algorithms for cleaning oil data, cleaning crude oil data and saving as csv

In [8]:
# using pandas functionality to manipulate data
import pandas as pd
# using path libray for importing and writing csvs
from pathlib import Path
# using hvplot to test the plotability of data
import hvplot.pandas

In [9]:
#using pandas read_csv function to pull in file and set index to the date
oil_df = pd.read_csv("../Resources/Crude Oil WTI Futures.csv", index_col='Date', parse_dates=True, infer_datetime_format=True)

# dropping NaN rows
oil_df = oil_df.dropna()

# displaying first 10 rows of the dataframe
oil_df.head(10)

Unnamed: 0_level_0,Price,Open,High,Low,Vol.,Change %
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-01-27,53.14,53.7,53.71,52.13,712.88K,-1.94%
2020-01-24,54.19,55.69,55.95,53.85,586.20K,-2.52%
2020-01-23,55.59,56.11,56.27,54.77,704.00K,-2.03%
2020-01-22,56.74,58.26,58.38,56.03,620.12K,-2.74%
2020-01-21,58.34,59.17,59.73,57.68,50.18K,-0.66%
2020-01-17,58.54,58.59,58.98,58.27,122.56K,0.03%
2020-01-16,58.52,58.1,58.87,57.56,182.87K,1.23%
2020-01-15,57.81,58.2,58.36,57.36,433.17K,-0.72%
2020-01-14,58.23,58.03,58.72,57.72,507.71K,0.26%
2020-01-13,58.08,59.04,59.27,57.91,584.00K,-1.63%


In [10]:
# fixing the change % column

# replacing % with nothing and making into a float
oil_df['Change %'] = oil_df['Change %'].replace("%", "", regex=True)
# multiplying by dividing by 100 to account for % and make into number
oil_df['Change %'] = pd.to_numeric(oil_df['Change %']) / 100

oil_df.head(10)

Unnamed: 0_level_0,Price,Open,High,Low,Vol.,Change %
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-01-27,53.14,53.7,53.71,52.13,712.88K,-0.0194
2020-01-24,54.19,55.69,55.95,53.85,586.20K,-0.0252
2020-01-23,55.59,56.11,56.27,54.77,704.00K,-0.0203
2020-01-22,56.74,58.26,58.38,56.03,620.12K,-0.0274
2020-01-21,58.34,59.17,59.73,57.68,50.18K,-0.0066
2020-01-17,58.54,58.59,58.98,58.27,122.56K,0.0003
2020-01-16,58.52,58.1,58.87,57.56,182.87K,0.0123
2020-01-15,57.81,58.2,58.36,57.36,433.17K,-0.0072
2020-01-14,58.23,58.03,58.72,57.72,507.71K,0.0026
2020-01-13,58.08,59.04,59.27,57.91,584.00K,-0.0163


In [11]:
# changing K and M char cells into floats and multiply by respective multiples to make into manipulatable numbers

for cell in oil_df['Vol.']:
    if cell.find('K') and cell.find('M') == False:
        # replacing vol. column K with nothing and making into a float
        cell = cell.replace('K', '', regex=True)
        # multiplying by 1000 to account for K
        cell = pd.to_numeric(cell) * 1000
    elif cell.find('M') and cell.find('K') == False:
        display(True)
        # replacing vol. column K with nothing and making into a float
        cell = cell.replace('M', '', regex=True)
        # multiplying by 1000000 to account for M
        cell = pd.to_numeric(cell) * 1000000

# displaying first 10 rows of the dataframe
oil_df.head(10)

Unnamed: 0_level_0,Price,Open,High,Low,Vol.,Change %
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-01-27,53.14,53.7,53.71,52.13,712.88K,-0.0194
2020-01-24,54.19,55.69,55.95,53.85,586.20K,-0.0252
2020-01-23,55.59,56.11,56.27,54.77,704.00K,-0.0203
2020-01-22,56.74,58.26,58.38,56.03,620.12K,-0.0274
2020-01-21,58.34,59.17,59.73,57.68,50.18K,-0.0066
2020-01-17,58.54,58.59,58.98,58.27,122.56K,0.0003
2020-01-16,58.52,58.1,58.87,57.56,182.87K,0.0123
2020-01-15,57.81,58.2,58.36,57.36,433.17K,-0.0072
2020-01-14,58.23,58.03,58.72,57.72,507.71K,0.0026
2020-01-13,58.08,59.04,59.27,57.91,584.00K,-0.0163


In [12]:
# displaying the converted data from the oil csv file

oil_df.hvplot(
    # setting axes data
    x='Date', y='High',
    # setting axes labels
    xlabel='Date', ylabel='High',
    # title
    title='Crude Oil Close',
)

In [None]:
oil_df.to_csv("../Resources/clean-data/oil_futures_clean.csv")