## Combing energy data

In [9]:
import pandas as pd
import numpy as np
from datetime import datetime
from functools import reduce

In [10]:
cgas_df = pd.read_csv('cgas.csv')
crude_oil_df = pd.read_csv('crude_oil.csv')
diesel_df = pd.read_csv('diesel.csv')
gas_df = pd.read_csv('gas.csv')
hoil_df = pd.read_csv('hoil.csv')
jet_fuel_df = pd.read_csv('jet_fuel.csv')
propane_df = pd.read_csv('propane.csv')

In [11]:
df_names = [cgas_df,crude_oil_df,diesel_df,gas_df,hoil_df,jet_fuel_df,propane_df]

In [12]:
for df in df_names:
    print(df.head())

         Date  NY_cgas  US_cgas  Unnamed: 3
0  1986-06-02    0.468    0.445         NaN
1  1986-06-03    0.436    0.418         NaN
2  1986-06-04    0.418    0.398         NaN
3  1986-06-05    0.431    0.415         NaN
4  1986-06-06    0.421    0.403         NaN
         Date  WTI_crude_oil  Brent_crude_oil  Unnamed: 3
0  1986-01-02          25.56              NaN         NaN
1  1986-01-03          26.00              NaN         NaN
2  1986-01-06          26.53              NaN         NaN
3  1986-01-07          25.85              NaN         NaN
4  1986-01-08          25.87              NaN         NaN
         Date  NY_diesel  US__diesel  LA_diesel  Unnamed: 4
0  1996-04-17        NaN         NaN      0.905         NaN
1  1996-04-18        NaN         NaN      0.930         NaN
2  1996-04-19        NaN         NaN      0.940         NaN
3  1996-04-22        NaN         NaN      0.960         NaN
4  1996-04-23        NaN         NaN      0.955         NaN
         Date  LA_gas  Unnam

In [13]:
for df in df_names:
    print(df.dtypes)

Date           object
NY_cgas       float64
US_cgas       float64
Unnamed: 3    float64
dtype: object
Date                object
WTI_crude_oil      float64
Brent_crude_oil    float64
Unnamed: 3         float64
dtype: object
Date           object
NY_diesel     float64
US__diesel    float64
LA_diesel     float64
Unnamed: 4    float64
dtype: object
Date           object
LA_gas        float64
Unnamed: 2    float64
dtype: object
Date           object
NY_hoil       float64
Unnamed: 2    float64
dtype: object
Date            object
US_jet_fuel    float64
Unnamed: 2     float64
dtype: object
Date           object
TX_propane    float64
Unnamed: 2    float64
dtype: object


In [14]:
df_merged = reduce(lambda  left,right: pd.merge(left,right,on=['Date'],
                                            how='outer'), df_names)

In [15]:
#remove unnamed column
df_merged = df_merged.loc[:, ~df_merged.columns.str.contains('^Unnamed')]
df_merged

Unnamed: 0,Date,NY_cgas,US_cgas,WTI_crude_oil,Brent_crude_oil,NY_diesel,US__diesel,LA_diesel,LA_gas,NY_hoil,US_jet_fuel,TX_propane
0,1986-06-02,0.468,0.445,13.80,,,,,,0.402,,
1,1986-06-03,0.436,0.418,13.35,,,,,,0.393,,
2,1986-06-04,0.418,0.398,13.15,,,,,,0.378,,
3,1986-06-05,0.431,0.415,13.21,,,,,,0.390,,
4,1986-06-06,0.421,0.403,12.73,,,,,,0.385,,
...,...,...,...,...,...,...,...,...,...,...,...,...
9317,2022-01-17,,,,87.82,,,,,,,
9318,2022-02-21,,,,98.95,,,,,,,
9319,2022-05-30,,,,123.01,,,,,,,
9320,1990-04-13,,,,,,,,,,0.528,


In [16]:
# drop unneeded columns
df_merged.drop(['Brent_crude_oil','NY_diesel','US__diesel','LA_diesel','LA_gas','US_jet_fuel','TX_propane'], axis=1, inplace=True)
df_merged


Unnamed: 0,Date,NY_cgas,US_cgas,WTI_crude_oil,NY_hoil
0,1986-06-02,0.468,0.445,13.80,0.402
1,1986-06-03,0.436,0.418,13.35,0.393
2,1986-06-04,0.418,0.398,13.15,0.378
3,1986-06-05,0.431,0.415,13.21,0.390
4,1986-06-06,0.421,0.403,12.73,0.385
...,...,...,...,...,...
9317,2022-01-17,,,,
9318,2022-02-21,,,,
9319,2022-05-30,,,,
9320,1990-04-13,,,,


In [17]:
#drop NAN rows
df_merged = df_merged.dropna().reset_index(drop=True)
df_merged

Unnamed: 0,Date,NY_cgas,US_cgas,WTI_crude_oil,NY_hoil
0,1986-06-02,0.468,0.445,13.80,0.402
1,1986-06-03,0.436,0.418,13.35,0.393
2,1986-06-04,0.418,0.398,13.15,0.378
3,1986-06-05,0.431,0.415,13.21,0.390
4,1986-06-06,0.421,0.403,12.73,0.385
...,...,...,...,...,...
9039,2022-05-24,3.849,3.794,112.55,3.911
9040,2022-05-25,3.898,3.843,112.88,3.994
9041,2022-05-26,3.927,3.887,116.19,3.961
9042,2022-05-27,4.102,4.027,114.96,3.902


In [18]:
from sklearn.preprocessing import StandardScaler

#normalize data using sklearn
normalizer = StandardScaler()
df_dropped = df_merged.drop('Date', axis = 1)
normalizer_merged_df = pd.DataFrame(normalizer.fit_transform(df_dropped), columns = df_dropped.columns)
normalizer_merged_df.insert(loc = 0, column = 'Date', value = df_merged['Date'])
normalizer_merged_df

Unnamed: 0,Date,NY_cgas,US_cgas,WTI_crude_oil,NY_hoil
0,1986-06-02,-1.024280,-1.033515,-1.091807,-1.043857
1,1986-06-03,-1.062590,-1.066443,-1.107214,-1.053922
2,1986-06-04,-1.084140,-1.090834,-1.114062,-1.070697
3,1986-06-05,-1.068576,-1.070102,-1.112008,-1.057277
4,1986-06-06,-1.080548,-1.084736,-1.128443,-1.062869
...,...,...,...,...,...
9039,2022-05-24,3.023402,3.050755,2.289297,2.880384
9040,2022-05-25,3.082064,3.110513,2.300596,2.973205
9041,2022-05-26,3.116782,3.164173,2.413927,2.936300
9042,2022-05-27,3.326289,3.334910,2.371813,2.870319


In [19]:
#save dataframe
df_merged.to_csv('energy_data.csv')