## Combing energy data

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
from functools import reduce

In [2]:
cgas_df = pd.read_csv('cgas.csv')
crude_oil_df = pd.read_csv('crude_oil.csv')
diesel_df = pd.read_csv('diesel.csv')
gas_df = pd.read_csv('gas.csv')
hoil_df = pd.read_csv('hoil.csv')
jet_fuel_df = pd.read_csv('jet_fuel.csv')
propane_df = pd.read_csv('propane.csv')

In [3]:
df_names = [cgas_df,crude_oil_df,diesel_df,gas_df,hoil_df,jet_fuel_df,propane_df]

In [4]:
for df in df_names:
    print(df.head())

         Date  NY_cgas  US_cgas  Unnamed: 3
0  1986-06-02    0.468    0.445         NaN
1  1986-06-03    0.436    0.418         NaN
2  1986-06-04    0.418    0.398         NaN
3  1986-06-05    0.431    0.415         NaN
4  1986-06-06    0.421    0.403         NaN
         Date  WTI_crude_oil  Brent_crude_oil  Unnamed: 3
0  1986-01-02          25.56              NaN         NaN
1  1986-01-03          26.00              NaN         NaN
2  1986-01-06          26.53              NaN         NaN
3  1986-01-07          25.85              NaN         NaN
4  1986-01-08          25.87              NaN         NaN
         Date  NY_diesel  US__diesel  LA_diesel  Unnamed: 4
0  1996-04-17        NaN         NaN      0.905         NaN
1  1996-04-18        NaN         NaN      0.930         NaN
2  1996-04-19        NaN         NaN      0.940         NaN
3  1996-04-22        NaN         NaN      0.960         NaN
4  1996-04-23        NaN         NaN      0.955         NaN
         Date  LA_gas  Unnam

In [5]:
for df in df_names:
    print(df.dtypes)

Date           object
NY_cgas       float64
US_cgas       float64
Unnamed: 3    float64
dtype: object
Date                object
WTI_crude_oil      float64
Brent_crude_oil    float64
Unnamed: 3         float64
dtype: object
Date           object
NY_diesel     float64
US__diesel    float64
LA_diesel     float64
Unnamed: 4    float64
dtype: object
Date           object
LA_gas        float64
Unnamed: 2    float64
dtype: object
Date           object
NY_hoil       float64
Unnamed: 2    float64
dtype: object
Date            object
US_jet_fuel    float64
Unnamed: 2     float64
dtype: object
Date           object
TX_propane    float64
Unnamed: 2    float64
dtype: object


In [6]:
df_merged = reduce(lambda  left,right: pd.merge(left,right,on=['Date'],
                                            how='outer'), df_names)

In [7]:
#remove unnamed column
df_merged = df_merged.loc[:, ~df_merged.columns.str.contains('^Unnamed')]
df_merged

Unnamed: 0,Date,NY_cgas,US_cgas,WTI_crude_oil,Brent_crude_oil,NY_diesel,US__diesel,LA_diesel,LA_gas,NY_hoil,US_jet_fuel,TX_propane
0,1986-06-02,0.468,0.445,13.80,,,,,,0.402,,
1,1986-06-03,0.436,0.418,13.35,,,,,,0.393,,
2,1986-06-04,0.418,0.398,13.15,,,,,,0.378,,
3,1986-06-05,0.431,0.415,13.21,,,,,,0.390,,
4,1986-06-06,0.421,0.403,12.73,,,,,,0.385,,
...,...,...,...,...,...,...,...,...,...,...,...,...
9317,2022-01-17,,,,87.82,,,,,,,
9318,2022-02-21,,,,98.95,,,,,,,
9319,2022-05-30,,,,123.01,,,,,,,
9320,1990-04-13,,,,,,,,,,0.528,


In [8]:
#drop NAN rows
df_merged = df_merged.dropna().reset_index(drop=True)
df_merged

Unnamed: 0,Date,NY_cgas,US_cgas,WTI_crude_oil,Brent_crude_oil,NY_diesel,US__diesel,LA_diesel,LA_gas,NY_hoil,US_jet_fuel,TX_propane
0,2006-06-14,1.962,2.034,69.12,65.52,2.104,2.152,2.244,2.276,1.889,2.079,1.091
1,2006-06-15,1.987,2.033,69.78,66.04,2.105,2.145,2.220,2.297,1.895,2.085,1.091
2,2006-06-16,1.952,1.996,69.75,65.01,2.088,2.085,2.203,2.323,1.868,2.048,1.093
3,2006-06-19,1.941,2.006,69.21,66.40,2.041,2.021,2.168,2.286,1.828,1.983,1.086
4,2006-06-20,1.956,2.021,69.30,67.57,2.048,2.028,2.153,2.296,1.836,1.991,1.083
...,...,...,...,...,...,...,...,...,...,...,...,...
3966,2022-05-24,3.849,3.794,112.55,115.77,3.981,3.721,3.973,3.929,3.911,3.636,1.198
3967,2022-05-25,3.898,3.843,112.88,116.41,4.064,3.834,3.998,4.122,3.994,3.741,1.198
3968,2022-05-26,3.927,3.887,116.19,119.81,4.031,3.938,4.123,4.143,3.961,3.823,1.198
3969,2022-05-27,4.102,4.027,114.96,121.19,4.022,3.959,4.257,4.314,3.902,3.852,1.198


In [9]:
from sklearn.preprocessing import StandardScaler

#normalize data using sklearn
normalizer = StandardScaler()
df_dropped = df_merged.drop('Date', axis = 1)
normalizer_merged_df = pd.DataFrame(normalizer.fit_transform(df_dropped), columns = df_dropped.columns)
normalizer_merged_df.insert(loc = 0, column = 'Date', value = df_merged['Date'])
normalizer_merged_df

Unnamed: 0,Date,NY_cgas,US_cgas,WTI_crude_oil,Brent_crude_oil,NY_diesel,US__diesel,LA_diesel,LA_gas,NY_hoil,US_jet_fuel,TX_propane
0,2006-06-14,-0.199180,0.005882,-0.105839,-0.416149,-0.170916,-0.030454,-0.001119,0.071327,-0.354853,-0.083801,0.444677
1,2006-06-15,-0.158731,0.004259,-0.077321,-0.395983,-0.169509,-0.040522,-0.036176,0.104172,-0.346284,-0.075459,0.444677
2,2006-06-16,-0.215359,-0.055776,-0.078618,-0.435927,-0.193430,-0.126815,-0.061008,0.144838,-0.384846,-0.126900,0.450259
3,2006-06-19,-0.233157,-0.039550,-0.101951,-0.382022,-0.259564,-0.218862,-0.112132,0.086967,-0.441976,-0.217270,0.430721
4,2006-06-20,-0.208887,-0.015212,-0.098062,-0.336649,-0.249715,-0.208794,-0.134043,0.102608,-0.430550,-0.206148,0.422348
...,...,...,...,...,...,...,...,...,...,...,...,...
3966,2022-05-24,2.853907,2.861612,1.770738,1.532564,2.470243,2.226125,2.524446,2.656740,2.533043,2.080909,0.743324
3967,2022-05-25,2.933187,2.941118,1.784997,1.557383,2.587034,2.388645,2.560964,2.958606,2.651586,2.226891,0.743324
3968,2022-05-26,2.980107,3.012511,1.928020,1.689236,2.540599,2.538221,2.743553,2.991452,2.604454,2.340896,0.743324
3969,2022-05-27,3.263250,3.239672,1.874873,1.742753,2.527935,2.568423,2.939287,3.258908,2.520188,2.381215,0.743324


In [10]:
#save dataframe
df_merged.to_csv('energy_data.csv')