## Combing energy data

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
from functools import reduce

In [2]:
cgas_df = pd.read_csv('cgas.csv')
crude_oil_df = pd.read_csv('crude_oil.csv')
diesel_df = pd.read_csv('diesel.csv')
gas_df = pd.read_csv('gas.csv')
hoil_df = pd.read_csv('hoil.csv')
jet_fuel_df = pd.read_csv('jet_fuel.csv')
propane_df = pd.read_csv('propane.csv')

In [3]:
df_names = [cgas_df,crude_oil_df,diesel_df,gas_df,hoil_df,jet_fuel_df,propane_df]

In [4]:
for df in df_names:
    print(df.head())

         Date  NY_cgas  US_cgas  Unnamed: 3
0  1986-06-02    0.468    0.445         NaN
1  1986-06-03    0.436    0.418         NaN
2  1986-06-04    0.418    0.398         NaN
3  1986-06-05    0.431    0.415         NaN
4  1986-06-06    0.421    0.403         NaN
         Date  WTI_crude_oil  Brent_crude_oil  Unnamed: 3
0  1986-01-02          25.56              NaN         NaN
1  1986-01-03          26.00              NaN         NaN
2  1986-01-06          26.53              NaN         NaN
3  1986-01-07          25.85              NaN         NaN
4  1986-01-08          25.87              NaN         NaN
         Date  NY_diesel  US__diesel  LA_diesel  Unnamed: 4
0  1996-04-17        NaN         NaN      0.905         NaN
1  1996-04-18        NaN         NaN      0.930         NaN
2  1996-04-19        NaN         NaN      0.940         NaN
3  1996-04-22        NaN         NaN      0.960         NaN
4  1996-04-23        NaN         NaN      0.955         NaN
         Date  LA_gas  Unnam

In [5]:
for df in df_names:
    print(df.dtypes)

Date           object
NY_cgas       float64
US_cgas       float64
Unnamed: 3    float64
dtype: object
Date                object
WTI_crude_oil      float64
Brent_crude_oil    float64
Unnamed: 3         float64
dtype: object
Date           object
NY_diesel     float64
US__diesel    float64
LA_diesel     float64
Unnamed: 4    float64
dtype: object
Date           object
LA_gas        float64
Unnamed: 2    float64
dtype: object
Date           object
NY_hoil       float64
Unnamed: 2    float64
dtype: object
Date            object
US_jet_fuel    float64
Unnamed: 2     float64
dtype: object
Date           object
TX_propane    float64
Unnamed: 2    float64
dtype: object


In [6]:
df_merged = reduce(lambda  left,right: pd.merge(left,right,on=['Date'],
                                            how='left'), df_names)

In [7]:
#remove unnamed column
df_merged = df_merged.loc[:, ~df_merged.columns.str.contains('^Unnamed')]
df_merged.head()

Unnamed: 0,Date,NY_cgas,US_cgas,WTI_crude_oil,Brent_crude_oil,NY_diesel,US__diesel,LA_diesel,LA_gas,NY_hoil,US_jet_fuel,TX_propane
0,1986-06-02,0.468,0.445,13.8,,,,,,0.402,,
1,1986-06-03,0.436,0.418,13.35,,,,,,0.393,,
2,1986-06-04,0.418,0.398,13.15,,,,,,0.378,,
3,1986-06-05,0.431,0.415,13.21,,,,,,0.39,,
4,1986-06-06,0.421,0.403,12.73,,,,,,0.385,,


In [8]:
#drop NAN rows
df_merged = df_merged.dropna().reset_index()

In [9]:
df_merged['Date']

0       2006-06-14
1       2006-06-15
2       2006-06-16
3       2006-06-19
4       2006-06-20
           ...    
3966    2022-05-24
3967    2022-05-25
3968    2022-05-26
3969    2022-05-27
3970    2022-05-31
Name: Date, Length: 3971, dtype: object

In [10]:
from sklearn.preprocessing import StandardScaler

#normalize data using sklearn
normalizer = StandardScaler()
df_dropped = df_merged.drop('Date', axis = 1)
normalizer_merged_df = pd.DataFrame(normalizer.fit_transform(df_dropped), columns = df_dropped.columns)
normalizer_merged_df.insert(loc = 0, column = 'Date', value = df_merged['Date'])
normalizer_merged_df

Unnamed: 0,Date,index,NY_cgas,US_cgas,WTI_crude_oil,Brent_crude_oil,NY_diesel,US__diesel,LA_diesel,LA_gas,NY_hoil,US_jet_fuel,TX_propane
0,2006-06-14,-1.730724,-0.199180,0.005882,-0.105839,-0.416149,-0.170916,-0.030454,-0.001119,0.071327,-0.354853,-0.083801,0.444677
1,2006-06-15,-1.729860,-0.158731,0.004259,-0.077321,-0.395983,-0.169509,-0.040522,-0.036176,0.104172,-0.346284,-0.075459,0.444677
2,2006-06-16,-1.728996,-0.215359,-0.055776,-0.078618,-0.435927,-0.193430,-0.126815,-0.061008,0.144838,-0.384846,-0.126900,0.450259
3,2006-06-19,-1.728132,-0.233157,-0.039550,-0.101951,-0.382022,-0.259564,-0.218862,-0.112132,0.086967,-0.441976,-0.217270,0.430721
4,2006-06-20,-1.727268,-0.208887,-0.015212,-0.098062,-0.336649,-0.249715,-0.208794,-0.134043,0.102608,-0.430550,-0.206148,0.422348
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3966,2022-05-24,1.731287,2.853907,2.861612,1.770738,1.532564,2.470243,2.226125,2.524446,2.656740,2.533043,2.080909,0.743324
3967,2022-05-25,1.732151,2.933187,2.941118,1.784997,1.557383,2.587034,2.388645,2.560964,2.958606,2.651586,2.226891,0.743324
3968,2022-05-26,1.733015,2.980107,3.012511,1.928020,1.689236,2.540599,2.538221,2.743553,2.991452,2.604454,2.340896,0.743324
3969,2022-05-27,1.733879,3.263250,3.239672,1.874873,1.742753,2.527935,2.568423,2.939287,3.258908,2.520188,2.381215,0.743324


In [11]:
#save dataframe
df_merged.to_csv('energy_data.csv')