# Machine Learning Model

## Supervised ML 

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

## import dataset

In [2]:
supply_demand = pd.read_csv('../cleaned_data/supply_demand.csv')
import_export = pd.read_csv('../cleaned_data/import_export.csv')
price_by_type = pd.read_csv('../cleaned_data/price_by_type.csv')
gas_storage = pd.read_csv('../cleaned_data/storage_vol.csv')

## Preprocessing Data

In [3]:
supply_demand = supply_demand.drop(columns='Unnamed: 0')
import_export = import_export.drop(columns='Unnamed: 0')

In [4]:
supply_demand.head()

Unnamed: 0,Year,Month,Gas_Procution(Mmcf),Gas_Consumption(Mmcf)
0,2022,2,2856356,3040029.0
1,2022,1,3180818,3591557.0
2,2021,12,3266272,2979653.0
3,2021,11,3161306,2659971.0
4,2021,10,3219612,2237715.0


In [5]:
supply_demand=supply_demand.rename(columns={'Gas_Procution(Mmcf)': 'Gas_Production(Mmcf)'})

In [6]:
import_export.head()

Unnamed: 0,Year,Month,Import_price($/Mcf),Export_price($/Mcf),Total Imports (Mmcf),Total Exports (Mmcf)
0,2022,2,5.62,8.22,259389,545563
1,2022,1,6.87,7.04,296179,610102
2,2021,12,4.74,7.4,252626,620886
3,2021,11,5.18,8.1,242405,556982
4,2021,10,4.79,7.97,228203,545055


In [7]:
price_by_type.head()

Unnamed: 0,Year,Month,all_grades(Price_per_Gallon),regular(Price_per_Gallon),midgrade(Price_per_Gallon),premium(Price_per_Gallon),diesel(Price_per_Gallon)
0,2001,1,1.487,1.447,1.541,1.63,1.524
1,2001,2,1.49,1.45,1.544,1.635,1.492
2,2001,3,1.45,1.409,1.506,1.596,1.399
3,2001,4,1.591,1.552,1.646,1.732,1.422
4,2001,5,1.738,1.702,1.785,1.869,1.496


In [8]:
#change columns name
price_by_type = price_by_type.rename(columns={
    'all_grades(Price_per_Gallon)':'all_grades($/Gallon)',
    'regular(Price_per_Gallon)':'regular($/Gallon)','midgrade(Price_per_Gallon)':'midgrade($/Gallon)',
    'premium(Price_per_Gallon)':'premium($/Gallon)',
    'diesel(Price_per_Gallon)': 'diesel($/Gallon)'
})

In [9]:
price_by_type = price_by_type.sort_values('Year',ascending=False)
price_by_type

Unnamed: 0,Year,Month,all_grades($/Gallon),regular($/Gallon),midgrade($/Gallon),premium($/Gallon),diesel($/Gallon)
253,2022,2,3.611,3.517,3.939,4.210,4.032
252,2022,1,3.413,3.315,3.766,4.036,3.724
246,2021,7,3.231,3.136,3.577,3.823,3.339
240,2021,1,2.420,2.334,2.719,2.975,2.681
241,2021,2,2.587,2.501,2.884,3.140,2.847
...,...,...,...,...,...,...,...
8,2001,9,1.557,1.522,1.600,1.682,1.495
9,2001,10,1.357,1.315,1.409,1.499,1.348
10,2001,11,1.212,1.171,1.265,1.357,1.259
11,2001,12,1.127,1.086,1.179,1.271,1.167


In [10]:
gas_storage.head()

Unnamed: 0,Year,Month,Volume_Mmcf
0,2022,2,5997164.0
1,2022,1,6653327.0
2,2021,12,7647859.0
3,2021,11,7971480.0
4,2021,10,8103211.0


In [11]:
gas_storage = gas_storage.rename(columns={'Volume_Mmcf': 'Volumn(Mmcf)'})

In [12]:
## create new df contains all data we need
new_df = supply_demand.merge(import_export, how='right')
new_df1 = new_df.merge(price_by_type,how='right')
us_gas_data = new_df1.merge(gas_storage,how='right')
us_gas_data

Unnamed: 0,Year,Month,Gas_Production(Mmcf),Gas_Consumption(Mmcf),Import_price($/Mcf),Export_price($/Mcf),Total Imports (Mmcf),Total Exports (Mmcf),all_grades($/Gallon),regular($/Gallon),midgrade($/Gallon),premium($/Gallon),diesel($/Gallon),Volumn(Mmcf)
0,2022,2,2856356,3040029.0,5.62,8.22,259389,545563,3.611,3.517,3.939,4.210,4.032,5997164.0
1,2022,1,3180818,3591557.0,6.87,7.04,296179,610102,3.413,3.315,3.766,4.036,3.724,6653327.0
2,2021,12,3266272,2979653.0,4.74,7.40,252626,620886,3.406,3.307,3.771,4.034,3.641,7647859.0
3,2021,11,3161306,2659971.0,5.18,8.10,242405,556982,3.491,3.395,3.836,4.098,3.727,7971480.0
4,2021,10,3219612,2237715.0,4.79,7.97,228203,545055,3.384,3.291,3.723,3.979,3.612,8103211.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
249,2001,5,1763141,1522382.0,4.95,5.50,321878,28981,1.738,1.702,1.785,1.869,1.496,5749464.0
250,2001,4,1703310,1807170.0,5.35,5.65,318573,23637,1.591,1.552,1.646,1.732,1.422,5252851.0
251,2001,3,1766754,2246633.0,5.42,4.93,358103,32121,1.450,1.409,1.506,1.596,1.399,5041971.0
252,2001,2,1582557,2309464.0,6.45,5.80,328289,26882,1.490,1.450,1.544,1.635,1.492,5240820.0


In [13]:
annual_gas_production = us_gas_data.groupby(['Year']).mean()['Gas_Production(Mmcf)']
annual_gas_consumption = us_gas_data.groupby(['Year']).mean()['Gas_Consumption(Mmcf)']
annual_gas_import_p = us_gas_data.groupby(['Year']).mean()['Import_price($/Mcf)']
annual_gas_export_p = us_gas_data.groupby(['Year']).mean()['Export_price($/Mcf)']
annual_total_import = us_gas_data.groupby(['Year']).mean()['Total Imports (Mmcf)']
annual_total_export = us_gas_data.groupby(['Year']).mean()['Total Exports (Mmcf)']
annual_avg_all_grades_p = us_gas_data.groupby(['Year']).mean()['all_grades($/Gallon)']
annual_avg_reg_p = us_gas_data.groupby(['Year']).mean()['regular($/Gallon)']
annual_avg_midg_p = us_gas_data.groupby(['Year']).mean()['midgrade($/Gallon)']
annual_avg_pre_p = us_gas_data.groupby(['Year']).mean()['premium($/Gallon)']
annual_avg_dis_p = us_gas_data.groupby(['Year']).mean()['diesel($/Gallon)']
annual_avg_vol = us_gas_data.groupby(['Year']).mean()['Volumn(Mmcf)']

In [19]:
annual_gas_summary = pd.DataFrame({
    'production(Mmcf)': annual_gas_production,
    'consumption(Mmcf)': annual_gas_consumption,
    'vol(Mmcf)': annual_avg_vol,
    'total_import(Mmcf)': annual_total_import,
    'toal_export(Mmcf)': annual_total_export,
    'import_price($/Mmcf)': annual_gas_import_p,
    'export_price($/Mmcf)': annual_gas_export_p,
    'all_grades_p($/Gallon)': annual_avg_all_grades_p,
    'reg_p($/Gallon)': annual_avg_reg_p,
    'midg_p($/Gallon)': annual_avg_midg_p,
    'pre_p($/Gallon)': annual_avg_pre_p,
    'dis_p($/Gallon)': annual_avg_dis_p 
})

annual_gas_summary.head()

Unnamed: 0_level_0,production(Mmcf),consumption(Mmcf),vol(Mmcf),total_import(Mmcf),toal_export(Mmcf),import_price($/Mmcf),export_price($/Mmcf),all_grades_p($/Gallon),reg_p($/Gallon),midg_p($/Gallon),pre_p($/Gallon),dis_p($/Gallon)
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2001,1714191.0,1853219.0,6335558.0,331411.5,31106.583333,4.3575,4.383333,1.466167,1.426,1.518833,1.607917,1.404083
2002,1657065.0,1918918.0,6715545.0,334622.0,43019.333333,3.138333,3.339167,1.381833,1.340333,1.43525,1.525833,1.315333
2003,1664530.0,1856375.0,6256805.0,328645.75,56660.25,5.180833,5.5825,1.601083,1.559167,1.656333,1.746,1.508333
2004,1626458.0,1866879.0,6460054.0,354879.916667,71178.083333,5.779167,6.069167,1.89125,1.84875,1.946917,2.038417,1.8075
2005,1577258.0,1834536.0,6492884.0,361752.833333,60716.666667,8.085,8.010833,2.312167,2.268167,2.367583,2.465917,2.398667


In [23]:
annual_gas_summary.columns

Index(['Year', 'production(Mmcf)', 'consumption(Mmcf)', 'vol(Mmcf)',
       'total_import(Mmcf)', 'toal_export(Mmcf)', 'import_price($/Mmcf)',
       'export_price($/Mmcf)', 'all_grades_p($/Gallon)', 'reg_p($/Gallon)',
       'midg_p($/Gallon)', 'pre_p($/Gallon)', 'dis_p($/Gallon)'],
      dtype='object')

In [20]:
annual_gas_summary = annual_gas_summary.reset_index()

In [24]:
test = annual_gas_summary.drop(columns = ['reg_p($/Gallon)',
       'midg_p($/Gallon)', 'pre_p($/Gallon)', 'dis_p($/Gallon)'])
test

Unnamed: 0,Year,production(Mmcf),consumption(Mmcf),vol(Mmcf),total_import(Mmcf),toal_export(Mmcf),import_price($/Mmcf),export_price($/Mmcf),all_grades_p($/Gallon)
0,2001,1714191.0,1853219.0,6335558.0,331411.5,31106.583333,4.3575,4.383333,1.466167
1,2002,1657065.0,1918918.0,6715545.0,334622.0,43019.333333,3.138333,3.339167,1.381833
2,2003,1664530.0,1856375.0,6256805.0,328645.75,56660.25,5.180833,5.5825,1.601083
3,2004,1626458.0,1866879.0,6460054.0,354879.916667,71178.083333,5.779167,6.069167,1.89125
4,2005,1577258.0,1834536.0,6492884.0,361752.833333,60716.666667,8.085,8.010833,2.312167
5,2006,1617473.0,1808256.0,6860307.0,348856.833333,60329.75,6.866667,6.831667,2.615083
6,2007,1683029.0,1925316.0,6837505.0,383965.166667,68537.833333,6.8725,6.905833,2.8455
7,2008,1759338.0,1939751.0,6592182.0,332008.416667,80271.75,8.774167,8.774167,3.305083
8,2009,1803995.0,1909173.0,7052343.0,312613.333333,89363.083333,4.136667,4.354167,2.396667
9,2010,1865156.0,2007233.0,7052461.0,311730.0,94732.416667,4.464167,5.03,2.833583


In [25]:
consumer_factors = pd.read_csv('Resources/Datasets/cleaned_data/consumer_factors.csv')
consumer_factors = consumer_factors.sort_values(by='Year')
consumer_df = consumers_factors.drop(columns='all_conusmers(Mmcf)')
consumer_df

FileNotFoundError: [Errno 2] No such file or directory: 'Resources/Datasets/cleaned_data/consumer_factors.csv'