# Machine Learning Model

## Supervised ML 

In [29]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

## import dataset

In [30]:
supply_demand = pd.read_csv('../cleaned_data/supply_demand.csv')
import_export = pd.read_csv('../cleaned_data/import_export.csv')
price_by_type = pd.read_csv('../cleaned_data/price_by_type.csv')
gas_storage = pd.read_csv('../cleaned_data/storage_vol.csv')

## Preprocessing Data

In [31]:
supply_demand = supply_demand.drop(columns='Unnamed: 0')
import_export = import_export.drop(columns='Unnamed: 0')

In [32]:
supply_demand.head()

Unnamed: 0,Year,Month,Gas_Procution(Mmcf),Gas_Consumption(Mmcf)
0,2022,2,2856356,3040029.0
1,2022,1,3180818,3591557.0
2,2021,12,3266272,2979653.0
3,2021,11,3161306,2659971.0
4,2021,10,3219612,2237715.0


In [33]:
supply_demand=supply_demand.rename(columns={'Gas_Procution(Mmcf)': 'Gas_Production(Mmcf)'})

In [34]:
import_export.head()

Unnamed: 0,Year,Month,Import_price($/Mcf),Export_price($/Mcf),Total Imports (Mmcf),Total Exports (Mmcf)
0,2022,2,5.62,8.22,259389,545563
1,2022,1,6.87,7.04,296179,610102
2,2021,12,4.74,7.4,252626,620886
3,2021,11,5.18,8.1,242405,556982
4,2021,10,4.79,7.97,228203,545055


In [35]:
price_by_type.head()

Unnamed: 0,Year,Month,all_grades(Price_per_Gallon),regular(Price_per_Gallon),midgrade(Price_per_Gallon),premium(Price_per_Gallon),diesel(Price_per_Gallon)
0,2001,1,1.487,1.447,1.541,1.63,1.524
1,2001,2,1.49,1.45,1.544,1.635,1.492
2,2001,3,1.45,1.409,1.506,1.596,1.399
3,2001,4,1.591,1.552,1.646,1.732,1.422
4,2001,5,1.738,1.702,1.785,1.869,1.496


In [36]:
#change columns name
price_by_type = price_by_type.rename(columns={
    'all_grades(Price_per_Gallon)':'all_grades($/Gallon)',
    'regular(Price_per_Gallon)':'regular($/Gallon)','midgrade(Price_per_Gallon)':'midgrade($/Gallon)',
    'premium(Price_per_Gallon)':'premium($/Gallon)',
    'diesel(Price_per_Gallon)': 'diesel($/Gallon)'
})

In [37]:
price_by_type = price_by_type.sort_values('Year',ascending=False)
price_by_type

Unnamed: 0,Year,Month,all_grades($/Gallon),regular($/Gallon),midgrade($/Gallon),premium($/Gallon),diesel($/Gallon)
253,2022,2,3.611,3.517,3.939,4.210,4.032
252,2022,1,3.413,3.315,3.766,4.036,3.724
246,2021,7,3.231,3.136,3.577,3.823,3.339
240,2021,1,2.420,2.334,2.719,2.975,2.681
241,2021,2,2.587,2.501,2.884,3.140,2.847
...,...,...,...,...,...,...,...
8,2001,9,1.557,1.522,1.600,1.682,1.495
9,2001,10,1.357,1.315,1.409,1.499,1.348
10,2001,11,1.212,1.171,1.265,1.357,1.259
11,2001,12,1.127,1.086,1.179,1.271,1.167


In [38]:
gas_storage.head()

Unnamed: 0,Year,Month,Volume_Mmcf
0,2022,2,5997164.0
1,2022,1,6653327.0
2,2021,12,7647859.0
3,2021,11,7971480.0
4,2021,10,8103211.0


In [39]:
gas_storage = gas_storage.rename(columns={'Volume_Mmcf': 'Volumn(Mmcf)'})

In [40]:
## create new df contains all data we need
new_df = supply_demand.merge(import_export, how='right')
new_df1 = new_df.merge(price_by_type,how='right')
us_gas_data = new_df1.merge(gas_storage,how='right')
us_gas_data

Unnamed: 0,Year,Month,Gas_Production(Mmcf),Gas_Consumption(Mmcf),Import_price($/Mcf),Export_price($/Mcf),Total Imports (Mmcf),Total Exports (Mmcf),all_grades($/Gallon),regular($/Gallon),midgrade($/Gallon),premium($/Gallon),diesel($/Gallon),Volumn(Mmcf)
0,2022,2,2856356,3040029.0,5.62,8.22,259389,545563,3.611,3.517,3.939,4.210,4.032,5997164.0
1,2022,1,3180818,3591557.0,6.87,7.04,296179,610102,3.413,3.315,3.766,4.036,3.724,6653327.0
2,2021,12,3266272,2979653.0,4.74,7.40,252626,620886,3.406,3.307,3.771,4.034,3.641,7647859.0
3,2021,11,3161306,2659971.0,5.18,8.10,242405,556982,3.491,3.395,3.836,4.098,3.727,7971480.0
4,2021,10,3219612,2237715.0,4.79,7.97,228203,545055,3.384,3.291,3.723,3.979,3.612,8103211.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
249,2001,5,1763141,1522382.0,4.95,5.50,321878,28981,1.738,1.702,1.785,1.869,1.496,5749464.0
250,2001,4,1703310,1807170.0,5.35,5.65,318573,23637,1.591,1.552,1.646,1.732,1.422,5252851.0
251,2001,3,1766754,2246633.0,5.42,4.93,358103,32121,1.450,1.409,1.506,1.596,1.399,5041971.0
252,2001,2,1582557,2309464.0,6.45,5.80,328289,26882,1.490,1.450,1.544,1.635,1.492,5240820.0


In [41]:
us_gas_data.to_csv('us_gas_data.csv', index=False)

In [42]:
# define features and target
y = us_gas_data['Month']
X = us_gas_data.drop(columns = 'Month')

In [43]:
# split data 
X_train,X_test, y_train, y_test = train_test_split(X, y, random_state=42,stratify=y)
X_train.shape()

TypeError: 'tuple' object is not callable

In [None]:
# use logistic Regression Model
classifier = LogistricRegreesion(solver='lbfgs', random_state=42)
