In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split

In [2]:
us_gas_data = pd.read_csv('Resources/Datasets/cleaned_data/us_gas_data.csv')
us_gas_data

Unnamed: 0,Year,Month,Gas_Production(Mmcf),Gas_Consumption(Mmcf),Import_price($/Mcf),Export_price($/Mcf),Total Imports (Mmcf),Total Exports (Mmcf),all_grades($/Gallon),regular($/Gallon),midgrade($/Gallon),premium($/Gallon),diesel($/Gallon),Volumn(Mmcf)
0,2022,2,2856356,3040029.0,5.62,8.22,259389,545563,3.611,3.517,3.939,4.210,4.032,5997164.0
1,2022,1,3180818,3591557.0,6.87,7.04,296179,610102,3.413,3.315,3.766,4.036,3.724,6653327.0
2,2021,12,3266272,2979653.0,4.74,7.40,252626,620886,3.406,3.307,3.771,4.034,3.641,7647859.0
3,2021,11,3161306,2659971.0,5.18,8.10,242405,556982,3.491,3.395,3.836,4.098,3.727,7971480.0
4,2021,10,3219612,2237715.0,4.79,7.97,228203,545055,3.384,3.291,3.723,3.979,3.612,8103211.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
249,2001,5,1763141,1522382.0,4.95,5.50,321878,28981,1.738,1.702,1.785,1.869,1.496,5749464.0
250,2001,4,1703310,1807170.0,5.35,5.65,318573,23637,1.591,1.552,1.646,1.732,1.422,5252851.0
251,2001,3,1766754,2246633.0,5.42,4.93,358103,32121,1.450,1.409,1.506,1.596,1.399,5041971.0
252,2001,2,1582557,2309464.0,6.45,5.80,328289,26882,1.490,1.450,1.544,1.635,1.492,5240820.0


In [3]:
us_gas_data.columns

Index(['Year', 'Month', 'Gas_Production(Mmcf)', 'Gas_Consumption(Mmcf)',
       'Import_price($/Mcf)', 'Export_price($/Mcf)', 'Total Imports (Mmcf)',
       'Total Exports (Mmcf)', 'all_grades($/Gallon)', 'regular($/Gallon)',
       'midgrade($/Gallon)', 'premium($/Gallon)', 'diesel($/Gallon)',
       'Volumn(Mmcf)'],
      dtype='object')

In [4]:
us_gas_data.dtypes

Year                       int64
Month                      int64
Gas_Production(Mmcf)       int64
Gas_Consumption(Mmcf)    float64
Import_price($/Mcf)      float64
Export_price($/Mcf)      float64
Total Imports (Mmcf)       int64
Total Exports (Mmcf)       int64
all_grades($/Gallon)     float64
regular($/Gallon)        float64
midgrade($/Gallon)       float64
premium($/Gallon)        float64
diesel($/Gallon)         float64
Volumn(Mmcf)             float64
dtype: object

In [6]:
us_gas_data = us_gas_data.astype({'Gas_Production(Mmcf)':float,
                                  'Total Imports (Mmcf)' : float,
                                  'Total Exports (Mmcf)': float})
us_gas_data.dtypes

Year                       int64
Month                      int64
Gas_Production(Mmcf)     float64
Gas_Consumption(Mmcf)    float64
Import_price($/Mcf)      float64
Export_price($/Mcf)      float64
Total Imports (Mmcf)     float64
Total Exports (Mmcf)     float64
all_grades($/Gallon)     float64
regular($/Gallon)        float64
midgrade($/Gallon)       float64
premium($/Gallon)        float64
diesel($/Gallon)         float64
Volumn(Mmcf)             float64
dtype: object

In [7]:
us_gas_data.describe()

Unnamed: 0,Year,Month,Gas_Production(Mmcf),Gas_Consumption(Mmcf),Import_price($/Mcf),Export_price($/Mcf),Total Imports (Mmcf),Total Exports (Mmcf),all_grades($/Gallon),regular($/Gallon),midgrade($/Gallon),premium($/Gallon),diesel($/Gallon),Volumn(Mmcf)
count,254.0,254.0,254.0,254.0,254.0,254.0,254.0,254.0,254.0,254.0,254.0,254.0,254.0,254.0
mean,2011.086614,6.46063,2130387.0,2127122.0,4.459764,4.955591,288559.976378,168158.531496,2.629945,2.562823,2.753433,2.909291,2.798303,6906497.0
std,6.121334,3.473847,507150.7,474056.9,2.161772,1.964109,58264.662838,146897.706073,0.718712,0.713631,0.736901,0.761305,0.8441,803170.7
min,2001.0,1.0,1400941.0,1368369.0,1.51,2.04,174225.0,23637.0,1.127,1.086,1.179,1.271,1.152,5041971.0
25%,2006.0,3.0,1694616.0,1743070.0,2.7325,3.51,238368.75,64439.5,2.1985,2.10925,2.28775,2.43075,2.31125,6288639.0
50%,2011.0,6.0,1994746.0,2068744.0,4.035,4.45,282064.5,118918.5,2.6405,2.555,2.786,2.9505,2.8395,7002317.0
75%,2016.0,9.0,2428004.0,2428177.0,5.6275,6.015,333876.25,224495.75,3.124,3.062,3.24775,3.47025,3.374,7545798.0
max,2022.0,12.0,3266272.0,3591557.0,11.99,12.68,426534.0,620886.0,4.114,4.062,4.181,4.298,4.703,8384087.0


In [9]:
#group by data 'year'- all gas data
annual_gas_production = us_gas_data.groupby(['Year']).mean()['Gas_Production(Mmcf)']
annual_gas_consumption = us_gas_data.groupby(['Year']).mean()['Gas_Consumption(Mmcf)']
annual_gas_import_p = us_gas_data.groupby(['Year']).mean()['Import_price($/Mcf)']
annual_gas_export_p = us_gas_data.groupby(['Year']).mean()['Export_price($/Mcf)']
annual_total_import = us_gas_data.groupby(['Year']).mean()['Total Imports (Mmcf)']
annual_total_export = us_gas_data.groupby(['Year']).mean()['Total Exports (Mmcf)']
annual_avg_all_grades_p = us_gas_data.groupby(['Year']).mean()['all_grades($/Gallon)']
annual_avg_reg_p = us_gas_data.groupby(['Year']).mean()['regular($/Gallon)']
annual_avg_midg_p = us_gas_data.groupby(['Year']).mean()['midgrade($/Gallon)']
annual_avg_pre_p = us_gas_data.groupby(['Year']).mean()['premium($/Gallon)']
annual_avg_dis_p = us_gas_data.groupby(['Year']).mean()['diesel($/Gallon)']
annual_avg_vol = us_gas_data.groupby(['Year']).mean()['Volumn(Mmcf)']

In [10]:
# create annual summary, reorder columns
annual_gas_summary = pd.DataFrame({
    'avg_production(Mmcf)': annual_gas_production,
    'avg_consumption(Mmcf)': annual_gas_consumption,
    'avg_vol(Mmcf)': annual_avg_vol,
    'avg_total_import(Mmcf)': annual_total_import,
    'avg_toal_export(Mmcf)': annual_total_export,
    'avg_import_price($/Mmcf)': annual_gas_import_p,
    'avg_export_price($/Mmcf)': annual_gas_export_p,
    'avg_all_grades_p($/Gallon)': annual_avg_all_grades_p,
    'avg_reg_p($/Gallon)': annual_avg_reg_p,
    'avg_midg_p($/Gallon)': annual_avg_midg_p,
    'avg_pre_p($/Gallon)': annual_avg_pre_p,
    'avg_dis_p($/Gallon)': annual_avg_dis_p 
})

annual_gas_summary.head()

Unnamed: 0_level_0,avg_production(Mmcf),avg_consumption(Mmcf),avg_vol(Mmcf),avg_total_import(Mmcf),avg_toal_export(Mmcf),avg_import_price($/Mmcf),avg_export_price($/Mmcf),avg_all_grades_p($/Gallon),avg_reg_p($/Gallon),avg_midg_p($/Gallon),avg_pre_p($/Gallon),avg_dis_p($/Gallon)
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2001,1714191.0,1853219.0,6335558.0,331411.5,31106.583333,4.3575,4.383333,1.466167,1.426,1.518833,1.607917,1.404083
2002,1657065.0,1918918.0,6715545.0,334622.0,43019.333333,3.138333,3.339167,1.381833,1.340333,1.43525,1.525833,1.315333
2003,1664530.0,1856375.0,6256805.0,328645.75,56660.25,5.180833,5.5825,1.601083,1.559167,1.656333,1.746,1.508333
2004,1626458.0,1866879.0,6460054.0,354879.916667,71178.083333,5.779167,6.069167,1.89125,1.84875,1.946917,2.038417,1.8075
2005,1577258.0,1834536.0,6492884.0,361752.833333,60716.666667,8.085,8.010833,2.312167,2.268167,2.367583,2.465917,2.398667


In [11]:
four_more_factors_yrly = pd.read_csv('Resources/Datasets/cleaned_data/4_more_factors.csv')
four_more_factors_yrly

Unnamed: 0,Year,inflation(%),stock_to_GDP(%),working_population,Interest_Rate(%)
0,2001,2.826171,132.148,181476647,3.41
1,2002,1.586032,101.0791,183792729,1.173333
2,2003,2.270095,124.5066,186939817,2.104167
3,2004,2.677237,133.6506,188763071,2.395833
4,2005,3.392747,130.4083,191024953,4.25
5,2006,3.225944,141.6542,193219398,6.020833
6,2007,2.852672,137.8527,195663562,5.791667
7,2008,3.8391,78.7766,196691536,2.166667
8,2009,-0.355546,104.3488,197897475,0.5
9,2010,1.640043,115.2841,199183839,0.729167


In [12]:
annual_gas_summary_more = annual_gas_summary.merge(four_more_factors_yrly, how='inner', on='Year')
annual_gas_summary_more

Unnamed: 0,Year,avg_production(Mmcf),avg_consumption(Mmcf),avg_vol(Mmcf),avg_total_import(Mmcf),avg_toal_export(Mmcf),avg_import_price($/Mmcf),avg_export_price($/Mmcf),avg_all_grades_p($/Gallon),avg_reg_p($/Gallon),avg_midg_p($/Gallon),avg_pre_p($/Gallon),avg_dis_p($/Gallon),inflation(%),stock_to_GDP(%),working_population,Interest_Rate(%)
0,2001,1714191.0,1853219.0,6335558.0,331411.5,31106.583333,4.3575,4.383333,1.466167,1.426,1.518833,1.607917,1.404083,2.826171,132.148,181476647,3.41
1,2002,1657065.0,1918918.0,6715545.0,334622.0,43019.333333,3.138333,3.339167,1.381833,1.340333,1.43525,1.525833,1.315333,1.586032,101.0791,183792729,1.173333
2,2003,1664530.0,1856375.0,6256805.0,328645.75,56660.25,5.180833,5.5825,1.601083,1.559167,1.656333,1.746,1.508333,2.270095,124.5066,186939817,2.104167
3,2004,1626458.0,1866879.0,6460054.0,354879.916667,71178.083333,5.779167,6.069167,1.89125,1.84875,1.946917,2.038417,1.8075,2.677237,133.6506,188763071,2.395833
4,2005,1577258.0,1834536.0,6492884.0,361752.833333,60716.666667,8.085,8.010833,2.312167,2.268167,2.367583,2.465917,2.398667,3.392747,130.4083,191024953,4.25
5,2006,1617473.0,1808256.0,6860307.0,348856.833333,60329.75,6.866667,6.831667,2.615083,2.569167,2.673,2.775917,2.705,3.225944,141.6542,193219398,6.020833
6,2007,1683029.0,1925316.0,6837505.0,383965.166667,68537.833333,6.8725,6.905833,2.8455,2.798583,2.905667,3.0095,2.881667,2.852672,137.8527,195663562,5.791667
7,2008,1759338.0,1939751.0,6592182.0,332008.416667,80271.75,8.774167,8.774167,3.305083,3.251833,3.372917,3.49175,3.806833,3.8391,78.7766,196691536,2.166667
8,2009,1803995.0,1909173.0,7052343.0,312613.333333,89363.083333,4.136667,4.354167,2.396667,2.3435,2.4645,2.58125,2.463667,-0.355546,104.3488,197897475,0.5
9,2010,1865156.0,2007233.0,7052461.0,311730.0,94732.416667,4.464167,5.03,2.833583,2.780333,2.900917,3.020083,2.989333,1.640043,115.2841,199183839,0.729167


In [None]:
# annual_gas_summary_more.to_csv('Resources/cleaned_data/14_variables.csv', index=False)

In [13]:
annual_gas_summary.dtypes

avg_production(Mmcf)          float64
avg_consumption(Mmcf)         float64
avg_vol(Mmcf)                 float64
avg_total_import(Mmcf)        float64
avg_toal_export(Mmcf)         float64
avg_import_price($/Mmcf)      float64
avg_export_price($/Mmcf)      float64
avg_all_grades_p($/Gallon)    float64
avg_reg_p($/Gallon)           float64
avg_midg_p($/Gallon)          float64
avg_pre_p($/Gallon)           float64
avg_dis_p($/Gallon)           float64
dtype: object

# Multilinear Regression with 4 more factors 
### Inflation, Working population, Stock market to GDP Percentage, and Interest_rate

In [14]:
#initialize X, y
X = annual_gas_summary_more.drop(columns = ['avg_all_grades_p($/Gallon)'])
y = annual_gas_summary_more['avg_all_grades_p($/Gallon)'].values.reshape(-1,1)

In [15]:
#Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [16]:
#Scale data
from sklearn.preprocessing import StandardScaler
data_scaler = StandardScaler()
X_train_s = data_scaler.fit_transform(X_train)
X_test_s = data_scaler.fit_transform(X_test)

In [17]:
#create instance of model
model = LinearRegression()

In [18]:
#train the model
model.fit(X_train_s, y_train)

LinearRegression()

In [22]:
#create prediction to test
y_pred = model.predict(X_test_s)

In [23]:
#results
print('Multi-Linear_Regression Model Result:')
print(y_pred.shape)
print('---------------------------------------')
print('intercept: \n', model.intercept_)
print('Coefficients: \n', model.coef_)

Multi-Linear_Regression Model Result:
(5, 1)
---------------------------------------
intercept: 
 [2.63623333]
Coefficients: 
 [[ 0.02169941 -0.01666174 -0.01625995 -0.00268678  0.01466605 -0.0228708
   0.05621146 -0.06697506  0.32975795  0.23605075  0.06660794  0.03536866
   0.00961803  0.00816742  0.00710507 -0.0106583 ]]


In [24]:
import sklearn.metrics as skm
print('MAE =', round(skm.mean_absolute_error(y_test, y_pred), 2))
print("MSE =", round(skm.mean_squared_error(y_test, y_pred), 2)) 
print("MAE =", round(skm.median_absolute_error(y_test, y_pred), 2)) 
print("EVS =", round(skm.explained_variance_score(y_test, y_pred), 2)) 
print("R2 score =", round(skm.r2_score(y_test, y_pred), 2))

MAE = 0.15
MSE = 0.02
MAE = 0.14
EVS = 1.0
R2 score = 0.95


In [28]:
import statsmodels.api as sm

In [29]:
#initialize X, y
X = annual_gas_summary_more.drop(columns = ['avg_all_grades_p($/Gallon)'])
y = annual_gas_summary_more['avg_all_grades_p($/Gallon)']

In [30]:
X = sm.add_constant(X)

  x = pd.concat(x[::order], 1)


In [31]:
sm_model = sm.OLS(y, X).fit()

In [32]:
predictions = sm_model.predict(X)

In [33]:
print(sm_model.summary())

                                OLS Regression Results                                
Dep. Variable:     avg_all_grades_p($/Gallon)   R-squared:                       1.000
Model:                                    OLS   Adj. R-squared:                  1.000
Method:                         Least Squares   F-statistic:                 5.064e+04
Date:                        Wed, 25 May 2022   Prob (F-statistic):           1.27e-07
Time:                                20:00:32   Log-Likelihood:                 104.56
No. Observations:                          20   AIC:                            -175.1
Df Residuals:                               3   BIC:                            -158.2
Df Model:                                  16                                         
Covariance Type:                    nonrobust                                         
                               coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------

## Visualize with the effective factors (only 2)

In [None]:
data = annual_gas_summary_more[['avg_all_grades_p($/Gallon)', 'avg_reg_p($/Gallon)','avg_pre_p($/Gallon)']]
sns.pairplot(data, hue = 'avg_all_grades_p($/Gallon)',palette='husl')

# Multilinear regression without the dummy variables

In [34]:
annual_gas_summary_less = annual_gas_summary_more.drop(columns=["avg_reg_p($/Gallon)","avg_midg_p($/Gallon)",
                                                                "avg_pre_p($/Gallon)","avg_dis_p($/Gallon)"])
annual_gas_summary_less

Unnamed: 0,Year,avg_production(Mmcf),avg_consumption(Mmcf),avg_vol(Mmcf),avg_total_import(Mmcf),avg_toal_export(Mmcf),avg_import_price($/Mmcf),avg_export_price($/Mmcf),avg_all_grades_p($/Gallon),inflation(%),stock_to_GDP(%),working_population,Interest_Rate(%)
0,2001,1714191.0,1853219.0,6335558.0,331411.5,31106.583333,4.3575,4.383333,1.466167,2.826171,132.148,181476647,3.41
1,2002,1657065.0,1918918.0,6715545.0,334622.0,43019.333333,3.138333,3.339167,1.381833,1.586032,101.0791,183792729,1.173333
2,2003,1664530.0,1856375.0,6256805.0,328645.75,56660.25,5.180833,5.5825,1.601083,2.270095,124.5066,186939817,2.104167
3,2004,1626458.0,1866879.0,6460054.0,354879.916667,71178.083333,5.779167,6.069167,1.89125,2.677237,133.6506,188763071,2.395833
4,2005,1577258.0,1834536.0,6492884.0,361752.833333,60716.666667,8.085,8.010833,2.312167,3.392747,130.4083,191024953,4.25
5,2006,1617473.0,1808256.0,6860307.0,348856.833333,60329.75,6.866667,6.831667,2.615083,3.225944,141.6542,193219398,6.020833
6,2007,1683029.0,1925316.0,6837505.0,383965.166667,68537.833333,6.8725,6.905833,2.8455,2.852672,137.8527,195663562,5.791667
7,2008,1759338.0,1939751.0,6592182.0,332008.416667,80271.75,8.774167,8.774167,3.305083,3.8391,78.7766,196691536,2.166667
8,2009,1803995.0,1909173.0,7052343.0,312613.333333,89363.083333,4.136667,4.354167,2.396667,-0.355546,104.3488,197897475,0.5
9,2010,1865156.0,2007233.0,7052461.0,311730.0,94732.416667,4.464167,5.03,2.833583,1.640043,115.2841,199183839,0.729167


In [37]:
#initialize X, y
X = annual_gas_summary_less.drop(columns = ['avg_all_grades_p($/Gallon)'])
y = annual_gas_summary_less['avg_all_grades_p($/Gallon)']

In [38]:
#Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [39]:
#Scale data
from sklearn.preprocessing import StandardScaler
data_scaler = StandardScaler()
X_train_s = data_scaler.fit_transform(X_train)
X_test_s = data_scaler.fit_transform(X_test)

In [40]:
#create instance of model
model = LinearRegression()

In [41]:
#train the model
model.fit(X_train_s, y_train)

LinearRegression()

In [43]:
#create prediction to test
y_pred = model.predict(X_test_s)

In [44]:
#results
print('Multi-Linear_Regression Model Result:')
print(y_pred.shape)
print('---------------------------------------')
print('intercept: \n', model.intercept_)
print('Coefficients: \n', model.coef_)

Multi-Linear_Regression Model Result:
(5,)
---------------------------------------
intercept: 
 2.778238888888772
Coefficients: 
 [-11.62512608   0.95862902  -0.66515831  -0.27185754  -0.9046697
   3.19858058   2.475043    -2.25451351   0.40395543   0.72926297
   8.16887592  -0.60303027]


In [45]:
import sklearn.metrics as skm
print('MAE =', round(skm.mean_absolute_error(y_test, y_pred), 2))
print("MSE =", round(skm.mean_squared_error(y_test, y_pred), 2)) 
print("MAE =", round(skm.median_absolute_error(y_test, y_pred), 2)) 
print("EVS =", round(skm.explained_variance_score(y_test, y_pred), 2)) 
print("R2 score =", round(skm.r2_score(y_test, y_pred), 2))

MAE = 0.82
MSE = 0.86
MAE = 0.83
EVS = -0.13
R2 score = -1.79


In [53]:
X = sm.add_constant(X)

  x = pd.concat(x[::order], 1)


In [54]:
sm_model = sm.OLS(y, X).fit()

In [55]:
predictions = sm_model.predict(X)

In [56]:
print(sm_model.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.887
Model:                            OLS   Adj. R-squared:                  0.693
Method:                 Least Squares   F-statistic:                     4.569
Date:                Sun, 22 May 2022   Prob (F-statistic):             0.0265
Time:                        16:53:43   Log-Likelihood:                 1.2739
No. Observations:                  20   AIC:                             23.45
Df Residuals:                       7   BIC:                             36.40
Df Model:                          12                                         
Covariance Type:            nonrobust                                         
                               coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------
const                   

# Regression model with original cleaned data - us_gas_data.csv

In [98]:
us_gas_data = pd.read_csv('Resources/cleaned_data/us_gas_data.csv')
us_gas_data

Unnamed: 0,Year,Month,Gas_Procution(Mmcf),Gas_Consumption(Mmcf),Import_price($/Mcf),Export_price($/Mcf),Total Imports (Mmcf),Total Exports (Mmcf),all_grades($/Gallon),regular($/Gallon),midgrade($/Gallon),premium($/Gallon),diesel($/Gallon),Volumn(Mmcf)
0,2022,2,2856356,3040029.0,5.62,8.22,259389,545563,3.611,3.517,3.939,4.210,4.032,5997164.0
1,2022,1,3180818,3591557.0,6.87,7.04,296179,610102,3.413,3.315,3.766,4.036,3.724,6653327.0
2,2021,12,3266272,2979653.0,4.74,7.40,252626,620886,3.406,3.307,3.771,4.034,3.641,7647859.0
3,2021,11,3161306,2659971.0,5.18,8.10,242405,556982,3.491,3.395,3.836,4.098,3.727,7971480.0
4,2021,10,3219612,2237715.0,4.79,7.97,228203,545055,3.384,3.291,3.723,3.979,3.612,8103211.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
249,2001,5,1763141,1522382.0,4.95,5.50,321878,28981,1.738,1.702,1.785,1.869,1.496,5749464.0
250,2001,4,1703310,1807170.0,5.35,5.65,318573,23637,1.591,1.552,1.646,1.732,1.422,5252851.0
251,2001,3,1766754,2246633.0,5.42,4.93,358103,32121,1.450,1.409,1.506,1.596,1.399,5041971.0
252,2001,2,1582557,2309464.0,6.45,5.80,328289,26882,1.490,1.450,1.544,1.635,1.492,5240820.0


In [100]:
#initialize X, y
X = us_gas_data.drop(columns = ['all_grades($/Gallon)'])
y = us_gas_data['all_grades($/Gallon)'].values.reshape(-1,1)

In [101]:
#Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [102]:
#Scale data
from sklearn.preprocessing import StandardScaler
data_scaler = StandardScaler()
X_train_s = data_scaler.fit_transform(X_train)
X_test_s = data_scaler.fit_transform(X_test)

In [103]:
#create instance of model
model = LinearRegression()

In [104]:
#train the model
model.fit(X_train_s, y_train)

LinearRegression()

In [105]:
#create prediction to test
y_pred = model.predict(X_train_s)

In [106]:
#results
print('Multi-Linear_Regression Model Result:')
print(y_pred.shape)
print('---------------------------------------')
print('intercept: \n', model.intercept_)
print('Coefficients: \n', model.coef_)

Multi-Linear_Regression Model Result:
(190, 1)
---------------------------------------
intercept: 
 [2.64834737]
Coefficients: 
 [[ 8.20380693e-03  1.58964056e-03 -6.65370864e-04 -6.87580030e-04
   5.83311572e-03 -4.69345287e-03 -9.77198795e-04  5.93297343e-04
   8.40873255e-01 -5.70915364e-01  4.53692822e-01 -7.27674098e-03
  -1.00277176e-03]]


In [107]:
X = sm.add_constant(X)

  x = pd.concat(x[::order], 1)


In [108]:
sm_model = sm.OLS(y, X).fit()

In [109]:
predictions = sm_model.predict(X)

In [110]:
print(sm_model.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       1.000
Model:                            OLS   Adj. R-squared:                  1.000
Method:                 Least Squares   F-statistic:                 2.892e+05
Date:                Sun, 22 May 2022   Prob (F-statistic):               0.00
Time:                        21:10:24   Log-Likelihood:                 950.71
No. Observations:                 254   AIC:                            -1873.
Df Residuals:                     240   BIC:                            -1824.
Df Model:                          13                                         
Covariance Type:            nonrobust                                         
                            coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------
const                    -3.16

# Regression with original data but without the dummies

In [46]:
us_gas_data_no_dummies = us_gas_data.drop(columns=["regular($/Gallon)","midgrade($/Gallon)","premium($/Gallon)",
                                                  "diesel($/Gallon)"])
us_gas_data_no_dummies

Unnamed: 0,Year,Month,Gas_Production(Mmcf),Gas_Consumption(Mmcf),Import_price($/Mcf),Export_price($/Mcf),Total Imports (Mmcf),Total Exports (Mmcf),all_grades($/Gallon),Volumn(Mmcf)
0,2022,2,2856356.0,3040029.0,5.62,8.22,259389.0,545563.0,3.611,5997164.0
1,2022,1,3180818.0,3591557.0,6.87,7.04,296179.0,610102.0,3.413,6653327.0
2,2021,12,3266272.0,2979653.0,4.74,7.40,252626.0,620886.0,3.406,7647859.0
3,2021,11,3161306.0,2659971.0,5.18,8.10,242405.0,556982.0,3.491,7971480.0
4,2021,10,3219612.0,2237715.0,4.79,7.97,228203.0,545055.0,3.384,8103211.0
...,...,...,...,...,...,...,...,...,...,...
249,2001,5,1763141.0,1522382.0,4.95,5.50,321878.0,28981.0,1.738,5749464.0
250,2001,4,1703310.0,1807170.0,5.35,5.65,318573.0,23637.0,1.591,5252851.0
251,2001,3,1766754.0,2246633.0,5.42,4.93,358103.0,32121.0,1.450,5041971.0
252,2001,2,1582557.0,2309464.0,6.45,5.80,328289.0,26882.0,1.490,5240820.0


In [49]:
#initialize X, y
X = us_gas_data_no_dummies.drop(columns = ['all_grades($/Gallon)'])
y = us_gas_data_no_dummies['all_grades($/Gallon)']

In [50]:
#Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [51]:
#Scale data
from sklearn.preprocessing import StandardScaler
data_scaler = StandardScaler()
X_train_s = data_scaler.fit_transform(X_train)
X_test_s = data_scaler.fit_transform(X_test)

In [52]:
#create instance of model
model = LinearRegression()

In [53]:
#train the model
model.fit(X_train_s, y_train)

LinearRegression()

In [54]:
#create prediction to test
y_pred = model.predict(X_test_s)

In [55]:
#results
print('Multi-Linear_Regression Model Result:')
print(y_pred.shape)
print('---------------------------------------')
print('intercept: \n', model.intercept_)
print('Coefficients: \n', model.coef_)

Multi-Linear_Regression Model Result:
(64,)
---------------------------------------
intercept: 
 2.6507999999999834
Coefficients: 
 [ 0.97432131  0.03026545 -0.32562536 -0.1117177   0.25085793  0.08023885
 -0.12294204 -0.27945977  0.0487995 ]


In [56]:
X = sm.add_constant(X)

  x = pd.concat(x[::order], 1)


In [57]:
sm_model = sm.OLS(y, X).fit()

In [58]:
predictions = sm_model.predict(X)

In [59]:
print(sm_model.summary())

                             OLS Regression Results                             
Dep. Variable:     all_grades($/Gallon)   R-squared:                       0.539
Model:                              OLS   Adj. R-squared:                  0.522
Method:                   Least Squares   F-statistic:                     31.68
Date:                  Wed, 25 May 2022   Prob (F-statistic):           2.09e-36
Time:                          20:25:09   Log-Likelihood:                -177.70
No. Observations:                   254   AIC:                             375.4
Df Residuals:                       244   BIC:                             410.8
Df Model:                             9                                         
Covariance Type:              nonrobust                                         
                            coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------
const     