In [73]:
import numpy as np # math arrays manipulation
import pandas as pd # for data management
import os 
from matplotlib import pyplot as plt # for plot

In [74]:
path = os.getcwd() + "/Rogue_242.csv" # make sure you are in the correct directory
rogue = pd.read_csv(path)

In [75]:
rogue.info()
rogue.head()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 162 entries, 0 to 161
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   MonthNumeric  162 non-null    int64  
 1   MonthFactor   162 non-null    object 
 2   Year          162 non-null    int64  
 3   RogueSales    162 non-null    int64  
 4   Unemployment  162 non-null    float64
 5   RogueQueries  162 non-null    int64  
 6   CPIAll        162 non-null    float64
 7   CPIEnergy     162 non-null    float64
dtypes: float64(3), int64(4), object(1)
memory usage: 10.2+ KB


Unnamed: 0,MonthNumeric,MonthFactor,Year,RogueSales,Unemployment,RogueQueries,CPIAll,CPIEnergy
0,1,January,2008,5435,5.0,18,212.174,226.775
1,2,February,2008,5223,4.9,11,212.687,229.731
2,3,March,2008,6873,5.1,18,213.448,233.349
3,4,April,2008,5814,5.0,17,213.942,234.778
4,5,May,2008,7467,5.4,17,215.208,243.924


In [76]:
rogue['MonthFactor'].unique()

array(['January', 'February', 'March', 'April', 'May', 'June', 'July',
       'August', 'Septeber', 'October', 'November', 'Decemeber'],
      dtype=object)

In [77]:
dummy = pd.get_dummies(rogue['MonthFactor'], drop_first=True)
rogue_encoded = pd.concat([rogue, dummy], axis=1).drop('MonthFactor', axis=1)

In [78]:
rogue_encoded.head()

Unnamed: 0,MonthNumeric,Year,RogueSales,Unemployment,RogueQueries,CPIAll,CPIEnergy,August,Decemeber,February,January,July,June,March,May,November,October,Septeber
0,1,2008,5435,5.0,18,212.174,226.775,0,0,0,1,0,0,0,0,0,0,0
1,2,2008,5223,4.9,11,212.687,229.731,0,0,1,0,0,0,0,0,0,0,0
2,3,2008,6873,5.1,18,213.448,233.349,0,0,0,0,0,0,1,0,0,0,0
3,4,2008,5814,5.0,17,213.942,234.778,0,0,0,0,0,0,0,0,0,0,0
4,5,2008,7467,5.4,17,215.208,243.924,0,0,0,0,0,0,0,1,0,0,0


In [79]:
import statsmodels.api as sm

rogue_train = rogue_encoded.sample(frac=0.8, random_state=1)
rogue_test = rogue_encoded.drop(rogue_train.index)

# Choose the features to be used
cols = rogue_encoded.columns.drop('RogueSales')
X_train = rogue_train[cols]
y_train = rogue_train['RogueSales']

# We must add an intercept as the standard model doesn't automatically fit one
X_train = sm.add_constant(X_train)

# fit the data to the model
model = sm.OLS(y_train, X_train).fit()

print(model.summary())

                            OLS Regression Results                            
Dep. Variable:             RogueSales   R-squared:                       0.856
Model:                            OLS   Adj. R-squared:                  0.836
Method:                 Least Squares   F-statistic:                     42.03
Date:                Sun, 02 Oct 2022   Prob (F-statistic):           5.37e-40
Time:                        22:11:32   Log-Likelihood:                -1261.7
No. Observations:                 130   AIC:                             2557.
Df Residuals:                     113   BIC:                             2606.
Df Model:                          16                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
const        -1.399e+07   2.18e+06     -6.407   

In [80]:
coef_df = pd.DataFrame(model.summary().tables[1].data)
coef_df.columns = coef_df.iloc[0]
coef_df.index = coef_df[""]
coef_df = coef_df.iloc[1:,1:]
coef_df

Unnamed: 0,coef,std err,t,P>|t|,[0.025,0.975]
,,,,,,
const,-13990000.0,2180000.0,-6.407,0.0,-18300000.0,-9660000.0
MonthNumeric,-254600.0,39800.0,-6.397,0.0,-333000.0,-176000.0
Year,7656.5576,1196.873,6.397,0.0,5285.337,10000.0
Unemployment,-1523.4585,244.798,-6.223,0.0,-2008.447,-1038.47
RogueQueries,202.0501,72.618,2.782,0.006,58.18,345.92
CPIAll,-1806.6805,338.857,-5.332,0.0,-2478.017,-1135.344
CPIEnergy,126.3576,34.947,3.616,0.0,57.122,195.594
August,1024000.0,159000.0,6.425,0.0,708000.0,1340000.0
Decemeber,2046000.0,319000.0,6.411,0.0,1410000.0,2680000.0


In [81]:
y_test = rogue_test['RogueSales']
X_test = rogue_test[cols]

X_test = sm.add_constant(X_test)

pred = model.predict(X_test)

In [82]:
RSS = 0
TSS = 0

y_values = y_test.values
pred_values = pred.values

def mean(L) :
    mean = 0
    n = len(L)
    for i in range(n) :
        mean += L[i]
    return mean/n

m_bar = mean(y_values)
m = len(pred_values)
for i in range(m) :
    d = pred_values[i] - y_values[i]
    t = pred_values[i] - m_bar
    RSS += d*d
    TSS += t*t

print('OSR is {}'.format(1-RSS/TSS*(m-1)/(m-len(cols)-1)))
    



OSR is 0.6040900300737349


In [83]:

from statsmodels.stats.outliers_influence import variance_inflation_factor
vif = [variance_inflation_factor(X_train.values, i) for i in range(1, X_train.shape[1])] 
vif

  vif = 1. / (1. - r_squared_i)


[inf,
 152.92456280194384,
 2.149760490544606,
 25.23648034074959,
 187.10095757226904,
 4.497812729527373,
 inf,
 inf,
 inf,
 inf,
 inf,
 inf,
 inf,
 inf,
 inf,
 inf,
 inf]

In [84]:
#After computing VIF I droped Year
cols_adjusted = cols.drop('MonthNumeric')

In [85]:
cols_adjusted_2 = cols_adjusted.drop('Year')

In [86]:
X_train = rogue_train[cols_adjusted_2]
X_train = sm.add_constant(X_train)
vif = [variance_inflation_factor(X_train.values, i) for i in range(1, X_train.shape[1])]
vif

[1.9885657223787512,
 23.687256010779503,
 18.781268300733856,
 1.5208798815279958,
 1.605535164255354,
 1.761247945188605,
 1.6081244337029712,
 1.6653856758687586,
 1.9267117664608824,
 1.8032789635381177,
 1.617558053644134,
 1.864605665325457,
 1.709612060500973,
 1.6475760547879386,
 1.5266262008226623]

In [87]:
# Let's try with this training set :

model = sm.OLS(y_train, X_train).fit()

print(model.summary())

                            OLS Regression Results                            
Dep. Variable:             RogueSales   R-squared:                       0.804
Model:                            OLS   Adj. R-squared:                  0.778
Method:                 Least Squares   F-statistic:                     31.18
Date:                Sun, 02 Oct 2022   Prob (F-statistic):           2.35e-33
Time:                        22:11:42   Log-Likelihood:                -1281.7
No. Observations:                 130   AIC:                             2595.
Df Residuals:                     114   BIC:                             2641.
Df Model:                          15                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
const        -2.113e+04   2.33e+04     -0.907   

In [89]:
y_test = rogue_test['RogueSales']
X_test = rogue_test[cols_adjusted_2]

X_test = sm.add_constant(X_test)

pred = model.predict(X_test)


RSS = 0
TSS = 0

y_values = y_test.values
pred_values = pred.values

def mean(L) :
    mean = 0
    n = len(L)
    for i in range(n) :
        mean += L[i]
    return mean/n

m_bar = mean(y_values)
m = len(pred_values)
for i in range(m) :
    d = pred_values[i] - y_values[i]
    t = pred_values[i] - m_bar
    RSS += d*d
    TSS += t*t

print('OSR is {}'.format(1-RSS/TSS))
    

OSR is 0.727047020042509


In [226]:
rogue_train['CPIA/CPIE'] = rogue_train['CPIAll']-rogue_train['CPIEnergy']

In [300]:
cols = rogue_train.columns
new_cols = cols.drop(['CPIA/CPIE','Decemeber','March','MonthNumeric','January','February','May','June','July','August','Septeber','November','October','CPIAll','Unemployment','RogueSales'])

In [301]:
X_train = rogue_train[new_cols]
vif = [variance_inflation_factor(X_train.values, i) for i in range(1, X_train.shape[1])] 
vif

[4.317954428179216, 100.39309647586295]

In [302]:
model = sm.OLS(y_train, X_train).fit()
model.summary()

0,1,2,3
Dep. Variable:,RogueSales,R-squared (uncentered):,0.925
Model:,OLS,Adj. R-squared (uncentered):,0.924
Method:,Least Squares,F-statistic:,524.6
Date:,"Sun, 02 Oct 2022",Prob (F-statistic):,2.44e-71
Time:,23:41:05,Log-Likelihood:,-1313.7
No. Observations:,130,AIC:,2633.0
Df Residuals:,127,BIC:,2642.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Year,3.5243,2.781,1.267,0.207,-1.978,9.026
RogueQueries,330.3519,21.270,15.531,0.000,288.262,372.442
CPIEnergy,-12.7638,24.048,-0.531,0.597,-60.351,34.824

0,1,2,3
Omnibus:,4.919,Durbin-Watson:,2.094
Prob(Omnibus):,0.085,Jarque-Bera (JB):,4.704
Skew:,0.32,Prob(JB):,0.0952
Kurtosis:,3.677,Cond. No.,101.0


In [303]:
y_test = rogue_test['RogueSales']
X_test = rogue_test[new_cols]
pred = model.predict(X_test)


RSS = 0
TSS = 0

y_values = y_test.values
pred_values = pred.values

m_bar = mean(y_values)
m = len(pred_values)
for i in range(m) :
    d = pred_values[i] - y_values[i]
    t = pred_values[i] - m_bar
    RSS += d*d
    TSS += t*t

print('OSR is {}'.format(1-RSS/TSS))

OSR is 0.6789134637663196


In [295]:
col = rogue.columns
rogue_normed = rogue
for i in range(len(col)) :
    rogue_normed[col] = (rogue[col]-rogue[col].mean())/np.sqrt(rogue[col].var())
rogue_normed.cov()

Unnamed: 0,MonthNumeric,MonthFactor,RogueSales,Unemployment,RogueQueries,CPIAll,CPIEnergy
MonthNumeric,1.0,-0.166129,0.018573,0.951788,0.776689,-0.520535,0.985993
MonthFactor,-0.166129,1.0,0.023861,-0.284251,-0.281817,0.063876,-0.288663
RogueSales,0.018573,0.023861,1.0,-0.027966,0.01025,0.005178,-0.055626
Unemployment,0.951788,-0.284251,-0.027966,1.0,0.834824,-0.606273,0.955366
RogueQueries,0.776689,-0.281817,0.01025,0.834824,1.0,-0.708006,0.806593
CPIAll,-0.520535,0.063876,0.005178,-0.606273,-0.708006,1.0,-0.519502
CPIEnergy,0.985993,-0.288663,-0.055626,0.955366,0.806593,-0.519502,1.0
