In [1]:
import statsmodels.formula.api as smf
import pandas as pd
import numpy as np

## Models:
* Sales - tv
* Sales - newspaper
* Sales - radio
* Sales - tv + newspaper
* Sales - newspaper + radio
* Sales - tv +  newspaper + radio

## First model:   Sales = Bo + Bi * TV

In [2]:
data = pd.read_csv("../../datasets/ads/Advertising.csv")
lm = smf.ols(formula="Sales~TV", data=data).fit()

In [3]:
lm.params

Intercept    7.032594
TV           0.047537
dtype: float64

### The model is:    Sales = 7.032594 + 0.047537 * TV

In [4]:
lm.params

Intercept    7.032594
TV           0.047537
dtype: float64

In [5]:
lm.pvalues

Intercept    1.406300e-35
TV           1.467390e-42
dtype: float64

In [6]:
lm.rsquared

0.611875050850071

## Second model: Sales = Bo + B1*TV + B2*Newspaper

In [7]:
lm2 = smf.ols(formula="Sales~TV + Newspaper", data=data).fit()

In [8]:
lm2.params

Intercept    5.774948
TV           0.046901
Newspaper    0.044219
dtype: float64

### The model is:    Sales = 5.774948 + 0.046901 * TV + 0.044219 * Newspaper

In [9]:
lm2.pvalues

Intercept    3.145860e-22
TV           5.507584e-44
Newspaper    2.217084e-05
dtype: float64

In [10]:
lm2.rsquared

0.6458354938293271

In [11]:
lm2.rsquared_adj

0.6422399150864777

In [12]:
sales_pred = lm2.predict(data[["TV","Newspaper"]])

In [13]:
SSD = sum((data["Sales"] - sales_pred)**2)

In [14]:
RSE = np.sqrt(SSD/(len(data) - 3))

In [15]:
RSE

3.120719860252886

In [16]:
error = RSE / np.mean(data["Sales"])
error

0.22255089037282122

In [17]:
lm2.summary()

0,1,2,3
Dep. Variable:,Sales,R-squared:,0.646
Model:,OLS,Adj. R-squared:,0.642
Method:,Least Squares,F-statistic:,179.6
Date:,"Thu, 28 Jun 2018",Prob (F-statistic):,3.9499999999999996e-45
Time:,21:39:12,Log-Likelihood:,-509.89
No. Observations:,200,AIC:,1026.0
Df Residuals:,197,BIC:,1036.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,5.7749,0.525,10.993,0.000,4.739,6.811
TV,0.0469,0.003,18.173,0.000,0.042,0.052
Newspaper,0.0442,0.010,4.346,0.000,0.024,0.064

0,1,2,3
Omnibus:,0.658,Durbin-Watson:,1.969
Prob(Omnibus):,0.72,Jarque-Bera (JB):,0.415
Skew:,-0.093,Prob(JB):,0.813
Kurtosis:,3.122,Cond. No.,410.0


## Third model: Sales = Bo + B1*TV + B2*Newspaper + B3*Radio

In [18]:
lm3 = smf.ols(formula="Sales~TV + Newspaper + Radio",data=data).fit()

In [19]:
lm3.params

Intercept    2.938889
TV           0.045765
Newspaper   -0.001037
Radio        0.188530
dtype: float64

### The model is:    Sales = 2.938889 + 0.045765 * TV - 0.001037 * Newspaper + 0.1885530 * Radio

In [20]:
lm3.pvalues

Intercept    1.267295e-17
TV           1.509960e-81
Newspaper    8.599151e-01
Radio        1.505339e-54
dtype: float64

In [21]:
lm3.summary()

0,1,2,3
Dep. Variable:,Sales,R-squared:,0.897
Model:,OLS,Adj. R-squared:,0.896
Method:,Least Squares,F-statistic:,570.3
Date:,"Thu, 28 Jun 2018",Prob (F-statistic):,1.58e-96
Time:,21:39:12,Log-Likelihood:,-386.18
No. Observations:,200,AIC:,780.4
Df Residuals:,196,BIC:,793.6
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,2.9389,0.312,9.422,0.000,2.324,3.554
TV,0.0458,0.001,32.809,0.000,0.043,0.049
Newspaper,-0.0010,0.006,-0.177,0.860,-0.013,0.011
Radio,0.1885,0.009,21.893,0.000,0.172,0.206

0,1,2,3
Omnibus:,60.414,Durbin-Watson:,2.084
Prob(Omnibus):,0.0,Jarque-Bera (JB):,151.241
Skew:,-1.327,Prob(JB):,1.44e-33
Kurtosis:,6.332,Cond. No.,454.0


In [22]:
sales_predict = lm3.predict(data[["TV","Newspaper","Radio"]])

In [23]:
SSD = sum((data["Sales"]-sales_predict)**2)

In [24]:
RSE = np.sqrt(SSD/(len(data)-4))

In [25]:
error = RSE / np.mean(data["Sales"])
error

0.1202004188564624

## Four model: Sales = Bo + B1*TV + B2*Radio

In [26]:
lm4 = smf.ols(formula="Sales~TV + Radio",data=data).fit()

In [27]:
lm4.params

Intercept    2.921100
TV           0.045755
Radio        0.187994
dtype: float64

### The model is:    Sales = 2.921100 + 0.045755 * TV  + 0.187994 * Radio

In [28]:
lm4.summary()

0,1,2,3
Dep. Variable:,Sales,R-squared:,0.897
Model:,OLS,Adj. R-squared:,0.896
Method:,Least Squares,F-statistic:,859.6
Date:,"Thu, 28 Jun 2018",Prob (F-statistic):,4.83e-98
Time:,21:39:13,Log-Likelihood:,-386.2
No. Observations:,200,AIC:,778.4
Df Residuals:,197,BIC:,788.3
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,2.9211,0.294,9.919,0.000,2.340,3.502
TV,0.0458,0.001,32.909,0.000,0.043,0.048
Radio,0.1880,0.008,23.382,0.000,0.172,0.204

0,1,2,3
Omnibus:,60.022,Durbin-Watson:,2.081
Prob(Omnibus):,0.0,Jarque-Bera (JB):,148.679
Skew:,-1.323,Prob(JB):,5.19e-33
Kurtosis:,6.292,Cond. No.,425.0


## Multicollinearity

In [29]:
data.corr()

Unnamed: 0,TV,Radio,Newspaper,Sales
TV,1.0,0.054809,0.056648,0.782224
Radio,0.054809,1.0,0.354104,0.576223
Newspaper,0.056648,0.354104,1.0,0.228299
Sales,0.782224,0.576223,0.228299,1.0


 Models for the correlation:
 * Newspaper ~ TV + Radio -> R^2   VIF = 1/(1-R^2)
 * TV ~ Newspaper + Radio -> R^2   VIF = 1/(1-R^2)
 * Radio ~ Newspaper + TV -> R^2   VIF = 1/(1-R^2)

* VIF = 1 we have 0 correlation
* 1<VIF<5 we have moderate correlation
* VIF>5 we have high correlation and we must to delete a variable

In [40]:
lm_news = smf.ols(formula="Newspaper~TV + Radio",data=data).fit()
lm_tv = smf.ols(formula="TV~Radio + Newspaper",data=data).fit()
lm_radio = smf.ols(formula="Radio~TV + Newspaper",data=data).fit()

In [41]:
r2_news = lm_news.rsquared
r2_tv = lm_tv.rsquared
r2_radio = lm_radio.rsquared

In [42]:
print(r2_news)
print(r2_tv)
print(r2_radio)

0.12678045656223502
0.00458962317423961
0.1266008772420567


In [43]:
VIF_news = 1/(1-r2_news)
VIF_tv = 1/(1-r2_tv)
VIF_radio = 1/(1-r2_radio)

In [44]:
print(VIF_news)
print(VIF_tv)
print(VIF_radio)

1.1451873787239288
1.0046107849396502
1.1449519171055353
