In [290]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.formula.api as sm

In [291]:
advert = pd.read_csv('Advertising.csv', index_col='Unnamed: 0')
advert.head()

Unnamed: 0,TV,radio,newspaper,sales
1,230.1,37.8,69.2,22.1
2,44.5,39.3,45.1,10.4
3,17.2,45.9,69.3,9.3
4,151.5,41.3,58.5,18.5
5,180.8,10.8,58.4,12.9


In [292]:
#cleaning
print(advert.TV.unique())
print(advert.radio.unique())
#no na values (excel sort&filter also helps)

[230.1  44.5  17.2 151.5 180.8   8.7  57.5 120.2   8.6 199.8  66.1 214.7
  23.8  97.5 204.1 195.4  67.8 281.4  69.2 147.3 218.4 237.4  13.2 228.3
  62.3 262.9 142.9 240.1 248.8  70.6 292.9 112.9  97.2 265.6  95.7 290.7
 266.9  74.7  43.1 228.  202.5 177.  293.6 206.9  25.1 175.1  89.7 239.9
 227.2  66.9 100.4 216.4 182.6 262.7 198.9   7.3 136.2 210.8 210.7  53.5
 261.3 239.3 102.7 131.1  69.   31.5 139.3 216.8 199.1 109.8  26.8 129.4
 213.4  16.9  27.5 120.5   5.4 116.   76.4 239.8  75.3  68.4 213.5 193.2
  76.3 110.7  88.3 134.3  28.6 217.7 250.9 107.4 163.3 197.6 184.9 289.7
 135.2 222.4 296.4 280.2 187.9 238.2 137.9  25.   90.4  13.1 255.4 225.8
 241.7 175.7 209.6  78.2  75.1 139.2 125.7  19.4 141.3  18.8 224.  123.1
 229.5  87.2   7.8  80.2 220.3  59.6   0.7 265.2   8.4 219.8  36.9  48.3
  25.6 273.7  43.   73.4 193.7 220.5 104.6  96.2 140.3 243.2  38.   44.7
 280.7 121.  171.3 187.8   4.1  93.9 149.8  11.7 131.7 172.5  85.7 188.4
 163.5 117.2 234.5  17.9 206.8 215.4 284.3  50.  16

In [293]:
from sklearn.preprocessing import scale
for i in advert.columns:
    advert[i+'_s'] = scale(advert[i], with_mean=True, with_std=True)

In [294]:
advert = advert.drop(columns='sales_s')
advert.head()

Unnamed: 0,TV,radio,newspaper,sales,TV_s,radio_s,newspaper_s
1,230.1,37.8,69.2,22.1,0.969852,0.981522,1.778945
2,44.5,39.3,45.1,10.4,-1.197376,1.082808,0.669579
3,17.2,45.9,69.3,9.3,-1.516155,1.528463,1.783549
4,151.5,41.3,58.5,18.5,0.05205,1.217855,1.286405
5,180.8,10.8,58.4,12.9,0.394182,-0.841614,1.281802


In [295]:
#vif
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant

advQuant = add_constant(advert.drop(columns=['TV', 'radio', 'newspaper', 'sales']))
vif = pd.DataFrame()
vif['Feature_vars'] = advQuant.columns
vif['VIF'] = [variance_inflation_factor(advQuant, i) for i in range (len(advQuant.columns))]
vif

Unnamed: 0,Feature_vars,VIF
0,const,1.0
1,TV_s,1.004611
2,radio_s,1.144952
3,newspaper_s,1.145187


In [296]:
#no columns dropped
fit1 = sm.ols('sales~TV_s+radio_s+newspaper_s', data=advert).fit()
fit1.summary() #newspaper insignificant

0,1,2,3
Dep. Variable:,sales,R-squared:,0.897
Model:,OLS,Adj. R-squared:,0.896
Method:,Least Squares,F-statistic:,570.3
Date:,"Wed, 12 Apr 2023",Prob (F-statistic):,1.58e-96
Time:,13:02:03,Log-Likelihood:,-386.18
No. Observations:,200,AIC:,780.4
Df Residuals:,196,BIC:,793.6
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,14.0225,0.119,117.655,0.000,13.787,14.258
TV_s,3.9193,0.119,32.809,0.000,3.684,4.155
radio_s,2.7921,0.128,21.893,0.000,2.541,3.044
newspaper_s,-0.0225,0.128,-0.177,0.860,-0.274,0.229

0,1,2,3
Omnibus:,60.414,Durbin-Watson:,2.084
Prob(Omnibus):,0.0,Jarque-Bera (JB):,151.241
Skew:,-1.327,Prob(JB):,1.44e-33
Kurtosis:,6.332,Cond. No.,1.46


In [297]:
fit2 = sm.ols('sales~TV_s+radio_s', data=advert).fit()
fit2.summary() #Rsq = 89.6

0,1,2,3
Dep. Variable:,sales,R-squared:,0.897
Model:,OLS,Adj. R-squared:,0.896
Method:,Least Squares,F-statistic:,859.6
Date:,"Wed, 12 Apr 2023",Prob (F-statistic):,4.83e-98
Time:,13:02:03,Log-Likelihood:,-386.2
No. Observations:,200,AIC:,778.4
Df Residuals:,197,BIC:,788.3
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,14.0225,0.119,117.945,0.000,13.788,14.257
TV_s,3.9184,0.119,32.909,0.000,3.684,4.153
radio_s,2.7841,0.119,23.382,0.000,2.549,3.019

0,1,2,3
Omnibus:,60.022,Durbin-Watson:,2.081
Prob(Omnibus):,0.0,Jarque-Bera (JB):,148.679
Skew:,-1.323,Prob(JB):,5.19e-33
Kurtosis:,6.292,Cond. No.,1.06


In [298]:
from sklearn.model_selection import train_test_split
np.random.seed(0)
trainIndex = np.random.choice(advert.shape[0], 160, replace=False)

In [299]:
advert.shape

(200, 7)

In [300]:
trainIndex

array([ 18, 170, 107,  98, 177, 182,   5, 146,  12, 152,  61, 125, 180,
       154,  80,   7,  33, 130,  37,  74, 183, 145,  45, 159,  60, 123,
       179, 185, 122,  44,  16,  55, 150, 111,  22, 189, 129,   4,  83,
       106, 134,  66,  26, 113, 168,  63,   8,  75, 118, 143,  71, 124,
       184,  97, 149,  24,  30, 160,  40,  56, 131,  96, 181,  19, 153,
        92,  54, 163,  51,  86, 139,  90, 137, 101, 144,  89, 109,  14,
        27, 141, 187,  46, 138, 195, 108,  62,   2,  59, 136, 197,  43,
        10, 194,  73, 196, 178, 175, 126,  93, 112, 158, 191,  50,   0,
        94, 110,  95,  64, 167,  41,  69,  49,  48,  85,  13, 161,  23,
       186, 135,  20,  15,  78, 104,  52, 100,  76,   3, 116, 164, 198,
         6,  68,  84, 121, 155, 171, 156,  91, 199,  11, 119, 102,  35,
        57,  65,   1, 120, 162,  42, 105, 132, 173,  17,  38, 133,  53,
       157, 128,  34,  28])

In [301]:
select = np.in1d(range(advert.shape[0]), trainIndex) #True = in training
training = advert.iloc[select]

In [302]:
testing = advert[~select]
print(training.head())
print(testing.head())

      TV  radio  newspaper  sales      TV_s   radio_s  newspaper_s
1  230.1   37.8       69.2   22.1  0.969852  0.981522     1.778945
2   44.5   39.3       45.1   10.4 -1.197376  1.082808     0.669579
3   17.2   45.9       69.3    9.3 -1.516155  1.528463     1.783549
4  151.5   41.3       58.5   18.5  0.052050  1.217855     1.286405
5  180.8   10.8       58.4   12.9  0.394182 -0.841614     1.281802
       TV  radio  newspaper  sales      TV_s   radio_s  newspaper_s
10  199.8    2.6       21.2   10.6  0.616043 -1.395307    -0.430582
22  237.4    5.1       23.5   12.5  1.055093 -1.226498    -0.324708
26  262.9    3.5       19.5   12.0  1.352854 -1.334536    -0.508836
30   70.6   16.0       40.8   10.5 -0.892610 -0.490491     0.471642
32  112.9   17.4       38.6   11.9 -0.398678 -0.395958     0.370372


In [303]:
fit3 = sm.ols('sales~TV_s+radio_s', data=training).fit()
fit3.summary()

0,1,2,3
Dep. Variable:,sales,R-squared:,0.902
Model:,OLS,Adj. R-squared:,0.901
Method:,Least Squares,F-statistic:,722.8
Date:,"Wed, 12 Apr 2023",Prob (F-statistic):,6.2700000000000005e-80
Time:,13:02:04,Log-Likelihood:,-306.94
No. Observations:,160,AIC:,619.9
Df Residuals:,157,BIC:,629.1
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,14.0118,0.132,106.545,0.000,13.752,14.272
TV_s,4.0236,0.131,30.636,0.000,3.764,4.283
radio_s,2.6871,0.135,19.914,0.000,2.421,2.954

0,1,2,3
Omnibus:,57.251,Durbin-Watson:,2.005
Prob(Omnibus):,0.0,Jarque-Bera (JB):,159.363
Skew:,-1.454,Prob(JB):,2.48e-35
Kurtosis:,6.93,Cond. No.,1.09


In [304]:
#slope TV = 3.787
#slope radio = 2.81
#intercept = 14
#using it on testing x...

y_pred = 3.787*testing.TV_s + 2.81*testing.radio_s+14
ytest = testing.sales

In [305]:
np.sum(np.square(ytest-y_pred))

115.47040029994935