In [1]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.preprocessing import StandardScaler, PolynomialFeatures

In [3]:
df = pd.read_csv('http://www-bcf.usc.edu/~gareth/ISL/Advertising.csv')
df.drop('Unnamed: 0', inplace=True, axis=1)
df.head()

Unnamed: 0,TV,radio,newspaper,sales
0,230.1,37.8,69.2,22.1
1,44.5,39.3,45.1,10.4
2,17.2,45.9,69.3,9.3
3,151.5,41.3,58.5,18.5
4,180.8,10.8,58.4,12.9


In [5]:
features = ['TV', 'radio', 'newspaper']
X = df[features]
y = df['sales']

In [6]:
lr = LinearRegression()
lr.fit(X, y)
lr.score(X, y)

0.89721063817895208

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [9]:
lr = LinearRegression()
lr.fit(X_train, y_train)
lr.score(X_train, y_train)

0.89664455276014987

In [10]:
lr.score(X_test, y_test)

0.89351633201636571

In [15]:
predictions = lr.predict(X_test)

from sklearn.metrics import r2_score

r2_score(y_test, predictions)

In [21]:
submission = X_test.copy()

In [24]:
submission['Id'] = submission.index
submission['Prediction'] = predictions
submission[['Id', 'Prediction']].head()

Unnamed: 0,Id,Prediction
95,95,16.383482
15,15,20.92435
30,30,21.614954
158,158,10.4907
128,128,22.176905


In [29]:
cross_val_score(LinearRegression(), X, y, cv=3)

array([ 0.90438089,  0.86536009,  0.9051984 ])

In [33]:
X.head()

Unnamed: 0,TV,radio,newspaper
0,230.1,37.8,69.2
1,44.5,39.3,45.1
2,17.2,45.9,69.3
3,151.5,41.3,58.5
4,180.8,10.8,58.4


In [49]:
kf = KFold(n_splits=3, shuffle=True, random_state=42)

for train, test in kf.split(X, y):
#     print('train', train)
#     print('test',test)
#     print('')
#     print('')
    X_train_fold = X.loc[train, :]
    X_test_fold = X.loc[test, :]
    y_train_fold = y[train]
    y_test_fold = y[test]
    
    lr_fold = LinearRegression()
    lr_fold.fit(X_train_fold, y_train_fold)
    print(lr_fold.score(X_test_fold, y_test_fold))

0.860687654909
0.930825782529
0.873980909944


In [50]:
cross_val_score(LinearRegression(), X, y, cv=kf)

array([ 0.86068765,  0.93082578,  0.87398091])

# Ideal workflow

In [53]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [60]:
ss = StandardScaler()
ss.fit(X_train)

X_train_scaled = ss.transform(X_train)
X_test_scaled = ss.transform(X_test)

In [62]:
cross_val_score(LinearRegression(), X_train_scaled, y_train).mean()

0.88362229974667927

In [65]:
lr = LinearRegression()
lr.fit(X_train_scaled, y_train)
lr.score(X_train_scaled, y_train)

0.89664455276014976

In [66]:
lr.score(X_test_scaled, y_test)

0.89351633201636593

In [69]:
(X_train['TV'] - X_train['TV'].mean()) / X_train['TV'].std()

(X_test['TV'] - X_train['TV'].mean()) / X_train['TV'].std()

114   -0.878487
173    0.181460
5     -1.695186
126   -1.705762
117   -0.899639
73    -0.276832
140   -0.934892
98     1.606865
172   -1.567100
96     0.524591
169    1.543409
97     0.375353
31    -0.470724
12    -1.517745
35     1.618616
119   -1.569450
42     1.652694
189   -1.577676
90    -0.219252
136   -1.496593
51    -0.617613
127   -0.854985
162    0.416482
41     0.282519
118   -0.320311
113    0.665604
26    -0.118192
139    0.375353
100    0.816018
111    1.042814
         ...   
48     0.872423
88    -0.759801
21     0.992284
57    -0.196924
160    0.229639
192   -1.595302
129   -1.097057
37    -0.919615
157   -0.037110
193    0.162658
1     -1.274498
52     0.745512
149   -1.272148
130   -1.789195
151   -0.375541
103    0.410606
99    -0.208676
116   -0.161671
87    -0.496577
74     0.710258
121   -1.576500
199    0.930003
20     0.769014
188    1.563386
71    -0.507153
106   -1.503644
14     0.600973
92     0.760788
179    0.148557
102    1.495230
Name: TV, Length: 150, d