In [1]:
import pandas as pd
from sklearn.linear_model import LinearRegression

In [68]:
df = pd.read_csv('data/housing.csv')

In [69]:
X = df[df.columns[:-1]]
y = df['PRICE']

In [70]:
X.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33


In [71]:
X = X - X.mean()
X = X / X.std()
X.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,-0.419367,0.284548,-1.286636,-0.272329,-0.144075,0.413263,-0.119895,0.140075,-0.981871,-0.665949,-1.457558,0.440616,-1.074499
1,-0.416927,-0.48724,-0.592794,-0.272329,-0.73953,0.194082,0.366803,0.556609,-0.867024,-0.986353,-0.302794,0.440616,-0.491953
2,-0.416929,-0.48724,-0.592794,-0.272329,-0.73953,1.281446,-0.265549,0.556609,-0.867024,-0.986353,-0.302794,0.396035,-1.207532
3,-0.416338,-0.48724,-1.305586,-0.272329,-0.834458,1.015298,-0.809088,1.076671,-0.752178,-1.105022,0.11292,0.415751,-1.360171
4,-0.412074,-0.48724,-1.305586,-0.272329,-0.834458,1.227362,-0.510674,1.076671,-0.752178,-1.105022,0.11292,0.440616,-1.025487


In [72]:
lr = LinearRegression()

In [73]:
lr.fit(X, y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [74]:
lr.score(X, y)

0.7406426641094095

In [75]:
def simple_scale(X):
    X = X.copy()
    X = X - X.mean()
    X = X / X.std()
    return X

In [76]:
X.describe().round(3)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
count,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0
mean,-0.0,-0.0,-0.0,-0.0,0.0,-0.0,-0.0,0.0,0.0,0.0,-0.0,0.0,-0.0
std,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
min,-0.419,-0.487,-1.556,-0.272,-1.464,-3.876,-2.333,-1.266,-0.982,-1.313,-2.705,-3.903,-1.53
25%,-0.411,-0.487,-0.867,-0.272,-0.912,-0.568,-0.837,-0.805,-0.637,-0.767,-0.488,0.205,-0.799
50%,-0.39,-0.487,-0.211,-0.272,-0.144,-0.108,0.317,-0.279,-0.522,-0.464,0.275,0.381,-0.181
75%,0.007,0.049,1.015,-0.272,0.598,0.482,0.906,0.662,1.66,1.529,0.806,0.433,0.602
max,9.924,3.8,2.42,3.665,2.73,3.552,1.116,3.957,1.66,1.796,1.637,0.441,3.545


In [77]:
def coefs_df(cols, coefs):
        df = pd.DataFrame({
                'Coefficients': coefs
                          }, index=cols)
        return df

In [78]:
coefs = coefs_df(X.columns, lr.coef_)

In [79]:
coefs.T

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
Coefficients,-0.929065,1.082639,0.141039,0.682414,-2.058754,2.676877,0.019485,-3.107116,2.664852,-2.078837,-2.062646,0.850109,-3.747332


In [80]:
import numpy as np
X @ coefs

Unnamed: 0,Coefficients
0,7.471037
1,2.492756
2,8.034790
3,6.074230
4,5.410718
...,...
501,1.000534
502,-0.157087
503,5.094620
504,3.595160


In [81]:
X.shape, coefs.T.shape

((506, 13), (1, 13))

In [82]:
df['PREDICTION'] = lr.predict(X)

In [83]:
# lr.predict(X)

In [84]:
model_cost = np.sum((df['PRICE'] - df['PREDICTION'])**2)

In [85]:
naive_cost = np.sum((df['PRICE'] - df['PRICE'].mean())**2)

In [86]:
1 - (model_cost / naive_cost)

0.7406426641094095

In [87]:
from sklearn.model_selection import train_test_split

In [88]:
X_train, X_test , y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4)

In [89]:
X_train.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
192,-0.410029,1.442231,-1.121922,-0.272329,-1.015684,1.271483,-1.501833,1.274989,-0.522484,-0.060741,-1.503749,0.370404,-1.369973
138,-0.39106,-0.48724,1.567444,-0.272329,0.598087,-0.608631,1.052444,-1.009846,-0.637331,0.170662,1.267684,0.387382,1.213676
251,-0.395212,0.456057,-0.76917,-0.272329,-1.067462,0.218278,-2.119976,1.710424,-0.292791,-0.464213,0.297683,0.223408,-1.269148
13,-0.346887,-0.48724,-0.436826,-0.272329,-0.144075,-0.477692,-0.240681,0.433325,-0.637331,-0.600682,1.175303,0.440616,-0.615184
256,-0.418314,3.371702,-1.076734,-0.272329,-1.386765,1.6643,-1.221183,1.206746,-0.752178,-0.974487,-1.180415,0.324947,-1.336365


In [90]:
lr.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [91]:
lr.coef_

array([-0.99749067,  1.0990688 ,  0.05666515,  0.82142824, -1.93360315,
        2.72903982, -0.30675065, -3.24552134,  2.5530408 , -2.25939818,
       -1.9620824 ,  0.80414656, -3.26863414])

In [92]:
lr.score(X_test, y_test)

0.7263451459702508