In [20]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression


In [24]:
housing = fetch_california_housing()
print(housing.data.shape)
print(housing.data)


(20640, 8)
[[   8.3252       41.            6.98412698 ...    2.55555556
    37.88       -122.23      ]
 [   8.3014       21.            6.23813708 ...    2.10984183
    37.86       -122.22      ]
 [   7.2574       52.            8.28813559 ...    2.80225989
    37.85       -122.24      ]
 ...
 [   1.7          17.            5.20554273 ...    2.3256351
    39.43       -121.22      ]
 [   1.8672       18.            5.32951289 ...    2.12320917
    39.43       -121.32      ]
 [   2.3886       16.            5.25471698 ...    2.61698113
    39.37       -121.24      ]]


# convert to pd dataframe, data = housing.data, columns = housing.feature_names

In [27]:
df = pd.DataFrame(housing.data, columns=housing.feature_names)
print(df.head())
df["target"]=housing.target
print(df.head())

   MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup  Latitude  \
0  8.3252      41.0  6.984127   1.023810       322.0  2.555556     37.88   
1  8.3014      21.0  6.238137   0.971880      2401.0  2.109842     37.86   
2  7.2574      52.0  8.288136   1.073446       496.0  2.802260     37.85   
3  5.6431      52.0  5.817352   1.073059       558.0  2.547945     37.85   
4  3.8462      52.0  6.281853   1.081081       565.0  2.181467     37.85   

   Longitude  
0    -122.23  
1    -122.22  
2    -122.24  
3    -122.25  
4    -122.25  
   MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup  Latitude  \
0  8.3252      41.0  6.984127   1.023810       322.0  2.555556     37.88   
1  8.3014      21.0  6.238137   0.971880      2401.0  2.109842     37.86   
2  7.2574      52.0  8.288136   1.073446       496.0  2.802260     37.85   
3  5.6431      52.0  5.817352   1.073059       558.0  2.547945     37.85   
4  3.8462      52.0  6.281853   1.081081       565.0  2.181467     37.85 

# create the feature matrix, and target vector



In [86]:
X = df.loc[:,["AveOccup","MedInc","Latitude","Longitude","HouseAge","AveRooms","AveBedrms","Population"]].values
X.shape

(20640, 8)

In [87]:
y = df.loc[:,"target"].values
y.shape

(20640,)

# now split the data into train and test


In [88]:
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=3)

# create model instance and train

In [89]:
reg = LinearRegression(fit_intercept=True)
reg.fit(X_train,y_train)

LinearRegression()

# check score, here R squared

In [90]:
score = reg.score(X_train,y_train)
print(score)

0.6082313600304428


In [91]:
coefs = reg.coef_
print(coefs)

[-3.43119685e-03  4.49450628e-01 -4.20976916e-01 -4.35175412e-01
  1.00707075e-02 -1.26928942e-01  8.15851637e-01 -1.18106966e-06]


In [92]:
intercept = reg.intercept_
print(intercept)

-37.18539362226713


In [93]:
test_score = reg.score(X_test,y_test)
print(test_score)

0.5945098144732655


In [94]:
print("target = {:.2f}* AveOccup + {:.2f}*MedInc + {:.2f}*Latitude + {:.2f}*Longitude + {:.2f}*HouseAge + {:.2f}*AveRooms + {:.2f}*AveBedrms + {:.2f}*Population".format(coefs[0],coefs[1],coefs[2],coefs[3],coefs[4],coefs[5],coefs[6],coefs[7]) )

target = -0.00* AveOccup + 0.45*MedInc + -0.42*Latitude + -0.44*Longitude + 0.01*HouseAge + -0.13*AveRooms + 0.82*AveBedrms + -0.00*Population


In [95]:
bias = reg.predict(X_test) - y_test

In [100]:
print(bias.sum())
print(bias.mean())
print(bias.max())
print(bias.min())
import numpy as np

np.median(bias)

23.84470828383406
0.004621067496867066
10.495431323156218
-3.8895311327700064


0.12459734487863339

In [102]:
print(y_test.mean())
print(np.median(y_test))

2.0615421279069768
1.7835
