In [20]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [21]:
#load the dataset 
from sklearn.datasets import fetch_california_housing

In [22]:
dataset = fetch_california_housing()

In [23]:
print(dataset.DESCR)

.. _california_housing_dataset:

California Housing dataset
--------------------------

**Data Set Characteristics:**

    :Number of Instances: 20640

    :Number of Attributes: 8 numeric, predictive attributes and the target

    :Attribute Information:
        - MedInc        median income in block group
        - HouseAge      median house age in block group
        - AveRooms      average number of rooms per household
        - AveBedrms     average number of bedrooms per household
        - Population    block group population
        - AveOccup      average number of household members
        - Latitude      block group latitude
        - Longitude     block group longitude

    :Missing Attribute Values: None

This dataset was obtained from the StatLib repository.
https://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.html

The target variable is the median house value for California districts,
expressed in hundreds of thousands of dollars ($100,000).

This dataset was derived

In [24]:
dataset.data

array([[   8.3252    ,   41.        ,    6.98412698, ...,    2.55555556,
          37.88      , -122.23      ],
       [   8.3014    ,   21.        ,    6.23813708, ...,    2.10984183,
          37.86      , -122.22      ],
       [   7.2574    ,   52.        ,    8.28813559, ...,    2.80225989,
          37.85      , -122.24      ],
       ...,
       [   1.7       ,   17.        ,    5.20554273, ...,    2.3256351 ,
          39.43      , -121.22      ],
       [   1.8672    ,   18.        ,    5.32951289, ...,    2.12320917,
          39.43      , -121.32      ],
       [   2.3886    ,   16.        ,    5.25471698, ...,    2.61698113,
          39.37      , -121.24      ]])

In [25]:
d1 = pd.DataFrame(dataset.data,columns=dataset.feature_names)
d2 = pd.DataFrame(dataset.target,columns=['price'])

In [26]:
d1

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25
...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32


In [27]:
d2

Unnamed: 0,price
0,4.526
1,3.585
2,3.521
3,3.413
4,3.422
...,...
20635,0.781
20636,0.771
20637,0.923
20638,0.847


In [28]:
df = pd.concat([d1,d2],axis=1)

In [29]:
df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,price
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


In [33]:
#divided dependent and independent feature
x = df.iloc[:,0:-1]
y = df.iloc[:,-1]

In [34]:
x

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25
...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32


In [35]:
y

0        4.526
1        3.585
2        3.521
3        3.413
4        3.422
         ...  
20635    0.781
20636    0.771
20637    0.923
20638    0.847
20639    0.894
Name: price, Length: 20640, dtype: float64

In [87]:
#train_test_split dataset
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.30,random_state=0)

In [88]:
#check x dataset shape
x_train.shape,x_test.shape

((14448, 8), (6192, 8))

In [89]:
#check shape of y dataset
y_train.shape,y_test.shape

((14448,), (6192,))

In [90]:
#scale the tarin dataset
from sklearn.preprocessing import StandardScaler
scale = StandardScaler()

In [91]:
x_train = scale.fit_transform(x_train)
x_test = scale.transform(x_test)

In [92]:
#model train
from sklearn.linear_model import LinearRegression
regrassor = LinearRegression()

In [93]:
regrassor.fit(x_train,y_train)

In [94]:
#check intercept
regrassor.intercept_

2.0682462451550454

In [95]:
regrassor.coef_

array([ 0.84489085,  0.1156853 , -0.27019687,  0.29078838, -0.0107715 ,
       -0.02805796, -0.8753289 , -0.84959869])

In [96]:
y_pred = regrassor.predict(x_test)

In [97]:
y_pred

array([2.2702672 , 2.79059912, 1.90984782, ..., 3.56895295, 0.98163812,
       2.72552317])

In [98]:
y_test

14740    1.369
10101    2.413
20566    2.007
2670     0.725
15709    4.600
         ...  
19681    0.740
12156    1.773
10211    3.519
2445     0.925
17914    2.983
Name: price, Length: 6192, dtype: float64

In [99]:
#check error
from sklearn.metrics import mean_absolute_error,mean_squared_error

mse = mean_squared_error(y_test,y_pred)
mae = mean_absolute_error(y_test,y_pred)
rmse = np.sqrt(mse)

In [100]:
print(mse,mae,rmse)

0.5431489670037237 0.5361818140641844 0.7369864089681191


In [101]:
#check performance of model
from sklearn.metrics import r2_score

score = r2_score(y_test,y_pred)
score

0.5926087785518777

In [102]:
1 - (1-score)*(len(y_test))/(len(y_test)-x_test.shape[1]-1)

0.5920157782295369

In [103]:
import pickle

In [105]:
pickle.dump(regrassor,open('regrassor.pkl','wb'))
pickle.dump(scale,open('scale.pkl','wb'))

In [106]:
r = pickle.load(open('regrassor.pkl','rb'))
s = pickle.load(open('scale.pkl','rb'))

In [107]:
r.predict(s.transform([[8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23]]))

array([4.1637558])