**Import the packages**

In [43]:
import pandas as pd
import numpy as np 
import pickle 
import seaborn as sns
import matplotlib.pyplot as plt 
import warnings 
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.datasets import fetch_california_housing
from sklearn.metrics import r2_score,mean_squared_error

**Create the dataset**

In [5]:
housing=fetch_california_housing()
housing

{'data': array([[   8.3252    ,   41.        ,    6.98412698, ...,    2.55555556,
           37.88      , -122.23      ],
        [   8.3014    ,   21.        ,    6.23813708, ...,    2.10984183,
           37.86      , -122.22      ],
        [   7.2574    ,   52.        ,    8.28813559, ...,    2.80225989,
           37.85      , -122.24      ],
        ...,
        [   1.7       ,   17.        ,    5.20554273, ...,    2.3256351 ,
           39.43      , -121.22      ],
        [   1.8672    ,   18.        ,    5.32951289, ...,    2.12320917,
           39.43      , -121.32      ],
        [   2.3886    ,   16.        ,    5.25471698, ...,    2.61698113,
           39.37      , -121.24      ]], shape=(20640, 8)),
 'target': array([4.526, 3.585, 3.521, ..., 0.923, 0.847, 0.894], shape=(20640,)),
 'frame': None,
 'target_names': ['MedHouseVal'],
 'feature_names': ['MedInc',
  'HouseAge',
  'AveRooms',
  'AveBedrms',
  'Population',
  'AveOccup',
  'Latitude',
  'Longitude'],
 'DESCR': 

In [11]:
df=pd.DataFrame(housing.data,columns=housing.feature_names)
df['MedHouseVal']=housing['target']
df

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422
...,...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09,0.781
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21,0.771
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22,0.923
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32,0.847


**4. Data Quick checks**

- Shape - [shape]
- Size - [size]
- Length - [len]
- Data type - [dtypes]
- columns


In [15]:
#shape
print(f'Shape of Dataset:{df.shape}')
print('----------------------------')
#size 
print(f'Size of dataset:{df.size}')
print('----------------------------')
#length
print(f'Length of Dataset:{len(df)}')
print('----------------------------')
#dtypes
print(df.dtypes)
print('----------------------------')
#columns
print(df.columns)


Shape of Dataset:(20640, 9)
----------------------------
Size of dataset:185760
----------------------------
Length of Dataset:20640
----------------------------
MedInc         float64
HouseAge       float64
AveRooms       float64
AveBedrms      float64
Population     float64
AveOccup       float64
Latitude       float64
Longitude      float64
MedHouseVal    float64
dtype: object
----------------------------
Index(['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup',
       'Latitude', 'Longitude', 'MedHouseVal'],
      dtype='object')


$ML-Model$_$Development$

- We divide data into two parts i.e input data and output data

- input data = X; output data=y

- Again we divide input data into two parts i.e train and test

- input train data= x_train; input test data= x_test

- similarly we divide output data into two parts i.e train and test

- output train data= y_train; output test data= y_test

- Model development happens on train data i.e x_train and y_train

- Model will predict by passing x_test data, these are called y_predictions

- y_predictions will compare with y_test , this is called test accuracy/ test error

**Divide the dataset into X and y**

In [20]:
X=df.drop('MedHouseVal',axis=1)
y=df['MedHouseVal']

**Split the dataset into train and test**

In [24]:
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=1234,test_size=0.30)

- Quick data cheak

In [27]:
print('the shape of data frame is :',df.shape)
print('the shape of X_train is :',X_train.shape)
print('the shape of X_test is :',X_test.shape)
print('the shape of y_train is :',y_train.shape)
print('the shape of y_test is :',y_test.shape)

the shape of data frame is : (20640, 9)
the shape of X_train is : (14448, 8)
the shape of X_test is : (6192, 8)
the shape of y_train is : (14448,)
the shape of y_test is : (6192,)


**Implement the Model**

In [30]:
LR=LinearRegression()
LR.fit(X_train,y_train)

**Make Predictions**

In [35]:
y_predictions=LR.predict(X_test)
y_predictions

array([2.60474013, 2.94398545, 2.59138626, ..., 3.44254726, 1.63598042,
       2.24182191], shape=(6192,))

**New Dataframe**

In [39]:
df1=pd.DataFrame()
df1=X_test
df1['y']=y
df1['y_predictions']=y_predictions

In [41]:
df1

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,y,y_predictions
15712,3.6812,52.0,4.848315,1.078652,614.0,1.724719,37.79,-122.44,5.000,2.604740
4119,5.5524,52.0,6.129032,1.041935,1842.0,2.970968,34.14,-118.20,2.939,2.943985
15725,3.8750,44.0,4.739264,1.024540,561.0,1.720859,37.78,-122.44,4.125,2.591386
14616,3.4706,20.0,5.274254,1.033582,1822.0,3.399254,32.80,-117.17,1.576,1.920864
13238,7.7234,10.0,8.016901,1.019718,1221.0,3.439437,34.13,-117.67,3.041,3.040696
...,...,...,...,...,...,...,...,...,...,...
7665,4.1250,5.0,5.734177,1.183544,1453.0,9.196203,33.93,-118.07,1.719,2.017527
2196,3.8056,29.0,6.391304,1.060870,1072.0,3.107246,36.83,-119.91,1.654,1.554616
18162,6.8753,26.0,6.987288,0.974576,1241.0,2.629237,37.36,-122.04,4.030,3.442547
16305,3.5982,15.0,4.462562,0.996672,1520.0,2.529118,38.01,-121.35,0.944,1.635980


In [45]:
ip1=[1,2,3,4,5,6,7,8]
LR.predict([ip1])

array([-39.69036637])

**Model evaluation**

In [48]:
R2_score=r2_score(y_test,y_predictions)
MSE=mean_squared_error(y_test,y_predictions)
RMSE=np.sqrt(MSE)

In [50]:
print(f'R2_score : {R2_score}')
print(f'Mean Square Error : {MSE}')
print(f'Root Mean Square Error : {RMSE}')

R2_score : 0.6017428753930177
Mean Square Error : 0.5307212239067881
Root Mean Square Error : 0.7285061591412856


In [52]:
X_test

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,y,y_predictions
15712,3.6812,52.0,4.848315,1.078652,614.0,1.724719,37.79,-122.44,5.000,2.604740
4119,5.5524,52.0,6.129032,1.041935,1842.0,2.970968,34.14,-118.20,2.939,2.943985
15725,3.8750,44.0,4.739264,1.024540,561.0,1.720859,37.78,-122.44,4.125,2.591386
14616,3.4706,20.0,5.274254,1.033582,1822.0,3.399254,32.80,-117.17,1.576,1.920864
13238,7.7234,10.0,8.016901,1.019718,1221.0,3.439437,34.13,-117.67,3.041,3.040696
...,...,...,...,...,...,...,...,...,...,...
7665,4.1250,5.0,5.734177,1.183544,1453.0,9.196203,33.93,-118.07,1.719,2.017527
2196,3.8056,29.0,6.391304,1.060870,1072.0,3.107246,36.83,-119.91,1.654,1.554616
18162,6.8753,26.0,6.987288,0.974576,1241.0,2.629237,37.36,-122.04,4.030,3.442547
16305,3.5982,15.0,4.462562,0.996672,1520.0,2.529118,38.01,-121.35,0.944,1.635980


**11. Save the model**

In [57]:
pickle.dump(LR,open('Linear_housing_model.pkl','wb'))