In [155]:
#Simple Linear Regression for boston dataset

import pandas as pd
import numpy as np

from sklearn import linear_model
from sklearn import datasets
from sklearn.model_selection import train_test_split
data = datasets.load_boston()
print(data.DESCR)

.. _boston_dataset:

Boston house prices dataset
---------------------------

**Data Set Characteristics:**  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pu

In [156]:
df = pd.DataFrame(data.data, columns = data.feature_names)
print(df.head())

      CRIM    ZN  INDUS  CHAS    NOX     RM   AGE     DIS  RAD    TAX  \
0  0.00632  18.0   2.31   0.0  0.538  6.575  65.2  4.0900  1.0  296.0   
1  0.02731   0.0   7.07   0.0  0.469  6.421  78.9  4.9671  2.0  242.0   
2  0.02729   0.0   7.07   0.0  0.469  7.185  61.1  4.9671  2.0  242.0   
3  0.03237   0.0   2.18   0.0  0.458  6.998  45.8  6.0622  3.0  222.0   
4  0.06905   0.0   2.18   0.0  0.458  7.147  54.2  6.0622  3.0  222.0   

   PTRATIO       B  LSTAT  
0     15.3  396.90   4.98  
1     17.8  396.90   9.14  
2     17.8  392.83   4.03  
3     18.7  394.63   2.94  
4     18.7  396.90   5.33  


In [157]:
target = pd.DataFrame(data.target,columns=['MEDV'])
print(target)

     MEDV
0    24.0
1    21.6
2    34.7
3    33.4
4    36.2
..    ...
501  22.4
502  20.6
503  23.9
504  22.0
505  11.9

[506 rows x 1 columns]


In [158]:
print('NUM in RM',df[df['RM'].isnull()])#chek if there are NULL in the column 
print('NUM in MEDV',target[target['MEDV'].isnull()])

NUM in RM Empty DataFrame
Columns: [CRIM, ZN, INDUS, CHAS, NOX, RM, AGE, DIS, RAD, TAX, PTRATIO, B, LSTAT]
Index: []
NUM in MEDV Empty DataFrame
Columns: [MEDV]
Index: []


In [159]:
X= pd.DataFrame(df['RM'])
y = pd.DataFrame(target['MEDV'])

In [160]:
print(X)
print(y)

        RM
0    6.575
1    6.421
2    7.185
3    6.998
4    7.147
..     ...
501  6.593
502  6.120
503  6.976
504  6.794
505  6.030

[506 rows x 1 columns]
     MEDV
0    24.0
1    21.6
2    34.7
3    33.4
4    36.2
..    ...
501  22.4
502  20.6
503  23.9
504  22.0
505  11.9

[506 rows x 1 columns]


In [161]:
df.to_csv('BOSTON_HOUSING2.csv')

In [162]:
X = pd.DataFrame(df['RM'])
y= pd.DataFrame(target['MEDV'])
lm = linear_model.LinearRegression()
model = lm.fit(X,y)
print('coeficent',lm.coef_)
print('Intercept',lm.intercept_)
print('R square',lm.score(X,y))

coeficent [[9.10210898]]
Intercept [-34.67062078]
R square 0.4835254559913343


In [163]:
X_train,X_test,y_train,y_test = train_test_split(X,y ,test_size = 0.2, random_state = 0)

In [164]:
print('X.shape', X.shape)
print('X_train.shape : ', X_train.shape)
print('X_test.shape : ', X_test.shape)
print('\n y.shape',y.shape)
print('y_train.shape : ', y_train.shape)
print('y_test.shape: ',  y_test.shape)

X.shape (506, 1)
X_train.shape :  (404, 1)
X_test.shape :  (102, 1)

 y.shape (506, 1)
y_train.shape :  (404, 1)
y_test.shape:  (102, 1)


In [165]:
lm = linear_model.LinearRegression()
model = lm.fit(X_train,y_train)
print('coeficent',lm.coef_)
print('Intercept',lm.intercept_)
print('R square',lm.score(X_train,y_train))

y_pred = lm.predict(X_test)
print('y_pred : ',y_pred[0:5])

coeficent [[9.37638431]]
Intercept [-36.47618963]
R square 0.4970800097843844
y_pred :  [[22.90445223]
 [21.80741526]
 [23.2795076 ]
 [13.67809006]
 [21.95743741]]


In [166]:
y_pred.tolist()

[[22.904452229344507],
 [21.80741526467247],
 [23.27950760188196],
 [13.678090064923232],
 [21.95743741368745],
 [18.22563645693981],
 [18.89135974319379],
 [19.482071954940274],
 [10.15256956307119],
 [23.57955189991192],
 [19.37893172749247],
 [25.94240074689786],
 [22.979463303852],
 [7.1427501984581525],
 [36.67836078578237],
 [29.542932323257382],
 [22.051201256821805],
 [31.849522864362704],
 [24.901622088106436],
 [18.37565860595479],
 [23.776455970494084],
 [18.65695013535788],
 [20.972917060776638],
 [24.301533492046516],
 [16.800426041297506],
 [-0.25521702484305564],
 [19.453942801999965],
 [17.05358841776028],
 [39.18185539746987],
 [20.64474360980637],
 [21.51674735095594],
 [20.541603382358574],
 [21.901179107806826],
 [20.672872762746678],
 [23.682692127359722],
 [17.93496854322329],
 [19.660223256895563],
 [18.610068213790704],
 [23.045097994046053],
 [21.301090511746906],
 [20.972917060776638],
 [17.77557000989487],
 [21.13231559410506],
 [14.409448041371249],
 [24.845

In [175]:
error = pd.DataFrame(y_pred, columns = ['y_pred'])

In [176]:
error

Unnamed: 0,y_pred
0,22.904452
1,21.807415
2,23.279508
3,13.678090
4,21.957437
...,...
97,23.373271
98,24.704718
99,17.353633
100,23.298260


In [197]:
B = [x for x in y_test['MEDV']]
B

[22.6,
 50.0,
 23.0,
 8.3,
 21.2,
 19.9,
 20.6,
 18.7,
 16.1,
 18.6,
 8.8,
 17.2,
 14.9,
 10.5,
 50.0,
 29.0,
 23.0,
 33.3,
 29.4,
 21.0,
 23.8,
 19.1,
 20.4,
 29.1,
 19.3,
 23.1,
 19.6,
 19.4,
 38.7,
 18.7,
 14.6,
 20.0,
 20.5,
 20.1,
 23.6,
 16.8,
 5.6,
 50.0,
 14.5,
 13.3,
 23.9,
 20.0,
 19.8,
 13.8,
 16.5,
 21.6,
 20.3,
 17.0,
 11.8,
 27.5,
 15.6,
 23.1,
 24.3,
 42.8,
 15.6,
 21.7,
 17.1,
 17.2,
 15.0,
 21.7,
 18.6,
 21.0,
 33.1,
 31.5,
 20.1,
 29.8,
 15.2,
 15.0,
 27.5,
 22.6,
 20.0,
 21.4,
 23.5,
 31.2,
 23.7,
 7.4,
 48.3,
 24.4,
 22.6,
 18.3,
 23.3,
 17.1,
 27.9,
 44.8,
 50.0,
 23.0,
 21.4,
 10.2,
 23.3,
 23.2,
 18.9,
 13.4,
 21.9,
 24.8,
 11.9,
 24.3,
 13.8,
 24.7,
 14.1,
 18.7,
 28.1,
 19.8]

In [198]:
error['y_test'] = pd.Series(B)

In [220]:
error

Unnamed: 0,y_pred,y_test,y- ŷ
0,22.904452,22.6,-0.304452
1,21.807415,50.0,28.192585
2,23.279508,23.0,-0.279508
3,13.678090,8.3,-5.378090
4,21.957437,21.2,-0.757437
...,...,...,...
97,23.373271,24.7,1.326729
98,24.704718,14.1,-10.604718
99,17.353633,18.7,1.346367
100,23.298260,28.1,4.801740


In [224]:
#RMSE 
import math

error['y- ŷ'] = (error['y_test']-error['y_pred'])
error['(y- ŷ)^2'] = (error['y- ŷ'])**2

In [225]:
error

Unnamed: 0,y_pred,y_test,y- ŷ,(y- ŷ)^2
0,22.904452,22.6,-0.304452,0.092691
1,21.807415,50.0,28.192585,794.821834
2,23.279508,23.0,-0.279508,0.078124
3,13.678090,8.3,-5.378090,28.923853
4,21.957437,21.2,-0.757437,0.573711
...,...,...,...,...
97,23.373271,24.7,1.326729,1.760209
98,24.704718,14.1,-10.604718,112.460044
99,17.353633,18.7,1.346367,1.812705
100,23.298260,28.1,4.801740,23.056703


In [230]:
#RMSE
math.sqrt((error['(y- ŷ)^2'].sum())/102)

6.848894190115315

In [242]:
#MAPE

error['|(y- ŷ)/y|'] = abs(error['y- ŷ']/error['y_test'])
print('MAPE : ',error['|(y- ŷ)/y|'].sum()/102)

MAPE :  0.23863369521986164


In [256]:
#RMSPE 

RMSPE = math.sqrt((1/102)*((error['y- ŷ']/error['y_test'])**2).sum())
RMSPE

0.4146421036885942