* CRIM - per capita crime rate by town
* ZN - proportion of residential land zoned for lots over 25,000 sq.ft.
* INDUS - proportion of non-retail business acres per town.
* CHAS - Charles River dummy variable (1 if tract bounds river; 0 otherwise)
* NOX - nitric oxides concentration (parts per 10 million)
* RM - average number of rooms per dwelling
* AGE - proportion of owner-occupied units built prior to 1940
* DIS - weighted distances to five Boston employment centres
* RAD - index of accessibility to radial highways
* TAX - full-value property-tax rate per $10,000
* PTRATIO - pupil-teacher ratio by town
* B - 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town
* LSTAT - % lower status of the population
* MEDV - Median value of owner-occupied homes in $1000's

In [1]:
import numpy as np
import pandas as pd
import sklearn

column_names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']
bos1 = pd.read_csv('housing.csv', delimiter=r"\s+", names=column_names)
bos1.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.9,5.33,36.2


In [2]:
bos1.isna().sum()

CRIM       0
ZN         0
INDUS      0
CHAS       0
NOX        0
RM         0
AGE        0
DIS        0
RAD        0
TAX        0
PTRATIO    0
B          0
LSTAT      0
MEDV       0
dtype: int64

Splitting model data with 70% for training

In [3]:
from sklearn.model_selection import train_test_split

X = np.array(bos1.iloc[:,0:13])
Y = np.array(bos1["MEDV"])

#testing data size is 30% of entire data
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.30, random_state=5)

Using Linear Regression Model

In [4]:
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor

#Load first data model
lr = LinearRegression()

#train the model on training data
lr.fit(x_train, y_train)

#predict the testing data so that we can later evaluate the model
pred_lr = lr.predict(x_test)

Model Evaluation

In [5]:
#Error for linear regression

import sklearn.metrics

mse_lr = sklearn.metrics.root_mean_squared_error(y_test, pred_lr)
print("Error for Linear Regression = {}".format(mse_lr))

Error for Linear Regression = 5.540490745781328


Using KNN Algo

In [6]:
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor

#train the model using training data
lr.fit(x_train, y_train)

#load the KNN model
Nn = KNeighborsRegressor(3)
Nn.fit(x_train, y_train)
pred_Nn = Nn.predict(x_test)

In [7]:
#Hyperparameter Tuning

import sklearn
import sklearn.metrics

for i in range(1,50):
    model = KNeighborsRegressor(i)
    model.fit(x_train, y_train)
    pred_y = model.predict(x_test)
    mse = sklearn.metrics.root_mean_squared_error(y_test, pred_y)
    print("{} error for k = {}".format(mse, i))

7.97154478854566 error for k = 1
7.159484875618533 error for k = 2
7.014927171138291 error for k = 3
7.004019640065342 error for k = 4
7.036131375752027 error for k = 5
7.103650686103268 error for k = 6
7.249246229196143 error for k = 7
7.278466403768686 error for k = 8
7.490296733721186 error for k = 9
7.573928228851226 error for k = 10
7.580880154071545 error for k = 11
7.620709624858009 error for k = 12
7.702433441773159 error for k = 13
7.745706188130712 error for k = 14
7.855546909761407 error for k = 15
7.970845764140948 error for k = 16
8.00708692880329 error for k = 17
8.05951400020052 error for k = 18
8.105972848197592 error for k = 19
8.171623447622684 error for k = 20
8.208766061680672 error for k = 21
8.266010100575647 error for k = 22
8.280897264278922 error for k = 23
8.326448746059764 error for k = 24
8.38105978099617 error for k = 25
8.410954693047014 error for k = 26
8.478704509976565 error for k = 27
8.50999986845734 error for k = 28
8.538275555508479 error for k = 29

In [8]:
#Model evaluation

mse_Nn = sklearn.metrics.root_mean_squared_error(y_test, pred_Nn)
print("Error for K-NN = {}".format(mse_Nn))

Error for K-NN = 7.014927171138291
