In [13]:
import numpy as np
import pandas as pd

In [14]:
data=pd.read_csv('HousingData.csv')
data.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,,36.2


In [15]:
data.shape

(506, 14)

In [16]:
# - CRIM - per capita crime rate by town
# - ZN - proportion of residential land zoned for lots over 25,000 sq.ft.
# - INDUS - proportion of non-retail business acres per town.
# - CHAS - Charles River dummy variable (1 if tract bounds river; 0 otherwise)
# - NOX - nitric oxides concentration (parts per 10 million)
# - RM - average number of rooms per dwelling
# - AGE - proportion of owner-occupied units built prior to 1940
# - DIS - weighted distances to five Boston employment centres
# - RAD - index of accessibility to radial highways
# - TAX - full-value property-tax rate per $10,000
# - PTRATIO - pupil-teacher ratio by town
# - B - 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town
# - LSTAT - % lower status of the population
# - MEDV - Median value of owner-occupied homes in $1000's

In [17]:
data.isnull().sum()

CRIM       20
ZN         20
INDUS      20
CHAS       20
NOX         0
RM          0
AGE        20
DIS         0
RAD         0
TAX         0
PTRATIO     0
B           0
LSTAT      20
MEDV        0
dtype: int64

In [6]:
data[data['CRIM'].isnull()]

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
53,,21.0,5.64,0.0,0.439,5.998,21.4,6.8147,4,243,16.8,396.9,8.43,23.4
115,,0.0,10.01,0.0,0.547,5.928,88.2,2.4631,6,432,17.8,344.91,15.76,18.3
183,,0.0,2.46,0.0,0.488,6.563,95.6,2.847,3,193,17.8,396.9,5.68,32.5
191,,45.0,3.44,0.0,0.437,6.739,30.8,6.4798,5,398,15.2,389.71,4.69,30.5
192,,45.0,3.44,0.0,0.437,7.178,26.3,6.4798,5,398,15.2,390.49,2.87,36.4
196,,80.0,1.52,0.0,0.404,7.287,34.1,7.309,2,329,12.6,396.9,4.08,33.3
229,,0.0,6.2,0.0,0.504,6.552,21.4,3.3751,8,307,17.4,380.34,3.76,31.5
236,,0.0,6.2,1.0,0.507,6.631,76.5,4.148,8,307,17.4,388.45,9.54,25.1
241,,30.0,4.93,0.0,0.428,6.095,65.1,6.3361,6,300,16.6,394.62,12.4,20.1
262,,20.0,3.97,0.0,0.647,8.398,91.5,2.2885,5,264,13.0,386.86,5.91,48.8


In [18]:
for i in data:
    if data[i].isnull().any():
        data[i]=data[i].fillna(data[i].mean())

In [19]:
data.isnull().sum()

CRIM       0
ZN         0
INDUS      0
CHAS       0
NOX        0
RM         0
AGE        0
DIS        0
RAD        0
TAX        0
PTRATIO    0
B          0
LSTAT      0
MEDV       0
dtype: int64

In [9]:
for i in data:
    if data[i].isnull().any():
        data[i]=data[i].fillna(data[i].mean())

In [10]:
data.isnull().sum().sum()

0

In [20]:
X = data.drop(columns=['MEDV'])
y = data['MEDV']

In [21]:
from sklearn.preprocessing import StandardScaler

In [22]:
st=StandardScaler()
x_scaled=st.fit_transform(X)

In [23]:
from sklearn.model_selection import train_test_split

In [24]:
x_train,x_test,y_train,y_test=train_test_split(x_scaled,y,test_size=0.2,random_state=42)

In [26]:
from sklearn.neighbors import KNeighborsRegressor

In [41]:
knn = KNeighborsRegressor(n_neighbors=7)

In [42]:
knn.fit(x_train, y_train)


In [43]:
y_pred = knn.predict(x_test)
y_pred

array([24.77142857, 31.54285714, 17.01428571, 23.98571429, 17.04285714,
       22.9       , 19.41428571, 16.44285714, 19.8       , 19.52857143,
       23.18571429, 20.52857143, 10.08571429, 23.15714286, 19.        ,
       21.07142857, 19.07142857, 10.14285714, 35.1       , 14.9       ,
       23.17142857, 26.48571429, 15.5       , 20.57142857, 16.14285714,
       15.42857143, 23.38571429, 11.85714286, 22.21428571, 19.57142857,
       23.12857143, 22.17142857, 15.34285714, 17.94285714, 15.48571429,
       17.68571429, 31.75714286, 20.07142857, 23.38571429, 24.14285714,
       19.5       , 30.77142857, 35.47142857, 20.65714286, 26.62857143,
       14.57142857, 15.51428571, 24.85714286, 18.82857143, 27.38571429,
       21.08571429, 30.05714286, 15.98571429, 22.17142857, 39.98571429,
       21.38571429, 14.94285714, 30.74285714, 24.37142857, 18.34285714,
       23.5       , 30.9       , 26.5       , 20.28571429, 24.21428571,
       22.21428571, 13.82857143, 23.31428571, 28.61428571, 14.1 

In [44]:
y_pred_train = knn.predict(x_train)
y_pred_train

array([ 9.94285714, 19.3       , 22.07142857, 11.97142857, 19.01428571,
       23.92857143, 21.25714286, 26.31428571,  9.32857143, 15.44285714,
       21.        , 26.05714286, 29.58571429, 12.82857143, 36.41428571,
       15.94285714, 20.85714286, 24.32857143, 19.87142857, 24.94285714,
       10.7       , 20.01428571, 23.71428571, 20.71428571, 26.5       ,
       32.        , 16.5       , 40.24285714, 18.64285714, 22.84285714,
       17.2       , 19.3       , 15.72857143, 13.47142857, 20.78571429,
       30.95714286, 26.01428571, 15.98571429, 17.17142857, 25.        ,
       22.37142857, 22.21428571,  9.95714286, 23.84285714, 19.44285714,
       17.35714286, 15.98571429, 42.98571429, 13.2       , 15.6       ,
       25.12857143, 20.37142857, 21.72857143, 21.01428571, 19.5       ,
       19.91428571, 27.97142857,  8.8       , 25.24285714, 19.87142857,
       21.01428571, 21.88571429, 28.48571429, 19.52857143, 42.2       ,
       13.28571429, 16.64285714, 19.78571429, 17.74285714, 20.85

In [45]:
from sklearn.metrics import r2_score

In [46]:
r2_score(y_train,y_pred_train)

0.7873411417839534

In [47]:
r2_score(y_test,y_pred)

0.6836200788401865