<a href="https://colab.research.google.com/github/AdebanjiAdelowo/Machine-Learning/blob/main/HousePricePredictionUsingBoston_Dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import urllib.request
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

The Boston house-price data of Harrison, D. and Rubinfeld, D.L. 'Hedonic
 prices and the demand for clean air', J. Environ. Economics & Management,
 vol.5, 81-102, 1978. 

In [2]:
# WEb Scraping
link = 'http://lib.stat.cmu.edu/datasets/boston'

with urllib.request.urlopen(link) as f:
  html_bytes = f.read()
  html = html_bytes.decode("utf-8")

In [3]:
data = html.split('\n')

In [4]:
data = data[22:]

In [5]:
dataset = []

for i in range(0, len(data)-1, 2):

  d = data[i:i+2]
  
  a = ''.join(d)

  f = a.split(' ')

  f = [float(i) for i in f if i]

  dataset.append(f)




```
# This is formatted as code
 Variables in order

 CRIM     per capita crime rate by town
 ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
 INDUS    proportion of non-retail business acres per town
 CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
 NOX      nitric oxides concentration (parts per 10 million)
 RM       average number of rooms per dwelling
 AGE      proportion of owner-occupied units built prior to 1940
 DIS      weighted distances to five Boston employment centres
 RAD      index of accessibility to radial highways
 TAX      full-value property-tax rate per $10,000
 PTRATIO  pupil-teacher ratio by town
 B        1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town
 LSTAT     lower status of the population
 MEDV     Median value of owner-occupied homes in $1000
```





In [6]:
columns = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM' ,'AGE', 'DIS', 'AD','TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']
dataset = pd.DataFrame(dataset, columns = columns)

In [7]:
dataset.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,AD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


In [8]:
X = dataset.loc[:, 'CRIM':'LSTAT']
X.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,AD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


In [9]:
Y = dataset.loc[:,'MEDV']

In [10]:
from sklearn.model_selection import train_test_split

num_test = 0.10

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=num_test, random_state=23)

In [11]:
# Linear Regression Cross Validation

from sklearn.linear_model import LinearRegression

kfold = KFold(n_splits=10, shuffle = True, random_state=7)

model = LinearRegression()

results = cross_val_score(model, X_train.values, Y_train.values, cv=kfold)

results.mean()

0.7251771746891866

In [12]:
# Decision Tree Regression Cross Validation

from sklearn.tree import DecisionTreeRegressor

kfold = KFold(n_splits=10, shuffle = True, random_state=7)

model = DecisionTreeRegressor(random_state = 1)

results = cross_val_score(model, X_train.values, Y_train.values, cv=kfold)

results.mean()

0.772636724918282

In [13]:
# Random Forest Regressor Cross Validation

from sklearn.ensemble import RandomForestRegressor
kfold = KFold(n_splits=10, shuffle = True, random_state=7)
model = RandomForestRegressor(random_state=0)
results = cross_val_score(model, X_train.values, Y_train.values, cv=kfold)
results.mean()

0.8732542408766317

In [14]:
# Parameter Tuning

from sklearn.metrics import make_scorer, mean_squared_error, r2_score

from sklearn.model_selection import GridSearchCV

 
regr = RandomForestRegressor()

# Choose some parameter combinations to try
parameters = {'n_estimators': [4, 6, 9], 
              'max_features': ['log2', 'sqrt','auto'], 
              'criterion' : ['squared_error', 'absolute_error', 'poisson'],
              'max_depth': [2, 3, 5, 10], 
              'min_samples_split': [2, 3, 5],
              'min_samples_leaf': [1,5,8]
             }

# Type of scoring used to compare parameter combinations
acc_scorer = make_scorer(r2_score)

# Run the grid search
grid_obj = GridSearchCV(regr, parameters, scoring=acc_scorer)

grid_obj = grid_obj.fit(X_train, Y_train)

In [15]:
grid_obj.best_score_

0.8819713634996169

In [16]:
grid_obj.best_params_

{'criterion': 'squared_error',
 'max_depth': 10,
 'max_features': 'auto',
 'min_samples_leaf': 1,
 'min_samples_split': 5,
 'n_estimators': 6}

In [17]:
grid_obj.scorer_

make_scorer(r2_score)

In [18]:
# Set the clf to the best combination of parameters
regr = grid_obj.best_estimator_

# Fit the best algorithm to the data. 
regr.fit(X_train, Y_train)

RandomForestRegressor(max_depth=10, min_samples_split=5, n_estimators=6)

In [19]:
Y_pred = regr.predict(X_test)

In [20]:
r2_score(Y_pred, Y_test)

0.6728885599994512