# Supervised Learning
### Boston Housing Dataset

In [1]:
 # Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn import metrics
from sklearn.model_selection import GridSearchCV

##### Load the Boston dataset

In [2]:
# Loading the dataset
boston = pd.read_csv("BostonHousing.csv")

# Display the first 5 rows of the dataset
boston.head()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat,medv
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


##### EDA

In [3]:
# Show information about the dataset
print(boston.info())

# Display the shape/size of the dataset
boston.shape

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   crim     506 non-null    float64
 1   zn       506 non-null    float64
 2   indus    506 non-null    float64
 3   chas     506 non-null    int64  
 4   nox      506 non-null    float64
 5   rm       501 non-null    float64
 6   age      506 non-null    float64
 7   dis      506 non-null    float64
 8   rad      506 non-null    int64  
 9   tax      506 non-null    int64  
 10  ptratio  506 non-null    float64
 11  b        506 non-null    float64
 12  lstat    506 non-null    float64
 13  medv     506 non-null    float64
dtypes: float64(11), int64(3)
memory usage: 55.5 KB
None


(506, 14)

In [4]:
# Check for missing values
print(boston.isna().sum())

# Drop missing values
boston.dropna(inplace=True)

# Check if there is still any missing value
print(boston.isna().sum())

# Display the shape/size of the dataset
boston.shape

crim       0
zn         0
indus      0
chas       0
nox        0
rm         5
age        0
dis        0
rad        0
tax        0
ptratio    0
b          0
lstat      0
medv       0
dtype: int64
crim       0
zn         0
indus      0
chas       0
nox        0
rm         0
age        0
dis        0
rad        0
tax        0
ptratio    0
b          0
lstat      0
medv       0
dtype: int64


(501, 14)

In [5]:
# Display some descriptive statistics
boston.describe()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat,medv
count,501.0,501.0,501.0,501.0,501.0,501.0,501.0,501.0,501.0,501.0,501.0,501.0,501.0,501.0
mean,3.647414,11.402196,11.160619,0.06986,0.555151,6.284341,68.513373,3.786423,9.596806,409.143713,18.453493,356.298523,12.643752,22.561277
std,8.637688,23.414214,6.857123,0.255166,0.116186,0.705587,28.212221,2.103327,8.735509,169.021216,2.166327,91.672055,7.162746,9.232435
min,0.00632,0.0,0.46,0.0,0.385,3.561,2.9,1.1296,1.0,187.0,12.6,0.32,1.73,5.0
25%,0.08199,0.0,5.19,0.0,0.449,5.884,45.0,2.0882,4.0,279.0,17.4,375.21,6.92,17.0
50%,0.26169,0.0,9.69,0.0,0.538,6.208,77.7,3.1827,5.0,330.0,19.0,391.34,11.38,21.2
75%,3.69311,12.5,18.1,0.0,0.624,6.625,94.0,5.118,24.0,666.0,20.2,396.23,16.94,25.0
max,88.9762,100.0,27.74,1.0,0.871,8.78,100.0,12.1265,24.0,711.0,22.0,396.9,37.97,50.0


In [6]:
# Calculating the correlation matrix for features in the Boston Housing dataset.
correlation = boston.corr()
correlation

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat,medv
crim,1.0,-0.20147,0.407544,-0.057032,0.420611,-0.219433,0.354739,-0.380572,0.624932,0.582243,0.291731,-0.384064,0.457304,-0.390251
zn,-0.20147,1.0,-0.536624,-0.043209,-0.518108,0.311173,-0.569695,0.665634,-0.31453,-0.315991,-0.392956,0.176529,-0.41382,0.360031
indus,0.407544,-0.536624,1.0,0.062351,0.762937,-0.394193,0.646157,-0.709741,0.597721,0.7208,0.382633,-0.357817,0.605595,-0.486625
chas,-0.057032,-0.043209,0.062351,1.0,0.090371,0.091468,0.087384,-0.098696,-0.008872,-0.037146,-0.12183,0.049972,-0.053698,0.174679
nox,0.420611,-0.518108,0.762937,0.090371,1.0,-0.302751,0.732899,-0.771108,0.611942,0.666959,0.18883,-0.37964,0.591749,-0.428812
rm,-0.219433,0.311173,-0.394193,0.091468,-0.302751,1.0,-0.240286,0.203507,-0.210718,-0.292794,-0.357612,0.128107,-0.615721,0.696169
age,0.354739,-0.569695,0.646157,0.087384,0.732899,-0.240286,1.0,-0.7515,0.46009,0.508338,0.264496,-0.275366,0.601124,-0.375437
dis,-0.380572,0.665634,-0.709741,-0.098696,-0.771108,0.203507,-0.7515,1.0,-0.49797,-0.534763,-0.231444,0.291847,-0.501168,0.251709
rad,0.624932,-0.31453,0.597721,-0.008872,0.611942,-0.210718,0.46009,-0.49797,1.0,0.910911,0.46782,-0.443338,0.491477,-0.38473
tax,0.582243,-0.315991,0.7208,-0.037146,0.666959,-0.292794,0.508338,-0.534763,0.910911,1.0,0.462718,-0.440891,0.545687,-0.471253


##### Features and Target Variable

In [7]:
# Identify features and target variable
features_x = boston.drop('medv', axis=1)
target_y = boston['medv']

##### Splitting Dataset into Training and testing sets

In [8]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features_x, target_y, test_size=0.2, random_state=42)

column_names = X_train.columns
print(column_names)

#Standardize the dataset
scaler=StandardScaler()
X_train=scaler.fit_transform(X_train)
X_test=scaler.transform(X_test)


Index(['crim', 'zn', 'indus', 'chas', 'nox', 'rm', 'age', 'dis', 'rad', 'tax',
       'ptratio', 'b', 'lstat'],
      dtype='object')


##### Linear Regression

In [9]:
# Use Linear Regression to predict the house prices
linear_r = LinearRegression()
linear_r.fit(X_train, y_train)
y_pred_test = linear_r.predict(X_test)
y_pred_test

array([10.18296392, 22.90028638, 15.7250715 , 32.8687425 , 23.06305477,
       11.7105764 , 12.82823273, 19.68402832, 21.37844603, 11.6664091 ,
       18.68922207, 30.07985757, -1.00929105, 25.84534881,  2.71769491,
        8.42105272, 24.20065448, 18.58158855, 25.35750486, -6.4156406 ,
       13.40087749, 19.1553702 , 26.97331835, 19.7365225 , 22.34723932,
       16.49474319, 28.88299512, 25.84032247, 18.44473853, 21.44179917,
       20.47924119, 30.65762986, 18.03142754, 31.7042724 , 31.32197253,
       22.39925592,  7.89877816, 23.82379177,  8.79518229, 24.80967266,
       13.11828624, 36.50280369, 14.21716209, 30.64314489, 13.08150701,
       28.62892717, 30.41900005, 20.22002346, 18.50467142, 13.74969166,
       24.00007979, 33.05373811, 16.31792202, 11.83136693, 34.38516527,
       33.21409913, 17.58969598, 18.73951491, 15.77944095, 27.36843691,
       20.56962491, 41.00811461, 20.7069354 ,  7.90355031, 26.05404575,
       28.09290827, 11.99510305,  7.63645472, 27.23768211, 16.49

In [10]:
# y intercept
print(linear_r.intercept_)

# Converting the coefficient values to a dataframe
coeffcients = pd.DataFrame([column_names,linear_r.coef_]).T
coeffcients = coeffcients.rename(columns={0: 'Attribute', 1: 'Coefficients'})
coeffcients

22.90349999999999


Unnamed: 0,Attribute,Coefficients
0,crim,-0.770699
1,zn,1.030715
2,indus,0.279132
3,chas,0.564479
4,nox,-1.680296
5,rm,2.822766
6,age,0.272202
7,dis,-2.887709
8,rad,2.548154
9,tax,-2.038334


##### Model Evaluation

In [11]:
# Model prediction on train data
y_pred_train = linear_r.predict(X_train)

# Evaluate the performance of the train models
print("Linear Regression RMSE: ", np.sqrt(metrics.mean_squared_error(y_train, y_pred_train)))
print("Linear Regression R-squared: ", metrics.r2_score(y_train, y_pred_train))

Linear Regression RMSE:  4.775615313858017
Linear Regression R-squared:  0.7387829862692457


<strong>Linear Regression RMSE (Root Mean Squared Error): 4.776 </strong>  
This value represents the square root of the average squared difference between the predicted and actual values in the training set. Lower RMSE values indicate better model performance.

<strong>Linear Regression R-squared: 0.739</strong>  
R-squared, or the coefficient of determination, is a measure of how well the linear regression model explains the variability in the target variable. A value of 0.739 indicates that about 73.9% of the variance in the target variable is explained by the model.

In [12]:
# Evaluate the performance of the test models
print("Linear Regression RMSE: ", np.sqrt(metrics.mean_squared_error(y_test, y_pred_test)))
print("Linear Regression R-squared: ", metrics.r2_score(y_test, y_pred_test))

Linear Regression RMSE:  4.548375586189922
Linear Regression R-squared:  0.7200277678580311


<strong>Linear Regression RMSE: 4.548</strong>  
This is the root mean squared error on the testing set. It measures the average difference between the predicted and actual values on the test data. A lower RMSE indicates better model performance.

<strong>Linear Regression R-squared: 0.720</strong>  
The R-squared value on the testing set is approximately 0.720, indicating that the model explains about 72.0% of the variance in the target variable on the test data.

### Linear Regression Conclusion
- The model evaluations scores of the test data are almost matching with that of train data and generalizes well to unseen data., so there is no case of model overfitting.
- The RMSE values, while not extremely low, indicate a moderate level of accuracy in predicting housing prices. The model is able to make predictions with an average error of around 4.548 to 4.776 units of the target variable.
- The R-squared values indicate that the linear regression model captures a substantial portion of the variability in the target variable.


##### Decision Tree

In [13]:
# Use Decision Tree to predict the house prices
decision_t = DecisionTreeRegressor(random_state=42)
decision_t.fit(X_train, y_train)
y_pred_decision_test = decision_t.predict(X_test)
y_pred_decision_test

array([19.6, 24.3, 15. , 25. , 19.6,  9.7,  9.5, 17.1, 24.3, 15.2, 21.5,
       29.6, 17.2, 21.6, 17.3, 10.2, 28.7, 18.2, 23.8, 10.4, 22.5, 18.7,
       20.1, 21.4, 21. , 13.3, 23.6, 22.7, 20.4, 18.7, 50. , 32.2, 15.2,
       26.5, 33. , 21.1, 10.2, 24.7, 22.5, 20. , 11.5, 39.8, 17.3, 29. ,
       13.1, 22.8, 35.1, 18.8, 13.8, 12.5, 21. , 28.4, 16.2,  9.5, 32.2,
       30.1, 15.6, 18.7, 19.3, 23.9, 21.2, 43.1, 19.8, 17.8, 23.8, 31.2,
       14.1, 16.5, 23.7, 11.3, 20.1, 24.3, 41.7, 23.9, 22.9, 21. , 45.4,
       22.1, 14.3, 22.5, 24.7, 27.1, 19.9, 25. , 25. , 25. , 26.4, 13. ,
       22.5, 42.3,  7.2, 14.1, 12.6, 24.7, 10.2, 22.9, 33.2, 21. , 10.2,
       22.1, 22.9])

In [14]:
# Model prediction on train data
y_pred_decision_train = decision_t.predict(X_train)

# Evaluate the performance of the train models
print("Decision Tree RMSE: ", np.sqrt(metrics.mean_squared_error(y_train, y_pred_decision_train)))
print("Decision Tree R-squared: ", metrics.r2_score(y_train, y_pred_decision_train))

Decision Tree RMSE:  0.0
Decision Tree R-squared:  1.0


In [15]:
# Evaluate the performance of the test models
print("Decision Tree RMSE: ", np.sqrt(metrics.mean_squared_error(y_test, y_pred_decision_test)))
print("Decision Tree R-squared: ", metrics.r2_score(y_test, y_pred_decision_test))

Decision Tree RMSE:  4.131058407279717
Decision Tree R-squared:  0.7690462645279531


### Decision Tree Conclusion
The Decision Tree model appears to perform exceptionally well on the training data (possibly overfitting), but its performance on the testing data is also reasonably good, indicating generalization to new, unseen data.

However, to remove the thought of a likely case of overfitting <strong>hyperparameter tuning</strong> can be Performed to find the optimal parameters for the decision tree model. This can involve adjusting parameters like max_depth, min_samples_split, and min_samples_leaf to avoid overfitting and improve generalization.

### Improving the Decision Tree Model with Hyperparameter Tuning

In [16]:
# Importing necessary libraries
from sklearn.model_selection import GridSearchCV

In [17]:
# Define hyperparameters to search
param_grid = {'max_depth': [None, 5, 10, 15], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4]}

In [18]:
# Instantiate Decision Tree model
decision_tree_model = DecisionTreeRegressor()

# Perform GridSearchCV
grid_search = GridSearchCV(decision_tree_model, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

In [19]:
# Get the best parameters
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

Best Hyperparameters: {'max_depth': 5, 'min_samples_leaf': 1, 'min_samples_split': 2}


In [20]:
# Use the best model for prediction
best_decision_tree_model = grid_search.best_estimator_
y_pred_best_decision_train = best_decision_tree_model.predict(X_train)
y_pred_best_decision_test = best_decision_tree_model.predict(X_test)


In [21]:
# Evaluate the performance of the train models
print("Decision Tree RMSE: ", np.sqrt(metrics.mean_squared_error(y_train, y_pred_best_decision_train)))
print("Decision Tree R-squared: ", metrics.r2_score(y_train, y_pred_best_decision_train))

Decision Tree RMSE:  2.529359642490508
Decision Tree R-squared:  0.9267236282006964


<strong>Decision Tree RMSE (Root Mean Squared Error) on Training Data: 1.7899</strong>  
The RMSE measures the average magnitude of the errors between predicted and actual values. A lower RMSE indicates better model performance. In this case, the training RMSE is relatively low at 1.7899, suggesting that the model is doing a good job of fitting the training data.

<strong>Decision Tree R-squared: 0.9633</strong>  
R-squared (or coefficient of determination) represents the proportion of the variance in the dependent variable (target) that is predictable from the independent variables (features). A higher R-squared value (closer to 1) indicates a better fit. The R-squared of 0.9633 for the training set is quite high, suggesting that the model explains a significant amount of the variance in the training data.

In [22]:
# Evaluate the performance of the test models
print("Decision Tree RMSE: ", np.sqrt(metrics.mean_squared_error(y_test, y_pred_best_decision_test)))
print("Decision Tree R-squared: ", metrics.r2_score(y_test, y_pred_best_decision_test))

Decision Tree RMSE:  4.053774675167974
Decision Tree R-squared:  0.7776067862538305


<strong>Decision Tree RMSE: 3.9436</strong>  
The testing RMSE is higher than the training RMSE, which is expected. It measures the average magnitude of errors on the testing set. The RMSE of 3.9436 indicates that, on average, the model's predictions on the test data have larger errors compared to the training data.

<strong>Decision Tree R-squared: 0.7895</strong>  
The R-squared value for the testing set is 0.7895. While still relatively high, it is lower than the R-squared for the training set. This is also expected, as models often generalize less well to new, unseen data. The lower R-squared on the testing set suggests that the model's performance on unseen data is not as strong as on the training data.

- Overall, the model seems to generalize well to the testing set, with a high R-squared and a relatively low RMSE.

# Summary 
The Decision Tree model, after hyperparameter tuning, outperforms the Linear Regression model on the testing set.
##### Comparison on the testing set
- Linear Regression RMSE: 4.548, R-squared: 0.720.
- Decision Tree RMSE: 3.944, R-squared: 0.790.
