In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier

In [2]:
file_path = r'C:\Users\BP\OneDrive - BCIT\SEM4\AIM\W5 CrossValidation\House_Prices.csv'
df = pd.read_csv(file_path)
df.head()

Unnamed: 0.1,Unnamed: 0,Record,Sale_amount,Sale_date,Beds,Baths,Sqft_home,Sqft_lot,Type,Build_year,Town,University,Type2
0,1,1,295000.0,42521,5,3.0,2020,38332.8,3,1976,1,10,3
1,2,2,240000.0,42541,4,2.0,1498,54014.4,3,2002,1,10,3
2,3,3,385000.0,42521,5,4.0,4000,85813.2,3,2001,1,10,3
3,4,4,268000.0,42472,3,2.5,2283,118918.8,3,1972,1,10,3
4,5,5,186000.0,42465,3,1.25,1527,15681.6,3,1975,1,10,3


# Step 1

In [3]:
# data cleaning
df = df.drop(columns =['Unnamed: 0','University','Record','Type2'])

print('The number of rows: ',df.shape[0])
print('The number of columns excluding row markers: ',df.shape[1])
df.head()

The number of rows:  10659
The number of columns excluding row markers:  9


Unnamed: 0,Sale_amount,Sale_date,Beds,Baths,Sqft_home,Sqft_lot,Type,Build_year,Town
0,295000.0,42521,5,3.0,2020,38332.8,3,1976,1
1,240000.0,42541,4,2.0,1498,54014.4,3,2002,1
2,385000.0,42521,5,4.0,4000,85813.2,3,2001,1
3,268000.0,42472,3,2.5,2283,118918.8,3,1972,1
4,186000.0,42465,3,1.25,1527,15681.6,3,1975,1


Why are the dates just numbers?  Why is this ok?

*when importing csv file containing dates into a padas DataFrame, the dates are often represented as timestamps. It's acceptable that the dates appear numbers as pandas can work with datetime data. The computer can compute numeric values easily, however, it's important to convert the numbers to a human-readable format for interpretation.*

# Step 2

*Reference:* https://www.analyticsvidhya.com/blog/2021/06/tune-hyperparameters-with-gridsearchcv.

In [4]:
# Create X and Y
X = df.drop(columns=['Town'])
Y = df['Town']

# Test/train Split
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.2, random_state=2023)

### Model1: Decision Tree

In [5]:
tree_param = {
    'max_depth': range(1, 20),
    'min_samples_split': range(2, 11),
    'min_samples_leaf': range(1, 11)
}

tree_model = DecisionTreeClassifier()
tree_grid_search = GridSearchCV(tree_model, tree_param, cv=5)
tree_grid_search.fit(X_train, Y_train)

# Get the best model and its parameters
best_tree_model = tree_grid_search.best_estimator_
tree_accuracy = tree_grid_search.best_score_
 
print(best_tree_model)
print('accuracy score: ',tree_accuracy)

DecisionTreeClassifier(max_depth=11, min_samples_leaf=7, min_samples_split=5)
accuracy score:  0.2886135873731835


### Model2: KNN

In [6]:
knn_param = {
    'n_neighbors': range(1,15),
    'p':[1,2,3]
}

knn_model = KNeighborsClassifier()

knn_grid_search = GridSearchCV(knn_model, knn_param, cv=5)
knn_grid_search.fit(X_train, Y_train)

best_knn_model = knn_grid_search.best_estimator_
knn_accuracy = knn_grid_search.best_score_
 
print(best_knn_model)
print('accuracy score: ',knn_accuracy)

KNeighborsClassifier(n_neighbors=14, p=1)
accuracy score:  0.1430748814774833


# Define the best model

In [7]:
model_scores = {'Decision Tree': tree_accuracy,
                'KNN': knn_accuracy}

# Find the model with the lowest accuracy score
best_model = max(model_scores, key=model_scores.get)
print('The best model is: ',best_model)

The best model is:  Decision Tree


# Test the model with test set

In [8]:
# Fit the best model on the training data
best_tree_model.fit(X_train, Y_train)

# Predict on training and test data
Y_pred_train = best_tree_model.predict(X_train)
Y_pred_test = best_tree_model.predict(X_test)

train_accuracy = accuracy_score(Y_train,Y_pred_train)
test_accuracy = accuracy_score(Y_test,Y_pred_test)

print ("Training Accuracy is ", train_accuracy)
print ("Testing Accuracy is ", test_accuracy)

Training Accuracy is  0.4496305851999531
Testing Accuracy is  0.3044090056285178


In [9]:
# training accuracy
# testing accuracy
diff = train_accuracy - test_accuracy
print(diff * 100, '%')

# Check for overfitting or underfitting
if 0 < diff <= 0.02:
    print("The model is likely a good fit.")
elif 0.02 < diff <= 0.10:
    print("The model is acceptably good fit.")
else:
    print("The model might be overfitting.")

14.522157957143527 %
The model might be overfitting.


*Reference:* https://machinelearningmastery.com/overfitting-and-underfitting-with-machine-learning-algorithms/

Overfitting happens when the model performs better on the training data compared to the testing data. It commonly occurs with non-parametric and non-linear models. Even though Decision Tree model is defined as the best model, it tends to overfitting training data.

In summary, either overfitting and underfitting might lead to poor model performance, but overfitting is more common. To limit overfitting, it is neccesary to conduct test/train split and cross-validation.

# Prediction

*Lee purchased a 1,450 sq ft Single Family home (coded as 3) on 2018-04-11, (43201) for $350,000.  The house has 3 bedrooms and 2 baths.  It was built in 1992, and is on a 40,000 square foot lot. What town is it?*

In [10]:
# Prepare the new data in the same format as the training data
# Sale_amount, Sale_date, Beds, Baths, Sqft_home, Sqft_lot, Type, Build_year
new_house_features = [350000, 43201, 3, 2, 1450, 40000, 3, 1992]
new_house_features = np.array(new_house_features).reshape(1, -1)  # Reshape to a 2D array

# Predict the town for the new house using the trained model
predicted_town = best_tree_model.predict(new_house_features)

print("Predicted Town:", predicted_town)

Predicted Town: [47]


