In [140]:
import pandas as pd
import plotly.express as px
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

from sklearn.datasets import  fetch_california_housing

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor

from sklearn.preprocessing import MinMaxScaler, StandardScaler

1	Id	To count the records.

2	MSSubClass	 Identifies the type of dwelling involved in the sale.

3	MSZoning	Identifies the general zoning classification of the sale.

4	LotArea	 Lot size in square feet.

5	LotConfig	Configuration of the lot

6	BldgType	Type of dwelling

7	OverallCond	Rates the overall condition of the house

8	YearBuilt	Original construction year

9	YearRemodAdd	Remodel date (same as construction date if no remodeling or additions).

10	Exterior1st	Exterior covering on house

11	BsmtFinSF2	Type 2 finished square feet.

12	TotalBsmtSF	Total square feet of basement area

13	SalePrice	To be predicted

# Read and clean the data

In [141]:
# read the data
data = pd.read_excel('Data/HousePricePrediction.xlsx')
data = data.dropna()
data = data.drop(['Id'], axis=1)

### Numerical Dataframe & Categorical dataframe

In [142]:
# numerical data
numerical_data = data.select_dtypes(include=['int64', 'float64'])

# categorical data
categorical_data = data.select_dtypes(include=['object'])


In [186]:
# Calculate the correlation matrix
correlation_matrix = numerical_data.corr()
correlation_matrix

Unnamed: 0,MSSubClass,LotArea,OverallCond,YearBuilt,YearRemodAdd,BsmtFinSF2,TotalBsmtSF,SalePrice
MSSubClass,1.0,-0.139781,-0.059316,0.02785,0.040581,-0.065649,-0.238518,-0.084284
LotArea,-0.139781,1.0,-0.005636,0.014228,0.013788,0.11117,0.260833,0.263843
OverallCond,-0.059316,-0.005636,1.0,-0.375983,0.073741,0.040229,-0.171098,-0.077856
YearBuilt,0.02785,0.014228,-0.375983,1.0,0.592855,-0.049107,0.391452,0.522897
YearRemodAdd,0.040581,0.013788,0.073741,0.592855,1.0,-0.067759,0.291066,0.507101
BsmtFinSF2,-0.065649,0.11117,0.040229,-0.049107,-0.067759,1.0,0.10481,-0.011378
TotalBsmtSF,-0.238518,0.260833,-0.171098,0.391452,0.291066,0.10481,1.0,0.613581
SalePrice,-0.084284,0.263843,-0.077856,0.522897,0.507101,-0.011378,0.613581,1.0


### Drop irrelevent coloumns 

In [50]:
data.columns

Index(['MSSubClass', 'MSZoning', 'LotArea', 'LotConfig', 'BldgType',
       'OverallCond', 'YearBuilt', 'YearRemodAdd', 'Exterior1st', 'BsmtFinSF2',
       'TotalBsmtSF', 'SalePrice'],
      dtype='object')

In [51]:
categorical_data.columns

Index(['MSZoning', 'LotConfig', 'BldgType', 'Exterior1st'], dtype='object')

In [52]:
numerical_data.columns

Index(['MSSubClass', 'LotArea', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'BsmtFinSF2', 'TotalBsmtSF', 'SalePrice'],
      dtype='object')

In [53]:
from scipy.stats import chi2_contingency

# Calculate chi-square test for each categorical column
chi2_results = {}
for column in categorical_data.columns:
    contingency_table = pd.crosstab(categorical_data[column], categorical_data['Exterior1st'])
    chi2, p_value, _, _ = chi2_contingency(contingency_table)
    chi2_results[column] = {'chi2': chi2, 'p_value': p_value}

# Print chi-square test results
for column, result in chi2_results.items():
    print(f"\nChi-square test for {column}:")
    print(f"Chi-square statistic: {result['chi2']:.4f}")
    print(f"P-value: {result['p_value']:.4f}")




Chi-square test for MSZoning:
Chi-square statistic: 240.3641
P-value: 0.0000

Chi-square test for LotConfig:
Chi-square statistic: 72.0282
P-value: 0.0733

Chi-square test for BldgType:
Chi-square statistic: 210.1977
P-value: 0.0000

Chi-square test for Exterior1st:
Chi-square statistic: 20440.0000
P-value: 0.0000


### Choose important variables to be trained and tested

In [93]:
data_chose = data[['MSSubClass', 'MSZoning', 'LotArea', 'LotConfig', 'BldgType',
       'OverallCond', 'YearBuilt', 'YearRemodAdd', 'Exterior1st', 'BsmtFinSF2',
       'TotalBsmtSF', 'SalePrice']]


In [94]:
## Define features and target variable

x = data_chose.drop('SalePrice', axis=1)
y = data_chose['SalePrice']

### encoding categorical columns

In [95]:

# Apply pd.get_dummies() only to the existing columns
X_encoded = pd.get_dummies(x, columns=categorical_data.columns)
X_encoded

Unnamed: 0,MSSubClass,LotArea,OverallCond,YearBuilt,YearRemodAdd,BsmtFinSF2,TotalBsmtSF,MSZoning_C (all),MSZoning_FV,MSZoning_RH,...,Exterior1st_CemntBd,Exterior1st_HdBoard,Exterior1st_ImStucc,Exterior1st_MetalSd,Exterior1st_Plywood,Exterior1st_Stone,Exterior1st_Stucco,Exterior1st_VinylSd,Exterior1st_Wd Sdng,Exterior1st_WdShing
0,60,8450,5,2003,2003,0.0,856.0,False,False,False,...,False,False,False,False,False,False,False,True,False,False
1,20,9600,8,1976,1976,0.0,1262.0,False,False,False,...,False,False,False,True,False,False,False,False,False,False
2,60,11250,5,2001,2002,0.0,920.0,False,False,False,...,False,False,False,False,False,False,False,True,False,False
3,70,9550,5,1915,1970,0.0,756.0,False,False,False,...,False,False,False,False,False,False,False,False,True,False
4,60,14260,5,2000,2000,0.0,1145.0,False,False,False,...,False,False,False,False,False,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,60,7917,5,1999,2000,0.0,953.0,False,False,False,...,False,False,False,False,False,False,False,True,False,False
1456,20,13175,6,1978,1988,163.0,1542.0,False,False,False,...,False,False,False,False,True,False,False,False,False,False
1457,70,9042,9,1941,2006,0.0,1152.0,False,False,False,...,True,False,False,False,False,False,False,False,False,False
1458,20,9717,6,1950,1996,1029.0,1078.0,False,False,False,...,False,False,False,True,False,False,False,False,False,False


### Perform Train Test Split

In [187]:
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

### Model Selection - KNeighborsRegressor

In [188]:
# Train and evaluate KNN Regressor on standardized data

y_pred_standardized = knn_standardized.predict(X_test_standardized)

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

mse_standardized = mean_squared_error(y_test, y_pred_standardized)
mae_standardized = mean_absolute_error(y_test, y_pred_standardized)
r2_standardized = r2_score(y_test, y_pred_standardized)

mse_standardized, mae_standardized, r2_standardized

(3016578724.410959, 33175.69452054794, 0.6067209342955436)

In [189]:
# Train and evaluate KNN Regressor on normalized data
knn_normalized = KNeighborsRegressor(n_neighbors=5)
knn_normalized.fit(X_train_normalized, y_train)
y_pred_normalized = knn_normalized.predict(X_test_normalized)
mse_normalized = mean_squared_error(y_test, y_pred_normalized)
mae_normalized = mean_absolute_error(y_test, y_pred_normalized)
r2_normalized = r2_score(y_test, y_pred_normalized)

mse_normalized, mae_normalized, r2_normalized

(3264400143.8260274, 35084.68219178082, 0.5744118234805617)

In [190]:

# Train and evaluate KNN Regressor on non-transformed data
knn = KNeighborsRegressor(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
mse_non_transformed = mean_squared_error(y_test, y_pred)
mae_non_transformed = mean_absolute_error(y_test, y_pred)
r2_non_transformed = r2_score(y_test, y_pred)

mse_non_transformed, mae_non_transformed, r2_non_transformed

(3662324916.580684, 40176.6897260274, 0.5225333554720174)

In [191]:

# Display results
results = {
    'Data Transformation': ['Non-Transformed', 'Standardized', 'Normalized'],
    'MSE': [mse_non_transformed, mse_standardized, mse_normalized],
    'MAE': [mae_non_transformed, mae_standardized, mae_normalized],
    'R^2': [r2_non_transformed, r2_standardized, r2_normalized]
}
results_df = pd.DataFrame(results)

# Calculate the improvement percentages compared to the non-transformed data
mse_non_transformed = results_df.loc[results_df['Data Transformation'] == 'Non-Transformed', 'MSE'].values[0]
mae_non_transformed = results_df.loc[results_df['Data Transformation'] == 'Non-Transformed', 'MAE'].values[0]
r2_non_transformed = results_df.loc[results_df['Data Transformation'] == 'Non-Transformed', 'R^2'].values[0]

# Improvements for Standardized data
mse_standardized_improvement = (mse_non_transformed - results_df.loc[results_df['Data Transformation'] == 'Standardized', 'MSE'].values[0]) / mse_non_transformed * 100
mae_standardized_improvement = (mae_non_transformed - results_df.loc[results_df['Data Transformation'] == 'Standardized', 'MAE'].values[0]) / mae_non_transformed * 100
r2_standardized_improvement = (results_df.loc[results_df['Data Transformation'] == 'Standardized', 'R^2'].values[0] - r2_non_transformed) / r2_non_transformed * 100

# Improvements for Normalized data
mse_normalized_improvement = (mse_non_transformed - results_df.loc[results_df['Data Transformation'] == 'Normalized', 'MSE'].values[0]) / mse_non_transformed * 100
mae_normalized_improvement = (mae_non_transformed - results_df.loc[results_df['Data Transformation'] == 'Normalized', 'MAE'].values[0]) / mae_non_transformed * 100
r2_normalized_improvement = (results_df.loc[results_df['Data Transformation'] == 'Normalized', 'R^2'].values[0] - r2_non_transformed) / r2_non_transformed * 100

# Display improvement percentages
improvements = {
    'Data Transformation': ['Standardized', 'Normalized'],
    'MSE Improvement (%)': [mse_standardized_improvement, mse_normalized_improvement],
    'MAE Improvement (%)': [mae_standardized_improvement, mae_normalized_improvement],
    'R^2 Improvement (%)': [r2_standardized_improvement, r2_normalized_improvement]
}
improvements_df = pd.DataFrame(improvements)

pd.DataFrame(results)

Unnamed: 0,Data Transformation,MSE,MAE,R^2
0,Non-Transformed,3662325000.0,40176.689726,0.522533
1,Standardized,3016579000.0,33175.694521,0.606721
2,Normalized,3264400000.0,35084.682192,0.574412


In [184]:
improvements_df

Unnamed: 0,Data Transformation,MSE Improvement (%),MAE Improvement (%),R^2 Improvement (%)
0,Standardized,17.632138,17.425515,16.111427
1,Normalized,10.86536,12.674035,9.92826
