$\textbf{HOUSE SALE PRICES}$

This notebook shows a submission to the Housing Prices competition on Kaggle(https://www.kaggle.com/competitions/home-data-for-ml-course/overview), which aims to predict the final price of homes based on various features such as overall quality, year built, total area, and neighborhood.

The entire workflow — from data loading and preprocessing to model training, tuning, and evaluation — is performed in this single Jupyter notebook.

In [7]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
# from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from xgboost import XGBRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt


Importing the training and testing data

In [52]:
train_data = pd.read_csv('./train.csv')
test_data = pd.read_csv('./test.csv')
print("TRAINING DATA:\n ")
print(train_data.head())
print("\n\nTESTING DATA: \n")
print(test_data.head())

TRAINING DATA:
 
   Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
0   1          60       RL         65.0     8450   Pave   NaN      Reg   
1   2          20       RL         80.0     9600   Pave   NaN      Reg   
2   3          60       RL         68.0    11250   Pave   NaN      IR1   
3   4          70       RL         60.0     9550   Pave   NaN      IR1   
4   5          60       RL         84.0    14260   Pave   NaN      IR1   

  LandContour Utilities  ... PoolArea PoolQC Fence MiscFeature MiscVal MoSold  \
0         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   
1         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      5   
2         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      9   
3         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   
4         Lvl    AllPub  ...        0    NaN   NaN         NaN       0     12   

  YrSold  SaleType  SaleCondition  SalePrice  
0   

$\textbf{Data Preprocessing:}$

In [9]:
# FOR TRAINING DATA
# Remove rows with missing target values
X_train_full = train_data.dropna(axis=0, subset=['SalePrice'])

# create target series and drop it from the deatures
y_train_full = X_train_full['SalePrice']
X_train_full.drop(columns=['SalePrice', 'Id'], axis=1, inplace=True)

# Split the full data into training and validation data
X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, random_state=0, train_size=0.8)

# SIMILARLY FOR TESTING DATA
X_test = test_data.drop(columns=['Id'], axis=1)



Checking The carinality of object columns:

In [10]:
categorical_columns = [col for col in X_train if X_train[col].dtypes=='object']
categorical_cardinalities = [X_train[col].nunique() for col in categorical_columns]
d = dict(zip(categorical_columns, categorical_cardinalities))
len(list(d.items()))
(list(d.items()))

[('MSZoning', 5),
 ('Street', 2),
 ('Alley', 2),
 ('LotShape', 4),
 ('LandContour', 4),
 ('Utilities', 2),
 ('LotConfig', 5),
 ('LandSlope', 3),
 ('Neighborhood', 25),
 ('Condition1', 9),
 ('Condition2', 6),
 ('BldgType', 5),
 ('HouseStyle', 8),
 ('RoofStyle', 6),
 ('RoofMatl', 7),
 ('Exterior1st', 15),
 ('Exterior2nd', 16),
 ('MasVnrType', 4),
 ('ExterQual', 4),
 ('ExterCond', 5),
 ('Foundation', 6),
 ('BsmtQual', 4),
 ('BsmtCond', 4),
 ('BsmtExposure', 4),
 ('BsmtFinType1', 6),
 ('BsmtFinType2', 6),
 ('Heating', 6),
 ('HeatingQC', 5),
 ('CentralAir', 2),
 ('Electrical', 5),
 ('KitchenQual', 4),
 ('Functional', 6),
 ('FireplaceQu', 5),
 ('GarageType', 6),
 ('GarageFinish', 3),
 ('GarageQual', 5),
 ('GarageCond', 5),
 ('PavedDrive', 3),
 ('PoolQC', 3),
 ('Fence', 4),
 ('MiscFeature', 3),
 ('SaleType', 9),
 ('SaleCondition', 6)]

Here, I choose to discard all columns where the cardinality is greater than 10.

In [11]:
# Separating the low- (nunique<=10) and high- cardinality columns
low_cardinality_cols = [col for col in categorical_columns if X_train[col].nunique()<=10]
high_cardinality_cols = list(set(categorical_columns) - set(low_cardinality_cols))

# Selecting the numeric columns:
numeric_cols = [col for col in X_train if  train_data[col].dtypes=='int64' or train_data[col].dtypes=='float64']

data_cols = numeric_cols + low_cardinality_cols
X_train_reduced = X_train[data_cols].copy()
X_val_reduced = X_val[data_cols].copy()
X_test_reduced = X_test[data_cols].copy()

In [12]:
# Numerical preprocessing
numerical_transformer = SimpleImputer(strategy='median')

# categorical preprocessing
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])


preprocessor = ColumnTransformer(transformers=[
    ('num', numerical_transformer, numeric_cols),
    ('cat', categorical_transformer, low_cardinality_cols)
])


$\textbf{Building Models:}$

In [32]:
# Defining the models
models = {
    'RandomForest' : RandomForestRegressor(random_state=0),
    'XGBoost' : XGBRegressor(random_state=0),
    'DecisionTree' : DecisionTreeRegressor(random_state=0)
}

# Defining the parameter lists for grid search
grid_params = {
    'RandomForest' : {
        'model__n_estimators' : [50, 100, 200, 300, 400]
    },
    'XGBoost' : {
        'model__n_estimators' : [50, 100, 200, 300, 400],
        'model__learning_rate' : [0.03, 0.05, 0.08, 0.1]
    },
    'DecisionTree' : {
        'model__max_leaf_nodes' : [10, 25, 50, 100, 200]
    }
}

In [None]:
best_model_estimator = {}
flag=-1

# Fitting the models
for name, model in models.items():
    
    flag+=1
    print("Grid search running for {}".format(name))

    # A pipeline to run the models through
    my_pipeline = Pipeline(steps = [
        ('preprocessor', preprocessor),
        ('model', model)
    ])

    # searching  for best fit aming the grid_params values
    my_grid = GridSearchCV(my_pipeline, param_grid=grid_params[name],
                            cv=5, n_jobs=None, scoring='neg_mean_absolute_error')
    
    my_grid.fit(X_train_reduced, y_train)

    print("Best params for {} is: {}".format(name, my_grid.best_params_))
    print("Best MAE for {} is: {}".format(name, -my_grid.best_score_))

    # finding the single best estimator
    if flag==0:
        best_mae = -my_grid.best_score_
        best_model = name
        best_model_estimator[name] = my_grid.best_estimator_
    elif -my_grid.best_score_ < best_mae:
        best_model_estimator = {}
        best_mae=-my_grid.best_score_
        best_model = name
        best_model_estimator[name] = my_grid.best_estimator_ 
    

Grid search running for RandomForest
Best params for RandomForest is: {'model__n_estimators': 300}
Best MAE for RandomForest is: 17787.56865868212
Grid search running for XGBoost
Best params for XGBoost is: {'model__learning_rate': 0.08, 'model__n_estimators': 300}
Best MAE for XGBoost is: 16856.659375
Grid search running for DecisionTree
Best params for DecisionTree is: {'model__max_leaf_nodes': 200}
Best MAE for DecisionTree is: 26049.260363094265


In [None]:
# Testing the best model on the validation dataset
final_model = best_model_estimator[best_model]
pred_y_val = final_model.predict(X_val_reduced)
mae_final = mean_absolute_error(pred_y_val, y_val)
print("The MAE on the validaton set is : {}".format(mae_final))

The MAE on the validaton set is : 17317.220703125


$\textbf{Final predictions on test dataset:}$

In [47]:
y_pred_final = final_model.predict(X_test_reduced)

# Exporting the data to a csv
export_data = pd.DataFrame()
export_data['Id'] = test_data.Id
export_data['SalePrice'] = y_pred_final
export_data.to_csv('./sale_price_prediction.csv', index=False)


In [None]:
export_data = pd.DataFrame()
export_data['Id'] = test_data.Id
export_data['SalePrice'] = y_pred_final
export_data.to_csv('./sale_price_prediction_2.csv', index=False)