# Housing: Model testing for competition

In [18]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, cohen_kappa_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import ConfusionMatrixDisplay
import matplotlib.pyplot as plt

# The training set

## Data reading & splitting

In [19]:
# reading
housing = pd.read_csv('housing_classification_competition_train_set.csv')

# Still has to be decided if we should drop duplicates or not
# housing = housing.drop_duplicates()

# Dropping rows where the column 'Expensive' has NaN values
#housing = housing.dropna(subset=['Expensive'])

# X and y creation
X = housing
y = X.pop("Expensive")

In [20]:
X.head()

Unnamed: 0,LotArea,LotFrontage,TotalBsmtSF,BedroomAbvGr,Fireplaces,PoolArea,GarageCars,WoodDeckSF,ScreenPorch,MSZoning,...,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,SaleType,SaleCondition
0,8450,65.0,856,3,0,0,2,0,0,RL,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
1,9600,80.0,1262,3,1,0,2,298,0,RL,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
2,11250,68.0,920,3,1,0,2,0,0,RL,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
3,9550,60.0,756,3,1,0,3,0,0,RL,...,Detchd,Unf,TA,TA,Y,,,,WD,Abnorml
4,14260,84.0,1145,4,1,0,3,192,0,RL,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal


In [21]:
X = X.drop(['FireplaceQu', 'GarageYrBlt', 'Alley', 'MasVnrType', 'PoolQC', 'Fence', 'MiscFeature', 'LotFrontage', 'FireplaceQu'], axis=1)

In [22]:
#Set Id as Index
X.set_index('Id', inplace=True)

In [23]:
X

Unnamed: 0_level_0,LotArea,TotalBsmtSF,BedroomAbvGr,Fireplaces,PoolArea,GarageCars,WoodDeckSF,ScreenPorch,MSZoning,Condition1,...,HeatingQC,Electrical,Functional,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,SaleType,SaleCondition
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,8450,856,3,0,0,2,0,0,RL,Norm,...,Ex,SBrkr,Typ,Attchd,RFn,TA,TA,Y,WD,Normal
2,9600,1262,3,1,0,2,298,0,RL,Feedr,...,Ex,SBrkr,Typ,Attchd,RFn,TA,TA,Y,WD,Normal
3,11250,920,3,1,0,2,0,0,RL,Norm,...,Ex,SBrkr,Typ,Attchd,RFn,TA,TA,Y,WD,Normal
4,9550,756,3,1,0,3,0,0,RL,Norm,...,Gd,SBrkr,Typ,Detchd,Unf,TA,TA,Y,WD,Abnorml
5,14260,1145,4,1,0,3,192,0,RL,Norm,...,Ex,SBrkr,Typ,Attchd,RFn,TA,TA,Y,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1456,7917,953,3,1,0,2,0,0,RL,Norm,...,Ex,SBrkr,Typ,Attchd,RFn,TA,TA,Y,WD,Normal
1457,13175,1542,3,2,0,2,349,0,RL,Norm,...,TA,SBrkr,Min1,Attchd,Unf,TA,TA,Y,WD,Normal
1458,9042,1152,4,2,0,1,0,0,RL,Norm,...,Ex,SBrkr,Typ,Attchd,RFn,TA,TA,Y,WD,Normal
1459,9717,1078,2,0,0,1,366,0,RL,Norm,...,Gd,FuseA,Typ,Attchd,Unf,TA,TA,Y,WD,Normal


In [24]:
y.head()

0    0
1    0
2    0
3    0
4    0
Name: Expensive, dtype: int64

In [25]:
# data splitting
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

## Setting up preprocessor

In [9]:
# select categorical and numerical column names
cat_columns = X.select_dtypes(exclude="number").columns
num_columns = X.select_dtypes(include="number").columns

# create numerical pipeline, only with the SimpleImputer(strategy="mean")
num_pipe = make_pipeline(
    SimpleImputer(strategy="mean"),
    StandardScaler())

# create categorical pipeline, with the SimpleImputer(fill_value="N_A") and the OneHotEncoder
cat_pipe = make_pipeline(
    SimpleImputer(strategy="constant", fill_value="N_A").set_output(transform="pandas"),
    OneHotEncoder(handle_unknown='ignore', sparse_output=False)
)

preprocessor = make_column_transformer(
    (num_pipe, num_columns),
    (cat_pipe, cat_columns)
)

## XGBoost

In [10]:
from xgboost import XGBClassifier

pipe = make_pipeline(preprocessor, StandardScaler(), XGBClassifier())
grid = {
    "xgbclassifier__learning_rate": [0.05],
    "xgbclassifier__n_estimators": [300],
    "xgbclassifier__max_depth": [5],
    "xgbclassifier__min_child_weight": [1],
    "xgbclassifier__subsample": [1.0],
    "xgbclassifier__colsample_bytree": [1.0]
}

# cross validation
xgb_model = GridSearchCV(pipe, grid, cv=50, scoring='accuracy', verbose=1)
xgb_model.fit(X_train, y_train)

# store in dictionary
score = round(xgb_model.best_score_, 4)
score

Fitting 50 folds for each of 1 candidates, totalling 50 fits


0.9479

In [11]:
# Prediction on training set
y_train_pred = xgb_model.predict(X_train)

# Training accuracy
train_accuracy = round(accuracy_score(y_train, y_train_pred),3)

# Prediction on test set
y_test_pred = xgb_model.predict(X_test)

# Testing accuracy
test_accuracy = round(accuracy_score(y_test, y_test_pred),3)

# Storing the results in a dictionary
results = {
    "Training Accuracy": train_accuracy,
    "Testing Accuracy": test_accuracy,
}

results

{'Training Accuracy': 1.0, 'Testing Accuracy': 0.973}

In [12]:
xgb_model.best_params_

{'xgbclassifier__colsample_bytree': 1.0,
 'xgbclassifier__learning_rate': 0.05,
 'xgbclassifier__max_depth': 5,
 'xgbclassifier__min_child_weight': 1,
 'xgbclassifier__n_estimators': 300,
 'xgbclassifier__subsample': 1.0}

In [13]:
import numpy as np

# Extracting feature importances
feature_importances = xgb_model.best_estimator_.named_steps['xgbclassifier'].feature_importances_

# Extracting feature names for numerical columns
num_features = num_columns

# Extracting feature names for categorical columns after OneHotEncoder transformation
cat_transformer = (xgb_model.best_estimator_.named_steps['columntransformer']
                   .transformers_[1][1])  # Adjust index as per your setup
cat_features = cat_transformer.named_steps['onehotencoder'].get_feature_names_out(cat_columns)

# Combine numerical and categorical feature names
all_features = np.concatenate([num_features, cat_features])

# Pair feature names with their importances
feature_importance_pairs = [(feature, importance) for feature, importance in zip(all_features, feature_importances)]

# Create a list of features where importance is greater than 0
significant_features = [feature for feature, importance in feature_importance_pairs if importance > 0]

# significant_features now contains the list of features with importance > 0
#significant_features

# The test set

In [14]:
# reading
test = pd.read_csv('housing_classification_competition_test_set.csv')
test

Unnamed: 0,LotArea,LotFrontage,TotalBsmtSF,BedroomAbvGr,Fireplaces,PoolArea,GarageCars,WoodDeckSF,ScreenPorch,MSZoning,...,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,SaleType,SaleCondition
0,11622,80.0,882.0,2,0,0,1.0,140,120,RH,...,Attchd,Unf,TA,TA,Y,,MnPrv,,WD,Normal
1,14267,81.0,1329.0,3,0,0,1.0,393,0,RL,...,Attchd,Unf,TA,TA,Y,,,Gar2,WD,Normal
2,13830,74.0,928.0,3,1,0,2.0,212,0,RL,...,Attchd,Fin,TA,TA,Y,,MnPrv,,WD,Normal
3,9978,78.0,926.0,3,1,0,2.0,360,0,RL,...,Attchd,Fin,TA,TA,Y,,,,WD,Normal
4,5005,43.0,1280.0,2,0,0,2.0,0,144,RL,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,1936,21.0,546.0,3,0,0,0.0,0,0,RM,...,,,,,Y,,,,WD,Normal
1455,1894,21.0,546.0,3,0,0,1.0,0,0,RM,...,CarPort,Unf,TA,TA,Y,,,,WD,Abnorml
1456,20000,160.0,1224.0,4,1,0,2.0,474,0,RL,...,Detchd,Unf,TA,TA,Y,,,,WD,Abnorml
1457,10441,62.0,912.0,3,0,0,0.0,80,0,RL,...,,,,,Y,,MnPrv,Shed,WD,Normal


In [15]:
test.set_index('Id', inplace=True)
test

Unnamed: 0_level_0,LotArea,LotFrontage,TotalBsmtSF,BedroomAbvGr,Fireplaces,PoolArea,GarageCars,WoodDeckSF,ScreenPorch,MSZoning,...,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,SaleType,SaleCondition
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1461,11622,80.0,882.0,2,0,0,1.0,140,120,RH,...,Attchd,Unf,TA,TA,Y,,MnPrv,,WD,Normal
1462,14267,81.0,1329.0,3,0,0,1.0,393,0,RL,...,Attchd,Unf,TA,TA,Y,,,Gar2,WD,Normal
1463,13830,74.0,928.0,3,1,0,2.0,212,0,RL,...,Attchd,Fin,TA,TA,Y,,MnPrv,,WD,Normal
1464,9978,78.0,926.0,3,1,0,2.0,360,0,RL,...,Attchd,Fin,TA,TA,Y,,,,WD,Normal
1465,5005,43.0,1280.0,2,0,0,2.0,0,144,RL,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2915,1936,21.0,546.0,3,0,0,0.0,0,0,RM,...,,,,,Y,,,,WD,Normal
2916,1894,21.0,546.0,3,0,0,1.0,0,0,RM,...,CarPort,Unf,TA,TA,Y,,,,WD,Abnorml
2917,20000,160.0,1224.0,4,1,0,2.0,474,0,RL,...,Detchd,Unf,TA,TA,Y,,,,WD,Abnorml
2918,10441,62.0,912.0,3,0,0,0.0,80,0,RL,...,,,,,Y,,MnPrv,Shed,WD,Normal


In [16]:
test['Expensive'] = xgb_model.predict(test)

submission = test.reset_index()[['Id', 'Expensive']]
submission.to_csv('housing_submission2.csv', index=False)