Change scoring to balanced accuracy scoring, run all the models, and figure out voting classifier


## 1 Data Reading

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn import set_config

# reading
url = "https://drive.google.com/file/d/1JsZV_7u1TGiQA863-YtYxlia1VNxAJZj/view?usp=sharing"
path = 'https://drive.google.com/uc?export=download&id='+url.split('/')[-2]
data = df = pd.read_csv(path)

# X and y creation
## Removing ID as it is unique to each property, and removing MSSubClass as it is a categorization
## of property types, but there is no way to tell which one is better / worse
X = data
y = X.pop("Expensive")
Id = X.pop("Id")

# # Feature Engineering
# X.loc[:, "Cabin"] = X.Cabin.str[0]

# data splitting
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## 2 Preprocessing

In [2]:
from sklearn.impute import KNNImputer
from sklearn.pipeline import make_pipeline

# 0. Set the config so that we can view our preprocessor
set_config(display="diagram")

# 1. defining categorical & ordinal columns
X_cat = X.select_dtypes(exclude="number").copy()
X_num = X.select_dtypes(include="number").copy()

# 2. numerical pipeline , default strategy we search for the best parameters later
numeric_pipe = make_pipeline(
    SimpleImputer("mean"))

# 3. categorical pipeline

# # 3.1 defining ordinal & onehot columns
# .get_indexer() get's the index to solve the problem described above about losing column names
ordinal_cols = X_cat.columns.get_indexer(["Street",
                                          "CentralAir",
                                          "ExterQual",
                                          "ExterCond",
                                          "BsmtQual",
                                          "BsmtCond",
                                          "BsmtExposure",
                                          "BsmtFinType1",
                                          "KitchenQual",
                                          "FireplaceQu",
                                          "Alley",
                                          "LotShape",
                                          "Utilities",
                                          "LandSlope",
                                          "RoofMatl",
                                          "BsmtFinType2",
                                          "Functional",
                                          "GarageType",
                                          "GarageFinish",
                                          "GarageQual",
                                          "GarageCond",
                                          "PavedDrive",
                                          "PoolQC",
                                          "Fence"])


onehot_cols = X_cat.columns.get_indexer(["MSZoning",
                                         "Condition1",
                                         "Heating",
                                         "Foundation",
                                         "LotShape",
                                         "LotCongig",
                                         "Neighborhood",
                                         "Condition2",
                                         "BldgType",
                                         "HouseStyle",
                                         "LandContour",
                                         "LotConfig",
                                         "Neighborhood",
                                         "Condition2",
                                         "BldgType",
                                         "HouseStyle",
                                         "RoofStyle",
                                         "Exterior1st",
                                         "Exterior2nd",
                                         "MasVnrType",
                                         "HeatingQc",
                                         "Electrical",
                                         "MiscFeature",
                                         "SaleType",
                                         "SaleCondition"])


# # 3.2. defining the categorical encoder

Street_cats = [ "NA", "Pave", "Grvl"]
CentralAir_cats = ["NA","N", "Y" ]
ExterQual_cats = ["NA","Po","Fa","TA","Gd","Ex"]
ExterCond_cats = ["NA","Po","Fa","TA","Gd","Ex"]
BsmtQual_cats = ["NA","Po","Fa","TA","Gd","Ex"]
BsmtCond_cats = ["NA","Po","Fa","TA","Gd","Ex"]
BsmtExposure_cats = ["NA", "No","Mn","Av","Gd"]
BsmtFinType1_cats = ["NA", "Unf","LwQ","Rec","BLQ","ALQ","GLQ"]
KitchenQual_cats = ["NA","Po","Fa","TA","Gd","Ex"]
FireplaceQu_cats = ["NA","Po","Fa","TA","Gd","Ex"]
Alley_cats = [ "NA", "Pave", "Grvl"]
LotShape_cats = ["NA", "IR3","IR2","IR1","Reg"]
Utilities_cats = ["NA", "ELO","NoSeWa","NoSewr","AllPub"]
LandSlope_cats = ["NA", "Sev","Mod","Gtl"]
RoofMatl_cats = ["NA","WdShngl","WdShake","Tar&Grv","Roll","Metal","Membran","CompShg","ClyTile"]
BsmtFinType2_cats = ["NA", "Unf","LwQ","Rec","BLQ","ALQ","GLQ"]
Functional_cats = ["NA","Sal","Sev","Maj2","Maj1","Mod","Min2","Min1","Typ"]
GarageType_cats = ["NA", "Detchd","CarPort","BuiltIn","Basment","Attchd","2Types"]
GarageFinish_cats = ["NA", "Unf","RFn","Fin"]
GarageQual_cats = ["NA","Po","Fa","TA","Gd","Ex"]
GarageCond_cats = ["NA","Po","Fa","TA","Gd","Ex"]
PavedDrive_cats = ["NA","N","P","Y"]
PoolQC_cats = ["NA","Fa","TA","Gd","Ex"]
Fence_cats = ["NA", "MnWw","GdWo","MnPrv","GdPrv"]



# # # 3.2.1. we manually establish the order of the categories for our ordinal feature (Cabin) from worst to best, including "N_A"

ordinal_cats = [Street_cats,
                CentralAir_cats,
                ExterQual_cats,
                ExterCond_cats,
                BsmtQual_cats,
                BsmtCond_cats,
                BsmtExposure_cats,
                BsmtFinType1_cats,
                KitchenQual_cats,
                FireplaceQu_cats,
                Alley_cats,
                LotShape_cats,
                Utilities_cats,
                LandSlope_cats,
                RoofMatl_cats,
                BsmtFinType2_cats,
                Functional_cats,
                GarageType_cats,
                GarageFinish_cats,
                GarageQual_cats,
                GarageCond_cats,
                PavedDrive_cats,
                PoolQC_cats,
                Fence_cats]

ordinal_encoder = OrdinalEncoder(categories=ordinal_cats)

# # # 3.2.2. defining the categorical encoder: a ColumnTransformer with 2 branches: ordinal & onehot
categorical_encoder = ColumnTransformer(
    transformers=[
        ("cat_ordinal", ordinal_encoder, ordinal_cols),
        ("cat_onehot", OneHotEncoder(sparse_output=False, handle_unknown="ignore"), onehot_cols),
    ]
)

# # 3.3. categorical pipeline = "N_A" imputer + categorical encoder
categorical_pipe = make_pipeline(SimpleImputer(strategy="constant", fill_value="NA"),
                                 categorical_encoder
                                )

# 4. full preprocessing: a ColumnTransformer with 2 branches: numeric & categorical
full_preprocessing = ColumnTransformer(
    transformers=[
        ("num_pipe", numeric_pipe, X_num.columns),
        ("cat_pipe", categorical_pipe, X_cat.columns),
    ]
)

full_preprocessing

## 3 LazyPredict


In [3]:
# pip install lazypredict

Collecting lazypredict
  Downloading lazypredict-0.2.12-py2.py3-none-any.whl (12 kB)
Installing collected packages: lazypredict
Successfully installed lazypredict-0.2.12


In [8]:
import lazypredict
from lazypredict.Supervised import LazyClassifier


# Initialize and fit LazyClassifier
clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None)
models, predictions = clf.fit(X_train, X_test, y_train, y_test)

# Print the performance of each model
models

 97%|█████████▋| 28/29 [00:06<00:00,  6.45it/s]

[LightGBM] [Info] Number of positive: 169, number of negative: 999
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001158 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3215
[LightGBM] [Info] Number of data points in the train set: 1168, number of used features: 162
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.144692 -> initscore=-1.776856
[LightGBM] [Info] Start training from score -1.776856


100%|██████████| 29/29 [00:06<00:00,  4.45it/s]


Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Perceptron,0.95,0.92,0.92,0.95,0.1
XGBClassifier,0.97,0.91,0.91,0.96,0.25
BernoulliNB,0.86,0.9,0.9,0.87,0.16
LGBMClassifier,0.96,0.89,0.89,0.96,0.26
LinearSVC,0.95,0.88,0.88,0.94,0.18
ExtraTreesClassifier,0.96,0.87,0.87,0.95,0.3
AdaBoostClassifier,0.95,0.87,0.87,0.95,1.14
LogisticRegression,0.95,0.87,0.87,0.94,0.17
BaggingClassifier,0.95,0.86,0.86,0.95,0.38
LinearDiscriminantAnalysis,0.93,0.86,0.86,0.93,0.2


The top 5 models based on Accuracy are
1. XGBClassifier
2. LGBMClassifier
3. ExtraTreesClassifier
4. Perceptron
5. LogisticRegression.

## 4 Modelling


### 4.1 XGB Classifier

In [12]:
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier


# XGBoost pipeline
xgb_full_pipeline = make_pipeline(full_preprocessing,
                                  StandardScaler(),
                                  XGBClassifier())

# Parameter grid for XGBoost
xgb_param_grid = {
    "columntransformer__num_pipe__simpleimputer__strategy": ["mean", "median"],
    "xgbclassifier__n_estimators": [100, 150, 200],
    "xgbclassifier__learning_rate": [0.01, 0.1, 0.2],
    "xgbclassifier__max_depth": [3, 5, 7],
    "xgbclassifier__min_child_weight": [1, 3, 5]
}

# GridSearchCV for XGBoost
xgb_search = GridSearchCV(xgb_full_pipeline,
                          xgb_param_grid,
                          cv=5,
                          verbose=1)


In [13]:
xgb_search.fit(X_train, y_train)

print(f"The best average score in cross validation was {xgb_search.best_score_}")
print(f"The best parameters are {xgb_search.best_params_}")

Fitting 5 folds for each of 162 candidates, totalling 810 fits
The best average score in cross validation was 0.9503539855471186
The best parameters are {'columntransformer__num_pipe__simpleimputer__strategy': 'median', 'xgbclassifier__learning_rate': 0.1, 'xgbclassifier__max_depth': 5, 'xgbclassifier__min_child_weight': 1, 'xgbclassifier__n_estimators': 100}


In [14]:
accuracy_score(xgb_search.predict(X_test), y_test)

0.9623287671232876

### 4.2 LGBMClassifier

In [15]:
# pip install lightgbm



In [72]:
from lightgbm import LGBMClassifier
from scipy.stats import randint, uniform
from sklearn.model_selection import RandomizedSearchCV


#  pipeline
lgb_full_pipeline = make_pipeline(full_preprocessing,
                                  StandardScaler(),
                                  LGBMClassifier())

# Parameter grid
lgb_param_grid = {
    'columntransformer__num_pipe__simpleimputer__strategy': ["mean", "median"],
    'lgbmclassifier__learning_rate': [0.01, 0.05, 0.1],
    'lgbmclassifier__n_estimators': [50, 100, 200],
    'lgbmclassifier__max_depth': [3, 5, 7],
    'lgbmclassifier__min_child_samples': [10, 20, 30],
    'lgbmclassifier__subsample': [0.8, 0.9, 1.0],
    'lgbmclassifier__colsample_bytree': [0.8, 0.9, 1.0],
    'lgbmclassifier__reg_alpha': [0.0, 0.1, 0.5],
    'lgbmclassifier__reg_lambda': [0.0, 0.1, 0.5],
    'lgbmclassifier__importance_type': ['split', 'gain'],
    'lgbmclassifier__bagging_fraction': [0.8, 0.9, 1.0],
    'lgbmclassifier__bagging_freq': [0, 5, 10],
    'lgbmclassifier__boosting_type': ['gbdt', 'dart']
}

# RandomizedSearchCV - GridSearchCV would be over 2 million fits
lgb_search = RandomizedSearchCV(lgb_full_pipeline,
                          lgb_param_grid,
                          cv=5,
                          verbose=1)

In [None]:
lgb_search.fit(X_train, y_train)

print(f"The best average score in cross validation was {lgb_search.best_score_}")

In [74]:
accuracy_score(lgb_search.predict(X_test), y_test)



0.9657534246575342

### 4.3 ExtraTreesClassifier


In [60]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from scipy.stats import randint


# Create the ExtraTreesClassifier pipeline
ext_full_pipeline = make_pipeline(
    full_preprocessing,
    StandardScaler(),
    ExtraTreesClassifier(random_state=42)  # Set any other parameters you need
)

# Parameter grid
ext_param_grid = {
    "columntransformer__num_pipe__simpleimputer__strategy": ["mean", "median"],
    "extratreesclassifier__n_estimators": randint(50, 100).rvs(10),
    "extratreesclassifier__max_depth": [None] + list(randint(5, 30).rvs(10))
}

# GridSearchCV
ext_search = GridSearchCV(
    ext_full_pipeline,
    ext_param_grid,
    cv=5,
    verbose=1
)

# Fit the grid search
ext_search.fit(X_train, y_train)


Fitting 5 folds for each of 220 candidates, totalling 1100 fits


In [61]:
print(f"The best average score in cross validation was {ext_search.best_score_}")

The best average score in cross validation was 0.9357873885770882


In [62]:
accuracy_score(ext_search.predict(X_test), y_test)

0.9452054794520548

### 4.4 Perceptron

In [34]:
from sklearn.linear_model import Perceptron
from scipy.stats import uniform
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler


# Pipeline
per_full_pipeline = make_pipeline(full_preprocessing, Perceptron())

# define parameter grid
per_param_grid = {
    "perceptron__alpha": uniform(0.0001, 0.1).rvs(10),
    "perceptron__max_iter": [10, 50, 100],
    "perceptron__tol": uniform(0.001, 0.01).rvs(10),
    "perceptron__eta0": uniform(0.01, 1.0).rvs(10),
}

# define GridSearchCV
per_search = GridSearchCV(per_full_pipeline,
                          per_param_grid,
                          cv=5,
                          verbose=1)



In [35]:
per_search.fit(X_train, y_train)

print(f"The best average score in cross-validation was {per_search.best_score_}")
print(f"The best parameters are {per_search.best_params_}")


Fitting 5 folds for each of 3000 candidates, totalling 15000 fits
The best average score in cross-validation was 0.8878617805656432
The best parameters are {'perceptron__alpha': 0.048253501600573095, 'perceptron__eta0': 0.7403442286745863, 'perceptron__max_iter': 10, 'perceptron__tol': 0.010883503986712461}


In [36]:
accuracy_score(per_search.predict(X_test), y_test)

0.9075342465753424

### 4.5 LogisticRegression

In [37]:
from sklearn.linear_model import LogisticRegression

#  pipeline
log_full_pipeline = make_pipeline(full_preprocessing,
                                  StandardScaler(),
                                  LogisticRegression())

# Parameter grid
log_param_grid = {
    "columntransformer__num_pipe__simpleimputer__strategy": ["mean", "median"],
    "logisticregression__C": uniform(0.1, 1.0).rvs(10),
    "logisticregression__penalty": ['l1', 'l2'],
    "logisticregression__solver": ['liblinear', 'saga'],
    "logisticregression__max_iter": [100, 200, 500, 1000],
}

# GridSearchCV
log_search = GridSearchCV(log_full_pipeline,
                          log_param_grid,
                          cv=5,
                          verbose=1)


In [38]:
log_search.fit(X_train, y_train)

print(f"The best average score in cross validation was {log_search.best_score_}")

Fitting 5 folds for each of 320 candidates, totalling 1600 fits
The best average score in cross validation was 0.9494956164484061


In [39]:
accuracy_score(log_search.predict(X_test), y_test)

0.934931506849315

### 4.6 Best Parameters

In [75]:
parameters = {"xgb" : xgb_search.best_params_,
              "lgbm": lgb_search.best_params_,
              "ext": ext_search.best_params_,
              "per": per_search.best_params_,
              "log": log_search.best_params_}


In [76]:
parameters

{'xgb': {'columntransformer__num_pipe__simpleimputer__strategy': 'median',
  'xgbclassifier__learning_rate': 0.1,
  'xgbclassifier__max_depth': 5,
  'xgbclassifier__min_child_weight': 1,
  'xgbclassifier__n_estimators': 100},
 'lgbm': {'lgbmclassifier__subsample': 0.8,
  'lgbmclassifier__reg_lambda': 0.1,
  'lgbmclassifier__reg_alpha': 0.1,
  'lgbmclassifier__n_estimators': 50,
  'lgbmclassifier__min_child_samples': 20,
  'lgbmclassifier__max_depth': 5,
  'lgbmclassifier__learning_rate': 0.1,
  'lgbmclassifier__importance_type': 'split',
  'lgbmclassifier__colsample_bytree': 0.8,
  'lgbmclassifier__boosting_type': 'gbdt',
  'lgbmclassifier__bagging_freq': 10,
  'lgbmclassifier__bagging_fraction': 0.8,
  'columntransformer__num_pipe__simpleimputer__strategy': 'median'},
 'ext': {'columntransformer__num_pipe__simpleimputer__strategy': 'mean',
  'extratreesclassifier__max_depth': 12,
  'extratreesclassifier__n_estimators': 54},
 'per': {'perceptron__alpha': 0.048253501600573095,
  'percep

In [77]:
from sklearn.ensemble import VotingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import LogisticRegression

# Initialize your models with the best parameters
xgb_model = XGBClassifier(learning_rate=0.1, max_depth=5, min_child_weight=1, n_estimators=100)
lgbm_model = LGBMClassifier(subsample = 0.8,reg_lambda = 0.1,reg_alpha= 0.1,n_estimators= 50,min_child_samples= 20, max_depth= 5, learning_rate= 0.1, importance_type = 'split', colsample_bytree= 0.8,boosting_type= 'gbdt',bagging_freq= 10, bagging_fraction= 0.8,)
extra_model = ExtraTreesClassifier(n_estimators=54, max_depth=12, random_state=42)
per_model = Perceptron(alpha=0.0482535, eta0 = 0.7403442286745863, max_iter = 10, tol = 0.010883503986712461 )
log_model = LogisticRegression(C = 0.7009413280452148, max_iter = 100, penalty = "l1", solver ="liblinear" )

# Create a dictionary of your models
models = {
    'xgb': xgb_model,
    'lgbm': lgbm_model,
    'ext': extra_model,
    'per': per_model,
    'log': log_model
}

voting_classifier = make_pipeline(
    full_preprocessing,
    VotingClassifier(
        estimators=[(name, model) for name, model in models.items()],
        voting='hard'
    )
)

# Fit the ensemble model on your training data
voting_classifier.fit(X_train, y_train)

# Predict using the ensemble model
voting_predictions = voting_classifier.predict(X_test)

# Calculate accuracy
final_accuracy = accuracy_score(y_test, voting_predictions)


[LightGBM] [Info] Number of positive: 169, number of negative: 999
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001536 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3240
[LightGBM] [Info] Number of data points in the train set: 1168, number of used features: 171
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.144692 -> initscore=-1.776856
[LightGBM] [Info] Start training from score -1.776856


In [78]:
final_accuracy

0.952054794520548