In [140]:
# imports
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score

# models
import xgboost as xgb  # pip install xgboost
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import (
    BaggingClassifier,
    GradientBoostingClassifier,
    StackingClassifier,
    VotingClassifier,
)
from sklearn.ensemble import (
    GradientBoostingRegressor,
    BaggingRegressor,
    StackingRegressor,
    VotingRegressor,
)

from sklearn.model_selection import cross_val_score, learning_curve, validation_curve

In [125]:
# load data
columns = [
    "Overall Qual",
    "Overall Cond",
    "Gr Liv Area",
    "Central Air",
    "Total Bsmt SF",
    "SalePrice",
]

df = pd.read_csv("AmesHousing.txt", sep="\t", usecols=columns)

In [126]:
# Clean Data
df["Central_Air_Binary"] = df["Central Air"].map({"N": 0, "Y": 1})
df = df.dropna(axis=0)
y = df["SalePrice"]
X = df.drop(["Central Air", "SalePrice"], axis=1)

In [127]:
# train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [128]:
# init models
xgb_model = xgb.XGBRegressor()
# xgb_model =xgb.XGBRegressor(base_score=None, booster=None, callbacks=None,
#                               colsample_bylevel=None, colsample_bynode=None,
#                               colsample_bytree=1.0, device=None,
#                               early_stopping_rounds=None,
#                               enable_categorical=False, eval_metric=None,
#                               feature_types=None, gamma=0.5, grow_policy=None,
#                               importance_type=None,
#                               interaction_constraints=None, learning_rate=0.01,
#                               max_bin=None, max_cat_threshold=None,
#                               max_cat_to_onehot=None, max_delta_step=None,
#                               max_depth=5, max_leaves=None,
#                               min_child_weight=None, monotone_constraints=None, multi_strategy=None,
#                               n_estimators=300, n_jobs=None,
#                               num_parallel_tree=None)
rfr_model = RandomForestRegressor(
    max_depth=10, min_samples_leaf=4, min_samples_split=10, n_estimators=300
)
svr_model = SVR(C=10, epsilon=0.2)
lass_model = Lasso(alpha=10)
eln_model = ElasticNet(alpha=0.01, l1_ratio=0.95)
lr_model = LinearRegression()
# scaler
scaler = MinMaxScaler()

# Create the pipeline
pipeline_xgb = Pipeline([("scaler", scaler), ("xgb", xgb_model)])

pipeline_rfr = Pipeline([("scaler", scaler), ("rfr", rfr_model)])


pipeline_svr = Pipeline([("scaler", scaler), ("svr", svr_model)])

pipeline_lass = Pipeline([("scaler", scaler), ("lass", lass_model)])

pipeline_eln = Pipeline([("scaler", scaler), ("eln", eln_model)])

pipeline_lr = Pipeline([("scaler", scaler), ("lr", lr_model)])

In [129]:
# fit models and display accuracy
pipeline_xgb.fit(X_train, y_train)
pipeline_rfr.fit(X_train, y_train)
pipeline_svr.fit(X_train, y_train)
pipeline_lass.fit(X_train, y_train)
pipeline_eln.fit(X_train, y_train)
pipeline_lr.fit(X_train, y_train)

ypred_xgb = pipeline_xgb.predict(X_test)
ypred_rfr = pipeline_rfr.predict(X_test)
ypred_svr = pipeline_svr.predict(X_test)
ypred_lass = pipeline_lass.predict(X_test)
ypred_eln = pipeline_eln.predict(X_test)
ypred_lr = pipeline_lr.predict(X_test)

r2_xgb = r2_score(y_test, ypred_xgb)
r2_rfr = r2_score(y_test, ypred_rfr)
r2_svr = r2_score(y_test, ypred_svr)
r2_lass = r2_score(y_test, ypred_lass)
r2_eln = r2_score(y_test, ypred_eln)
r2_lr = r2_score(y_test, ypred_lr)

print(f"r2_xgb: {r2_xgb}")
print(f"r2_rfr: {r2_rfr}")
print(f"r2_svr: {r2_svr}")
print(f"r2_lass: {r2_lass}")
print(f"r2_eln: {r2_eln}")
print(f"r2_lr: {r2_lr}")

r2_xgb: 0.8762718300369723
r2_rfr: 0.8361987683778861
r2_svr: -0.019077327120919696
r2_lass: 0.6712619895673891
r2_eln: 0.6730473184293602
r2_lr: 0.6708452385884899


r2_xgb: 0.8762718300369723

r2_rfr: 0.8637605575807634

r2_svr: -0.046523512399574196

r2_lass: 0.6708874959350789

r2_eln: 0.11013549601961803

r2_lr: 0.6708452385884899


In [130]:
# parameter grids for models
# XGBoost Regressor
xgb_params = {
    "xgb__n_estimators": [100, 200, 300],  # Number of boosting rounds
    "xgb__max_depth": [3, 5, 7],  # Maximum depth of a tree
    "xgb__learning_rate": [
        0.1,
        0.01,
        0.001,
    ],  # Step size shrinkage used in update to prevents overfitting
    "xgb__subsample": [0.8, 0.9, 1.0],  # Subsample ratio of the training instance
    "xgb__colsample_bytree": [
        0.8,
        0.9,
        1.0,
    ],  # Subsample ratio of columns when constructing each tree
    "xgb__gamma": [
        0,
        0.25,
        0.5,
    ],  # Minimum loss reduction required to make a further partition on a leaf node of the tree
}

# Random Forest Regressor
rfr_params = {
    "rfr__n_estimators": [100, 200, 300],  # Number of trees in the forest
    "rfr__max_depth": [None, 10, 20],  # Maximum depth of the tree
    "rfr__min_samples_split": [
        2,
        5,
        10,
    ],  # Minimum number of samples required to split an internal node
    "rfr__min_samples_leaf": [
        1,
        2,
        4,
    ],  # Minimum number of samples required to be at a leaf node
}

# Support Vector Regressor
svr_params = {
    "svr__kernel": ["linear", "rbf"],  # Kernel type
    "svr__C": [
        0.1,
        1,
        10,
    ],  # Regularization parameter. The strength of the regularization is inversely proportional to C
    "svr__gamma": [
        "scale",
        "auto",
    ],  # Kernel coefficient for 'rbf', 'poly' and 'sigmoid'
}

# Lasso Regressor
lasso_params = {
    "lass__alpha": [0.01, 0.1, 1, 10],  # Constant that multiplies the L1 term.
}

# Elastic Net Regressor
elasticnet_params = {
    "eln__alpha": [0.01, 0.1, 1, 10],  # Constant that multiplies the penalty terms
    "eln__l1_ratio": [
        0.1,
        0.5,
        0.7,
        0.9,
        0.95,
        0.99,
        1,
    ],  # The ElasticNet mixing parameter
}

In [131]:
# RandomSearchCV
random_search_xgb = RandomizedSearchCV(
    pipeline_xgb, param_distributions=xgb_params, cv=5, scoring="r2", n_jobs=3
)
random_search_rfr = RandomizedSearchCV(
    pipeline_rfr, param_distributions=rfr_params, cv=5, scoring="r2", n_jobs=3
)
random_search_svr = RandomizedSearchCV(
    pipeline_svr, param_distributions=svr_params, cv=5, scoring="r2", n_jobs=3
)
random_search_lass = RandomizedSearchCV(
    pipeline_lass, param_distributions=lasso_params, cv=5, scoring="r2", n_jobs=3
)
random_search_eln = RandomizedSearchCV(
    pipeline_eln, param_distributions=elasticnet_params, cv=5, scoring="r2", n_jobs=3
)

In [132]:
random_search_xgb.fit(X_train, y_train)
random_search_rfr.fit(X_train, y_train)
random_search_svr.fit(X_train, y_train)
random_search_lass.fit(X_train, y_train)
random_search_eln.fit(X_train, y_train)

best_xgb = random_search_xgb.best_estimator_
best_rfr = random_search_rfr.best_estimator_
best_svr = random_search_svr.best_estimator_
best_lass = random_search_lass.best_estimator_
best_eln = random_search_eln.best_estimator_

print(f"best_xgb: {best_xgb}")
print(f"best_rfr: {best_rfr}")
print(f"best_svr: {best_svr}")
print(f"best_lass: {best_lass}")
print(f"best_eln: {best_eln}")

best_xgb: Pipeline(steps=[('scaler', MinMaxScaler()),
                ('xgb',
                 XGBRegressor(base_score=None, booster=None, callbacks=None,
                              colsample_bylevel=None, colsample_bynode=None,
                              colsample_bytree=1.0, device=None,
                              early_stopping_rounds=None,
                              enable_categorical=False, eval_metric=None,
                              feature_types=None, gamma=0, grow_policy=None,
                              importance_type=None,
                              interaction_constraints=None, learning_rate=0.1,
                              max_bin=None, max_cat_threshold=None,
                              max_cat_to_onehot=None, max_delta_step=None,
                              max_depth=3, max_leaves=None,
                              min_child_weight=None, missing=nan,
                              monotone_constraints=None, multi_strategy=None,
                



best_xgb: Pipeline(steps=[('scaler', MinMaxScaler()),
                ('xgb',
                 XGBRegressor(base_score=None, booster=None, callbacks=None,
                              colsample_bylevel=None, colsample_bynode=None,
                              colsample_bytree=1.0, device=None,
                              early_stopping_rounds=None,
                              enable_categorical=False, eval_metric=None,
                              feature_types=None, gamma=0.5, grow_policy=None,
                              importance_type=None,
                              interaction_constraints=None, learning_rate=0.01,
                              max_bin=None, max_cat_threshold=None,
                              max_cat_to_onehot=None, max_delta_step=None,
                              max_depth=5, max_leaves=None,
                              min_child_weight=None, missing=nan,
                              monotone_constraints=None, multi_strategy=None,
                              n_estimators=300, n_jobs=None,
                              num_parallel_tree=None, random_state=None, ...))])


best_rfr: Pipeline(steps=[('scaler', MinMaxScaler()),
                ('rfr',
                 RandomForestRegressor(max_depth=10, min_samples_leaf=4,
                                       min_samples_split=10,
                                       n_estimators=300))])


best_svr: Pipeline(steps=[('scaler', MinMaxScaler()), ('svr', SVR(C=10, epsilon=0.2))])


best_lass: Pipeline(steps=[('scaler', MinMaxScaler()), ('lass', Lasso(alpha=10))])


best_eln: Pipeline(steps=[('scaler', MinMaxScaler()),
                ('eln', ElasticNet(alpha=0.01, l1_ratio=0.95))])

In [133]:
# Bagging
bagging_model = BaggingRegressor(estimator=xgb_model, n_estimators=10, random_state=42)
bagging_model.fit(X_train, y_train)
y_pred = bagging_model.predict(X_test)
r2 = r2_score(y_test, y_pred)

print(f"xgb_model: {r2}")

xgb_model: 0.8884924653504968


In [134]:
# Boosting
booster = GradientBoostingRegressor().fit(X_train, y_train)
y_pred = booster.predict(X_test)
r2 = r2_score(y_test, y_pred)

print(f"booster_model: {r2}")

booster_model: 0.8933634174946422


In [135]:
# Stacking
level1_models = [("svr", svr_model), ("lass", lass_model), ("rfr", rfr_model)]
# Define the final estimator (meta-learner) for the second level
final_estimator = xgb_model  # for example. Can use anything else - maybe try some hyperparameter tuning first?

stacking_model = StackingRegressor(
    estimators=level1_models, final_estimator=final_estimator, cv=5
)
stacking_model.fit(X_train, y_train)
y_pred = stacking_model.predict(X_test)
r2 = r2_score(y_test, y_pred)

print(f"Stacking Model R2: {r2:.2f}")

Stacking Model Accuracy: 0.86


In [142]:
# voting
voting_model = VotingRegressor(estimators=level1_models)
voting_model.fit(X_train, y_train)
y_pred = voting_model.predict(X_test)
r2 = r2_score(y_test, y_pred)

print(f"Majority Voting Model R2: {r2:.2f}")

Majority Voting Model Accuracy: 0.69


In [137]:
# learning curve

In [138]:
# Validation curve