In [1]:
import pandas as pd
import joblib

In [2]:
df = joblib.load('my_dataframe.joblib')

In [3]:
X = df.drop('SalePrice', axis=1)
y = df['SalePrice']

In [4]:
# Encoding categorical columns 
#categorical_columns=df.select_dtypes(include=['object']).columns.tolist()
#X=pd.get_dummies(X, columns=categorical_columns)

In [5]:
from sklearn.preprocessing import LabelEncoder
categorical_columns = X.select_dtypes(include=['object']).columns.tolist()

label_encoders = {}

# Apply Label Encoding to each categorical column
for col in categorical_columns:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])
    label_encoders[col] = le

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [29]:
gb_model = GradientBoostingRegressor(
    n_estimators=100,    # Number of trees (weak learners)
    learning_rate=0.1,   # Learning rate
    max_depth=3,         # Maximum depth of the individual trees
    random_state=42
)

In [30]:
# Train the model
gb_model.fit(X_train, y_train)

In [31]:
# Make predictions on the test set
y_pred = gb_model.predict(X_test)

In [32]:
# Evaluate the model
rmse = mean_squared_error(y_test, y_pred, squared=False)
print(f"RMSE: {rmse:.2f}")

RMSE: 20497.32




In [19]:
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

0.8992961555352058

In [13]:
from sklearn.model_selection import RandomizedSearchCV

In [14]:
from scipy.stats import uniform, randint
# Define the parameter distributions
param_dist = {
    'n_estimators': randint(100, 500),  # Randomly sample from 100 to 500 trees
    'learning_rate': uniform(0.01, 0.3),  # Randomly sample learning rates between 0.01 and 0.3
    'max_depth': randint(3, 10),  # Randomly sample max_depth between 3 and 10
    'min_samples_split': randint(2, 10),  # Randomly sample between 2 and 10
    'min_samples_leaf': randint(1, 5),  # Randomly sample between 1 and 5
    'subsample': uniform(0.7, 0.3)  # Randomly sample subsample values between 0.7 and 1.0
}

In [15]:
# Set up the RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=gb_model,
    param_distributions=param_dist,
    n_iter=50,  # Number of random combinations to try
    scoring='neg_mean_squared_error',  # Use MSE for regression
    cv=5,  # 5-fold cross-validation
    verbose=2,
    n_jobs=-1,  # Use all CPU cores
    random_state=42
)

In [16]:
# Fit RandomizedSearchCV
random_search.fit(X_train, y_train)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


RandomizedSearchCV(cv=5, estimator=GradientBoostingRegressor(random_state=42),
                   n_iter=50, n_jobs=-1,
                   param_distributions={'learning_rate': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000002624127F910>,
                                        'max_depth': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000002624127FD00>,
                                        'min_samples_leaf': <scipy.stats._distn_infrastructure.rv_fro...bject at 0x00000262437F9280>,
                                        'min_samples_split': <scipy.stats._distn_infrastructure.rv_frozen object at 0x00000262437F9040>,
                                        'n_estimators': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000026242818DF0>,
                                        'subsample': <scipy.stats._distn_infrastructure.rv_frozen object at 0x00000262437F94C0>},
                   random_state=42, scoring='neg_mean_squared_error',
               

In [17]:
# Get the best parameters from the random search
best_params_random = random_search.best_params_
print(f"Best parameters from RandomizedSearchCV: {best_params_random}")

Best parameters from RandomizedSearchCV: {'learning_rate': 0.17837302775431035, 'max_depth': 5, 'min_samples_leaf': 1, 'min_samples_split': 8, 'n_estimators': 230, 'subsample': 0.9282355145850691}


In [18]:
# Predict using the best model found by RandomizedSearchCV
best_model_random = random_search.best_estimator_
y_pred_random = best_model_random.predict(X_test)

# Evaluate the performance of the RandomizedSearchCV model
rmse_random = mean_squared_error(y_test, y_pred_random,squared=False)
print(f'Mean Squared Error (RandomizedSearchCV): {rmse_random}')

Mean Squared Error (RandomizedSearchCV): 26721.018977240106


In [19]:
from sklearn.model_selection import GridSearchCV
# Define a more focused parameter grid based on the results of RandomizedSearchCV
param_grid = {
    'n_estimators': [best_params_random['n_estimators'] - 50, best_params_random['n_estimators'], best_params_random['n_estimators'] + 50],
    'learning_rate': [best_params_random['learning_rate'] * 0.8, best_params_random['learning_rate'], best_params_random['learning_rate'] * 1.2],
    'max_depth': [best_params_random['max_depth'] - 1, best_params_random['max_depth'], best_params_random['max_depth'] + 1],
    'min_samples_split': [best_params_random['min_samples_split'] - 1, best_params_random['min_samples_split'], best_params_random['min_samples_split'] + 1],
    'min_samples_leaf': [best_params_random['min_samples_leaf'] - 1, best_params_random['min_samples_leaf'], best_params_random['min_samples_leaf'] + 1],
    'subsample': [best_params_random['subsample'] * 0.9, best_params_random['subsample'], best_params_random['subsample'] * 1.1]
}

In [20]:
# Set up the GridSearchCV
grid_search = GridSearchCV(
    estimator=gb_model,
    param_grid=param_grid,
    scoring='neg_mean_squared_error',
    cv=5,  # 5-fold cross-validation
    verbose=2,
    n_jobs=-1  # Use all available CPU cores
)

In [21]:
# Fit GridSearchCV
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 729 candidates, totalling 3645 fits


             nan             nan             nan             nan
             nan             nan             nan             nan
             nan             nan             nan             nan
             nan             nan             nan             nan
             nan             nan             nan             nan
             nan             nan             nan -6.55373768e+08
 -7.39943730e+08             nan -6.56414340e+08 -7.38959407e+08
             nan -6.57075823e+08 -7.37141647e+08             nan
 -7.01870310e+08 -6.98618627e+08             nan -7.01210903e+08
 -6.96034412e+08             nan -7.01012000e+08 -6.97322862e+08
             nan -7.39847230e+08 -7.14821061e+08             nan
 -7.39254082e+08 -7.15766938e+08             nan -7.40972436e+08
 -7.15537409e+08             nan -6.72126952e+08 -7.71831962e+08
             nan -6.74362429e+08 -7.73660607e+08             nan
 -6.74188452e+08 -7.71993508e+08             nan -6.65061991e+08
 -8.34306885e+08         

GridSearchCV(cv=5, estimator=GradientBoostingRegressor(random_state=42),
             n_jobs=-1,
             param_grid={'learning_rate': [0.14269842220344828,
                                           0.17837302775431035,
                                           0.21404763330517243],
                         'max_depth': [4, 5, 6], 'min_samples_leaf': [0, 1, 2],
                         'min_samples_split': [7, 8, 9],
                         'n_estimators': [180, 230, 280],
                         'subsample': [0.8354119631265623, 0.9282355145850691,
                                       1.021059066043576]},
             scoring='neg_mean_squared_error', verbose=2)

In [22]:
# Get the best parameters from the grid search
best_params_grid = grid_search.best_params_
print(f"Best parameters from GridSearchCV: {best_params_grid}")



Best parameters from GridSearchCV: {'learning_rate': 0.14269842220344828, 'max_depth': 4, 'min_samples_leaf': 1, 'min_samples_split': 7, 'n_estimators': 180, 'subsample': 0.8354119631265623}


In [23]:
# Predict using the best model found by GridSearchCV
best_model_grid = grid_search.best_estimator_
y_pred_grid = best_model_grid.predict(X_test)

# Evaluate the performance of the GridSearchCV model
mse_grid = mean_squared_error(y_test, y_pred_grid, squared=False)
print(f'Root Mean Squared Error (GridSearchCV): {mse_grid}')

Mean Squared Error (GridSearchCV): 26837.975894342708


# XGBoost

In [1]:
import xgboost as xgb

In [13]:
# Initialize and fit the model
xg_model = xgb.XGBRegressor(objective='reg:squarederror', eval_metric='rmse')
xg_model.fit(X_train, y_train)

# Predict and evaluate
y_pred = xg_model.predict(X_test)
rmse_xgb = mean_squared_error(y_test, y_pred,squared=False)
print(f'XGBoost RMSE: {rmse_xgb}')

XGBoost RMSE: 26650.804059091068


# LigthGBM

In [14]:
import lightgbm as lgb

In [15]:
# Initialize and fit the model
lgb_model = lgb.LGBMRegressor(objective='regression', metric='rmse')
lgb_model.fit(X_train, y_train)

# Predict and evaluate
y_pred = lgb_model.predict(X_test)
rmse_lgb = mean_squared_error(y_test, y_pred,squared=False)
print(f'LightGBM RMSE: {rmse_lgb}')

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001163 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2540
[LightGBM] [Info] Number of data points in the train set: 1153, number of used features: 58
[LightGBM] [Info] Start training from score 179212.566349
LightGBM RMSE: 24775.5764942313


# CatBoost

In [8]:
from catboost import CatBoostRegressor

In [9]:
# Initialize and fit the model
cat_model = CatBoostRegressor(learning_rate=0.1, depth=6, iterations=1000, eval_metric='RMSE', verbose=0)
cat_model.fit(X_train, y_train)

# Predict and evaluate
y_pred = cat_model.predict(X_test)
rmse_cat = mean_squared_error(y_test, y_pred,squared=False)
print(f'CatBoost RMSE: {rmse_cat}')

CatBoost RMSE: 19906.694552317414




In [10]:
from sklearn.model_selection import cross_val_score, KFold
import numpy as np
# 5-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(cat_model, X, y, cv=kf, scoring='neg_root_mean_squared_error')

# Convert negative RMSE scores to positive
rmse_scores = -cv_scores

# Calculate mean RMSE
mean_rmse = np.mean(rmse_scores)
print(f'Cross-validated RMSE: {mean_rmse:.2f}')

# Relative RMSE as % of the mean SalePrice
relative_rmse = (mean_rmse / np.mean(y)) * 100
print(f'Relative RMSE: {relative_rmse:.2f}%')

Cross-validated RMSE: 17582.09
Relative RMSE: 10.33%


In [11]:
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

0.8992961555352058

In [12]:
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

In [13]:
# Define the hyperparameter grid for Random Search
random_grid = {
    'depth': np.arange(4, 10),
    'learning_rate': np.logspace(-3, 0, 10),  # Exponentially spaced values from 0.001 to 1
    'iterations': np.arange(500, 2000, 100),
    'l2_leaf_reg': np.arange(1, 10)
}

In [14]:
# Setup RandomizedSearchCV
random_search = RandomizedSearchCV(estimator=cat_model, param_distributions=random_grid, 
                                   n_iter=50, scoring='neg_mean_squared_error', 
                                   cv=5, verbose=2, random_state=42, n_jobs=-1)

In [15]:
# Fit the random search model
random_search.fit(X_train, y_train)

# Output the best parameters from RandomizedSearchCV
print("Best parameters from RandomizedSearchCV:", random_search.best_params_)

Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best parameters from RandomizedSearchCV: {'learning_rate': 0.021544346900318832, 'l2_leaf_reg': 2, 'iterations': 1800, 'depth': 4}


In [16]:
# Define a smaller grid based on RandomizedSearchCV results
best_params = random_search.best_params_

param_grid = {
    'depth': [best_params['depth'] - 1, best_params['depth'], best_params['depth'] + 1],
    'learning_rate': [best_params['learning_rate'] * 0.8, best_params['learning_rate'], best_params['learning_rate'] * 1.2],
    'iterations': [best_params['iterations'] - 100, best_params['iterations'], best_params['iterations'] + 100],
    'l2_leaf_reg': [best_params['l2_leaf_reg'] - 1, best_params['l2_leaf_reg'], best_params['l2_leaf_reg'] + 1]
}

In [17]:
# Setup GridSearchCV
grid_search = GridSearchCV(estimator=cat_model, param_grid=param_grid, 
                           scoring='neg_mean_squared_error', cv=5, verbose=2, n_jobs=-1)


In [19]:
# Fit the GridSearchCV
grid_search.fit(X_train, y_train)

# Output the best parameters from GridSearchCV
print("Best parameters from GridSearchCV:", grid_search.best_params_)

Fitting 5 folds for each of 81 candidates, totalling 405 fits
Best parameters from GridSearchCV: {'depth': 5, 'iterations': 1700, 'l2_leaf_reg': 1, 'learning_rate': 0.017235477520255067}


In [20]:
# Define CatBoost model with the best hyperparameters
best_catboost_model = CatBoostRegressor(
    depth=5,
    iterations=1700,
    l2_leaf_reg=1,
    learning_rate=0.017235477520255067,
    verbose=0
)

# Fit the model to the full training set
best_catboost_model.fit(X_train, y_train)

# Evaluate on the test set
y_pred = best_catboost_model.predict(X_test)

# Optionally, calculate evaluation metrics (e.g., RMSE)
from sklearn.metrics import mean_squared_error
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"RMSE: {rmse}")


RMSE: 19087.981269342836
