In [None]:
!pip install optuna
!pip install lightgbm
!pip install -U scikit-learn lightgbm

import optuna
from sklearn.model_selection import cross_val_score
import lightgbm as lgb
import xgboost as xgb
import numpy as np
import seaborn as sns
!pip install --upgrade shap
import matplotlib.pyplot as plt
import shap
import pandas as pd
from sklearn.linear_model import LinearRegression
import warnings
warnings.filterwarnings('ignore', category=FutureWarning)


# Load the CSV file into a pandas DataFrame
train = pd.read_csv("/content/drive/MyDrive/CS-301/house-prices-advanced-regression-techniques/train.csv")

# Split the data into X (features) and y (target)
X = train[["MSSubClass", "LotFrontage", "LotArea", "YearBuilt", "YearRemodAdd", "MasVnrArea", "BsmtFinSF1", "BsmtFinSF2", "BsmtUnfSF", "TotalBsmtSF", "1stFlrSF", "2ndFlrSF", "GrLivArea"]]

y = train[["SalePrice"]]

model = xgb.XGBRegressor().fit(X, y)

# Create an explainer object using the XGBoost model
explainer = shap.Explainer(model)

# Generate SHAP values for your dataset
shap_values = explainer(X)

def objective(trial):
    params = {
        'num_leaves': trial.suggest_int('num_leaves', 2, 256),
        'max_depth': trial.suggest_int('max_depth', 2, 64),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 1),
        'n_estimators': trial.suggest_int('n_estimators', 50, 1000),
        'min_child_samples': trial.suggest_int('min_child_samples', 1, 100),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-9, 10.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-9, 10.0),
        'random_state': 42,
        'objective': 'regression',
        'metric': 'rmse'
    }
    lgbm = lgb.LGBMRegressor(**params)
    return cross_val_score(lgbm, X, y, cv=5, scoring='neg_root_mean_squared_error').mean()

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100)
print('Best Hyperparameters:', study.best_params)
print('Best RMSE:', -study.best_value)
best_params = study.best_params
best_lgbm = lgb.LGBMRegressor(**best_params)
best_lgbm.fit(X, y)




[I 2025-01-08 00:24:06,284] A new study created in memory with name: no-name-e473857c-cd10-4e75-9c95-b8acd2032457
[I 2025-01-08 00:24:09,907] Trial 0 finished with value: -35585.10100099388 and parameters: {'num_leaves': 127, 'max_depth': 15, 'learning_rate': 0.05180128328416788, 'n_estimators': 907, 'min_child_samples': 97, 'reg_alpha': 8.364072802684044e-09, 'reg_lambda': 4.370222482098478e-07}. Best is trial 0 with value: -35585.10100099388.
[I 2025-01-08 00:24:28,909] Trial 1 finished with value: -37828.234228476926 and parameters: {'num_leaves': 120, 'max_depth': 55, 'learning_rate': 0.376519233557186, 'n_estimators': 809, 'min_child_samples': 6, 'reg_alpha': 0.0003040682191885695, 'reg_lambda': 1.8287181878044783e-09}. Best is trial 1 with value: -37828.234228476926.
[I 2025-01-08 00:24:31,653] Trial 2 finished with value: -34253.78591868202 and parameters: {'num_leaves': 43, 'max_depth': 60, 'learning_rate': 0.010359555054476739, 'n_estimators': 734, 'min_child_samples': 57, 're

Best Hyperparameters: {'num_leaves': 3, 'max_depth': 25, 'learning_rate': 0.001243493292392687, 'n_estimators': 50, 'min_child_samples': 68, 'reg_alpha': 0.013091008028497922, 'reg_lambda': 2.164063718486514e-09}
Best RMSE: 77185.49461071179
