In [84]:
import pandas as pd
import numpy as np
import shap
import optuna
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import mlflow
import mlflow.sklearn
import mlflow.lightgbm

from sklearn.datasets import fetch_california_housing # ONLY TESTING

In [85]:
mlflow.set_experiment("house_price_prediction")

<Experiment: artifact_location='file:///home/jgomacor/projects/Dream_Team_2024/analysis/training/mlruns/638427604679944752', creation_time=1731768523700, experiment_id='638427604679944752', last_update_time=1731768523700, lifecycle_stage='active', name='house_price_prediction', tags={}>

In [86]:
def load_data(data_path):
    """Load and preprocess the dataset"""
    # df = pd.read_csv(data_path)

    df =  fetch_california_housing(as_frame=True).frame # ONLY TESTING
    # Assuming your target variable is named 'price'
    X = df.drop('MedHouseVal', axis=1)
    y = df['MedHouseVal']
    
    # Handle categorical variables (if any)
    X = pd.get_dummies(X, drop_first=True)
    
    return train_test_split(X, y, test_size=0.2, random_state=42)


In [87]:
# Enhanced evaluation metrics with cross-validation
def evaluate_model(model, X_train, X_test, y_train, y_test):
    """Calculate regression metrics including cross-validation scores"""
    # Standard metrics
    predictions = model.predict(X_test)
    mse = mean_squared_error(y_test, predictions)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, predictions)
    r2 = r2_score(y_test, predictions)
    
    # Cross-validation score
    cv_scores = cross_val_score(model, X_train, y_train, 
                              cv=5, scoring='neg_mean_squared_error')
    cv_rmse = np.sqrt(-cv_scores.mean())
    
    return {
        "rmse": rmse,
        "mse": mse,
        "mae": mae,
        "r2": r2,
        "cv_rmse": cv_rmse
    }

In [88]:
def analyze_shap(model, X_train, X_test, model_type='tree'):
    """Generate and log SHAP values and plots"""
    if model_type == 'tree':
        explainer = shap.TreeExplainer(model)
    else:
        explainer = shap.KernelExplainer(model.predict, shap.sample(X_train, 100))
    
    # Calculate SHAP values for test set
    shap_values = explainer.shap_values(X_test)
    
    # Generate and save SHAP summary plot
    plt.figure()
    shap.summary_plot(shap_values, X_test, show=False)
    plt.tight_layout()
    plt.savefig('shap_summary.png')
    mlflow.log_artifact('shap_summary.png')
    plt.close()
    
    # Generate and save SHAP dependency plots for top features
    if isinstance(shap_values, list):
        shap_values = shap_values[0]  # For LightGBM
    feature_importance = np.abs(shap_values).mean(0)
    top_features = X_test.columns[np.argsort(-feature_importance)[:5]]
    
    for feature in top_features:
        plt.figure()
        shap.dependence_plot(feature, shap_values, X_test, show=False)
        plt.tight_layout()
        plt.savefig(f'shap_dependence_{feature}.png')
        mlflow.log_artifact(f'shap_dependence_{feature}.png')
        plt.close()
    
    # Log feature importance values
    feature_importance_dict = dict(zip(X_test.columns, feature_importance))
    mlflow.log_dict(feature_importance_dict, "feature_importance.json")
    
    return feature_importance_dict

In [89]:
def objective_sklearn_rf(trial, X_train, X_test, y_train, y_test):
    with mlflow.start_run(nested=True):
        params = {
            "n_estimators": trial.suggest_int("n_estimators", 50, 300),
            "max_depth": trial.suggest_int("max_depth", 3, 25),
            "min_samples_split": trial.suggest_int("min_samples_split", 2, 15),
            "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 10),
            "random_state": 42
        }
        
        mlflow.log_params(params)
        
        model = RandomForestRegressor(**params)
        model.fit(X_train, y_train)
        
        metrics = evaluate_model(model, X_train, X_test, y_train, y_test)
        mlflow.log_metrics(metrics)
        
        return metrics['cv_rmse']  # Optimize for cross-validated RMSE

In [90]:
def objective_lightgbm(trial, X_train, X_test, y_train, y_test):
    with mlflow.start_run(nested=True):
        params = {
            "n_estimators": trial.suggest_int("n_estimators", 50, 300),
            "max_depth": trial.suggest_int("max_depth", 3, 25),
            "num_leaves": trial.suggest_int("num_leaves", 20, 100),
            "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
            "random_state": 42
        }
        
        mlflow.log_params(params)
        
        model = LGBMRegressor(**params)
        model.fit(X_train, y_train)
        
        metrics = evaluate_model(model, X_train, X_test, y_train, y_test)
        mlflow.log_metrics(metrics)
        
        return metrics['cv_rmse']

In [91]:
def optimize_and_train(model_type, X_train, X_test, y_train, y_test, n_trials=50):
    study = optuna.create_study(direction="minimize")
    
    if model_type == "sklearn_rf":
        objective = lambda trial: objective_sklearn_rf(trial, X_train, X_test, y_train, y_test)
        model_class = RandomForestRegressor
        mlflow_flavor = mlflow.sklearn
    else:  # lightgbm
        objective = lambda trial: objective_lightgbm(trial, X_train, X_test, y_train, y_test)
        model_class = LGBMRegressor
        mlflow_flavor = mlflow.lightgbm
    
    # Run optimization
    with mlflow.start_run(run_name=f"{model_type}_optimization") as run:
        study.optimize(objective, n_trials=n_trials)
        
        # Train final model with best parameters
        best_params = study.best_params
        final_model = model_class(**best_params)
        final_model.fit(X_train, y_train)
        
        # Log best parameters and metrics
        mlflow.log_params(best_params)
        metrics = evaluate_model(final_model, X_train, X_test, y_train, y_test)
        mlflow.log_metrics(metrics)
        
        # Log model
        mlflow_flavor.log_model(final_model, "model")
        
        # Generate and log SHAP analysis
        feature_importance = analyze_shap(
            final_model, X_train, X_test, 
            model_type='tree'
        )
        
        # Log optimization history
        optuna_metrics = {
            f"trial_{trial.number}_value": trial.value
            for trial in study.trials
        }
        mlflow.log_metrics(optuna_metrics)
        
        # Save optimization plots
        plt.figure()
        optuna.visualization.plot_optimization_history(study).write_image('optimization_history.png')
        mlflow.log_artifact('optimization_history.png')
        
        plt.figure()
        optuna.visualization.plot_param_importances(study).write_image('param_importances.png')
        mlflow.log_artifact('param_importances.png')
        
        return final_model, metrics, feature_importance, study


In [92]:
X_train, X_test, y_train, y_test = load_data("")

In [93]:
sklearn_params = {
    "n_estimators": 100,
    "max_depth": 10,
    "min_samples_split": 2,
    "min_samples_leaf": 1,
    "random_state": 42
}

lightgbm_params = {
    "n_estimators": 100,
    "max_depth": 10,
    "num_leaves": 31,
    "random_state": 42
}

In [94]:
rf_model, rf_metrics, rf_importance, rf_study = optimize_and_train(
    "sklearn_rf", X_train, X_test, y_train, y_test, n_trials=50
)
print("Best RF Metrics:", rf_metrics)
print("\nTop 5 Important Features:", dict(sorted(
    rf_importance.items(), key=lambda x: x[1], reverse=True)[:5]))

[I 2024-11-16 16:39:27,803] A new study created in memory with name: no-name-5e88a1db-ae88-4bd5-9fcd-abf8eebd0a60
[I 2024-11-16 16:42:05,712] Trial 0 finished with value: 0.538161057868983 and parameters: {'n_estimators': 234, 'max_depth': 10, 'min_samples_split': 7, 'min_samples_leaf': 2}. Best is trial 0 with value: 0.538161057868983.
[W 2024-11-16 16:42:50,687] Trial 1 failed with parameters: {'n_estimators': 113, 'max_depth': 11, 'min_samples_split': 14, 'min_samples_leaf': 1} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/home/jgomacor/projects/Dream_Team_2024/venv/lib/python3.10/site-packages/optuna/study/_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
  File "/tmp/ipykernel_99218/3658379325.py", line 5, in <lambda>
    objective = lambda trial: objective_sklearn_rf(trial, X_train, X_test, y_train, y_test)
  File "/tmp/ipykernel_99218/3605679111.py", line 16, in objective_sklearn_rf
    metrics = evaluate_

KeyboardInterrupt: 

In [None]:
lgb_model, lgb_metrics, lgb_importance, lgb_study = optimize_and_train(
    "lightgbm", X_train, X_test, y_train, y_test, n_trials=1
)
print("Best LightGBM Metrics:", lgb_metrics)
print("\nTop 5 Important Features:", dict(sorted(
    lgb_importance.items(), key=lambda x: x[1], reverse=True)[:5]))

[I 2024-11-16 16:42:56,928] A new study created in memory with name: no-name-39e1f6fa-37f4-4147-80cb-e7f6af495e52


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.027995 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1838
[LightGBM] [Info] Number of data points in the train set: 16512, number of used features: 8
[LightGBM] [Info] Start training from score 2.071947
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002750 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1838
[LightGBM] [Info] Number of data points in the train set: 13209, number of used features: 8
[LightGBM] [Info] Start training from score 2.067432
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000666 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1837
[LightGBM] [Info] Number of data points in the train set: 

[I 2024-11-16 16:43:05,182] Trial 0 finished with value: 0.476266801128126 and parameters: {'n_estimators': 228, 'max_depth': 22, 'num_leaves': 58, 'learning_rate': 0.0222712342861298}. Best is trial 0 with value: 0.476266801128126.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000639 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1838
[LightGBM] [Info] Number of data points in the train set: 16512, number of used features: 8
[LightGBM] [Info] Start training from score 2.071947
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001144 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1838
[LightGBM] [Info] Number of data points in the train set: 13209, number of used features: 8
[LightGBM] [Info] Start training from score 2.067432
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000659 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1837
[LightGBM] [Info] Number of data points in the train set: 13209, number of used features: 8
[LightGBM] [Info] Start traini

