In [13]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import mlflow
import mlflow.sklearn
import mlflow.lightgbm
import os

from sklearn.datasets import fetch_california_housing # ONLY TESTING

In [14]:
# Set MLflow experiment
mlflow.set_experiment("house_price_prediction")

<Experiment: artifact_location='file:///home/jgomacor/projects/Dream_Team_2024/analysis/training/mlruns/993656704908748596', creation_time=1731771951114, experiment_id='993656704908748596', last_update_time=1731771951114, lifecycle_stage='active', name='house_price_prediction', tags={}>

In [18]:
def load_data(data_path):
    """Load and preprocess the dataset"""
    # df = pd.read_csv(data_path)
    data = fetch_california_housing(as_frame=True)
    df = data.frame

    # Assuming your target variable is named 'price'
    # X = df.drop('Listing.Price.ClosePrice', axis=1)
    # y = df['Listing.Price.ClosePrice']

    
    X = df.drop('MedHouseVal', axis=1)
    y = df['MedHouseVal']
    
    # Handle categorical variables (if any)
    X = pd.get_dummies(X, drop_first=True)
    
    return train_test_split(X, y, test_size=0.2, random_state=42)

# Evaluation metrics function
def evaluate_model(model, X_test, y_test):
    """Calculate regression metrics"""
    predictions = model.predict(X_test)
    mse = mean_squared_error(y_test, predictions)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, predictions)
    r2 = r2_score(y_test, predictions)
    
    return {
        "rmse": rmse,
        "mse": mse,
        "mae": mae,
        "r2": r2
    }

In [19]:
# Training function for sklearn Random Forest
def train_sklearn_rf(X_train, X_test, y_train, y_test, params):
    with mlflow.start_run(run_name="sklearn_rf"):
        # Log parameters
        mlflow.log_params(params)
        
        # Train model
        rf = RandomForestRegressor(**params)
        rf.fit(X_train, y_train)
        
        # Evaluate and log metrics
        metrics = evaluate_model(rf, X_test, y_test)
        mlflow.log_metrics(metrics)
        
        # Log model
        mlflow.sklearn.log_model(rf, "model")
        
        return rf, metrics

# Training function for LightGBM
def train_lightgbm(X_train, X_test, y_train, y_test, params):
    with mlflow.start_run(run_name="lightgbm"):
        # Log parameters
        mlflow.log_params(params)
        
        # Train model
        lgb = LGBMRegressor(**params)
        lgb.fit(X_train, y_train)
        
        # Evaluate and log metrics
        metrics = evaluate_model(lgb, X_test, y_test)
        mlflow.log_metrics(metrics)
        
        # Log model
        mlflow.lightgbm.log_model(lgb, "model")
        
        return lgb, metrics

In [None]:
# Example usage in notebook cells:
# Cell 1: Load data
X_train, X_test, y_train, y_test = load_data('df_train.csv')

# Cell 2: Define parameters for each model
sklearn_params = {
    "n_estimators": 100,
    "max_depth": 10,
    "min_samples_split": 2,
    "min_samples_leaf": 1,
    "random_state": 42
}

lightgbm_params = {
    "n_estimators": 100,
    "max_depth": 10,
    "num_leaves": 31,
    "random_state": 42
}

# # Cell 3: Train sklearn Random Forest
# sklearn_model, sklearn_metrics = train_sklearn_rf(
#     X_train, X_test, y_train, y_test, sklearn_params
# )
# print("Sklearn RF Metrics:", sklearn_metrics)

# Cell 4: Train LightGBM
lightgbm_model, lightgbm_metrics = train_lightgbm(
    X_train, X_test, y_train, y_test, lightgbm_params
)
print("LightGBM Metrics:", lightgbm_metrics)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001595 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1838
[LightGBM] [Info] Number of data points in the train set: 16512, number of used features: 8
[LightGBM] [Info] Start training from score 2.071947




LightGBM Metrics: {'rmse': np.float64(0.4630272361128509), 'mse': np.float64(0.2143942213823058), 'mae': np.float64(0.308491698784304), 'r2': 0.8363913671668293}


In [None]:
from joblib import dump, load
import os
from datetime import datetime

def save_model(model, scaler=None, metrics=None, model_dir='models'):
    """
    Save the LightGBM model, scaler, and metrics using joblib
    
    Parameters:
    -----------
    model : LGBMRegressor
        Trained LightGBM model
    scaler : StandardScaler, optional
        Fitted scaler used for preprocessing
    metrics : dict, optional
        Model performance metrics
    model_dir : str
        Directory to save the model files
    """
    # Create timestamp for versioning
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    
    # Create model directory if it doesn't exist
    if not os.path.exists(model_dir):
        os.makedirs(model_dir)
    
    # Save model
    model_path = os.path.join(model_dir, f'lightgbm_model_{timestamp}.joblib')
    dump(model, model_path)
    
    # Save scaler if provided
    if scaler is not None:
        scaler_path = os.path.join(model_dir, f'scaler_{timestamp}.joblib')
        dump(scaler, scaler_path)
    
    # Save metrics if provided
    if metrics is not None:
        metrics_path = os.path.join(model_dir, f'metrics_{timestamp}.joblib')
        dump(metrics, metrics_path)
    
    print(f"Model saved in {model_dir} with timestamp {timestamp}")
    return timestamp

def load_saved_model(timestamp, model_dir='models'):
    """
    Load the saved model, scaler, and metrics
    
    Parameters:
    -----------
    timestamp : str
        Timestamp of the saved model version
    model_dir : str
        Directory where model files are saved
    
    Returns:
    --------
    tuple : (model, scaler, metrics)
    """
    model_path = os.path.join(model_dir, f'lightgbm_model_{timestamp}.joblib')
    model = load(model_path)
    
    # Try to load scaler if it exists
    scaler = None
    scaler_path = os.path.join(model_dir, f'scaler_{timestamp}.joblib')
    if os.path.exists(scaler_path):
        scaler = load(scaler_path)
    
    # Try to load metrics if they exist
    metrics = None
    metrics_path = os.path.join(model_dir, f'metrics_{timestamp}.joblib')
    if os.path.exists(metrics_path):
        metrics = load(metrics_path)
    
    return model, scaler, metrics

# Example usage in notebook cells:

# Cell 1: Train model (using code from previous artifact)
lightgbm_model, lightgbm_metrics = train_lightgbm(
    X_train, X_test, y_train, y_test, lightgbm_params
)

# Cell 2: Save the model
timestamp = save_model(
    model=lightgbm_model,
    metrics=lightgbm_metrics,
    model_dir='house_price_models'
)

# Cell 3: Later, when you need to load the model
loaded_model, loaded_scaler, loaded_metrics = load_saved_model(
    timestamp,
    model_dir='house_price_models'
)

# Cell 4: Make predictions with loaded model
def predict_price(X_new, model, scaler=None):
    """
    Make predictions using the loaded model
    
    Parameters:
    -----------
    X_new : pd.DataFrame
        New data to predict on
    model : LGBMRegressor
        Loaded model
    scaler : StandardScaler, optional
        Loaded scaler if used in training
    """
    if scaler is not None:
        X_new = scaler.transform(X_new)
    
    predictions = model.predict(X_new)
    return predictions

# Example prediction
new_data = X_train.iloc[0:1]  # Your new data
predictions = predict_price(new_data, loaded_model, loaded_scaler)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.035167 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1838
[LightGBM] [Info] Number of data points in the train set: 16512, number of used features: 8
[LightGBM] [Info] Start training from score 2.071947




Model saved in house_price_models with timestamp 20241116_174445


ValueError: Expected 2D array, got scalar array instead:
array=<pandas.core.indexing._iLocIndexer object at 0x7a44075c8220>.
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [22]:
timestamp = save_model(
    model=lightgbm_model,
    metrics=lightgbm_metrics,
    model_dir='production_models'
)


Model saved in production_models with timestamp 20241116_174133
