In [1]:
# Add the project root directory to sys.path
import os
import sys
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))  # Adjust to your project's structure
sys.path.append(project_root)

import pickle

from src.analysis import *

In [2]:
# Import necessary libraries
import mlflow
import mlflow.sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error, accuracy_score

# Set up MLflow tracking URI
mlflow.set_tracking_uri("sqlite:///mlflow.db")  # Replace with your MLflow server URI
mlflow.set_experiment("MVP Prediction 00")  # Name your experiment
mlflow.set_tag("developer", "christophe")
mlflow.sklearn.autolog()
# Function to initialize an MLflow run
def log_baseline_model(model, X_train, X_test, y_train, y_test):
    with mlflow.start_run(nested=True):
        # Train the model
        model.fit(X_train, y_train)
        
        # Make predictions
        y_pred = model.predict(X_test)
        signature = infer_signature(X_test, y_pred)
        
        # Evaluate the model
        rmse = root_mean_squared_error(y_test, y_pred)
        accuracy = accuracy_score(y_test, y_pred.round())  # Example for classification
        
        # Log model, parameters, and metrics
        mlflow.log_param("model_type", model.__class__.__name__)
        mlflow.log_metric("rmse", rmse)
        mlflow.log_metric("accuracy", accuracy)
        mlflow.sklearn.log_model(model, "model")

        model_name = type(model).__name__
        # with open(f'/Users/cb/src/nba-ml/gradient-boosted-trees-scoring-basket/models/{model_name}_mlflow.bin', 'wb') as f_out:
        #     pickle.dump((model), f_out)

        mlflow.sklearn.log_model(
            sk_model=model,
            artifact_path="sklearn-model",
            signature=signature,
            registered_model_name=f"sk-learn-{model_name}",
        )
        
        print(f"Logged model: {model.__class__.__name__}, RMSE: {rmse}, Accuracy: {accuracy}")

In [3]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.datasets import make_regression
import pandas as pd
import numpy as np

from mlflow.models import infer_signature

# Load or create a dataset
# X, y = make_regression(n_samples=1000, n_features=10, noise=0.1, random_state=42)  # Replace with your dataset

file_path = "/Users/cb/src/nba_mvp_ml/data/processed/by_season/fully_merged/final_stacked_data.csv"  # Update with your file path
X, y = load_and_preprocess_data(file_path, target_column='mvp')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

y_train = y_train.to_numpy().flatten() # Ensure y_train is a 1D array
y_test = y_test.to_numpy().flatten() # Ensure y_test is a 1D array

# Baseline Model 1: Linear Regression
linear_model = LinearRegression()
log_baseline_model(linear_model, X_train, X_test, y_train, y_test)

# Baseline Model 2: Decision Tree Regressor
tree_model = DecisionTreeRegressor(random_state=42)
log_baseline_model(tree_model, X_train, X_test, y_train, y_test)

Registered model 'sk-learn-LinearRegression' already exists. Creating a new version of this model...
Created version '3' of model 'sk-learn-LinearRegression'.


Logged model: LinearRegression, RMSE: 0.5167480328007441, Accuracy: 0.7457627118644068




Logged model: DecisionTreeRegressor, RMSE: 0.31889640207164033, Accuracy: 0.8983050847457628


Registered model 'sk-learn-DecisionTreeRegressor' already exists. Creating a new version of this model...
Created version '3' of model 'sk-learn-DecisionTreeRegressor'.


In [4]:
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from sklearn.ensemble import RandomForestRegressor

# Define the objective function for Hyperopt
def objective(params):
    # Ensure parameters are cast to appropriate types
    params = {
        "n_estimators": int(params["n_estimators"]),
        "max_depth": int(params["max_depth"]),
        "min_samples_split": int(params["min_samples_split"]),
    }
    
    # Start a new MLflow run for this trial
    with mlflow.start_run(nested=True):
        # Initialize the model with the current parameters
        model = RandomForestRegressor(
            n_estimators=params["n_estimators"],
            max_depth=params["max_depth"],
            min_samples_split=params["min_samples_split"],
            random_state=42
        )

        model_name = type(model).__name__
        
        # Train and evaluate the model
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        signature = infer_signature(X_test, y_pred)
        
        rmse = root_mean_squared_error(y_test, y_pred)
        
        # Log parameters and metrics to MLflow
        mlflow.log_param("model_type", model.__class__.__name__)
        mlflow.log_params(params)
        mlflow.log_metric("rmse", rmse)
        mlflow.sklearn.log_model(
            sk_model=model,
            artifact_path="sklearn-model",
            signature=signature,
            registered_model_name=f"sk-learn-{model_name}",
        )
        
        # Return the objective for Hyperopt
        return {"loss": rmse, "status": STATUS_OK}

# Define the hyperparameter search space
space = {
    "n_estimators": hp.quniform("n_estimators", 50, 200, 10),
    "max_depth": hp.quniform("max_depth", 3, 15, 1),
    "min_samples_split": hp.quniform("min_samples_split", 2, 10, 1),
}

# Run the Hyperopt optimization
trials = Trials()
best_params = fmin(
    fn=objective,
    space=space,
    algo=tpe.suggest,
    max_evals=30,
    trials=trials
)

print("Best Parameters:", best_params)

  3%|▉                           | 1/30 [00:06<03:11,  6.61s/trial, best loss: 0.21666763609066178]

Successfully registered model 'sk-learn-RandomForestRegressor'.
Created version '1' of model 'sk-learn-RandomForestRegressor'.


  7%|█▊                          | 2/30 [00:12<02:52,  6.17s/trial, best loss: 0.21666763609066178]

Registered model 'sk-learn-RandomForestRegressor' already exists. Creating a new version of this model...
Created version '2' of model 'sk-learn-RandomForestRegressor'.


 10%|██▊                         | 3/30 [00:18<02:48,  6.22s/trial, best loss: 0.21422285420041032]

Registered model 'sk-learn-RandomForestRegressor' already exists. Creating a new version of this model...
Created version '3' of model 'sk-learn-RandomForestRegressor'.


 13%|███▋                        | 4/30 [00:25<02:48,  6.49s/trial, best loss: 0.21422285420041032]

Registered model 'sk-learn-RandomForestRegressor' already exists. Creating a new version of this model...
Created version '4' of model 'sk-learn-RandomForestRegressor'.


 17%|████▋                       | 5/30 [00:32<02:41,  6.47s/trial, best loss: 0.21422285420041032]

Registered model 'sk-learn-RandomForestRegressor' already exists. Creating a new version of this model...
Created version '5' of model 'sk-learn-RandomForestRegressor'.


 20%|█████▌                      | 6/30 [00:38<02:31,  6.32s/trial, best loss: 0.21422285420041032]

Registered model 'sk-learn-RandomForestRegressor' already exists. Creating a new version of this model...
Created version '6' of model 'sk-learn-RandomForestRegressor'.


 23%|██████▌                     | 7/30 [00:44<02:25,  6.33s/trial, best loss: 0.21422285420041032]

Registered model 'sk-learn-RandomForestRegressor' already exists. Creating a new version of this model...
Created version '7' of model 'sk-learn-RandomForestRegressor'.


 27%|███████▍                    | 8/30 [00:51<02:22,  6.46s/trial, best loss: 0.21422285420041032]

Registered model 'sk-learn-RandomForestRegressor' already exists. Creating a new version of this model...
Created version '8' of model 'sk-learn-RandomForestRegressor'.


 30%|████████▍                   | 9/30 [00:58<02:18,  6.59s/trial, best loss: 0.21422285420041032]

Registered model 'sk-learn-RandomForestRegressor' already exists. Creating a new version of this model...
Created version '9' of model 'sk-learn-RandomForestRegressor'.


 33%|█████████                  | 10/30 [01:04<02:13,  6.67s/trial, best loss: 0.21422285420041032]

Registered model 'sk-learn-RandomForestRegressor' already exists. Creating a new version of this model...
Created version '10' of model 'sk-learn-RandomForestRegressor'.


 37%|█████████▉                 | 11/30 [01:11<02:03,  6.53s/trial, best loss: 0.21422285420041032]

Registered model 'sk-learn-RandomForestRegressor' already exists. Creating a new version of this model...
Created version '11' of model 'sk-learn-RandomForestRegressor'.


 40%|██████████▊                | 12/30 [01:17<01:56,  6.49s/trial, best loss: 0.21422285420041032]

Registered model 'sk-learn-RandomForestRegressor' already exists. Creating a new version of this model...
Created version '12' of model 'sk-learn-RandomForestRegressor'.


 43%|███████████▋               | 13/30 [01:24<01:51,  6.53s/trial, best loss: 0.21422285420041032]

Registered model 'sk-learn-RandomForestRegressor' already exists. Creating a new version of this model...
Created version '13' of model 'sk-learn-RandomForestRegressor'.


 47%|████████████▌              | 14/30 [01:29<01:40,  6.29s/trial, best loss: 0.21422285420041032]

Registered model 'sk-learn-RandomForestRegressor' already exists. Creating a new version of this model...
Created version '14' of model 'sk-learn-RandomForestRegressor'.


 50%|█████████████▌             | 15/30 [01:36<01:34,  6.28s/trial, best loss: 0.21356049241578562]

Registered model 'sk-learn-RandomForestRegressor' already exists. Creating a new version of this model...
Created version '15' of model 'sk-learn-RandomForestRegressor'.


 53%|██████████████▍            | 16/30 [01:42<01:29,  6.38s/trial, best loss: 0.21356049241578562]

Registered model 'sk-learn-RandomForestRegressor' already exists. Creating a new version of this model...
Created version '16' of model 'sk-learn-RandomForestRegressor'.


 57%|███████████████▎           | 17/30 [01:49<01:22,  6.35s/trial, best loss: 0.21356049241578562]

Registered model 'sk-learn-RandomForestRegressor' already exists. Creating a new version of this model...
Created version '17' of model 'sk-learn-RandomForestRegressor'.


 60%|████████████████▏          | 18/30 [01:54<01:14,  6.23s/trial, best loss: 0.21356049241578562]

Registered model 'sk-learn-RandomForestRegressor' already exists. Creating a new version of this model...
Created version '18' of model 'sk-learn-RandomForestRegressor'.


 63%|█████████████████          | 19/30 [02:01<01:09,  6.32s/trial, best loss: 0.21356049241578562]

Registered model 'sk-learn-RandomForestRegressor' already exists. Creating a new version of this model...
Created version '19' of model 'sk-learn-RandomForestRegressor'.


 67%|██████████████████         | 20/30 [02:07<01:01,  6.15s/trial, best loss: 0.21356049241578562]

Registered model 'sk-learn-RandomForestRegressor' already exists. Creating a new version of this model...
Created version '20' of model 'sk-learn-RandomForestRegressor'.


 70%|██████████████████▉        | 21/30 [02:13<00:55,  6.13s/trial, best loss: 0.21356049241578562]

Registered model 'sk-learn-RandomForestRegressor' already exists. Creating a new version of this model...
Created version '21' of model 'sk-learn-RandomForestRegressor'.


 73%|███████████████████▊       | 22/30 [02:19<00:49,  6.15s/trial, best loss: 0.21356049241578562]

Registered model 'sk-learn-RandomForestRegressor' already exists. Creating a new version of this model...
Created version '22' of model 'sk-learn-RandomForestRegressor'.


 77%|████████████████████▋      | 23/30 [02:25<00:43,  6.21s/trial, best loss: 0.21356049241578562]

Registered model 'sk-learn-RandomForestRegressor' already exists. Creating a new version of this model...
Created version '23' of model 'sk-learn-RandomForestRegressor'.


 80%|█████████████████████▌     | 24/30 [02:32<00:37,  6.18s/trial, best loss: 0.21356049241578562]

Registered model 'sk-learn-RandomForestRegressor' already exists. Creating a new version of this model...
Created version '24' of model 'sk-learn-RandomForestRegressor'.


 83%|██████████████████████▌    | 25/30 [02:38<00:31,  6.27s/trial, best loss: 0.21356049241578562]

Registered model 'sk-learn-RandomForestRegressor' already exists. Creating a new version of this model...
Created version '25' of model 'sk-learn-RandomForestRegressor'.


 87%|███████████████████████▍   | 26/30 [02:44<00:25,  6.28s/trial, best loss: 0.21356049241578562]

Registered model 'sk-learn-RandomForestRegressor' already exists. Creating a new version of this model...
Created version '26' of model 'sk-learn-RandomForestRegressor'.


 90%|████████████████████████▎  | 27/30 [02:50<00:18,  6.20s/trial, best loss: 0.21356049241578562]

Registered model 'sk-learn-RandomForestRegressor' already exists. Creating a new version of this model...
Created version '27' of model 'sk-learn-RandomForestRegressor'.


 93%|█████████████████████████▏ | 28/30 [02:57<00:12,  6.25s/trial, best loss: 0.21356049241578562]

Registered model 'sk-learn-RandomForestRegressor' already exists. Creating a new version of this model...
Created version '28' of model 'sk-learn-RandomForestRegressor'.


 97%|██████████████████████████ | 29/30 [03:03<00:06,  6.30s/trial, best loss: 0.21356049241578562]

Registered model 'sk-learn-RandomForestRegressor' already exists. Creating a new version of this model...
Created version '29' of model 'sk-learn-RandomForestRegressor'.


100%|███████████████████████████| 30/30 [03:10<00:00,  6.34s/trial, best loss: 0.21356049241578562]
Best Parameters: {'max_depth': 13.0, 'min_samples_split': 3.0, 'n_estimators': 110.0}


Registered model 'sk-learn-RandomForestRegressor' already exists. Creating a new version of this model...
Created version '30' of model 'sk-learn-RandomForestRegressor'.


In [5]:
import optuna
from sklearn.ensemble import GradientBoostingRegressor

# Define objective function for Optuna
def optuna_objective(trial):
    with mlflow.start_run(nested=True):
        # Suggest hyperparameters
        n_estimators = trial.suggest_int("n_estimators", 50, 200)
        max_depth = trial.suggest_int("max_depth", 3, 15)
        learning_rate = trial.suggest_float("learning_rate", 0.01, 0.3)
        
        # Initialize model
        model = GradientBoostingRegressor(
            n_estimators=n_estimators,
            max_depth=max_depth,
            learning_rate=learning_rate,
            random_state=42
        )

        model_name = type(model).__name__
        
        # Train model
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        signature = infer_signature(X_test, y_pred)
        
        # Evaluate model
        rmse = root_mean_squared_error(y_test, y_pred)
        
        # Log to MLflow
        mlflow.log_param("n_estimators", n_estimators)
        mlflow.log_param("model_type", model.__class__.__name__)
        mlflow.log_param("max_depth", max_depth)
        mlflow.log_param("learning_rate", learning_rate)
        mlflow.log_metric("rmse", rmse)
        mlflow.sklearn.log_model(
            sk_model=model,
            artifact_path="sklearn-model",
            signature=signature,
            registered_model_name=f"sk-learn-{model_name}",
        )        
        return rmse

# Create and optimize the study
study = optuna.create_study(direction="minimize")
study.optimize(optuna_objective, n_trials=30)

print("Best Parameters:", study.best_params)


IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html

[I 2024-12-02 17:27:43,555] A new study created in memory with name: no-name-acf25abb-5d74-4a10-8964-5a3671209ab0
Successfully registered model 'sk-learn-GradientBoostingRegressor'.
Created version '1' of model 'sk-learn-GradientBoostingRegressor'.
[I 2024-12-02 17:27:49,694] Trial 0 finished with value: 0.2770955783974073 and parameters: {'n_estimators': 120, 'max_depth': 7, 'learning_rate': 0.2281238036913382}. Best is trial 0 with value: 0.2770955783974073.
Registered model 'sk-learn-GradientBoostingRegressor' already exists. Creating a new version of this model...
Created version '2' of model 'sk-learn-GradientBoostingRegressor'.
[I 2024-12-02 17:27:56,371] Trial 1 finished with value: 0.2813157676686083 and parameters: {'n_estimators': 151, 'max_depth': 14, 'learning_rate': 0.13563574521411031}. Best is trial 0 with value: 0.2770955783974073.
Registered mo

Best Parameters: {'n_estimators': 97, 'max_depth': 5, 'learning_rate': 0.2876550428888992}


In [6]:
# Best Parameters: {'max_depth': 3.0, 'min_samples_split': 5.0, 'n_estimators': 140.0}

# Best Parameters: {'n_estimators': 73, 'max_depth': 3, 'learning_rate': 0.15121675856543731}