In [1]:
import mlflow
import mlflow.sklearn
from sklearn.model_selection import train_test_split
import pandas as pd

In [2]:
# Set MLFlow tracking URI (local or server-based)
mlflow.set_tracking_uri("sqlite:///mlflow.db")  # Change if using a centralized server

# Define the experiment name
mlflow.set_experiment("MVP Prediction Models")

mlflow.set_tag("developer", "christophe")

In [3]:
import os
import sys
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))  # Adjust to your project's structure
sys.path.append(project_root)

from src.analysis import *

# Load your cleaned dataset
data_path = "/Users/cb/src/nba_mvp_ml/data/processed/by_season/fully_merged/final_stacked_data.csv"

X, y = load_and_preprocess_data(data_path, remove_excess_features=True) # X will be normalized

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
from hyperopt import fmin, tpe, hp, Trials
from hyperopt.pyll import scope
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import root_mean_squared_error, accuracy_score, precision_score, recall_score, r2_score

from sklearn.metrics import ConfusionMatrixDisplay
import matplotlib.pyplot as plt

def optimize_and_log_model(model_name, space):
    def objective(params):

        params = {
            "n_estimators": int(params["n_estimators"]),
            "max_depth": int(params["max_depth"]),
            "criterion": params["criterion"],
        }
        
        with mlflow.start_run(run_name="Best Random Forest Model", nested=True):
            # Instantiate the model with parameters from Hyperopt
            model = RandomForestClassifier(**params)
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            
            test_score = model.score(X_test, y_test)
            rmse = root_mean_squared_error(y_test, y_pred)
            r2 = r2_score(y_test, y_pred)
            accuracy = accuracy_score(y_test, y_pred)

            precision = precision_score(y_test, y_pred, average="weighted")
            recall = recall_score(y_test, y_pred, average="weighted")
            
            # Log parameters and score
            mlflow.log_param("model_name", model_name)
            for param, value in params.items():
                mlflow.log_param(param, value)
            mlflow.log_metric("test_score", test_score)
        
            mlflow.log_metric("rmse", rmse)
            mlflow.log_metric("r2", r2)
            mlflow.log_metric("accuracy", accuracy)
            mlflow.log_metric("precision", precision)
            mlflow.log_metric("recall", recall)

            disp = ConfusionMatrixDisplay.from_estimator(model, X_test, y_test)
            plt.savefig("tmp/confusion_matrix.png")
            mlflow.log_artifact("tmp/confusion_matrix.png")
            plt.close() 
            
            return -test_score  # Hyperopt minimizes the objective

    trials = Trials()
    best_params = fmin(fn=objective, space=space, algo=tpe.suggest, max_evals=200, trials=trials)
    print("Best Parameters:", best_params)

    return best_params

In [5]:
criterion = ["gini", "entropy"]
space = {
    "n_estimators": scope.int(hp.quniform("n_estimators", 1, 200, 1)),
    "max_depth": scope.int(hp.quniform("max_depth", 5, 30, 1)),
    "criterion": hp.choice("criterion", criterion)
}

best_params = optimize_and_log_model("Random Forest", space)

100%|████████████████| 200/200 [01:08<00:00,  2.93trial/s, best loss: -0.9661016949152542]
Best Parameters: {'criterion': 0, 'max_depth': 15.0, 'n_estimators': 137.0}


In [6]:
best_params['max_depth'] = int(best_params['max_depth'])
best_params['n_estimators'] = int(best_params['n_estimators'])
best_params['criterion'] = criterion[best_params['criterion']]

# Train the final model with the best parameters
best_model = RandomForestClassifier(**best_params)
best_model.fit(X_train, y_train)

In [7]:
subset_indexes = y_test.index

y_pred = pd.Series(best_model.predict(X_test))
true_positive = y_test.reset_index().loc[(y_test.reset_index()['mvp'] == 1) & (y_pred == 1)]
false_positive = y_test.reset_index().loc[(y_test.reset_index()['mvp'] == 0) & (y_pred == 1)]
false_negative = y_test.reset_index().loc[(y_test.reset_index()['mvp'] == 1) & (y_pred == 0)]
true_negative = y_test.reset_index().loc[(y_test.reset_index()['mvp'] == 0) & (y_pred == 0)]

print(f"true_positive:\n {true_positive}")
print(f"false_positive:\n {false_positive}")
print(f"false_negative:\n {false_negative}")

list(true_positive['index'])

true_positive:
     index  mvp
20    143    1
21    167    1
25      5    1
34    197    1
false_positive:
     index  mvp
18    228    0
false_negative:
     index  mvp
26    238    1
46     25    1


[143, 167, 5, 197]

In [8]:
y_true = y_test.reset_index()['mvp']

# Calculate True Positives, False Positives, and False Negatives
tp = ((y_true == 1) & (y_pred == 1)).sum()  # True Positives
fp = ((y_true == 0) & (y_pred == 1)).sum()  # False Positives
fn = ((y_true == 1) & (y_pred == 0)).sum()  # False Negatives
tn = ((y_true == 0) & (y_pred == 0)).sum()  # True Negatives

# Precision and Recall
precision = tp / (tp + fp) if (tp + fp) > 0 else 0
recall = tp / (tp + fn) if (tp + fn) > 0 else 0

# Accuracy
accuracy = (tp + tn) / (tp + tn + fp + fn)

print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"Accuracy: {accuracy}")

Precision: 0.8
Recall: 0.6666666666666666
Accuracy: 0.9491525423728814


In [12]:
cross_refs_path = "/Users/cb/src/nba_mvp_ml/data/processed/by_season/fully_merged/player_index_mapping.csv"
cross_refs = pd.read_csv(cross_refs_path)

print('\nCorrectly predicted as MVP')
for i in list(true_positive['index']):
    player = cross_refs.iloc[i]['Player']
    season = write_season(int(cross_refs.iloc[i]['SEASON_ID']))
    
    print(f'{season} {player}')

print('\nIncorrectly predicted as MVP')
for i in list(false_positive['index']):
    player = cross_refs.iloc[i]['Player']
    season = write_season(int(cross_refs.iloc[i]['SEASON_ID']))
    
    print(f'{season} {player}')

print('\nIncorrectly predicted as non-MVP')
for i in list(false_negative['index']):
    player = cross_refs.iloc[i]['Player']
    season = write_season(int(cross_refs.iloc[i]['SEASON_ID']))
    
    print(f'{season} {player}')

print('\nCorrectly predicted as non-MVP')
for i in list(true_negative['index']):
    player = cross_refs.iloc[i]['Player']
    season = write_season(int(cross_refs.iloc[i]['SEASON_ID']))
    
    print(f'{season} {player}')


Correctly predicted as MVP
2017-18 JAMES HARDEN
2015-16 STEPHEN CURRY
2016-17 RUSSELL WESTBROOK
1990-91 MICHAEL JORDAN

Incorrectly predicted as MVP
2018-19 JAMES HARDEN

Incorrectly predicted as non-MVP
2020-21 NIKOLA JOKIĆ
1998-99 KARL MALONE

Correctly predicted as non-MVP
2011-12 TONY PARKER
1982-83 BUCK WILLIAMS
2002-03 ALLEN IVERSON
2021-22 LUKA DONČIĆ
2003-04 BEN WALLACE
2006-07 KOBE BRYANT
1983-84 ADRIAN DANTLEY
1999-00 KEVIN GARNETT
2022-23 DOMANTAS SABONIS
1995-96 DAVID ROBINSON
1992-93 DAVID ROBINSON
1996-97 TIM HARDAWAY
1991-92 MARK PRICE
1986-87 MICHAEL JORDAN
2003-04 JERMAINE O'NEAL
1983-84 KAREEM ABDUL-JABBAR
1983-84 JULIUS ERVING
2020-21 JOEL EMBIID
1989-90 PATRICK EWING
1992-93 PATRICK EWING
1995-96 GARY PAYTON
2000-01 CHRIS WEBBER
1984-85 MOSES MALONE
2005-06 DWYANE WADE
1987-88 LARRY BIRD
1990-91 DAVID ROBINSON
2004-05 TIM DUNCAN
2008-09 DWYANE WADE
1986-87 LARRY BIRD
2012-13 KOBE BRYANT
1987-88 CHARLES BARKLEY
2019-20 ANTHONY DAVIS
2022-23 JAYSON TATUM
1997-98 SHAQ

In [10]:
X.shape

(295, 24)