In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import mlflow.pytorch
import numpy as np

import mlflow

import os
import sys
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))  # Adjust to your project's structure
sys.path.append(project_root)

from src.analysis import *

In [2]:
# Set MLFlow tracking URI (local or server-based)
mlflow.set_tracking_uri("sqlite:///mlflow.db")  # Change if using a centralized server

# Define the experiment name
mlflow.set_experiment("MVP Prediction NN")

mlflow.set_tag("developer", "christophe")

In [23]:
# Load your cleaned dataset
data_path = "/Users/cb/src/nba_mvp_ml/data/processed/by_season/fully_merged/final_stacked_data.csv"

_X, _y = load_and_preprocess_data(data_path, remove_excess_features=True) # X will be normalized


# Example input data
np.random.seed(42)
X =_X.to_numpy().astype(np.float32)
y = _y.to_numpy().astype(np.int64)  # Binary labels

# Determine sizes for train, validation, and test splits
train_size = int(0.7 * len(X))  # 70% for training
val_size = int(0.15 * len(X))   # 15% for validation
test_size = len(X) - train_size - val_size  # Remaining 15% for testing

# Split the datase
X_train = torch.tensor(X[:train_size])
y_train = torch.tensor(y[:train_size])

X_val= torch.tensor(X[train_size:train_size + val_size])
y_val= torch.tensor(y[train_size:train_size + val_size])

X_test = torch.tensor(X[train_size + val_size:])
y_test = torch.tensor(y[train_size + val_size:])

_y_test = _y[train_size + val_size:]

# Check the shapes of each split
print("Train set:", X_train.shape, y_train.shape)
print("Validation set:", X_val.shape, y_val.shape)
print("Test set:", X_test.shape, y_test.shape)

# Create DataLoaders
batch_size = 32
train_dataset = TensorDataset(X_train, y_train)
val_dataset = TensorDataset(X_val, y_val)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

Train set: torch.Size([206, 24]) torch.Size([206])
Validation set: torch.Size([44, 24]) torch.Size([44])
Test set: torch.Size([45, 24]) torch.Size([45])


In [4]:
# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Define a simple neural network
class SimpleMLP(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(SimpleMLP, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, output_size)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return self.softmax(x)

# Hyperparameters
input_size = 24
hidden_size = 64
output_size = 2
learning_rate = 0.001
num_epochs = 20

# Initialize the model, loss function, and optimizer
model = SimpleMLP(input_size, hidden_size, output_size).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [5]:
with mlflow.start_run(nested=True):
    # Log hyperparameters
    mlflow.log_param("input_size", input_size)
    mlflow.log_param("hidden_size", hidden_size)
    mlflow.log_param("output_size", output_size)
    mlflow.log_param("learning_rate", learning_rate)
    mlflow.log_param("batch_size", batch_size)
    mlflow.log_param("num_epochs", num_epochs)

    mlflow.log_param("model_name", 'neural network')

    # Training loop
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0

        for inputs, targets in train_loader:
            inputs, targets = inputs.to(device), targets.to(device)

            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()

        avg_train_loss = running_loss / len(train_loader)
        print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {avg_train_loss:.4f}")
        mlflow.log_metric("train_loss", avg_train_loss, step=epoch)

        # Validation loop
        model.eval()
        val_loss = 0.0
        correct = 0
        total = 0

        with torch.no_grad():
            for inputs, targets in val_loader:
                inputs, targets = inputs.to(device), targets.to(device)

                outputs = model(inputs)
                loss = criterion(outputs, targets)
                val_loss += loss.item()

                _, predicted = torch.max(outputs, 1)
                total += targets.size(0)
                correct += (predicted == targets).sum().item()

        avg_val_loss = val_loss / len(val_loader)
        val_accuracy = 100 * correct / total
        print(f"Validation Loss: {avg_val_loss:.4f}, Accuracy: {val_accuracy:.2f}%")

        mlflow.log_metric("val_loss", avg_val_loss, step=epoch)
        mlflow.log_metric("val_accuracy", val_accuracy, step=epoch)

    # Log the trained model
    mlflow.pytorch.log_model(model, "model")

    print("Model training complete and logged in MLflow.")

Epoch [1/20], Loss: 0.6965
Validation Loss: 0.6490, Accuracy: 70.45%
Epoch [2/20], Loss: 0.6409
Validation Loss: 0.6010, Accuracy: 79.55%
Epoch [3/20], Loss: 0.5935
Validation Loss: 0.5623, Accuracy: 84.09%
Epoch [4/20], Loss: 0.5598
Validation Loss: 0.5320, Accuracy: 86.36%
Epoch [5/20], Loss: 0.5288
Validation Loss: 0.5091, Accuracy: 86.36%
Epoch [6/20], Loss: 0.5017
Validation Loss: 0.4922, Accuracy: 86.36%
Epoch [7/20], Loss: 0.4936
Validation Loss: 0.4799, Accuracy: 86.36%
Epoch [8/20], Loss: 0.4733
Validation Loss: 0.4700, Accuracy: 88.64%
Epoch [9/20], Loss: 0.4691
Validation Loss: 0.4629, Accuracy: 88.64%
Epoch [10/20], Loss: 0.4615
Validation Loss: 0.4570, Accuracy: 88.64%
Epoch [11/20], Loss: 0.4488
Validation Loss: 0.4505, Accuracy: 86.36%
Epoch [12/20], Loss: 0.4440
Validation Loss: 0.4468, Accuracy: 86.36%
Epoch [13/20], Loss: 0.4413
Validation Loss: 0.4428, Accuracy: 86.36%
Epoch [14/20], Loss: 0.4297
Validation Loss: 0.4404, Accuracy: 86.36%
Epoch [15/20], Loss: 0.4356
V



Model training complete and logged in MLflow.


In [19]:
model.eval()

# Pass the input tensor to the model
with torch.no_grad(): 
    predictions = model(X_test)

y_pred = torch.argmax(predictions, dim=1)

In [54]:
y_true = y_test

# Calculate True Positives, False Positives, and False Negatives
tp = ((y_true == 1) & (y_pred == 1)).sum()  # True Positives
fp = ((y_true == 0) & (y_pred == 1)).sum()  # False Positives
fn = ((y_true == 1) & (y_pred == 0)).sum()  # False Negatives
tn = ((y_true == 0) & (y_pred == 0)).sum()  # True Negatives

# Precision and Recall
precision = tp / (tp + fp) if (tp + fp) > 0 else 0
recall = tp / (tp + fn) if (tp + fn) > 0 else 0

# Accuracy
accuracy = (tp + tn) / (tp + tn + fp + fn)

print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"Accuracy: {accuracy}")

Precision: 1.0
Recall: 0.7142857313156128
Accuracy: 0.9555555582046509


In [50]:
# subset_indexes = _y_test.index

y_pred_np = y_pred.numpy()

true_positive = _y_test.reset_index().loc[(_y_test.reset_index()['mvp'] == 1) & (y_pred_np == 1)]
false_positive = _y_test.reset_index().loc[(_y_test.reset_index()['mvp'] == 0) & (y_pred_np == 1)]
false_negative = _y_test.reset_index().loc[(_y_test.reset_index()['mvp'] == 1) & (y_pred_np == 0)]
true_negative = _y_test.reset_index().loc[(_y_test.reset_index()['mvp'] == 0) & (y_pred_np == 0)]

# print(f"true_positive:\n {true_positive}")
# print(f"false_positive:\n {false_positive}")
# print(f"false_negative:\n {false_negative}")

cross_refs_path = "/Users/cb/src/nba_mvp_ml/data/processed/by_season/fully_merged/player_index_mapping.csv"
cross_refs = pd.read_csv(cross_refs_path)

print('\nCorrectly predicted as MVP')
for i in list(true_positive['index']):
    player = cross_refs.iloc[i]['Player']
    season = write_season(int(cross_refs.iloc[i]['SEASON_ID']))
    
    print(f'{season} {player}')

print('\nIncorrectly predicted as MVP')
for i in list(false_positive['index']):
    player = cross_refs.iloc[i]['Player']
    season = write_season(int(cross_refs.iloc[i]['SEASON_ID']))
    
    print(f'{season} {player}')

print('\nIncorrectly predicted as non-MVP')
for i in list(false_negative['index']):
    player = cross_refs.iloc[i]['Player']
    season = write_season(int(cross_refs.iloc[i]['SEASON_ID']))
    
    print(f'{season} {player}')

print('\nCorrectly predicted as non-MVP')
for i in list(true_negative['index']):
    player = cross_refs.iloc[i]['Player']
    season = write_season(int(cross_refs.iloc[i]['SEASON_ID']))
    
    print(f'{season} {player}')


Correctly predicted as MVP
1984-85 LARRY BIRD
1994-95 DAVID ROBINSON
2003-04 KEVIN GARNETT
2011-12 LEBRON JAMES
1999-00 SHAQUILLE O'NEAL

Incorrectly predicted as MVP

Incorrectly predicted as non-MVP
2022-23 JOEL EMBIID
2001-02 TIM DUNCAN

Correctly predicted as non-MVP
1984-85 MAGIC JOHNSON
1984-85 MOSES MALONE
1984-85 TERRY CUMMINGS
1994-95 CHARLES BARKLEY
1994-95 HAKEEM OLAJUWON
1994-95 KARL MALONE
1994-95 PATRICK EWING
1994-95 SCOTTIE PIPPEN
1994-95 SHAQUILLE O'NEAL
2003-04 BEN WALLACE
2003-04 JERMAINE O'NEAL
2003-04 KOBE BRYANT
2003-04 SHAQUILLE O'NEAL
2003-04 TIM DUNCAN
2011-12 CHRIS PAUL
2011-12 DWIGHT HOWARD
2011-12 KEVIN DURANT
2011-12 KEVIN LOVE
2011-12 KOBE BRYANT
2011-12 TONY PARKER
1999-00 ALLEN IVERSON
1999-00 ALONZO MOURNING
1999-00 GARY PAYTON
1999-00 KARL MALONE
1999-00 KEVIN GARNETT
1999-00 TIM DUNCAN
2022-23 DOMANTAS SABONIS
2022-23 DONOVAN MITCHELL
2022-23 GIANNIS ANTETOKOUNMPO
2022-23 JAYSON TATUM
2022-23 NIKOLA JOKIĆ
2022-23 SHAI GILGEOUS-ALEXANDER
2001-02 CHRIS

In [52]:
y_true

tensor([0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
        0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0])