In [1]:
from hyperopt import fmin, tpe, hp, Trials, STATUS_OK
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import mlflow
import mlflow.pytorch
from mlflow.models import infer_signature
from sklearn.metrics import accuracy_score

import numpy as np
import pandas as pd

import os
import sys
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))  # Adjust to your project's structure
sys.path.append(project_root)
from src.analysis import *

In [2]:
# Set MLFlow tracking URI (local or server-based)
mlflow.set_tracking_uri("sqlite:///mlflow.db")  # Change if using a centralized server

# Define the experiment name
mlflow.set_experiment("MVP Prediction (NN)")

mlflow.set_tag("developer", "cbrown")

In [3]:
# Load your cleaned dataset
data_path = "/Users/cb/src/nba_mvp_ml/data/processed/by_season/fully_merged/final_stacked_data.csv"

_X, _y = load_and_preprocess_data(data_path, remove_excess_features=True) # X will be normalized


# Example input data
np.random.seed(42)
X =_X.to_numpy().astype(np.float32)
y = _y.to_numpy().astype(np.int64)  # Binary labels

# Determine sizes for train, validation, and test splits
train_size = int(0.7 * len(X))  # 70% for training
val_size = int(0.15 * len(X))   # 15% for validation
test_size = len(X) - train_size - val_size  # Remaining 15% for testing

# Split the datase
X_train = torch.tensor(X[:train_size])
y_train = torch.tensor(y[:train_size])

X_val= torch.tensor(X[train_size:train_size + val_size])
y_val= torch.tensor(y[train_size:train_size + val_size])

X_test = torch.tensor(X[train_size + val_size:])
y_test = torch.tensor(y[train_size + val_size:])

_y_test = _y[train_size + val_size:]

# Check the shapes of each split
print("Train set:", X_train.shape, y_train.shape)
print("Validation set:", X_val.shape, y_val.shape)
print("Test set:", X_test.shape, y_test.shape)

# Create DataLoaders
train_dataset = TensorDataset(X_train, y_train)
val_dataset = TensorDataset(X_val, y_val)

Train set: torch.Size([206, 24]) torch.Size([206])
Validation set: torch.Size([44, 24]) torch.Size([44])
Test set: torch.Size([45, 24]) torch.Size([45])


In [4]:
# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Define a simple neural network
class SimpleMLP(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, dropout_pct=0):
        super(SimpleMLP, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        # self.relu = nn.ReLU()
        self.leaky_relu = nn.LeakyReLU(negative_slope=0.01) 
        self.dropout = nn.Dropout(p=dropout_pct)
        self.fc2 = nn.Linear(hidden_size, output_size)
        self.softmax = nn.Softmax(dim=1)
        self.bn1 = nn.BatchNorm1d(hidden_size)


    def forward(self, x):
        x = self.fc1(x)
        # x = self.relu(x)
        x = self.leaky_relu(x)
        x = self.bn1(x)  # Batch normalization
        x = self.fc2(x)
        return self.softmax(x)

In [5]:
# Define the objective function
def objective(params):
    input_size = len(_X.columns)
    hidden_size = int(params["hidden_size"])
    output_size = 2
    learning_rate = params["learning_rate"]
    batch_size = int(params["batch_size"])
    num_epochs = int(params["num_epochs"])
    dropout_pct = int(params["dropout_pct"])
    
    # Update DataLoaders with new batch size
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    
    # Initialize the model
    model = SimpleMLP(input_size, hidden_size, output_size).to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    
    # Training loop
    for epoch in range(num_epochs):
        model.train()
        for inputs, targets in train_loader:
            inputs, targets = inputs.to(device), targets.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
    
    # Validation loop
    model.eval()
    val_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, targets in val_loader:
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = model(inputs)
            val_loss += criterion(outputs, targets).item()
            _, predicted = torch.max(outputs, 1)
            total += targets.size(0)
            correct += (predicted == targets).sum().item()
    
    val_accuracy = correct / total
    avg_val_loss = val_loss / len(val_loader) 
    
    # Log parameters and metrics to MLflow
    with mlflow.start_run(run_name=f"{X.shape[1]} features", nested=True):
        y_pred = torch.argmax(model(X_test), dim=1).numpy()

        y_true = y_test

        # Calculate True Positives, False Positives, and False Negatives
        tp = ((y_true == 1) & (y_pred == 1)).sum()  # True Positives
        fp = ((y_true == 0) & (y_pred == 1)).sum()  # False Positives
        fn = ((y_true == 1) & (y_pred == 0)).sum()  # False Negatives
        tn = ((y_true == 0) & (y_pred == 0)).sum()  # True Negatives
        
        # Precision and Recall
        precision = tp / (tp + fp) if (tp + fp) > 0 else 0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0
        
        # Accuracy
        accuracy = (tp + tn) / (tp + tn + fp + fn)
        
        mlflow.log_metric("precision", precision)
        mlflow.log_metric("recall", recall)
        mlflow.log_metric("accuracy", accuracy)
        
        mlflow.log_params(params)
        mlflow.log_metric("val_loss", avg_val_loss)
        mlflow.log_metric("val_accuracy", val_accuracy)
        mlflow.pytorch.log_model(
            pytorch_model=model,
            signature=infer_signature(X_test.numpy(), y_pred),
            artifact_path="nn-model",
        )
    
    # Return validation loss for Hyperopt to minimize
    return {"loss": -val_accuracy, "status": STATUS_OK}

# Define the hyperparameter search space
search_space = {
    "hidden_size": hp.quniform("hidden_size", 16, 128, 16),  # Discrete range
    "learning_rate": hp.loguniform("learning_rate", -6, -1),  # Log scale
    "batch_size": hp.quniform("batch_size", 4, 64, 4),
    "num_epochs": hp.quniform("num_epochs", 20, 500, 20),
    "dropout_pct": hp.quniform("dropout_pct", 0, 70, 1)
}

# Run Hyperopt optimization
trials = Trials()
best = fmin(
    fn=objective,
    space=search_space,
    algo=tpe.suggest,
    max_evals=20,  # Number of evaluations
    trials=trials
)

print("Best hyperparameters:", best)

100%|███████████████████| 20/20 [01:52<00:00,  5.62s/trial, best loss: -0.9545454545454546]
Best hyperparameters: {'batch_size': 36.0, 'dropout_pct': 26.0, 'hidden_size': 48.0, 'learning_rate': 0.0033595262605951083, 'num_epochs': 80.0}


In [8]:
list(_X.columns)

['FGM',
 'BPM',
 'DRBPct',
 'DWS',
 'OBPM',
 'PER',
 'TOVPct',
 'VORP',
 'WS/48_x',
 'Rk_opp_pg',
 '2P%_opp_pg',
 'DRB_opp_pg',
 'SRS',
 'ORtg',
 'sentiment_1',
 'sentiment_2',
 'sentiment_3',
 'sentiment_5',
 'sentiment_6',
 'sentiment_8',
 'sentiment_13',
 'sentiment_14',
 'sentiment_avg',
 'WS']

In [10]:
# #maybe do k fold?

# from sklearn.model_selection import KFold
# import numpy as np

# # Number of folds for cross-validation
# k_folds = 5
# kf = KFold(n_splits=k_folds, shuffle=True, random_state=42)

# # Prepare the data
# X = _X.to_numpy().astype(np.float32)
# y = _y.to_numpy().astype(np.int64)

# # Hyperparameters
# input_size = len(_X.columns)
# hidden_size = 64
# output_size = 2
# learning_rate = 0.001
# num_epochs = 100
# batch_size = 32

# # Metrics to track across folds
# fold_results = []

# for fold, (train_idx, val_idx) in enumerate(kf.split(X, y)):
#     print(f"\n--- Fold {fold + 1}/{k_folds} ---")
    
#     # Split data into training and validation sets for this fold
#     X_train, y_train = torch.tensor(X[train_idx]), torch.tensor(y[train_idx])
#     X_val, y_val = torch.tensor(X[val_idx]), torch.tensor(y[val_idx])
    
#     train_dataset = TensorDataset(X_train, y_train)
#     val_dataset = TensorDataset(X_val, y_val)
    
#     train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
#     val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

#     # Initialize the model, optimizer, and loss function for this fold
#     model = SimpleMLP(input_size, hidden_size, output_size).to(device)
#     optimizer = optim.Adam(model.parameters(), lr=learning_rate)
#     criterion = nn.CrossEntropyLoss()

#     # Training loop
#     for epoch in range(num_epochs):
#         model.train()
#         running_loss = 0.0

#         for inputs, targets in train_loader:
#             inputs, targets = inputs.to(device), targets.to(device)

#             optimizer.zero_grad()
#             outputs = model(inputs)
#             loss = criterion(outputs, targets)
#             loss.backward()
#             optimizer.step()

#             running_loss += loss.item()

#         avg_train_loss = running_loss / len(train_loader)

#         # Validation loop
#         model.eval()
#         val_loss = 0.0
#         correct = 0
#         total = 0

#         with torch.no_grad():
#             for inputs, targets in val_loader:
#                 inputs, targets = inputs.to(device), targets.to(device)

#                 outputs = model(inputs)
#                 loss = criterion(outputs, targets)
#                 val_loss += loss.item()

#                 _, predicted = torch.max(outputs, 1)
#                 total += targets.size(0)
#                 correct += (predicted == targets).sum().item()

#         avg_val_loss = val_loss / len(val_loader)
#         val_accuracy = 100 * correct / total

#         if epoch % 10 == 0 or epoch == num_epochs - 1:
#             print(f"Epoch [{epoch + 1}/{num_epochs}] - Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}, Val Accuracy: {val_accuracy:.2f}%")
    
#     # Log fold results
#     fold_results.append({
#         "fold": fold + 1,
#         "val_loss": avg_val_loss,
#         "val_accuracy": val_accuracy
#     })

#     # Log metrics to MLFlow
#     with mlflow.start_run(nested=True):
#         mlflow.log_param("fold", fold + 1)
#         mlflow.log_param("input_size", input_size)
#         mlflow.log_param("hidden_size", hidden_size)
#         mlflow.log_param("learning_rate", learning_rate)
#         mlflow.log_metric("val_loss", avg_val_loss)
#         mlflow.log_metric("val_accuracy", val_accuracy)

# # Summarize cross-validation results
# mean_val_accuracy = np.mean([result["val_accuracy"] for result in fold_results])
# std_val_accuracy = np.std([result["val_accuracy"] for result in fold_results])

# print("\n--- Cross-Validation Results ---")
# for result in fold_results:
#     print(f"Fold {result['fold']}: Val Loss = {result['val_loss']:.4f}, Val Accuracy = {result['val_accuracy']:.2f}%")
# print(f"Mean Val Accuracy: {mean_val_accuracy:.2f}%, Std Val Accuracy: {std_val_accuracy:.2f}%")

# # Log overall cross-validation results
# with mlflow.start_run(nested=True):
#     mlflow.log_metric("mean_val_accuracy", mean_val_accuracy)
#     mlflow.log_metric("std_val_accuracy", std_val_accuracy)