In [1]:
# set directory 1 level up
import sys
sys.path.append('..')

In [2]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import logging
import os
from nba_career_predictor import NBACareerPredictor
from parser import setup_parser
import config
import shap
from sklearn.metrics import roc_curve, auc
import plotly.offline as pyo

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Initialize results tracking
# Initialize metrics_df with specific column names and data types
metrics_df = pd.DataFrame({
    'Model': pd.Series(dtype='str'),
    'Accuracy': pd.Series(dtype='float'),
    'Precision': pd.Series(dtype='float'),
    'Recall': pd.Series(dtype='float'),
    'F1': pd.Series(dtype='float')
})

fig = go.Figure()

# Load data
logging.info("Loading data...")
df = pd.read_csv(config.DATA_PATH)

# Initialize predictor
predictor = NBACareerPredictor()
predictors_dict = {}

# Add features
logging.info("Adding smart features...")
enhanced_df = predictor.add_features(df)

# Preprocess data
X_train, X_test, y_train, y_test = predictor.preprocess_data(enhanced_df)

In [None]:
model_name = "Logistic Regression"
# Create experiment directory
experiment_name = f"{model_name}_variance_testing"
experiment_dir = os.path.join(config.RESULTS_DIR, experiment_name)
os.makedirs(experiment_dir, exist_ok=True)

seeds = [1, 10, 20, 30, 40, 50, 60, 70, 80, 90]

# Train and evaluate each model
for seed_number in seeds:
    logging.info("-" * 50)
    logging.info(" " * 20)
    logging.info(f"Processing {model_name}...")
    logging.info("-" * 20)
    predictors_dict[model_name] = NBACareerPredictor(model_type=model_name, seed=seed_number)
    
    # Train and evaluate model, now returns more metrics
    metrics, final_score, fpr, tpr, thresholds, youden_index, optimal_threshold, optimal_fpr, optimal_tpr = predictors_dict[model_name].train_and_test_model(
        X_train, y_train, X_test, y_test
    )
    
    # Add metrics to the table
    new_metrics_row = pd.DataFrame([{
        'Model': model_name,
        'Accuracy': metrics['accuracy'],
        'Precision': metrics['precision'],
        'Recall': metrics['recall'],
        'F1': metrics['f1']
    }])

    metrics_df = pd.concat([metrics_df, new_metrics_row], ignore_index=True)

    # Plot ROC curve for model
    fig.add_trace(go.Scatter(
        x=fpr, y=tpr, mode='lines',
        name=model_name,
        hovertemplate=("FPR: %{x:.2f}<br>"+"TPR: %{y:.2f}<br>"+"Threshold: %{customdata:.2f}<extra></extra>"),
        customdata=thresholds  # This adds the thresholds to the hover data
    ))

# Save metrics and ROC plot
fig.update_layout(title=f"ROC Curves of experiment {experiment_name}", xaxis_title="False Positive Rate", yaxis_title="True Positive Rate")

# Display metrics table and ROC curves plot
logging.info("Final metrics table:\n" + str(metrics_df))
pyo.plot(fig)

Fitting 5 folds for each of 48 candidates, totalling 240 fits
Fitting 5 folds for each of 48 candidates, totalling 240 fits
Fitting 5 folds for each of 48 candidates, totalling 240 fits
Fitting 5 folds for each of 48 candidates, totalling 240 fits
Fitting 5 folds for each of 48 candidates, totalling 240 fits
Fitting 5 folds for each of 48 candidates, totalling 240 fits
Fitting 5 folds for each of 48 candidates, totalling 240 fits
Fitting 5 folds for each of 48 candidates, totalling 240 fits
Fitting 5 folds for each of 48 candidates, totalling 240 fits
Fitting 5 folds for each of 48 candidates, totalling 240 fits


'temp-plot.html'