In [1]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
from sklearn.metrics import recall_score, roc_auc_score
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve


ModuleNotFoundError: No module named 'imblearn'

In [None]:
data_folder = "processed_data"

def preprocess_mip_data(current_season, previous_season):
    df_combined = pd.merge(
        current_season,
        previous_season,
        on='Player',
        suffixes=('_curr', '_prev')
    )
    
    # Calculate year-over-year improvements
    df_combined['PTS_improvement'] = df_combined['PTS_curr'] - df_combined['PTS_prev']
    df_combined['TRB_improvement'] = df_combined['TRB_curr'] - df_combined['TRB_prev']
    df_combined['AST_improvement'] = df_combined['AST_curr'] - df_combined['AST_prev']
    
    # Compute overall improvement score
    df_combined['Improvement_Score'] = (
        df_combined['PTS_improvement'] +
        df_combined['TRB_improvement'] +
        df_combined['AST_improvement']
    )
    return df_combined


In [None]:
def preprocess_all_seasons(data_folder, start_year, end_year):
    season_files = [f"{data_folder}/nba_player_stats_{year}-{str(year+1)[-2:]}_processed.csv" for year in range(start_year, end_year)]
    processed_dfs = []
    
    for i in range(1, len(season_files)):
        current_file = season_files[i]
        previous_file = season_files[i - 1]
        
        if os.path.exists(current_file) and os.path.exists(previous_file):
            current_season = pd.read_csv(current_file)
            previous_season = pd.read_csv(previous_file)
            
            processed_df = preprocess_mip_data(current_season, previous_season)
            processed_dfs.append(processed_df)
        else:
            print(f"Files {current_file} or {previous_file} not found. Skipping.")
    
    return pd.concat(processed_dfs, ignore_index=True)

In [None]:

train_df = preprocess_all_seasons(data_folder, 1980, 2015)
test_df = preprocess_all_seasons(data_folder, 2016, 2024)

print("Columns in train_df:", train_df.columns)
print("Columns in test_df:", test_df.columns)

# Function to mark top-5 MIP candidates per season
def mark_mip_candidates(df):
    df['MIP_Class'] = 0
    for season in df['Season_curr'].unique():
        season_mask = df['Season_curr'] == season
        season_data = df[season_mask]
        top_5_indices = season_data.nlargest(5, 'Improvement_Score').index
        df.loc[top_5_indices, 'MIP_Class'] = 1
    return df

In [None]:


# Marking MIP candidates in both datasets
train_df = mark_mip_candidates(train_df)
test_df = mark_mip_candidates(test_df)

print("Training Data MIP_Class Distribution:")
print(train_df['MIP_Class'].value_counts(normalize=True))

features = ['PTS_improvement', 'TRB_improvement', 'AST_improvement', 'Improvement_Score']
X = train_df[features]
y = train_df['MIP_Class']

X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=42, stratify=y)

# Handling missing values
imputer = SimpleImputer(strategy='mean')
X_train_imputed = pd.DataFrame(
    imputer.fit_transform(X_train),
    columns=X_train.columns
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_imputed)

# Handling class imbalance with SMOTE
class_counts = y_train.value_counts()
print(f"Class distribution before SMOTE: {class_counts}")

minority_class_size = class_counts[1]
if minority_class_size > 1:
    k_neighbors = min(5, minority_class_size - 1)
    smote = SMOTE(random_state=42, k_neighbors=k_neighbors)
    X_train_balanced, y_train_balanced = smote.fit_resample(X_train_scaled, y_train)
    print(f"Class distribution after SMOTE: {pd.Series(y_train_balanced).value_counts()}")
else:
    print("Not enough samples in the minority class for SMOTE. Skipping oversampling.")
    X_train_balanced, y_train_balanced = X_train_scaled, y_train

# Training a DummyClassifier
dummy_clf = DummyClassifier(strategy='stratified', random_state=42)
dummy_clf.fit(X_train_balanced, y_train_balanced)




In [None]:
season_2024 = test_df[test_df['Season_curr'] == '2023-24'].copy()

if season_2024.empty:
    print("No data available for the 2023–24 season.")
else:
    X_2024 = season_2024[features]
    
    X_2024_imputed = pd.DataFrame(imputer.transform(X_2024), columns=X_2024.columns)
    X_2024_scaled = scaler.transform(X_2024_imputed)
    
    pred_proba = dummy_clf.predict_proba(X_2024_scaled)[:, 1]
    season_2024['MIP_Probability'] = pred_proba
    
    # Show the top 5 predictions
    top_5 = season_2024.nlargest(5, 'MIP_Probability')[
        ['Player', 'Team_curr', 'MIP_Probability', 'PTS_improvement', 'TRB_improvement', 'AST_improvement', 'Improvement_Score']
    ]
    print("\nTop 5 Predicted MIP Candidates for 2023–24 Season:")
    print(top_5.round(3))
    
    mip_winner = top_5.iloc[0]  
    print("\nPredicted Most Improved Player for 2023–24:")
    print(f"Player: {mip_winner['Player']}")
    print(f"Team: {mip_winner['Team_curr']}")
    print(f"Probability: {mip_winner['MIP_Probability']:.3f}")
    print(f"PTS Improvement: {mip_winner['PTS_improvement']}")
    print(f"TRB Improvement: {mip_winner['TRB_improvement']}")
    print(f"AST Improvement: {mip_winner['AST_improvement']}")


y_pred = dummy_clf.predict(scaler.transform(imputer.transform(X_val)))
print("\nValidation Set Performance:")
print(f"Recall Score: {recall_score(y_val, y_pred):.3f}")
print(f"ROC-AUC Score: {roc_auc_score(y_val, y_pred):.3f}")



In [None]:
# # Plotting ROC Curve
# fpr, tpr, thresholds = roc_curve(y_val, y_pred)
# roc_auc = roc_auc_score(y_val, y_pred)

# plt.figure(figsize=(8, 6))
# plt.plot(fpr, tpr, color='blue', lw=2, label=f'ROC curve (area = {roc_auc:.3f})')
# plt.plot([0, 1], [0, 1], color='gray', linestyle='--')
# plt.xlabel('False Positive Rate (FPR)')
# plt.ylabel('True Positive Rate (TPR)')
# plt.title('Receiver Operating Characteristic (ROC) Curve')
# plt.legend(loc="lower right")
# plt.grid(True)
# plt.show()