In [None]:
# Import libraries
import pandas as pd
import os

# Define file paths and load data
data_path = "../processed-data"
season_file = "combined_data_w_target.csv" # Combined dataset
file_path = os.path.join(data_path, season_file)
combined_data_w_target = pd.read_csv(file_path)
combined_data_w_target # Test successful load

In [None]:
combined_data_w_target.columns

In [None]:
# Create SMOY Winner column
combined_data_w_target['SMOY_Winner'] = combined_data_w_target['Awards'].apply(lambda x: 1 if '6MOY-1' in str(x) else 0)
print(combined_data_w_target.columns)
combined_data_w_target
combined_data_w_target['SMOY_Winner'].unique()


In [4]:
# features = ['Age', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P',
#        '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'TS%',
#        'EFF', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'Pos_C', 'Pos_PF', 'Pos_PG',
#        'Pos_SF', 'Pos_SG', 'SMOY_Eligible']

features = ['Age', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P',
       '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'TS%',
       'EFF', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS']

target = ['SMOY_Winner']

In [None]:
# Importing necessary libraries
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE  # For oversampling
from sklearn.model_selection import StratifiedKFold

# Defining target (y) and features (X)
X = combined_data_w_target[features]  # Remove Player, Team, Season, Award (not useful unless transformed), and candidate/winner (leakage)
y = combined_data_w_target[target]  # Target

# Data Splits
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)  # stratify ensures balanced split

# Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Handle class imbalance with SMOTE (oversampling the minority class)
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)

# Model Creation with class_weight='balanced' to handle imbalance within the RandomForest
rfc_model = RandomForestClassifier(random_state=42, class_weight='balanced')

# Fit model on resampled data
rfc_model.fit(X_train_resampled, y_train_resampled)

# Make predictions
y_pred = rfc_model.predict(X_test_scaled)

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

# Classification report (precision, recall, F1-score, etc.)
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Confusion Matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


In [None]:
# Visualize the confusion matrix
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Confusion matrix visualization
cm = confusion_matrix(y_test, y_pred)
fig, ax = plt.subplots(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt="d", cmap='Blues', xticklabels=['Non-SMOY', 'SMOY Winner'], yticklabels=['Non-SMOY', 'SMOY Winner'])
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()

# Key Insights
print("\nKey Findings:")
print("1. High overall accuracy (98.69%) driven by the majority class (Non-SMOY).")
print("2. Low recall (0.29) and precision (0.20) for the minority class (SMOY Winner).")
print("3. Confusion matrix shows 24 false negatives (SMOY Winner misclassified) and 39 false positives (Non-SMOY misclassified).")

# Calculate and display percentage of correct predictions for the SMOY Winner class
correct_smooy_winner = cm[1, 1]  # True positives
total_smooy_winner = cm[1, 0] + cm[1, 1]  # Total actual SMOY Winners
smooy_winner_accuracy = correct_smooy_winner / total_smooy_winner * 100 if total_smooy_winner > 0 else 0

print(f"\nSMOY Winner Prediction Accuracy: {smooy_winner_accuracy:.2f}%")