# Coach of the Year (COTY) Prediction Pipeline

This notebook implements a pipeline to predict the winner of the "Coach of the Year" award. 
The award is typically given to the coach with the best record or the most improved team.

## Steps
1. **Data Preparation**: Load data and filter for COTY awards.
2. **Feature Engineering**: Calculate `win_diff` (improvement) and other metrics.
3. **Modeling**: Train a Learning-to-Rank model (Logistic Regression) to predict the winner.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report

# Display settings
pd.set_option('display.max_columns', None)

## 1. Data Loading

In [None]:
coaches = pd.read_csv('../../initial_data/coaches.csv')
awards = pd.read_csv('../../initial_data/awards_players.csv')
teams = pd.read_csv('../../initial_data/teams.csv')

print("Coaches:", coaches.shape)
print("Awards:", awards.shape)
print("Teams:", teams.shape)

## 2. Entity Resolution & Stint Handling
We filter for "Coach of the Year" awards and handle coaches who had multiple stints in a single year.

In [None]:
# Filter COTY awards
coty_awards = awards[awards['award'] == 'Coach of the Year'].copy()
coty_awards = coty_awards.rename(columns={'playerID': 'coachID'})
coty_awards['is_coty'] = 1

# Handle Stints: Aggregate coach stats per year
# We prioritize the team where they coached the majority of games
coaches['games'] = coaches['won'] + coaches['lost']
coaches_agg = coaches.sort_values('games', ascending=False).drop_duplicates(subset=['coachID', 'year'])

print("Unique Coach-Seasons:", coaches_agg.shape[0])

## 3. Feature Engineering
We calculate key predictors:
- **win_diff**: Improvement from the previous season (Current Wins - Previous Wins).
- **conf_rank**: Conference rank (lower is better).

In [None]:
# Calculate Team Metrics (Win Diff)
teams_metrics = teams[['tmID', 'year', 'won', 'lost', 'rank', 'seeded']].copy()
teams_metrics['win_pct'] = teams_metrics['won'] / (teams_metrics['won'] + teams_metrics['lost'])

# Calculate previous year's wins
teams_metrics = teams_metrics.sort_values(['tmID', 'year'])
teams_metrics['prev_won'] = teams_metrics.groupby('tmID')['won'].shift(1)
teams_metrics['win_diff'] = teams_metrics['won'] - teams_metrics['prev_won']
teams_metrics['win_diff'] = teams_metrics['win_diff'].fillna(0)

# Merge Coach Data with Team Metrics
df = pd.merge(coaches_agg, teams_metrics, on=['tmID', 'year'], how='left', suffixes=('', '_team'))

# Merge Target Variable
df = pd.merge(df, coty_awards[['coachID', 'year', 'is_coty']], on=['coachID', 'year'], how='left')
df['is_coty'] = df['is_coty'].fillna(0)

# Final Feature Cleanup
df['team_won'] = df['won_team']
df['team_lost'] = df['lost_team']
df['conf_rank'] = df['rank']

# Drop rows with missing critical data
df = df.dropna(subset=['team_won', 'conf_rank'])

print("Final Dataset Shape:", df.shape)
df[['coachID', 'year', 'tmID', 'won', 'win_diff', 'conf_rank', 'is_coty']].head()

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_curve, auc, average_precision_score
import xgboost as xgb

features = ['won', 'lost', 'win_diff', 'conf_rank']
target = 'is_coty'

# Split Data
max_year = df['year'].max()
test_year = max_year - 1

print("Test Year:", test_year)

train_df = df[df['year'] < test_year].copy()
test_df = df[df['year'] >= test_year].copy()

X_train = train_df[features]
y_train = train_df[target]
X_test = test_df[features]
y_test = test_df[target]

# Scale Features
scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_test_s = scaler.transform(X_test)

random_state = 99

metrics_list = []

def print_metrics(y_true, y_pred, model_name):
    acc = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred, zero_division=0)
    rec = recall_score(y_true, y_pred, zero_division=0)
    f1 = f1_score(y_true, y_pred, zero_division=0)
    
    print(f"\n--- {model_name} Metrics ---")
    print(f"Accuracy: {acc:.4f}")
    print(f"Precision: {prec:.4f}")
    print(f"Recall: {rec:.4f}")
    print(f"F1-Score: {f1:.4f}")
    return {"Model": model_name, "Accuracy": acc, "Precision": prec, "Recall": rec, "F1": f1}

In [None]:
# 1. Logistic Regression
param_grid_lr = {
    'C': [0.01, 0.1, 1, 10, 100], 
    'solver': ['liblinear', 'lbfgs']
}

lr = LogisticRegression(random_state=random_state, class_weight='balanced', max_iter=1000)

grid_lr = GridSearchCV(lr, param_grid_lr, cv=5, scoring='f1', n_jobs=-1)
grid_lr.fit(X_train_s, y_train)

best_lr = grid_lr.best_estimator_
y_pred_lr = best_lr.predict(X_test_s)

print(f"Best LR Params: {grid_lr.best_params_}")
metrics_list.append(print_metrics(y_test, y_pred_lr, 'Logistic Regression'))

In [None]:
# 2. Random Forest
param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 5],
    'max_features': ['sqrt', 'log2']
}

rf = RandomForestClassifier(random_state=random_state, class_weight='balanced')

grid_rf = GridSearchCV(rf, param_grid_rf, cv=5, scoring='f1', n_jobs=-1)
grid_rf.fit(X_train_s, y_train)

best_rf = grid_rf.best_estimator_
y_pred_rf = best_rf.predict(X_test_s)

print(f"Best RF Params: {grid_rf.best_params_}")
metrics_list.append(print_metrics(y_test, y_pred_rf, 'Random Forest'))

In [None]:
# 3. Support Vector Machine
param_grid_svc = {
    'C': [0.001, 0.01, 0.1, 1, 10],
    'gamma': ['scale', 'auto'],
    'kernel': ['rbf', 'linear', 'sigmoid']
}

svc = SVC(probability=True, random_state=random_state, class_weight='balanced')

grid_svc = GridSearchCV(svc, param_grid_svc, cv=5, scoring='f1', n_jobs=-1)
grid_svc.fit(X_train_s, y_train)

best_svc = grid_svc.best_estimator_
y_pred_svc = best_svc.predict(X_test_s)

print(f"Best SVC Params: {grid_svc.best_params_}")
metrics_list.append(print_metrics(y_test, y_pred_svc, 'SVC'))

In [None]:
# 4. XGBoost
best_xgb = xgb.XGBRanker(
    objective='rank:pairwise', 
    learning_rate=0.01, 
    n_estimators=100, 
    max_depth=5, 
    random_state=random_state,
)

groups_train = train_df.groupby('year').size().to_list()
groups_test = test_df.groupby('year').size().to_list()

best_xgb.fit(X_train_s, y_train, group=groups_train)

y_pred_xgb = best_xgb.predict(X_test_s)

In [None]:
models = {
    'Logistic Regression': best_lr,
    'Random Forest': best_rf,
    'SVM (RBF)': best_svc,
    'XGBRanker': best_xgb
}

In [None]:
from sklearn.metrics import roc_curve, auc

predicted_res = {}

def calculate_ranking_metrics(model, X, y, df_meta):
    """
    Calculates Top-1, Top-3, Avg Rank, and MRR for a model on a given dataset.
    """
    temp_df = df_meta.copy()
    
    # Get scores (proba for classifiers, predict for XGBoost)
    if hasattr(model, "predict_proba"):
        temp_df['pred_score'] = model.predict_proba(X)[:, 1]
    else:
        temp_df['pred_score'] = model.predict(X)
        
    years = temp_df['year'].unique()
    
    ranks = []
    top1_hits = 0
    top3_hits = 0
    reciprocal_ranks = []
    
    for yr in years:
        yr_data = temp_df[temp_df['year'] == yr]
        
        # Skip years with no winner (data integrity check)
        if yr_data['is_coty'].sum() == 0:
            continue
            
        # 1. Identify Actual Winner
        actual_winner_id = yr_data[yr_data['is_coty'] == 1]['coachID'].values[0]
        
        # 2. Rank Candidates by Score (Highest Score = Rank 1)
        yr_data_sorted = yr_data.sort_values('pred_score', ascending=False).reset_index(drop=True)
        
        # 3. Find Rank of Actual Winner
        # We add 1 because index starts at 0
        winner_rank = yr_data_sorted[yr_data_sorted['coachID'] == actual_winner_id].index[0] + 1
        
        ranks.append(winner_rank)
        reciprocal_ranks.append(1 / winner_rank)
        
        if winner_rank == 1:
            top1_hits += 1
        if winner_rank <= 3:
            top3_hits += 1
            
    # Calculate Averages
    n = len(ranks)
    metrics = {
        'Top-1 Acc': top1_hits / n if n > 0 else 0,
        'Top-3 Acc': top3_hits / n if n > 0 else 0,
        'Avg Rank': np.mean(ranks) if n > 0 else 0,
        'MRR': np.mean(reciprocal_ranks) if n > 0 else 0
    }
    return metrics

results_list = []
plt.figure(figsize=(10, 8))

train_meta = train_df[['year', 'coachID', 'is_coty']].reset_index(drop=True)

for name, model in models.items():
    if name == "XGBRanker":
        y_prob = model.predict(X_test_s)
    else:
        y_prob = model.predict_proba(X_test_s)[:, 1]
        
    fpr, tpr, _ = roc_curve(y_test, y_prob)
    roc_auc = auc(fpr, tpr)
    
    rank_metrics = calculate_ranking_metrics(model, X_train_s, y_train, train_meta)
    
    # 4. Save Results
    results_list.append({
        'Model': name,
        'AUC (Test)': roc_auc,
        'Avg Winner Rank (Train)': rank_metrics['Avg Rank'],
        'Top-1 Acc (Train)': rank_metrics['Top-1 Acc'],
        'Top-3 Acc (Train)': rank_metrics['Top-3 Acc'],
        'MRR (Train)': rank_metrics['MRR']
    })
    
    # Plot ROC
    plt.plot(fpr, tpr, lw=2, label=f'{name} (AUC = {roc_auc:.2f})')

# Formatting
plt.plot([0, 1], [0, 1], 'k--', lw=2, label='Random Chance')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve Comparison')
plt.legend(loc="lower right")
plt.show()

# --- Display Final Table ---
# Sort by Top-1 Acc first, then Avg Rank (lower is better)
results_df = pd.DataFrame(results_list)
results_df = results_df.sort_values(by=['Avg Winner Rank (Train)', 'AUC (Test)'], ascending=[True, False])

print("\n--- Comprehensive Model Evaluation ---")
# Format floats for cleaner reading
display(results_df.style.format({
    'AUC (Test)': '{:.3f}',
    'Top-1 Acc (Train)': '{:.1%}',
    'Top-3 Acc (Train)': '{:.1%}',
    'Avg Winner Rank (Train)': '{:.1f}',
    'MRR (Train)': '{:.3f}'
}))

**Summary of the Strategy**
- High Train Ranking + High Test AUC = Great Model. (It learned the history well and applies it well to the future).

- High Train Ranking + Low Test AUC = Overfitting. (It memorized the history but fails on new data).

- Low Train Ranking + High Test AUC = Underfitting/Luck. (It didn't learn the history well, but got lucky on the test year).

In [None]:
winning_predicts = pd.DataFrame()
winning_predicts['Model'] = ""
winning_predicts['Top 1'] = np.nan
winning_predicts['Top 3'] = np.nan
winning_predicts['Missed'] = np.nan
winning_predicts['Avg Winner Rank'] = np.nan


for name, model in models.items():
    full_viz_df = test_df.copy()

    # Score
    if name == "XGBRanker":
        full_viz_df['score'] = model.predict(X_test_s)
    else:
        full_viz_df['score'] = model.predict_proba(X_test_s)[:, 1]

    full_viz_df.sort_values('year').reset_index(drop=True)

    # Year-by-Year Analysis
    unique_years = sorted(full_viz_df['year'].unique())
    cols_to_show = ['Predicted_Rank', 'coachID', 'tmID', 'won', 'win_diff', 'score']

    missed = 0
    top_1 = 0
    top_3 = 0
    avg_rank = 0

    print(f"--- Year-by-Year Precision Analysis: {name} ---")

    for yr in unique_years:
        yr_data = full_viz_df[full_viz_df['year'] == yr].copy()
        
        # Create Rankings
        yr_data_sorted = yr_data.sort_values('score', ascending=False).reset_index(drop=True)
        yr_data_sorted['Predicted_Rank'] = yr_data_sorted.index + 1
        
        # Find Actual Winner
        winner_row = yr_data[yr_data['is_coty'] == 1]
        
        if not winner_row.empty:
            winner_id = winner_row['coachID'].values[0]
            winner_team = winner_row['tmID'].values[0]
            
            # Find Model's Rank
            pred_rank = yr_data_sorted[yr_data_sorted['coachID'] == winner_id]['Predicted_Rank'].values[0]

            avg_rank += pred_rank
            
            # Status Icon
            if pred_rank == 1:
                top_1 += 1
                status = "✅ PERFECT (Rank 1)"
            elif pred_rank <= 3:
                top_3 += 1
                status = f"⚠️ Top 3 (Rank {pred_rank})"
            else:
                missed += 1
                status = f"❌ Missed (Rank {pred_rank})"
                
            print(f"\nYear {yr}: \n\tWinner = {winner_id} ({winner_team}) | {status}")
            
            # Display Top Prediction vs Winner (if different)
            if pred_rank > 1:
                print("\tModel Pick:")
                display(yr_data_sorted.head(1)[cols_to_show])
                print("\tActual Winner:")
                display(yr_data_sorted[yr_data_sorted['coachID'] == winner_id][cols_to_show])
            else:
                display(yr_data_sorted.head(1)[cols_to_show])
                
        else:
            print(f"\nYear {yr}: [No Winner Recorded]")
        
    avg_rank /= len(unique_years)
    winning_predicts.loc[name,'Model'] = name
    winning_predicts.loc[name,'Top 1'] = top_1
    winning_predicts.loc[name,'Top 3'] = top_3
    winning_predicts.loc[name,'Missed'] = missed
    winning_predicts.loc[name,'Avg Winner Rank'] = avg_rank


winning_predicts = winning_predicts.merge(results_df[['Model', 'AUC (Test)']], on='Model', how='left')
winning_predicts = winning_predicts.sort_values(['Top 1', 'Avg Winner Rank', 'AUC (Test)'], ascending=[False, True, False]).reset_index(drop=True)

print("\n--- Final Year-by-Year Precision Analysis ---")
display(winning_predicts)

In [None]:
best_model_name = winning_predicts.iloc[0]['Model']

full_history_df = df.copy().sort_values('year').reset_index(drop=True)

# Define Features and Target
X_full = full_history_df[features]
y_full = full_history_df[target]

groups_full = full_history_df.groupby('year', sort=False).size().to_list()

# Train final model
final_model = models[best_model_name]

# XGBRanker needs group info
if final_model is models['XGBRanker']:
    final_model.fit(X_full, y_full, group=groups_full)
else:
    final_model.fit(X_full, y_full)

print(f"Final Model ({best_model_name}) trained on full history (Years 1-10). Ready for Year 11 inputs.")