# 6 Woman Of The Year Award

In [126]:
import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
from collections import defaultdict

In [127]:
# ================================
# LOAD DATA
# ================================
awards = pd.read_csv("../../initial_data/awards_players.csv")
coaches = pd.read_csv("../../initial_data/coaches.csv")
players_teams = pd.read_csv("../../initial_data/players_teams.csv")
players = pd.read_csv("../../initial_data/players.csv")
series_post = pd.read_csv("../../initial_data/series_post.csv")
teams_post = pd.read_csv("../../initial_data/teams_post.csv")
teams = pd.read_csv("../../initial_data/teams.csv")


## Feature Engineering

In [128]:
# ======================================================
# HELPER FUNCTIONS
# ======================================================
def calculate_player_score(df):
    """Calculate Game Score for each player."""
    base = (
        df["points"]
        + 0.4 * df["fgMade"]
        - 0.7 * df["fgAttempted"]
        - 0.4 * (df["ftAttempted"] - df["ftMade"])
        + 0.7 * df["oRebounds"]
        + 0.3 * df["dRebounds"]
        + df["steals"]
        + 0.7 * df["assists"]
        + 0.7 * df["blocks"]
        - 0.4 * df["PF"]
        - df["turnovers"]
    )
    df = df.copy()
    df.loc[:, "Game_Score_Total"] = base
    df.loc[:, "Game_Score_Per_Minute"] = base / df["minutes"].replace(0, np.nan)
    return df

def build_player_features(players_teams_df, year):
    """Aggregate per-player features for MVP prediction."""
    df_prev = players_teams_df[players_teams_df["year"] == year - 1].copy()
    
    # Aggregate if multiple stints exist
    df_prev_agg = df_prev.groupby("playerID").agg({
        "tmID": "first",  # assume first team is representative
        "minutes": "sum",
        "points": "sum",
        "oRebounds": "sum",
        "dRebounds": "sum",
        "assists": "sum",
        "steals": "sum",
        "blocks": "sum",
        "turnovers": "sum",
        "PF": "sum",
        "fgAttempted": "sum",
        "fgMade": "sum",
        "ftAttempted": "sum",
        "ftMade": "sum"
    }).reset_index()
    
    df_prev_agg = calculate_player_score(df_prev_agg)
    
    player_features = df_prev_agg[[
        "playerID", "tmID", "minutes", "Game_Score_Total", "Game_Score_Per_Minute"
    ]]
    
    return player_features

def build_team_features(players_teams_df, teams_df, year):
    """Compute team weighted score using previous season stats, safe for Sixth Woman."""
    team_features = []
    team_ids = teams_df[teams_df["year"] == year]["tmID"].unique()
    
    for team in team_ids:
        df_team = players_teams_df[(players_teams_df["year"] == year - 1) & (players_teams_df["tmID"] == team)]
        if df_team.empty:
            weighted_score = 0
        else:
            df_team = calculate_player_score(df_team)
            total_weight = df_team["Game_Score_Total"].sum()
            if total_weight == 0 or np.isnan(total_weight):
                weighted_score = 0
            else:
                weighted_score = ((df_team["Game_Score_Per_Minute"] * df_team["Game_Score_Total"]).sum() / total_weight)
        team_features.append({"tmID": team, "Team_Weighted_Score": weighted_score})
    
    return pd.DataFrame(team_features)

def merge_player_team_features_sixth(player_features, team_features):
    """
    Merge player features with team features for Sixth Woman prediction.
    Only keep relevant columns.
    """
    df = player_features.merge(
        team_features[['tmID', 'Team_Weighted_Score']].drop_duplicates(),
        on='tmID',
        how='left'
    )
    return df

In [129]:
# ======================================================
# SIXTH WOMAN PRESEASON PIPELINE
# ======================================================

from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import numpy as np

# ------------------------------
# 1️⃣ Build historical dataset
# ------------------------------
def build_sixth_woman_dataset(players_teams_df, teams_df, awards_df, start_year, end_year):
    """
    Builds a historical dataset for Sixth Woman of the Year.
    Only considers seasons where the award exists.
    """
    all_data = []
    
    # Filter awards for the Sixth Woman
    awards_6th = awards_df[awards_df["award"] == "Sixth Woman of the Year"]
    
    for year in range(start_year, end_year + 1):
        # Player features from previous season
        df_players = build_player_features(players_teams_df, year)
        # Team features
        df_team = build_team_features(players_teams_df, teams_df, year)
        
        # Merge
        df = df_players.merge(df_team[['tmID', 'Team_Weighted_Score']].drop_duplicates(), on='tmID', how='left')
        
        # Bench fraction (approximation)
        df["Bench_Fraction"] = 1 - (df["minutes"] / df.groupby("tmID")["minutes"].transform("sum"))
        
        # Label
        df["Sixth_Woman"] = df["playerID"].apply(
            lambda pid: 1 if ((awards_6th["year"] == year) & (awards_6th["playerID"] == pid)).any() else 0
        )
        
        all_data.append(df)
    
    return pd.concat(all_data, ignore_index=True)

# ------------------------------
# 2️⃣ Train model
# ------------------------------
def train_sixth_woman_model(dataset, feature_cols):
    """
    Trains a RandomForestClassifier on all available award seasons.
    """
    # Use all data as training (no split because sample is tiny)
    X = dataset[feature_cols].fillna(0)
    y = dataset["Sixth_Woman"]
    
    model = RandomForestClassifier(
        n_estimators=500,
        class_weight="balanced",
        random_state=42
    )
    model.fit(X, y)
    
    return model

# ------------------------------
# 3️⃣ Preseason prediction
# ------------------------------
def preseason_sixth_woman_prediction(model, players_teams_df, teams_df, year, feature_cols, top_n=5):
    """
    Predicts preseason candidates for Sixth Woman using previous season stats.
    """
    # Player & team features from previous season
    df_players = build_player_features(players_teams_df, year)
    df_team = build_team_features(players_teams_df, teams_df, year)
    
    # Merge
    df = df_players.merge(df_team[['tmID', 'Team_Weighted_Score']].drop_duplicates(), on='tmID', how='left')
    
    # Bench fraction
    df["Bench_Fraction"] = 1 - (df["minutes"] / df.groupby("tmID")["minutes"].transform("sum"))
    
    # Predict probabilities
    X_pred = df[feature_cols].fillna(0)
    df["Sixth_Woman_Probability"] = model.predict_proba(X_pred)[:, 1]
    
    # Return top candidates
    return df.sort_values("Sixth_Woman_Probability", ascending=False)[
        ["playerID", "tmID", "Sixth_Woman_Probability"]
    ].head(top_n)


In [130]:
# 1️⃣ Build dataset for historical award seasons (8–10)
dataset_6th = build_sixth_woman_dataset(players_teams, teams, awards, start_year=8, end_year=10)

# 2️⃣ Define features
feature_cols_6th = ["Game_Score_Total", "Game_Score_Per_Minute", "minutes", "Team_Weighted_Score", "Bench_Fraction"]

# 3️⃣ Train model
model_6th = train_sixth_woman_model(dataset_6th, feature_cols_6th)

# 4️⃣ Preseason prediction for next season (year 11)
top_candidates = preseason_sixth_woman_prediction(model_6th, players_teams, teams, year=8, feature_cols=feature_cols_6th, top_n=5)

print("\nTop 5 Sixth Woman Candidates for Next Season:")
print(top_candidates)



Top 5 Sixth Woman Candidates for Next Season:
       playerID tmID  Sixth_Woman_Probability
119  pierspl01w  DET                    0.652
169  williad01w  MIN                    0.034
159  turneba01w  SEA                    0.012
85   lovelst01w  CHI                    0.004
70   johnsvi01w  SAS                    0.002


## Model Evaluation

In [131]:
# ======================================================
# EVALUATE SIXTH WOMAN MODEL
# ======================================================

def rank_players_by_probability(df, prob_col="Sixth_Woman_Probability"):
    """
    Rank players in a single season by predicted probability.
    """
    df = df.copy()
    df = df.sort_values(prob_col, ascending=False).reset_index(drop=True)
    df["Predicted_Rank"] = df.index + 1
    return df
    
def evaluate_sixth_woman_model(model, players_teams, teams, awards, feature_cols, top_n=5, years=[8, 9, 10]):
    results = []

    for year in years:
        # Predict all candidates for the year
        df_pred = preseason_sixth_woman_prediction(model, players_teams, teams, year, feature_cols, top_n=50)
        
        # Sort by probability descending
        df_pred = df_pred.sort_values("Sixth_Woman_Probability", ascending=False).reset_index(drop=True)

        # Get actual winner
        actual_winner = awards[
            (awards["year"] == year) &
            (awards["award"] == "Sixth Woman of the Year")
        ]["playerID"].values

        if len(actual_winner) == 0:
            actual_winner = None
            hit = False
            rank = None
        else:
            actual_winner = actual_winner[0]
            hit = actual_winner in df_pred["playerID"].values
            # Get rank (1 = highest probability)
            rank_row = df_pred[df_pred['playerID'] == actual_winner]
            rank = int(rank_row.index[0] + 1) if not rank_row.empty else None

        results.append({
            "year": year,
            "actual_sixth_woman": actual_winner,
            "top_candidates": df_pred.head(top_n)["playerID"].tolist(),
            "hit_top_n": hit,
            "actual_rank": rank
        })

    results_df = pd.DataFrame(results)
    coverage = results_df["hit_top_n"].mean() * 100
    avg_rank = results_df["actual_rank"].dropna().mean()
    print(f"\nCoverage: {coverage:.2f}% of actual Sixth Woman winners were in top {top_n} candidates.")
    print(f"Average rank of actual winners across seasons: {avg_rank:.2f}")

    return results_df



# ======================================================
# FEATURE IMPORTANCE
# ======================================================
def sixth_woman_feature_importance(model, feature_cols):
    feat_imp = pd.DataFrame({
        "Feature": feature_cols,
        "Importance": model.feature_importances_
    }).sort_values("Importance", ascending=False)
    print("\nFeature Importance:")
    print(feat_imp)
    return feat_imp


In [132]:
feature_cols_6th = ["Game_Score_Total", "Game_Score_Per_Minute", "minutes", "Team_Weighted_Score", "Bench_Fraction"]

# Preseason prediction for next season
top_candidates = preseason_sixth_woman_prediction(model_6th, players_teams, teams, year=8, feature_cols=feature_cols_6th, top_n=5)
print("\nTop 5 Sixth Woman Candidates for Next Season:")
print(top_candidates)

# Evaluate model historically
results_eval = evaluate_sixth_woman_model(model_6th, players_teams, teams, awards, feature_cols_6th, years=[8,9,10], top_n=5)
print("\nYear-by-Year Results:")
print(results_eval)

# 2️⃣ Feature importance
feat_imp_6th = sixth_woman_feature_importance(model_6th, feature_cols_6th)





Top 5 Sixth Woman Candidates for Next Season:
       playerID tmID  Sixth_Woman_Probability
119  pierspl01w  DET                    0.652
169  williad01w  MIN                    0.034
159  turneba01w  SEA                    0.012
85   lovelst01w  CHI                    0.004
70   johnsvi01w  SAS                    0.002

Coverage: 33.33% of actual Sixth Woman winners were in top 5 candidates.
Average rank of actual winners across seasons: 1.00

Year-by-Year Results:
   year actual_sixth_woman                                     top_candidates  \
0     8         pierspl01w  [pierspl01w, williad01w, turneba01w, lovelst01...   
1     9         wiggica01w  [spencsi01w, williad01w, hoffmeb01w, haynikr01...   
2    10         bonnede01w  [langhcr01w, kellycr01w, humphta01w, houstch01...   

   hit_top_n  actual_rank  
0       True          1.0  
1      False          NaN  
2      False          NaN  

Feature Importance:
                 Feature  Importance
1  Game_Score_Per_Minute    0.334

In [133]:



for year in [8, 9, 10]:
    season_df = preseason_sixth_woman_prediction(model_6th, players_teams, teams, year, feature_cols_6th, top_n=20)
    season_df = rank_players_by_probability(season_df)
    print(f"\nYear {year} - Top Candidates")
    print(season_df.head(10))  # show top 10 predicted candidates


Year 8 - Top Candidates
     playerID tmID  Sixth_Woman_Probability  Predicted_Rank
0  pierspl01w  DET                    0.652               1
1  williad01w  MIN                    0.034               2
2  turneba01w  SEA                    0.012               3
3  lovelst01w  CHI                    0.004               4
4  johnsvi01w  SAS                    0.002               5
5  mabikmw01w  LAS                    0.002               6
6  taylope01w  PHO                    0.002               7
7  abrossv01w  MIN                    0.000               8
8  beardal01w  WAS                    0.000               9
9  anderam01w  CON                    0.000              10

Year 9 - Top Candidates
     playerID tmID  Sixth_Woman_Probability  Predicted_Rank
0  spencsi01w  LAS                    0.040               1
1  williad01w  SAC                    0.038               2
2  hoffmeb01w  IND                    0.012               3
3  haynikr01w  SAC                    0.006       

In [None]:
def evaluate_top_candidates_impact(model, players_teams, teams, feature_cols, years=[8, 9, 10], top_n=5,
                                   bench_threshold=0.5, gs_per_min_threshold=0.2):
    """
    Evaluate top N Sixth Woman candidates per year to see if they were high-impact bench players.

    Parameters:
        model: Trained RandomForestClassifier for Sixth Woman
        players_teams (pd.DataFrame): Player stats per team per year
        teams (pd.DataFrame): Team-level info
        feature_cols (list): Columns used in the model
        years (list): Years to evaluate
        top_n (int): Number of top candidates to check
        bench_threshold (float): Minimum fraction of minutes off the bench to count as bench player
        gs_per_min_threshold (float): Minimum Game_Score_Per_Minute to count as high-impact

    Returns:
        results_df (pd.DataFrame): Contains top candidates and whether they were high-impact bench players
    """
    results = []

    def is_high_impact_bench(player_id, year):
        """Helper to check if a player was high-impact off the bench.
        Computes Game_Score_Per_Minute from raw stats if the column is not present.
        """
        df = players_teams[(players_teams["playerID"] == player_id) & (players_teams["year"] == year)]
        if df.empty:
            return False

        # Aggregate player's stints if there are multiple rows
        needed_cols = [
            "points", "fgMade", "fgAttempted", "ftAttempted", "ftMade",
            "oRebounds", "dRebounds", "steals", "assists", "blocks",
            "PF", "turnovers", "minutes"
        ]
        agg = {c: (df[c].sum() if c in df.columns else 0) for c in needed_cols}
        agg["playerID"] = player_id
        agg["year"] = year
        # pick a representative tmID (first stint)
        agg["tmID"] = df["tmID"].iloc[0]

        player_df = pd.DataFrame([agg])
        # calculate Game_Score_Per_Minute using helper (handles division-by-zero)
        player_df = calculate_player_score(player_df)
        if "Game_Score_Per_Minute" in player_df.columns:
            gs_per_min = player_df["Game_Score_Per_Minute"].iloc[0]
        else:
            gs_per_min = 0

        # Bench fraction: 1 - player's minutes / total team minutes for that year
        team_minutes = players_teams[(players_teams["year"] == year) & (players_teams["tmID"] == agg["tmID"])]["minutes"].sum()
        player_minutes = player_df["minutes"].iloc[0] if "minutes" in player_df.columns else 0
        bench_frac = 1 - (player_minutes / team_minutes) if team_minutes > 0 else 0

        return (bench_frac >= bench_threshold) and (gs_per_min >= gs_per_min_threshold)

    for year in years:
        # Get top N candidates
        df_pred = preseason_sixth_woman_prediction(model, players_teams, teams, year, feature_cols, top_n=50)
        df_pred = rank_players_by_probability(df_pred)
        top_candidates = df_pred.head(top_n)

        # Check each candidate
        for i, row in top_candidates.iterrows():
            results.append({
                "year": year,
                "playerID": row["playerID"],
                "tmID": row["tmID"],
                "Predicted_Prob": row["Sixth_Woman_Probability"],
                "Predicted_Rank": row["Predicted_Rank"],
                "High_Impact_Bench": is_high_impact_bench(row["playerID"], year)
            })

    results_df = pd.DataFrame(results)
    
    # Optional: summary stats
    summary = results_df.groupby("year")["High_Impact_Bench"].mean().reset_index()
    summary.rename(columns={"High_Impact_Bench": f"Fraction_Top{top_n}_HighImpact"}, inplace=True)
    print(summary)

    return results_df


top_candidates_eval = evaluate_top_candidates_impact(
    model_6th, 
    players_teams, 
    teams, 
    feature_cols_6th, 
    years=[8, 9, 10], 
    top_n=5,
    bench_threshold=0.5,
    gs_per_min_threshold=0.2
)

print(top_candidates_eval)

KeyError: 'Game_Score_Per_Minute'