In [1]:
import pandas as pd
import numpy as np

from collections import defaultdict


def clean_players(players: pd.DataFrame):
    players = players.drop(['firstseason','lastseason'], axis='columns') # all players with first and last season 0
    #players = players[players["bioID"].isin(players_teams["playerID"])]
    return players

def clean_players_teams(players_teams: pd.DataFrame):
    return players_teams.drop('lgID', axis='columns') # same lgID

def clean_awards_players(awards_players: pd.DataFrame):
    return awards_players.drop("lgID", axis="columns")

def clean_coaches(coaches: pd.DataFrame):
    return coaches.drop("lgID", axis="columns")

def clean_teams_post(teams_post : pd.DataFrame):
    return teams_post.drop("lgID", axis="columns")

def clean_series_post(series_post : pd.DataFrame):
    return series_post.drop(["lgIDLoser", "lgIDWinner"], axis="columns")

def clean_teams(teams : pd.DataFrame):
    return teams.drop(["lgID", "franchID", "divID", "arena", "attend", "min", "name"], axis="columns")

def parse_player_team_data(df):
    """
    Parses the player-team-season dataset into structured Python dictionaries
    for further analysis and championship prediction.
    """

    df.columns = [c.strip() for c in df.columns]
    df.fillna(0, inplace=True)

    players = defaultdict(list)
    teams_by_year = defaultdict(lambda: defaultdict(list))

    for _, row in df.iterrows():
        playerID = row["playerID"]
        teamID = row["tmID"]
        year = int(row["year"])

        # Regular season stats
        season_stats = {
            "GP": row["GP"],
            "GS": row["GS"],
            "minutes": row["minutes"],
            "points": row["points"],
            "oRebounds": row["oRebounds"],
            "dRebounds": row["dRebounds"],
            "rebounds": row["rebounds"],
            "assists": row["assists"],
            "steals": row["steals"],
            "blocks": row["blocks"],
            "turnovers": row["turnovers"],
            "PF": row["PF"],
            "fgAttempted": row["fgAttempted"],
            "fgMade": row["fgMade"],
            "ftAttempted": row["ftAttempted"],
            "ftMade": row["ftMade"],
            "threeAttempted": row["threeAttempted"],
            "threeMade": row["threeMade"],
            "dq": row["dq"],
        }

        # Postseason stats
        postseason_stats = {
            "PostGP": row["PostGP"],
            "PostGS": row["PostGS"],
            "PostMinutes": row["PostMinutes"],
            "PostPoints": row["PostPoints"],
            "PostoRebounds": row["PostoRebounds"],
            "PostdRebounds": row["PostdRebounds"],
            "PostRebounds": row["PostRebounds"],
            "PostAssists": row["PostAssists"],
            "PostSteals": row["PostSteals"],
            "PostBlocks": row["PostBlocks"],
            "PostTurnovers": row["PostTurnovers"],
            "PostPF": row["PostPF"],
            "PostfgAttempted": row["PostfgAttempted"],
            "PostfgMade": row["PostfgMade"],
            "PostftAttempted": row["PostftAttempted"],
            "PostftMade": row["PostftMade"],
            "PostthreeAttempted": row["PostthreeAttempted"],
            "PostthreeMade": row["PostthreeMade"],
            "PostDQ": row["PostDQ"],
        }

        # Combined player record
        record = {
            "year": year,
            "teamID": teamID,
            "stint": row["stint"],
            **season_stats,
            **postseason_stats
        }

        # Store in both structures
        players[playerID].append(record)
        teams_by_year[year][teamID].append(record)

    print(f"Parsed {len(players)} players across {len(teams_by_year)} seasons.")
    return {"players": players, "teams_by_year": teams_by_year}

awards_players = clean_awards_players(pd.read_csv("basketballPlayoffs/awards_players.csv"))
coaches = clean_coaches(pd.read_csv("basketballPlayoffs/coaches.csv"))
players_teams = clean_players_teams(pd.read_csv("basketballPlayoffs/players_teams.csv"))
players = clean_players(pd.read_csv("basketballPlayoffs/players.csv"))
series_post = clean_series_post(pd.read_csv("basketballPlayoffs/series_post.csv"))
teams_post = clean_teams_post(pd.read_csv("basketballPlayoffs/teams_post.csv"))
teams = clean_teams(pd.read_csv("basketballPlayoffs/teams.csv"))

# For year 11

coaches_11 = pd.read_csv("Season_11/coaches.csv")
players_teams_11 = pd.read_csv("Season_11/players_teams.csv")
teams_11 = pd.read_csv("Season_11/teams.csv")

In [2]:
teams["FG_Percentage"] = teams["o_fgm"] / teams["o_fga"]*100 # Field goal percentage
teams["FT_Percentage"] = teams["o_ftm"] / teams["o_fta"]*100 # Free-Throw percentage
teams["3P_Percentage"] = teams["o_3pm"] / teams["o_3pa"]*100 # 3 Point percentage
teams["O_OREBPG"] = teams["o_oreb"] / teams["GP"] # Ofensive rebounds per game
teams["O_DREBPG"] = teams["o_dreb"] / teams["GP"] # Defensive rebounds per game
teams["TOPG"] = teams["o_to"] / teams["GP"] # Turnovers per game
teams["PPG"] = teams["o_pts"] / teams["GP"] # Points per game
teams["STLPG"] = teams["o_stl"] / teams["GP"] # Steals per game
teams["BLKPG"] = teams["o_blk"] / teams["GP"] # Blocks per game
teams["PFPG"] = teams["o_pf"] / teams["GP"] # Personal fouls per game
teams["D_PPG"] = teams["d_pts"] / teams["GP"]
teams["APG"] = teams["o_asts"] / teams["GP"] # Assists per game
teams["Win%"] = teams["won"] / teams["GP"] *100

teams[["tmID", "FG_Percentage", "FT_Percentage", "3P_Percentage", "Win%"]].head()

Unnamed: 0,tmID,FG_Percentage,FT_Percentage,3P_Percentage,Win%
0,ATL,39.636847,74.758621,33.779264,11.764706
1,ATL,44.85173,75.364238,30.481283,52.941176
2,CHA,42.669469,74.696707,33.937824,25.0
3,CHA,41.910112,77.651515,35.747664,56.25
4,CHA,43.01676,73.906486,40.037951,56.25


In [3]:
teams["team_success"] = teams["won"] / (teams["won"] + teams["lost"])

player_features = players_teams.merge(
    teams[["tmID", "year", "team_success"]],
    on=["tmID", "year"],
    how="left"
)

rookie_year = (
    player_features.groupby("playerID")["year"]
    .min()
    .reset_index()
    .rename(columns={"year": "rookie_year"})
)

player_features = player_features.merge(rookie_year, on="playerID", how="left")

player_features.head(10)

Unnamed: 0,playerID,year,stint,tmID,GP,GS,minutes,points,oRebounds,dRebounds,...,PostPF,PostfgAttempted,PostfgMade,PostftAttempted,PostftMade,PostthreeAttempted,PostthreeMade,PostDQ,team_success,rookie_year
0,abrossv01w,2,0,MIN,26,23,846,343,43,131,...,0,0,0,0,0,0,0,0,0.375,2
1,abrossv01w,3,0,MIN,27,27,805,314,45,101,...,0,0,0,0,0,0,0,0,0.3125,2
2,abrossv01w,4,0,MIN,30,25,792,318,44,97,...,8,22,6,8,8,7,3,0,0.529412,2
3,abrossv01w,5,0,MIN,22,11,462,146,17,57,...,7,23,8,4,2,8,2,0,0.529412,2
4,abrossv01w,6,0,MIN,31,31,777,304,29,78,...,0,0,0,0,0,0,0,0,0.411765,2
5,abrossv01w,7,0,MIN,34,2,724,263,44,62,...,0,0,0,0,0,0,0,0,0.294118,2
6,abrossv01w,8,0,MIN,34,29,843,345,53,97,...,0,0,0,0,0,0,0,0,0.294118,2
7,abrossv01w,9,0,CON,6,0,107,34,3,17,...,8,24,11,4,2,5,0,0,0.617647,2
8,adamsjo01w,4,0,MIN,10,0,96,33,10,13,...,0,0,0,0,0,0,0,0,0.529412,4
9,aguilel01w,3,0,UTA,28,0,141,43,0,11,...,1,0,0,0,0,0,0,0,0.625,3


Most Valuable Player

For this award, we considered each player’s career statistics (Points, Assists, and Rebounds per Game) as well as a Career_Efficiency metric based on the formula used to calculate individual player performance in the first problem

In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

# ==========================
# Most Valuable Player
# ==========================

def prepare_mvp_career_data(target_year, players_teams, teams_df, awards_df):

    history_end_year = target_year - 1

    if history_end_year < 1:
        return None

    players_history = players_teams[players_teams["year"].between(1, history_end_year)].copy()

    career_stats = players_history.groupby("playerID").agg({
        "GP": "sum",
        "points": "sum",
        "oRebounds": "sum",
        "dRebounds": "sum",
        "assists": "sum",
        "steals": "sum",
        "blocks": "sum",
        "turnovers": "sum",
        "fgMade": "sum",
        "fgAttempted": "sum",
        "ftMade": "sum",
        "ftAttempted": "sum",
        "PF": "sum"
    }).reset_index()

    # Player's Score formula used in question 1 to evaluate a player's individual performance 
    career_stats["Raw_Score"] = (
        career_stats["points"] + 0.4 * career_stats["fgMade"] + 0.7 * career_stats["oRebounds"] +
        0.3 * career_stats["dRebounds"] + career_stats["steals"] + 0.7 * career_stats["assists"] +
        0.7 * career_stats["blocks"] - 0.7 * career_stats["fgAttempted"] -
        0.4 * (career_stats["ftAttempted"] - career_stats["ftMade"]) -
        0.4 * career_stats["PF"] - career_stats["turnovers"]
    )

    career_stats['Career_Efficiency'] = career_stats['Raw_Score'] / career_stats['GP']
    career_stats['Career_PPG'] = career_stats['points'] / career_stats['GP']
    career_stats['Career_RPG'] = (career_stats['oRebounds'] + career_stats['dRebounds']) / career_stats['GP']
    career_stats['Career_APG'] = career_stats['assists'] / career_stats['GP']

    roster_target_year = players_teams[players_teams["year"] == target_year][['playerID', 'tmID']].drop_duplicates()

    candidates = pd.merge(roster_target_year, career_stats, on='playerID', how='inner')

    prev_year = target_year - 1
    teams_prev = teams_df[teams_df['year'] == prev_year][['tmID', 'won', 'lost']].copy()
    teams_prev['Prev_Team_Win_Pct'] = teams_prev['won'] / (teams_prev['won'] + teams_prev['lost'])

    candidates = pd.merge(candidates, teams_prev[['tmID', 'Prev_Team_Win_Pct']], on='tmID', how='left')
    candidates['Prev_Team_Win_Pct'] = candidates['Prev_Team_Win_Pct'].fillna(0.5)

    awards_target = awards_df[
        (awards_df['year'] == target_year) &
        (awards_df['award'] == 'Most Valuable Player')
    ][['playerID']]
    awards_target['Won_MVP'] = 1

    final_df = pd.merge(candidates, awards_target, on='playerID', how='left')
    final_df['Won_MVP'] = final_df['Won_MVP'].fillna(0)

    return final_df

years_train = [2, 3, 4, 5, 6, 7, 8, 9]
train_list = []

for y in years_train:
    df = prepare_mvp_career_data(y, players_teams, teams, awards_players)
    train_list.append(df)

train_mvp = pd.concat(train_list, ignore_index=True)

features_mvp = ['Career_Efficiency', 'Career_PPG', 'Career_RPG', 'Career_APG']


In [5]:
model_mvp = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
model_mvp.fit(train_mvp[features_mvp], train_mvp['Won_MVP'])

importance = pd.DataFrame({'Feature': features_mvp, 'Value': model_mvp.feature_importances_})
print(importance.sort_values(by='Value', ascending=False))

candidates_10_mvp = prepare_mvp_career_data(10, players_teams, teams, awards_players)

#print(candidates_10_mvp[candidates_10_mvp['tmID'] == 'IND'])

probs = model_mvp.predict_proba(candidates_10_mvp[features_mvp])[:, 1]
candidates_10_mvp['MVP_Prob'] = probs

ranking_mvp = candidates_10_mvp[['playerID', 'tmID', 'Career_PPG', 'Career_APG', 'Career_RPG', 'Career_Efficiency', 'MVP_Prob']].sort_values(by='MVP_Prob', ascending=False)

print("\n--- TOP Most Valuable Player candidates (Year 10) ---")
ranking_mvp.head(10)

             Feature     Value
0  Career_Efficiency  0.518582
1         Career_PPG  0.361762
2         Career_RPG  0.097893
3         Career_APG  0.021763

--- TOP Most Valuable Player candidates (Year 10) ---


Unnamed: 0,playerID,tmID,Career_PPG,Career_APG,Career_RPG,Career_Efficiency,MVP_Prob
71,leslili01w,LAS,17.519841,2.480159,9.353175,13.513889,0.42
20,catchta01w,IND,16.674528,3.726415,7.768868,14.523113,0.09
117,tauradi01w,PHO,20.347305,4.065868,4.287425,14.017964,0.05
137,youngso01w,SAS,15.37,1.76,6.36,11.627,0.04
3,augusse01w,MIN,21.252525,2.141414,3.878788,13.870707,0.03
91,parkeca01w,LAS,18.484848,3.424242,9.484848,16.506061,0.03
55,jacksla01w,SEA,19.417722,1.556962,7.962025,15.420675,0.02
122,thompti01w,LAS,17.636735,2.0,6.64898,11.189388,0.01
4,bakersh01w,DET,6.864865,1.716216,2.0,3.935135,0.0
0,ajavoma01w,WAS,8.029412,1.735294,1.764706,3.614706,0.0


The winner in year 10 was tauradi01w, who was predicted to finish third according to the model. However, the probability of tauradi01w winning the award is low compared to the top candidate, leslili01w

Most Valuable Player Prediction for year 11

In [6]:
all_players_teams = pd.concat([players_teams, players_teams_11], ignore_index=True)
all_teams = pd.concat([teams, teams_11], ignore_index=True)

years_train = [2, 3, 4, 5, 6, 7, 8, 9, 10]
train_list = []

for y in years_train:
    df = prepare_mvp_career_data(y, players_teams, teams, awards_players)
    train_list.append(df)

train_mvp = pd.concat(train_list, ignore_index=True)

features_mvp = ['Career_Efficiency', 'Career_PPG', 'Career_RPG', 'Career_APG']

model_mvp = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
model_mvp.fit(train_mvp[features_mvp], train_mvp['Won_MVP'])

importance = pd.DataFrame({'Feature': features_mvp, 'Value': model_mvp.feature_importances_})
print(importance.sort_values(by='Value', ascending=False))

candidates_11_mvp = prepare_mvp_career_data(11, all_players_teams, all_teams, awards_players)

probs = model_mvp.predict_proba(candidates_11_mvp[features_mvp])[:, 1]
candidates_11_mvp['MVP_Prob'] = probs

ranking_mvp = candidates_11_mvp[['playerID', 'tmID', 'Career_PPG', 'Career_APG', 'Career_RPG', 'Career_Efficiency', 'MVP_Prob']].sort_values(by='MVP_Prob', ascending=False)

print("\n--- TOP Most Valuable Player candidates (Year 11) ---")
ranking_mvp.head(10)

             Feature     Value
0  Career_Efficiency  0.529997
1         Career_PPG  0.360126
2         Career_RPG  0.091793
3         Career_APG  0.018083

--- TOP Most Valuable Player candidates (Year 11) ---


Unnamed: 0,playerID,tmID,Career_PPG,Career_APG,Career_RPG,Career_Efficiency,MVP_Prob
85,tauradi01w,PHO,20.348485,3.979798,4.515152,14.262626,0.55
99,youngso01w,SAS,16.067669,1.729323,6.398496,11.966917,0.18
2,augusse01w,MIN,21.238095,2.104762,3.895238,14.021905,0.06
61,parkeca01w,LAS,16.172414,3.051724,9.603448,14.593103,0.02
84,swoopsh01w,TUL,15.846154,3.461538,4.950226,12.50362,0.02
15,catchta01w,IND,16.45122,3.646341,7.691057,14.346341,0.02
36,jacksla01w,SEA,19.395437,1.486692,7.86692,15.331179,0.01
4,beviltu01w,SAS,4.742138,2.232704,1.981132,4.126415,0.0
8,braxtka01w,NYL,7.08805,0.779874,4.515723,4.154717,0.0
6,bobbish01w,IND,3.0,2.305085,1.644068,1.791525,0.0


Most Improved Player

For the Most Improved Player, we chose to evaluate players based on their potential in points, assists, and rebounds by assessing per-minute statistics from the previous season. In addition, we identified that potential candidates could not have many points per game (low initial expectations)

In [7]:
# ==========================
# Most Improved Player 
# ==========================


def prepare_mip_data(target_year, player_features, awards_df):
    prev_year = target_year - 1

    base_data = player_features[player_features['year'] == prev_year].copy()

    all_history = player_features.groupby('playerID')['year'].min().reset_index().rename(columns={'year': 'Rookie_Year'})
    base_data = pd.merge(base_data, all_history, on='playerID', how='left')
    base_data['Years_Exp'] = prev_year - base_data['Rookie_Year']

    base_data['PPG'] = base_data['points'] / base_data['GP']

    candidates = base_data[
        (base_data['PPG'] < 15.0) &
        (base_data['GP'] > 10)
    ].copy()

    candidates['PPM'] = candidates['points'] / candidates['minutes'] # Points per Minute
    candidates['MPG'] = candidates['minutes'] / candidates['GP']
    candidates['Start_Ratio'] = candidates['GS'] / candidates['GP']
    candidates['APM'] = candidates['assists'] / candidates['minutes']
    candidates['RPM'] = candidates['rebounds'] / candidates['minutes']

    awards_target = awards_df[
        (awards_df['year'] == target_year) &
        (awards_df['award'] == 'Most Improved Player')
    ][['playerID']]
    awards_target['Won_MIP'] = 1

    final_df = pd.merge(candidates, awards_target, on='playerID', how='left')
    final_df['Won_MIP'] = final_df['Won_MIP'].fillna(0)

    return final_df


data_2 = prepare_mip_data(2, player_features, awards_players)
data_3 = prepare_mip_data(3, player_features, awards_players)
data_4 = prepare_mip_data(4, player_features, awards_players)
data_8 = prepare_mip_data(8, player_features, awards_players)
data_9 = prepare_mip_data(9, player_features, awards_players)
train_mip = pd.concat([data_2, data_3, data_4, data_8, data_9], ignore_index=True)

# Features about player's potential
features_mip = ['APM', 'RPM', 'PPM', 'MPG', 'Start_Ratio']

model_mip = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
model_mip.fit(train_mip[features_mip], train_mip['Won_MIP'])

print("Features' importance for Most Improved Player:")
importance = pd.DataFrame({'Feature': features_mip, 'Value': model_mip.feature_importances_})
print(importance.sort_values(by='Value', ascending=False))

train_mip[train_mip['Won_MIP'] == 1].head()

Features' importance for Most Improved Player:
       Feature     Value
1          RPM  0.275425
0          APM  0.248069
2          PPM  0.240707
3          MPG  0.152874
4  Start_Ratio  0.082926


Unnamed: 0,playerID,year,stint,tmID,GP,GS,minutes,points,oRebounds,dRebounds,...,rookie_year,Rookie_Year,Years_Exp,PPG,PPM,MPG,Start_Ratio,APM,RPM,Won_MIP
3,arcaija01w,1,0,HOU,32,32,977,268,36,83,...,1,1,0,8.375,0.274309,30.53125,1.0,0.061412,0.121801,1.0
260,milleco01w,2,0,WAS,20,0,137,34,5,4,...,2,2,0,1.7,0.248175,6.85,0.0,0.058394,0.065693,1.0
462,snowmi01w,3,0,HOU,32,2,480,125,31,88,...,3,3,0,3.90625,0.260417,15.0,0.0625,0.027083,0.247917,1.0
570,mccarja01w,7,0,CHA,30,3,421,136,40,65,...,6,6,1,4.533333,0.32304,14.033333,0.1,0.059382,0.249406,1.0
686,hoffmeb01w,8,0,IND,34,10,582,144,40,97,...,5,5,3,4.235294,0.247423,17.117647,0.294118,0.044674,0.235395,1.0


In [8]:
candidates_10 = prepare_mip_data(10, player_features, awards_players)

#print(candidates_10[candidates_10['tmID'] == 'WAS'])

probs = model_mip.predict_proba(candidates_10[features_mip])[:, 1]
candidates_10['MIP_Prob'] = probs

ranking_mip = candidates_10[['playerID', 'tmID', 'PPG', 'PPM', 'APM', 'RPM', 'MPG', 'Start_Ratio', 'MIP_Prob']].sort_values(by='MIP_Prob', ascending=False)

print("\n--- Most Improved Player Candidates (Year 10) ---")

#ranking_mip[ranking_mip['tmID'] == 'WAS'].head(10)
ranking_mip.head(10)


--- Most Improved Player Candidates (Year 10) ---


Unnamed: 0,playerID,tmID,PPG,PPM,APM,RPM,MPG,Start_Ratio,MIP_Prob
33,farriba01w,PHO,3.470588,0.213382,0.014467,0.235081,16.264706,0.235294,0.28
66,langhcr01w,WAS,4.823529,0.310019,0.028355,0.257089,15.558824,0.176471,0.19
79,mazzake01w,PHO,5.794118,0.307332,0.059282,0.102964,18.852941,0.058824,0.1
137,williad01w,SAC,6.088235,0.30131,0.029112,0.243086,20.205882,0.970588,0.05
111,sanfona01w,WAS,6.735294,0.28805,0.050314,0.242767,23.382353,0.823529,0.04
46,harpela01w,SAC,5.529412,0.335714,0.021429,0.244643,16.470588,0.029412,0.03
19,careyja01w,CON,4.151515,0.274549,0.094188,0.08016,15.121212,0.090909,0.03
90,mosbybe01w,WAS,2.0,0.262295,0.010929,0.289617,7.625,0.041667,0.02
5,balesal01w,ATL,4.823529,0.210797,0.028278,0.262211,22.882353,0.529412,0.02
94,nnamach01w,ATL,1.333333,0.190476,0.02381,0.142857,7.0,0.0,0.02


The one who won in year 10 was langhcr01w, who was predicted to finish second according to the model.

Predicting for year 11

In [9]:
data_2 = prepare_mip_data(2, player_features, awards_players)
data_3 = prepare_mip_data(3, player_features, awards_players)
data_4 = prepare_mip_data(4, player_features, awards_players)
data_8 = prepare_mip_data(8, player_features, awards_players)
data_9 = prepare_mip_data(9, player_features, awards_players)
data_10 = prepare_mip_data(10, player_features, awards_players)
train_mip = pd.concat([data_2, data_3, data_4, data_8, data_9, data_10], ignore_index=True)

model_mip = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
model_mip.fit(train_mip[features_mip], train_mip['Won_MIP'])

print("Features' importance for Most Improved Player:")
importance = pd.DataFrame({'Feature': features_mip, 'Value': model_mip.feature_importances_})
print(importance.sort_values(by='Value', ascending=False))
train_mip[train_mip['Won_MIP'] == 1].head(10)

Features' importance for Most Improved Player:
       Feature     Value
1          RPM  0.299807
0          APM  0.245705
2          PPM  0.218514
3          MPG  0.168962
4  Start_Ratio  0.067012


Unnamed: 0,playerID,year,stint,tmID,GP,GS,minutes,points,oRebounds,dRebounds,...,rookie_year,Rookie_Year,Years_Exp,PPG,PPM,MPG,Start_Ratio,APM,RPM,Won_MIP
3,arcaija01w,1,0,HOU,32,32,977,268,36,83,...,1,1,0,8.375,0.274309,30.53125,1.0,0.061412,0.121801,1.0
260,milleco01w,2,0,WAS,20,0,137,34,5,4,...,2,2,0,1.7,0.248175,6.85,0.0,0.058394,0.065693,1.0
462,snowmi01w,3,0,HOU,32,2,480,125,31,88,...,3,3,0,3.90625,0.260417,15.0,0.0625,0.027083,0.247917,1.0
570,mccarja01w,7,0,CHA,30,3,421,136,40,65,...,6,6,1,4.533333,0.32304,14.033333,0.1,0.059382,0.249406,1.0
686,hoffmeb01w,8,0,IND,34,10,582,144,40,97,...,5,5,3,4.235294,0.247423,17.117647,0.294118,0.044674,0.235395,1.0
833,langhcr01w,9,0,WAS,34,6,529,164,53,83,...,9,9,0,4.823529,0.310019,15.558824,0.176471,0.028355,0.257089,1.0


In [10]:
candidates_11 = prepare_mip_data(11, all_players_teams, awards_players)

probs = model_mip.predict_proba(candidates_11[features_mip])[:, 1]
candidates_11['MIP_Prob'] = probs

ranking_mip = candidates_11[['playerID', 'tmID', 'PPG', 'PPM', 'APM', 'RPM', 'MPG', 'Start_Ratio', 'MIP_Prob']].sort_values(by='MIP_Prob', ascending=False)

print("\n--- Most Improved Player Candidates (Year 11) ---")
ranking_mip.head(10)


--- Most Improved Player Candidates (Year 11) ---


Unnamed: 0,playerID,tmID,PPG,PPM,APM,RPM,MPG,Start_Ratio,MIP_Prob
34,harpela01w,SAC,4.545455,0.326087,0.043478,0.245652,13.939394,0.333333,0.12
115,walkeas01w,SEA,1.769231,0.255556,0.044444,0.255556,6.923077,0.0,0.09
57,larkier01w,NYL,2.444444,0.321168,0.058394,0.226277,7.611111,0.0,0.09
6,blackch01w,CON,2.939394,0.226636,0.014019,0.25,12.969697,0.242424,0.08
104,snowmi01w,ATL,5.441176,0.367793,0.033797,0.28827,14.794118,0.058824,0.06
100,smithbr01w,PHO,1.0,0.145833,0.020833,0.243056,6.857143,0.0,0.02
86,parisco01w,SAC,4.848485,0.361991,0.049774,0.303167,13.393939,0.181818,0.01
36,haydeva01w,LAS,3.76,0.349442,0.022305,0.249071,10.76,0.04,0.01
1,anosini01w,MIN,13.2,0.441964,0.090402,0.247768,29.866667,0.966667,0.01
29,frazeme01w,SAS,5.0,0.444785,0.046012,0.257669,11.241379,0.0,0.01


Sixth Woman of the Year

Here our goal is to predict who will win Sixth Player of the Year. We believe that evaluating each player’s potential in terms of scoring efficiency (points per minute), in addition to points, assists, and rebounds per game from the previous season, will be important to understand which players have the potential to be the best sixth player of the season. We also take into account the minutes played per game for each player to assess whether they are important or influential on their team despite coming off the bench.

In [11]:
# ==========================
# Sixth Woman of the Year 
# ==========================


def prepare_6th_data(target_year, player_features, awards_df):
    prev_year = target_year - 1

    current_roster = player_features[player_features['year'] == target_year][['playerID', 'tmID', 'stint', 'year']].copy()

    current_roster = current_roster.sort_values(by=['playerID', 'stint'])
    current_roster = current_roster.drop_duplicates(subset=['playerID'], keep='last') # Remove duplicates (players who changed teams mid-season), keeping their statistics for the year

    history_raw = player_features[player_features['year'] == prev_year].copy()

    history = history_raw.groupby('playerID').agg({
        'GP': 'sum',
        'GS': 'sum',
        'minutes': 'sum',
        'points': 'sum',
        'rebounds': 'sum',
        'assists': 'sum'
    }).reset_index()

    history['Prev_PPG'] = history['points'] / history['GP']
    history['Prev_PPM'] = history['points'] / history['minutes']
    history['Prev_MPG'] = history['minutes'] / history['GP']
    history['Prev_APG'] = history['assists'] / history['GP']
    history['Prev_RPG'] = history['rebounds'] / history['GP']
    history['Prev_Start_Ratio'] = history['GS'] / history['GP'] # Number of games a player played as a starter

    history_cols = ['playerID', 'Prev_PPG', 'Prev_PPM', 'Prev_MPG', 'Prev_APG', 'Prev_RPG', 'Prev_Start_Ratio']

    candidates = pd.merge(current_roster, history[history_cols], on='playerID', how='left')

    candidates['Is_Rookie'] = candidates['Prev_PPG'].isna().astype(int)

    avg_ppg = history['Prev_PPG'].mean()
    avg_ppm = history['Prev_PPM'].mean()
    avg_mpg = history['Prev_MPG'].mean()
    avg_apg = history['Prev_APG'].mean()
    avg_rpg = history['Prev_RPG'].mean()

    # Rookie's stats with the average of all candidates
    candidates['Prev_PPG'] = candidates['Prev_PPG'].fillna(avg_ppg)
    candidates['Prev_PPM'] = candidates['Prev_PPM'].fillna(avg_ppm)
    candidates['Prev_MPG'] = candidates['Prev_MPG'].fillna(15.0)
    candidates['Prev_APG'] = candidates['Prev_APG'].fillna(avg_apg)
    candidates['Prev_RPG'] = candidates['Prev_RPG'].fillna(avg_rpg)

    candidates['Prev_Start_Ratio'] = candidates['Prev_Start_Ratio'].fillna(0.0)

    candidates = candidates[candidates['Prev_Start_Ratio'] < 0.5].copy()

    awards_target = awards_df[
        (awards_df['year'] == target_year) &
        (awards_df['award'] == 'Sixth Woman of the Year')
    ][['playerID']]
    awards_target['Won_6th'] = 1

    final_df = pd.merge(candidates, awards_target, on='playerID', how='left')
    final_df['Won_6th'] = final_df['Won_6th'].fillna(0)

    return final_df


data_8 = prepare_6th_data(8, player_features, awards_players)
data_9 = prepare_6th_data(9, player_features, awards_players)

train_6th_10 = pd.concat([data_8, data_9], ignore_index=True)

features_6th = ['Prev_PPG', 'Prev_PPM', 'Prev_MPG', 'Prev_APG', 'Prev_RPG', 'Is_Rookie']

train_6th_10[train_6th_10['Won_6th'] == 1].head()

Unnamed: 0,playerID,tmID,stint,year,Prev_PPG,Prev_PPM,Prev_MPG,Prev_APG,Prev_RPG,Prev_Start_Ratio,Is_Rookie,Won_6th
67,pierspl01w,DET,0,8,6.470588,0.389381,16.617647,0.705882,3.882353,0.0,0,1.0
216,wiggica01w,MIN,0,9,6.731642,0.326749,15.0,1.495325,3.078135,0.0,1,1.0


In [12]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(train_6th_10[features_6th])
y_train = train_6th_10['Won_6th']

model_6th_log = LogisticRegression(random_state=42, class_weight='balanced')
model_6th_log.fit(X_train_scaled, y_train)

candidates_10_6th = prepare_6th_data(10, players_teams, awards_players)

X_test_scaled = scaler.transform(candidates_10_6th[features_6th])
probs = model_6th_log.predict_proba(X_test_scaled)[:, 1]

candidates_10_6th['Win_Prob'] = probs
total_prob = candidates_10_6th['Win_Prob'].sum()
candidates_10_6th['Vote_Share'] = candidates_10_6th['Win_Prob'] / total_prob

ranking_6th = candidates_10_6th[['playerID', 'tmID', 'Prev_PPG', 'Prev_PPM', 'Prev_APG', 'Prev_RPG', 'Prev_MPG', 'Vote_Share']].sort_values(by='Vote_Share', ascending=False)

ranking_6th['Share_Pct'] = ranking_6th['Vote_Share'].apply(lambda x: f"{x:.1%}")

print("\n--- TOP Sixth Woman of the Year candidates (Year 10) ---")
ranking_6th[['playerID', 'tmID', 'Prev_PPG', 'Prev_PPM', 'Prev_APG', 'Prev_RPG', 'Prev_MPG', 'Share_Pct']].head(10)


--- TOP Sixth Woman of the Year candidates (Year 10) ---


Unnamed: 0,playerID,tmID,Prev_PPG,Prev_PPM,Prev_APG,Prev_RPG,Prev_MPG,Share_Pct
43,lyttlsa01w,ATL,8.222222,0.453988,0.888889,6.222222,18.111111,3.0%
8,braxtka01w,DET,8.939394,0.5,0.818182,5.090909,17.878788,2.9%
33,jacksti02w,NYL,8.32,0.419355,1.04,5.72,19.84,2.9%
26,haydeva01w,LAS,5.233333,0.516447,0.4,3.5,10.133333,2.9%
32,houstch01w,MIN,8.848485,0.502582,0.848485,3.69697,17.606061,2.7%
37,kellycr01w,DET,7.363636,0.441016,0.454545,3.30303,16.69697,2.6%
24,harpela01w,SAC,5.529412,0.335714,0.352941,4.029412,16.470588,2.4%
13,davenje01w,IND,4.642857,0.422078,0.285714,2.857143,11.0,2.3%
60,pierspl01w,DET,11.857143,0.511556,2.25,4.857143,23.178571,2.2%
59,philler01w,CON,5.125,0.476744,0.375,2.125,10.75,2.1%


The one who won in year 10 was bonnede01w, who was predicted to be out of top-10.

Predicting for year 11

In [13]:
data_8 = prepare_6th_data(8, player_features, awards_players)
data_9 = prepare_6th_data(9, player_features, awards_players)
data_10 = prepare_6th_data(10, player_features, awards_players)

train_6th = pd.concat([data_8, data_9, data_10], ignore_index=True)

features_6th = ['Prev_PPG', 'Prev_PPM', 'Prev_MPG', 'Prev_APG', 'Prev_RPG', 'Is_Rookie']

train_6th[train_6th['Won_6th'] == 1].head()

Unnamed: 0,playerID,tmID,stint,year,Prev_PPG,Prev_PPM,Prev_MPG,Prev_APG,Prev_RPG,Prev_Start_Ratio,Is_Rookie,Won_6th
67,pierspl01w,DET,0,8,6.470588,0.389381,16.617647,0.705882,3.882353,0.0,0,1.0
216,wiggica01w,MIN,0,9,6.731642,0.326749,15.0,1.495325,3.078135,0.0,1,1.0
232,bonnede01w,PHO,0,10,6.419933,0.326247,15.0,1.38058,2.981079,0.0,1,1.0


In [14]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(train_6th[features_6th])
y_train = train_6th['Won_6th']

model_6th_log = LogisticRegression(random_state=42, class_weight='balanced')
model_6th_log.fit(X_train_scaled, y_train)

candidates_11_6th = prepare_6th_data(11, all_players_teams, awards_players)

X_test_scaled = scaler.transform(candidates_11_6th[features_6th])
probs = model_6th_log.predict_proba(X_test_scaled)[:, 1]

candidates_11_6th['Win_Prob'] = probs

candidates_11_6th['Win_Prob'] = probs
total_prob = candidates_11_6th['Win_Prob'].sum()
candidates_11_6th['Vote_Share'] = candidates_11_6th['Win_Prob'] / total_prob

ranking_6th = candidates_11_6th[['playerID', 'tmID', 'Prev_PPG', 'Prev_PPM', 'Prev_APG', 'Prev_RPG', 'Prev_MPG', 'Vote_Share']].sort_values(by='Vote_Share', ascending=False)

ranking_6th['Share_Pct'] = ranking_6th['Vote_Share'].apply(lambda x: f"{x:.1%}")

print("\n--- TOP Sixth Woman of the Year candidates (Year 11) ---")
ranking_6th.head(10)


--- TOP Sixth Woman of the Year candidates (Year 11) ---


Unnamed: 0,playerID,tmID,Prev_PPG,Prev_PPM,Prev_APG,Prev_RPG,Prev_MPG,Vote_Share,Share_Pct
8,bonnede01w,PHO,11.205882,0.526243,0.382353,5.764706,21.294118,0.024406,2.4%
9,braxtka01w,PHO,9.0,0.5,1.464286,5.964286,18.0,0.021233,2.1%
73,snowmi01w,CHI,5.441176,0.367793,0.5,4.264706,14.794118,0.018651,1.9%
28,holliqu01w,NYL,4.823529,0.37788,0.147059,3.235294,12.764706,0.018126,1.8%
70,sanfona01w,PHO,6.264706,0.321267,0.647059,4.323529,19.5,0.017315,1.7%
6,bjorkan01w,CHI,7.31169,0.348347,1.515533,3.229245,15.0,0.0163,1.6%
0,adairje01w,MIN,7.31169,0.348347,1.515533,3.229245,15.0,0.0163,1.6%
4,ayimmi01w,TUL,7.31169,0.348347,1.515533,3.229245,15.0,0.0163,1.6%
1,adamsda01w,SAS,7.31169,0.348347,1.515533,3.229245,15.0,0.0163,1.6%
24,griffke01w,CON,7.31169,0.348347,1.515533,3.229245,15.0,0.0163,1.6%


Rookie of the Year

Since there isn’t much relevant data available to help predict a possible winner for this award, we had to focus on the data available for each rookie before the season starts (height, weight, position — C, G, F). In addition, we created a college score associated with each player, which evaluates whether a college is known for producing many or few players for the league (college prestige). This way, the model takes into account whether a player comes from a strong, talent-developing college

In [15]:
from datetime import datetime

# ==========================
# Rookie of the Year
# ==========================

def calculate_college_prestige(players_df):
    college_counts = players_df['college'].value_counts() # Count how many unique players came from each college.
    return college_counts.to_dict()


def prepare_rookie_data(target_year, players_df, players_teams_df, awards_df, college_prestige_map):

    rookie_years = players_teams_df.groupby('playerID')['year'].min().reset_index()
    rookie_ids = rookie_years[rookie_years['year'] == target_year]['playerID']

    rookie_profiles = players_df[players_df['bioID'].isin(rookie_ids)].copy()

    if len(rookie_profiles) == 0:
        return None

    rookie_profiles['College_Score'] = rookie_profiles['college'].map(college_prestige_map).fillna(1)

    rookie_profiles['Is_Guard'] = rookie_profiles['pos'].str.contains('G').fillna(False).astype(int)
    rookie_profiles['Is_Forward'] = rookie_profiles['pos'].str.contains('F').fillna(False).astype(int)
    rookie_profiles['Is_Center'] = rookie_profiles['pos'].str.contains('C').fillna(False).astype(int)

    rookie_profiles['height'] = rookie_profiles['height'].fillna(rookie_profiles['height'].mean())
    rookie_profiles['weight'] = rookie_profiles['weight'].fillna(rookie_profiles['weight'].mean())

    awards_target = awards_df[
        (awards_df['year'] == target_year) &
        (awards_df['award'] == 'Rookie of the Year')
    ][['playerID']]
    awards_target['Won_ROY'] = 1

    final_df = pd.merge(rookie_profiles, awards_target, left_on='bioID', right_on='playerID', how='left')
    final_df['Won_ROY'] = final_df['Won_ROY'].fillna(0)

    features_cols = ['bioID', 'height', 'weight', 'College_Score', 'Is_Guard', 'Is_Forward', 'Is_Center', 'Won_ROY']

    return final_df[features_cols]

college_map = calculate_college_prestige(players)
train_list = []
years_with_data = [2, 3, 4, 5, 6, 7, 8, 9]

for y in years_with_data:
    df = prepare_rookie_data(y, players, players_teams, awards_players, college_map)
    if df is not None and df['Won_ROY'].sum() > 0:
        train_list.append(df)

features_roy = ['height', 'weight', 'College_Score','Is_Guard', 'Is_Forward', 'Is_Center']

train_roy = pd.concat(train_list, ignore_index=True)

train_roy[train_roy['Won_ROY'] == 1].head(10)

Unnamed: 0,bioID,height,weight,College_Score,Is_Guard,Is_Forward,Is_Center,Won_ROY
47,stileja01w,68.0,144,1.0,1,0,0,1.0
70,catchta01w,73.0,167,38.0,0,1,0,1.0
111,fordch01w,75.0,198,17.0,0,1,0,1.0
163,tauradi01w,72.0,172,26.0,1,1,0,1.0
183,johnste01w,63.0,132,16.0,1,0,0,1.0
210,augusse01w,72.0,179,16.0,0,1,0,1.0
268,pricear01w,69.0,133,3.0,1,1,0,1.0
309,parkeca01w,76.0,175,38.0,0,1,0,1.0


In [16]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(train_roy[features_roy])
y_train = train_roy['Won_ROY']

model_roy_log = LogisticRegression(random_state=42, class_weight='balanced')
model_roy_log.fit(X_train_scaled, y_train)

candidates_10_roy = prepare_rookie_data(10, players, players_teams, awards_players, college_map)

X_test_scaled = scaler.transform(candidates_10_roy[features_roy])
probs = model_roy_log.predict_proba(X_test_scaled)[:, 1]

candidates_10_roy['ROY_Prob'] = probs
total_prob = candidates_10_roy['ROY_Prob'].sum()
candidates_10_roy['Vote_Share'] = candidates_10_roy['ROY_Prob'] / total_prob

ranking_roy = candidates_10_roy[['bioID', 'College_Score', 'height', 'weight', 'Vote_Share']].sort_values(by='Vote_Share', ascending=False)
ranking_roy['Share_Pct'] = ranking_roy['Vote_Share'].apply(lambda x: f"{x:.1%}")

print("\n--- TOP Rookie of the Year candidates (Year 10) ---")
ranking_roy.head(10)


--- TOP Rookie of the Year candidates (Year 10) ---


Unnamed: 0,bioID,College_Score,height,weight,Vote_Share,Share_Pct
16,montgre01w,26.0,67.0,139,0.116433,11.6%
12,mccanra01w,16.0,73.0,161,0.100717,10.1%
4,colemma01w,14.0,73.0,160,0.100592,10.1%
22,wirthch01w,11.0,73.0,185,0.088062,8.8%
19,tolivkr01w,14.0,67.0,130,0.079658,8.0%
23,wisdoli01w,12.0,74.0,186,0.077374,7.7%
15,montaan01w,6.0,73.0,190,0.072599,7.3%
13,mccouan01w,1.0,73.0,160,0.053679,5.4%
1,boddiwh01w,13.0,69.0,149,0.051849,5.2%
9,januabr01w,6.0,68.0,144,0.044155,4.4%


Here, the real winner of rookie of the year in year 10 was predicted in 8th place by the model(mccouan01w)

Predicting for year 11

In [17]:
roster_11_ids = players_teams_11.copy()
history_ids = players_teams.copy()
players_clean = players.copy()

rookie_ids_11 = roster_11_ids[~roster_11_ids['playerID'].isin(history_ids['playerID'])]['playerID'].unique()

rookie_profiles_11 = players_clean[players_clean['bioID'].isin(rookie_ids_11)].copy()


rookie_profiles_11['College_Score'] = rookie_profiles_11['college'].map(college_map).fillna(1)

rookie_profiles_11['Is_Guard'] = rookie_profiles_11['pos'].str.contains('G').fillna(False).astype(int)
rookie_profiles_11['Is_Forward'] = rookie_profiles_11['pos'].str.contains('F').fillna(False).astype(int)
rookie_profiles_11['Is_Center'] = rookie_profiles_11['pos'].str.contains('C').fillna(False).astype(int)

rookie_profiles_11['height'] = rookie_profiles_11['height'].fillna(rookie_profiles_11['height'].mean())
rookie_profiles_11['weight'] = rookie_profiles_11['weight'].fillna(rookie_profiles_11['weight'].mean())

features_roy = ['height', 'weight', 'College_Score', 'Is_Guard', 'Is_Forward', 'Is_Center']

rookie_profiles_11.head(10)

Unnamed: 0,bioID,pos,height,weight,college,collegeOther,birthDate,deathDate,College_Score,Is_Guard,Is_Forward,Is_Center
2,adairje01w,C,76.0,197,George Washington,,1986-12-19,0000-00-00,6.0,0,0,1
3,adamsda01w,F-C,73.0,239,Texas A&M,Jefferson College (JC),1989-02-19,0000-00-00,8.0,0,1,1
25,appelja01w,C,76.0,210,Stanford,,1988-05-14,0000-00-00,29.0,0,0,1
33,ayimmi01w,F,75.0,0,Pepperdine,,1988-05-06,0000-00-00,2.0,0,1,0
68,bjorkan01w,F,72.0,166,Tennessee,,1989-07-14,0000-00-00,38.0,0,1,0
99,brelaje01w,F,75.0,170,North Carolina,,1988-02-23,0000-00-00,16.0,0,1,0
128,camba01w,C,80.0,215,none,,1991-08-18,0000-00-00,8.0,0,0,1
149,charlti01w,C,76.0,198,Connecticut,,1988-12-05,0000-00-00,26.0,0,0,1
152,cheekjo01w,F,73.0,201,Duke,,1988-06-25,0000-00-00,15.0,0,1,0
154,chestfe01w,F,75.0,180,DePaul,,1988-03-24,0000-00-00,7.0,0,1,0


In [18]:
train_list = []
years_with_data = [2, 3, 4, 5, 6, 7, 8, 9, 10]

for y in years_with_data:
    df = prepare_rookie_data(y, players, players_teams, awards_players, college_map)
    if df is not None and df['Won_ROY'].sum() > 0:
        train_list.append(df)

features_roy = ['height', 'weight', 'College_Score', 'Is_Guard', 'Is_Forward', 'Is_Center']

train_roy = pd.concat(train_list, ignore_index=True)

#train_roy[train_roy['Won_ROY'] == 1].head(10)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(train_roy[features_roy])
y_train = train_roy['Won_ROY']

model_roy_log = LogisticRegression(random_state=42, class_weight='balanced')
model_roy_log.fit(X_train_scaled, y_train)

X_test_scaled = scaler.transform(rookie_profiles_11[features_roy])
probs = model_roy_log.predict_proba(X_test_scaled)[:, 1]

rookie_profiles_11['ROY_Prob'] = probs
total_prob = rookie_profiles_11['ROY_Prob'].sum()
rookie_profiles_11['Vote_Share'] = rookie_profiles_11['ROY_Prob'] / total_prob

ranking_roy = rookie_profiles_11[['bioID', 'College_Score', 'height', 'weight', 'Vote_Share']].sort_values(by='Vote_Share', ascending=False)
ranking_roy['Share_Pct'] = ranking_roy['Vote_Share'].apply(lambda x: f"{x:.1%}")

print("\n--- TOP Rookie of the Year candidates (Year 11) ---")
ranking_roy.head(10)


--- TOP Rookie of the Year candidates (Year 11) ---


Unnamed: 0,bioID,College_Score,height,weight,Vote_Share,Share_Pct
68,bjorkan01w,38.0,72.0,166,0.068517,6.9%
539,moorema01w,26.0,72.0,175,0.063598,6.4%
157,chriska02w,15.0,72.0,180,0.056196,5.6%
610,phillpo01w,21.0,74.0,173,0.05214,5.2%
370,ibekwif01w,5.0,74.0,0,0.050689,5.1%
152,cheekjo01w,15.0,73.0,201,0.04966,5.0%
508,mccrada01w,8.0,71.0,170,0.048767,4.9%
599,pederka01w,29.0,76.0,190,0.04757,4.8%
220,dunlavi01w,4.0,73.0,160,0.041877,4.2%
33,ayimmi01w,2.0,75.0,0,0.041857,4.2%


Defensive Player of the Year

To predict this award, we considered individual defensive data such as stocks (the combination of blocks and steals) and defensive rebounds. Additionally, we evaluated the number of points allowed per game by the player’s team in the previous season

In [19]:
# ==========================
# Defensive Player of the Year
# ==========================

def prepare_dpoy_career_data(target_year, players_teams, teams_df, awards_df):

    history_end_year = target_year - 1

    if history_end_year < 1:
        return None

    players_history = players_teams[players_teams["year"].between(1, history_end_year)].copy()

    career_stats = players_history.groupby("playerID").agg({
        "GP": "sum",
        "minutes": "sum",
        "steals": "sum",
        "blocks": "sum",
        "dRebounds": "sum",
        "PF": "sum"
    }).reset_index()

    career_stats['GP'] = career_stats['GP'].replace(0, 1)
    career_stats['minutes'] = career_stats['minutes'].replace(0, 1)

    career_stats['Career_SPG'] = career_stats['steals'] / career_stats['GP']
    career_stats['Career_BPG'] = career_stats['blocks'] / career_stats['GP']
    career_stats['Career_DRPG'] = career_stats['dRebounds'] / career_stats['GP'] # Defensive Rebounds
    career_stats['Career_MPG'] = career_stats['minutes'] / career_stats['GP']

    career_stats['Career_Stocks'] = career_stats['Career_SPG'] + career_stats['Career_BPG']

    career_stats['Career_Stocks_Per_Min'] = (career_stats['steals'] + career_stats['blocks']) / career_stats['minutes']


    roster_target_year = players_teams[players_teams["year"] == target_year][['playerID', 'tmID']].drop_duplicates()

    candidates = pd.merge(roster_target_year, career_stats, on='playerID', how='inner')

    prev_year = target_year - 1
    teams_prev = teams_df[teams_df['year'] == prev_year][['tmID', 'd_pts', 'GP']].copy()

    teams_prev['Prev_Team_PA'] = teams_prev['d_pts'] / teams_prev['GP'] # Points per game allowed last season (Team Defensive Performance)

    candidates = pd.merge(candidates, teams_prev[['tmID', 'Prev_Team_PA']], on='tmID', how='left')

    avg_pa = teams_prev['Prev_Team_PA'].mean()
    candidates['Prev_Team_PA'] = candidates['Prev_Team_PA'].fillna(avg_pa)

    candidates = candidates[
        (candidates['GP'] > 15) &
        (candidates['Career_MPG'] > 12.0)
    ].copy()

    awards_target = awards_df[
        (awards_df['year'] == target_year) &
        (awards_df['award'] == 'Defensive Player of the Year')
    ][['playerID']]
    awards_target['Won_DPOY'] = 1

    final_df = pd.merge(candidates, awards_target, on='playerID', how='left')
    final_df['Won_DPOY'] = final_df['Won_DPOY'].fillna(0)

    return final_df

years_train = [2, 3, 4, 5, 6, 7, 8, 9]
train_list = []

for y in years_train:
    df = prepare_dpoy_career_data(y, players_teams, teams, awards_players)
    train_list.append(df)

train_dpoy = pd.concat(train_list, ignore_index=True)

train_dpoy[train_dpoy['Won_DPOY'] == 1].head(10)

features_dpoy = ['Career_Stocks', 'Career_Stocks_Per_Min', 'Career_DRPG', 'Prev_Team_PA']

In [20]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(train_dpoy[features_dpoy])
y_train = train_dpoy['Won_DPOY']

model_log_dpoy = LogisticRegression(random_state=42, class_weight='balanced', max_iter=2000)
model_log_dpoy.fit(X_train_scaled, y_train)

coefs = pd.DataFrame({'Feature': features_dpoy, 'Coeficiente': model_log_dpoy.coef_[0]})
print("--- Features's importance ---")
print(coefs.sort_values(by='Coeficiente', ascending=False))


candidates_10_dpoy = prepare_dpoy_career_data(10, players_teams, teams, awards_players)

X_test_scaled = scaler.transform(candidates_10_dpoy[features_dpoy])

probs = model_log_dpoy.predict_proba(X_test_scaled)[:, 1]
candidates_10_dpoy['Raw_Prob'] = probs

total_prob = candidates_10_dpoy['Raw_Prob'].sum()
candidates_10_dpoy['DPOY_Share'] = candidates_10_dpoy['Raw_Prob'] / total_prob

ranking_dpoy_log = candidates_10_dpoy[[
    'playerID', 'tmID', 'Career_SPG', 'Career_BPG', 'Career_Stocks', 'Career_Stocks_Per_Min', 'Career_DRPG', 'Prev_Team_PA', 'DPOY_Share'
]].sort_values(by='DPOY_Share', ascending=False)

ranking_dpoy_log['Share_Pct'] = ranking_dpoy_log['DPOY_Share'].apply(lambda x: f"{x:.1%}")

print("\n--- TOP Defensive Player of the Year candidates (Year 10) ---")
ranking_dpoy_log.head(10)

--- Features's importance ---
                 Feature  Coeficiente
0          Career_Stocks     2.534585
1  Career_Stocks_Per_Min     0.187387
2            Career_DRPG    -0.700504
3           Prev_Team_PA    -0.713271

--- TOP Defensive Player of the Year candidates (Year 10) ---


Unnamed: 0,playerID,tmID,Career_SPG,Career_BPG,Career_Stocks,Career_Stocks_Per_Min,Career_DRPG,Prev_Team_PA,DPOY_Share,Share_Pct
66,leslili01w,LAS,1.424603,2.464286,3.888889,0.11863,6.916667,74.205882,0.092269,9.2%
18,catchta01w,IND,2.509434,0.943396,3.45283,0.102478,5.448113,72.264706,0.08925,8.9%
1,anosini01w,MIN,2.205882,1.264706,3.470588,0.128261,4.5,80.029412,0.087453,8.7%
84,parkeca01w,LAS,1.272727,2.272727,3.545455,0.1055,6.939394,74.205882,0.083374,8.3%
51,jacksla01w,SEA,1.160338,2.046414,3.206751,0.097436,5.624473,70.764706,0.08217,8.2%
31,fowlesy01w,CHI,1.117647,2.117647,3.235294,0.127907,5.823529,73.823529,0.08033,8.0%
47,hornbal01w,DET,2.323529,0.294118,2.617647,0.119143,2.5,74.176471,0.071674,7.2%
34,griffyo01w,IND,1.634409,0.956989,2.591398,0.091081,4.315412,72.264706,0.048516,4.9%
104,suttota01w,IND,0.811538,1.507692,2.319231,0.088365,3.665385,72.264706,0.03335,3.3%
95,rileyru01w,SAS,0.687747,1.55336,2.241107,0.088139,3.438735,71.117647,0.033301,3.3%


The one who won in year 10 was catchta01w, who was predicted to finish second according to the model.

Predicting for year 11

In [21]:
years_train = [2, 3, 4, 5, 6, 7, 8, 9, 10]
train_list = []

for y in years_train:
    df = prepare_dpoy_career_data(y, players_teams, teams, awards_players)
    train_list.append(df)

train_dpoy = pd.concat(train_list, ignore_index=True)

train_dpoy[train_dpoy['Won_DPOY'] == 1].head(10)

features_dpoy = ['Career_Stocks', 'Career_Stocks_Per_Min', 'Career_DRPG', 'Prev_Team_PA']

In [22]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(train_dpoy[features_dpoy])
y_train = train_dpoy['Won_DPOY']

model_log_dpoy = LogisticRegression(random_state=42, class_weight='balanced', max_iter=2000)
model_log_dpoy.fit(X_train_scaled, y_train)

candidates_11_dpoy = prepare_dpoy_career_data(11, all_players_teams, teams, awards_players)

X_test_scaled = scaler.transform(candidates_11_dpoy[features_dpoy])

probs = model_log_dpoy.predict_proba(X_test_scaled)[:, 1]
candidates_11_dpoy['Raw_Prob'] = probs

total_prob = candidates_11_dpoy['Raw_Prob'].sum()
candidates_11_dpoy['DPOY_Share'] = candidates_11_dpoy['Raw_Prob'] / total_prob

ranking_dpoy_log = candidates_11_dpoy[[
    'playerID', 'tmID', 'Career_SPG', 'Career_BPG', 'Career_Stocks', 'Career_Stocks_Per_Min', 'Career_DRPG', 'Prev_Team_PA', 'DPOY_Share'
]].sort_values(by='DPOY_Share', ascending=False)

ranking_dpoy_log['Share_Pct'] = ranking_dpoy_log['DPOY_Share'].apply(lambda x: f"{x:.1%}")

print("\n--- TOP Defensive Player of the Year candidates (Year 11) ---")
ranking_dpoy_log.head(10)


--- TOP Defensive Player of the Year candidates (Year 11) ---


Unnamed: 0,playerID,tmID,Career_SPG,Career_BPG,Career_Stocks,Career_Stocks_Per_Min,Career_DRPG,Prev_Team_PA,DPOY_Share,Share_Pct
1,anosini01w,WAS,2.4375,1.109375,3.546875,0.125,4.71875,77.117647,0.150598,15.1%
15,catchta01w,IND,2.565041,0.886179,3.45122,0.103197,5.341463,73.558824,0.148592,14.9%
34,jacksla01w,SEA,1.190114,2.015209,3.205323,0.097536,5.577947,72.823529,0.132335,13.2%
58,parkeca01w,LAS,0.982759,2.206897,3.189655,0.096154,7.12069,73.470588,0.099973,10.0%
76,swoopsh01w,TUL,2.158371,0.647059,2.80543,0.082163,3.909502,78.346154,0.083944,8.4%
49,mccouan01w,ATL,2.176471,0.352941,2.529412,0.117007,2.058824,82.264706,0.060062,6.0%
70,smithta01w,IND,1.136646,1.319876,2.456522,0.082888,3.996894,73.558824,0.056273,5.6%
75,suttota01w,IND,0.787456,1.505226,2.292683,0.087687,3.71777,73.558824,0.039968,4.0%
53,miltode01w,LAS,1.446667,0.81,2.256667,0.071106,4.083333,73.470588,0.030463,3.0%
23,fowlesy01w,CHI,0.97561,1.731707,2.707317,0.09893,5.731707,79.205882,0.02982,3.0%


WNBA Finals Most Valuable Player

In [23]:
from sklearn.model_selection import GroupKFold
from sklearn.metrics import average_precision_score

teams_10 = teams.copy()
pt_10 = players_teams.copy()
awards_10 = awards_players.copy()

teams_11_meta = teams_11.copy()
pt_11_roster = players_teams_11.copy()

for df in [teams_10, pt_10, awards_10, teams_11_meta, pt_11_roster]:
    df.columns = [c.strip() for c in df.columns]


def playoff_score_prevyear(teams_df):
    t = teams_df[['year','tmID','playoff','firstRound','semis','finals']].copy()
    cond = [
        t['finals'].eq('W'),
        t['finals'].eq('L'),
        t['semis'].eq('L'),
        t['firstRound'].eq('L'),
        t['playoff'].eq('Y')
    ]
    choices = [4, 3, 2, 1, 1]
    t['Playoff_Score'] = np.select(cond, choices, default=0)
    prev = t[['year','tmID','Playoff_Score']].copy()
    prev['year'] = prev['year'] + 1
    return prev.rename(columns={'Playoff_Score':'Prev_Playoff_Score'})


def team_prev_winpct(teams_df):
    t = teams_df[['year','tmID','won','lost']].copy()
    t['Prev_Team_Win_Pct'] = (t['won'] / (t['won'] + t['lost']).replace(0, np.nan)).fillna(0.5)
    t = t[['year','tmID','Prev_Team_Win_Pct']].copy()
    t['year'] = t['year'] + 1
    return t


def aggregate_player_year_stats(pt_df):
    """
    Agrega por playerID-year (soma stints) e cria stats por jogo.
    """
    agg = pt_df.groupby(["playerID","year"], as_index=False).agg({
        "GP":"sum","GS":"sum","minutes":"sum",
        "points":"sum","assists":"sum","rebounds":"sum",
        "steals":"sum","blocks":"sum","turnovers":"sum",
        "fgMade":"sum","fgAttempted":"sum",
        "threeMade":"sum","threeAttempted":"sum"
    })

    agg["GP"] = agg["GP"].replace(0, np.nan)
    agg["minutes"] = agg["minutes"].replace(0, np.nan)
    agg["fgAttempted"] = agg["fgAttempted"].replace(0, np.nan)
    agg["threeAttempted"] = agg["threeAttempted"].replace(0, np.nan)

    agg["PPG"] = agg["points"] / agg["GP"]
    agg["APG"] = agg["assists"] / agg["GP"]
    agg["RPG"] = agg["rebounds"] / agg["GP"]
    agg["SPG"] = agg["steals"] / agg["GP"]
    agg["BPG"] = agg["blocks"] / agg["GP"]
    agg["MPG"] = agg["minutes"] / agg["GP"]
    agg["FG%"] = (agg["fgMade"] / agg["fgAttempted"]).fillna(0)
    agg["3P%"] = (agg["threeMade"] / agg["threeAttempted"]).fillna(0)
    agg["Start_Ratio"] = (agg["GS"] / agg["GP"]).fillna(0)

    # Eficiency per game formula
    agg["EFF_G"] = ((agg["points"] + agg["rebounds"] + agg["assists"]
                     + agg["steals"] + agg["blocks"] - agg["turnovers"]) / agg["GP"])

    agg = agg.replace([np.inf, -np.inf], np.nan).fillna(0)
    return agg


def fmvp_award_rows(awards_df):
    mask = (
        awards_df["award"].astype(str).str.contains("Finals", case=False, na=False) &
        awards_df["award"].astype(str).str.contains("Most Valuable Player", case=False, na=False)
    )
    return awards_df[mask][["playerID","year"]].drop_duplicates()


def prepare_fmvp_preseason_data(target_year, pt_hist, teams_hist, awards_hist, roster_df_for_year):
    """
    Cada linha = jogador na season target_year (roster),
    features = stats do ano anterior + força da equipa no ano anterior,
    label = ganhou Finals MVP no target_year.
    """
    prev_year = target_year - 1

    # actual year roster
    roster = roster_df_for_year[roster_df_for_year["year"] == target_year][["playerID","tmID","stint","year"]].copy()
    roster = roster.sort_values(["playerID","stint"]).drop_duplicates("playerID", keep="last")

    py = aggregate_player_year_stats(pt_hist)
    prev_stats = py[py["year"] == prev_year].copy()

    df = roster.merge(prev_stats, on="playerID", how="left", suffixes=("", "_prev"))

    # rookie without stats in previous year
    df["Is_Rookie_or_NoPrev"] = df["GP"].isna().astype(int)

    # fill NA's with mean
    num_cols = ["PPG","APG","RPG","SPG","BPG","MPG","FG%","3P%","EFF_G","Start_Ratio","GP"]
    for c in num_cols:
        if c in df.columns:
            df[c] = df[c].fillna(prev_stats[c].mean() if c in prev_stats.columns else 0).fillna(0)

    rookie_year = py.groupby("playerID")["year"].min().rename("rookie_year").reset_index()
    df = df.merge(rookie_year, on="playerID", how="left")
    df["Years_Exp"] = (prev_year - df["rookie_year"]).fillna(0).clip(lower=0)

    # last year team's features (win% e playoff score)
    prev_win = team_prev_winpct(teams_hist)
    prev_play = playoff_score_prevyear(teams_hist)

    df = df.merge(prev_win, on=["year","tmID"], how="left")
    df = df.merge(prev_play, on=["year","tmID"], how="left")

    df["Prev_Team_Win_Pct"] = df["Prev_Team_Win_Pct"].fillna(0.5)
    df["Prev_Playoff_Score"] = df["Prev_Playoff_Score"].fillna(0)

    # label FMVP target_year
    fmvp = fmvp_award_rows(awards_hist).copy()
    fmvp["FMVP"] = 1
    df = df.merge(fmvp, on=["playerID","year"], how="left")
    df["FMVP"] = df["FMVP"].fillna(0).astype(int)

    # renaming features
    df = df.rename(columns={
        "PPG":"Prev_PPG","APG":"Prev_APG","RPG":"Prev_RPG","SPG":"Prev_SPG","BPG":"Prev_BPG",
        "MPG":"Prev_MPG","FG%":"Prev_FG%","3P%":"Prev_3P%","EFF_G":"Prev_EFF_G","Start_Ratio":"Prev_Start_Ratio"
    })

    return df

In [24]:
roster_all = pd.concat([
    pt_10[["playerID","year","stint","tmID"]].copy(),
    pt_11_roster[["playerID","year","stint","tmID"]].copy()
], ignore_index=True)

train_rows = []
for y in range(2, 10):
    df_y = prepare_fmvp_preseason_data(
        target_year=y,
        pt_hist=pt_10,
        teams_hist=teams_10,
        awards_hist=awards_10,
        roster_df_for_year=roster_all
    )
    train_rows.append(df_y)

train_df = pd.concat(train_rows, ignore_index=True)

pred_df_10 = prepare_fmvp_preseason_data(
    target_year=10,
    pt_hist=pt_10,
    teams_hist=teams_10,
    awards_hist=awards_10,
    roster_df_for_year=roster_all
)

FEATURES = [
    "Prev_PPG","Prev_APG","Prev_RPG","Prev_SPG","Prev_BPG",
    "Prev_MPG","Prev_FG%","Prev_3P%","Prev_EFF_G","Prev_Start_Ratio",
    "Prev_Team_Win_Pct","Prev_Playoff_Score",
    "Years_Exp","Is_Rookie_or_NoPrev"
]

X = train_df[FEATURES].copy()
y = train_df["FMVP"].astype(int).copy()
groups = train_df["year"].copy()

model = RandomForestClassifier(
    n_estimators=800,
    max_depth=6,
    min_samples_leaf=8,
    class_weight="balanced",
    random_state=42
)

gkf = GroupKFold(n_splits=5)
pr_aucs, hit5, hit10 = [], [], []

for tr, te in gkf.split(X, y, groups):
    model.fit(X.iloc[tr], y.iloc[tr])
    prob = model.predict_proba(X.iloc[te])[:, 1]

    pr_aucs.append(average_precision_score(y.iloc[te], prob))

    # Hit@K per year: each season
    fold_df = train_df.iloc[te][["playerID","tmID","year","FMVP"]].copy()
    fold_df["prob"] = prob

    h5 = []
    h10 = []
    for yr, g in fold_df.groupby("year"):
        g = g.sort_values("prob", ascending=False)
        winner_ids = set(g[g["FMVP"] == 1]["playerID"].tolist())
        if len(winner_ids) == 0:
            continue
        top5 = set(g.head(5)["playerID"].tolist())
        top10 = set(g.head(10)["playerID"].tolist())
        h5.append(1.0 if len(winner_ids & top5) > 0 else 0.0)
        h10.append(1.0 if len(winner_ids & top10) > 0 else 0.0)

    if len(h5) > 0:
        hit5.append(np.mean(h5))
        hit10.append(np.mean(h10))

print(f"FMVP (pre-season) CV PR-AUC: {np.mean(pr_aucs):.3f} ± {np.std(pr_aucs):.3f}")
print(f"FMVP Hit@5:  {np.mean(hit5) if len(hit5)>0 else np.nan:.3f}  |  Hit@10: {np.mean(hit10) if len(hit10)>0 else np.nan:.3f}")

model.fit(X, y)

X_10 = pred_df_10[FEATURES].copy()
pred_df_10 = pred_df_10.copy()
pred_df_10["FMVP_Prob"] = model.predict_proba(X_10)[:, 1]

ranking_10 = pred_df_10.sort_values("FMVP_Prob", ascending=False)

print("\n--- TOP Finals MVP candidates (Year 10) ---")
ranking_10[[
    "playerID","tmID",
    "Prev_PPG","Prev_APG","Prev_RPG","Prev_EFF_G","Prev_MPG",
    "Prev_Team_Win_Pct","Prev_Playoff_Score",
    "Years_Exp", "FMVP_Prob"
]].head(10)

FMVP (pre-season) CV PR-AUC: 0.244 ± 0.382
FMVP Hit@5:  0.200  |  Hit@10: 0.400

--- TOP Finals MVP candidates (Year 10) ---


Unnamed: 0,playerID,tmID,Prev_PPG,Prev_APG,Prev_RPG,Prev_EFF_G,Prev_MPG,Prev_Team_Win_Pct,Prev_Playoff_Score,Years_Exp,FMVP_Prob
80,leslili01w,LAS,15.060606,2.424242,8.878788,27.181818,32.090909,0.588235,2,8,0.550342
62,jacksla01w,SEA,20.238095,1.238095,7.0,29.619048,33.047619,0.647059,1,7,0.491938
107,parkeca01w,LAS,18.484848,3.424242,9.484848,32.121212,33.606061,0.588235,2,0,0.400741
132,tauradi01w,PHO,24.117647,3.558824,5.058824,33.0,31.852941,0.470588,0,4,0.393275
156,youngso01w,SAS,17.454545,2.272727,5.636364,25.757576,31.939394,0.705882,3,2,0.391252
34,douglka01w,IND,15.636364,3.212121,4.090909,21.848485,34.363636,0.5,1,7,0.37802
48,hammobe01w,SAS,17.636364,4.878788,2.757576,23.636364,33.363636,0.705882,3,8,0.362236
3,augusse01w,MIN,19.064516,2.677419,3.870968,25.354839,33.612903,0.470588,0,2,0.34255
104,nolande01w,DET,15.823529,4.411765,3.911765,23.529412,33.647059,0.647059,4,7,0.34202
35,dupreca01w,CHI,16.323529,2.294118,7.941176,26.058824,32.911765,0.352941,0,2,0.289161


The one who won in year 10 was tauradi01w, who was predicted to finish fourth according to the model.

Predicting for year 11

In [25]:
train_rows = []
for y in range(2, 11):
    df_y = prepare_fmvp_preseason_data(
        target_year=y,
        pt_hist=pt_10,
        teams_hist=teams_10,
        awards_hist=awards_10,
        roster_df_for_year=roster_all
    )
    train_rows.append(df_y)

train_df = pd.concat(train_rows, ignore_index=True)

pred_df_11 = prepare_fmvp_preseason_data(
    target_year=11,
    pt_hist=pt_10,
    teams_hist=teams_10,
    awards_hist=awards_10,
    roster_df_for_year=roster_all
)

FEATURES = [
    "Prev_PPG","Prev_APG","Prev_RPG","Prev_SPG","Prev_BPG",
    "Prev_MPG","Prev_FG%","Prev_3P%","Prev_EFF_G","Prev_Start_Ratio",
    "Prev_Team_Win_Pct","Prev_Playoff_Score",
    "Years_Exp","Is_Rookie_or_NoPrev"
]

X = train_df[FEATURES].copy()
y = train_df["FMVP"].astype(int).copy()
groups = train_df["year"].copy()

model = RandomForestClassifier(
    n_estimators=800,
    max_depth=6,
    min_samples_leaf=8,
    class_weight="balanced",
    random_state=42
)

gkf = GroupKFold(n_splits=5)
pr_aucs, hit5, hit10 = [], [], []

for tr, te in gkf.split(X, y, groups):
    model.fit(X.iloc[tr], y.iloc[tr])
    prob = model.predict_proba(X.iloc[te])[:, 1]

    pr_aucs.append(average_precision_score(y.iloc[te], prob))

    # Hit@K per year: each season
    fold_df = train_df.iloc[te][["playerID","tmID","year","FMVP"]].copy()
    fold_df["prob"] = prob

    h5 = []
    h10 = []
    for yr, g in fold_df.groupby("year"):
        g = g.sort_values("prob", ascending=False)
        winner_ids = set(g[g["FMVP"] == 1]["playerID"].tolist())
        if len(winner_ids) == 0:
            continue
        top5 = set(g.head(5)["playerID"].tolist())
        top10 = set(g.head(10)["playerID"].tolist())
        h5.append(1.0 if len(winner_ids & top5) > 0 else 0.0)
        h10.append(1.0 if len(winner_ids & top10) > 0 else 0.0)

    if len(h5) > 0:
        hit5.append(np.mean(h5))
        hit10.append(np.mean(h10))

print(f"FMVP (pre-season) CV PR-AUC: {np.mean(pr_aucs):.3f} ± {np.std(pr_aucs):.3f}")
print(f"FMVP Hit@5:  {np.mean(hit5) if len(hit5)>0 else np.nan:.3f}  |  Hit@10: {np.mean(hit10) if len(hit10)>0 else np.nan:.3f}")

model.fit(X, y)

X_11 = pred_df_11[FEATURES].copy()
pred_df_11 = pred_df_11.copy()
pred_df_11["FMVP_Prob"] = model.predict_proba(X_11)[:, 1]

ranking_11 = pred_df_11.sort_values("FMVP_Prob", ascending=False)

print("\n--- TOP Finals MVP candidates (Year 11)---")
ranking_11[[
    "playerID","tmID",
    "Prev_PPG","Prev_APG","Prev_RPG","Prev_EFF_G","Prev_MPG",
    "Prev_Team_Win_Pct","Prev_Playoff_Score",
    "Years_Exp","FMVP_Prob"
]].head(10)

FMVP (pre-season) CV PR-AUC: 0.134 ± 0.147
FMVP Hit@5:  0.400  |  Hit@10: 0.400

--- TOP Finals MVP candidates (Year 11)---


Unnamed: 0,playerID,tmID,Prev_PPG,Prev_APG,Prev_RPG,Prev_EFF_G,Prev_MPG,Prev_Team_Win_Pct,Prev_Playoff_Score,Years_Exp,FMVP_Prob
21,catchta01w,IND,15.058824,3.147059,7.205882,26.264706,31.882353,0.647059,3.0,7.0,0.370851
121,tauradi01w,PHO,20.354839,3.516129,5.741935,29.516129,31.483871,0.676471,4.0,5.0,0.332369
31,douglka01w,IND,17.645161,2.741935,3.870968,23.870968,32.354839,0.647059,3.0,8.0,0.327975
54,jacksla01w,SEA,19.192308,0.846154,7.0,28.576923,32.423077,0.588235,1.0,8.0,0.301692
89,parkeca01w,LAS,13.12,2.56,9.76,25.96,32.6,0.529412,2.0,1.0,0.257387
138,youngso01w,SAS,18.181818,1.636364,6.515152,26.272727,33.727273,0.441176,1.0,3.0,0.255499
58,jonesas01w,CON,16.695652,2.434783,5.913043,24.347826,31.608696,0.470588,0.0,7.0,0.237401
41,hammobe01w,SAS,19.483871,5.032258,3.322581,26.225806,33.83871,0.441176,1.0,9.0,0.223649
33,dupreca01w,PHO,15.705882,2.205882,7.882353,25.647059,34.882353,0.676471,4.0,3.0,0.193518
99,powelni01w,NYL,16.676471,2.264706,5.882353,24.029412,30.441176,0.382353,0.0,5.0,0.193146


Coach of the Year

In [26]:
from sklearn.model_selection import StratifiedKFold, cross_val_score

teams_10 = teams.copy()
teams_post_10 = teams_post.copy()
pt_10 = players_teams.copy()
coaches_10 = coaches.copy()
awards_10 = awards_players.copy()

teams_11_meta = teams_11.copy()
pt_11_roster = players_teams_11.copy()

for df in [teams_10, pt_10, coaches_10, awards_10, teams_11_meta, coaches_11, pt_11_roster]:
    df.columns = [c.strip() for c in df.columns]


def playoff_score_prevyear(teams_df):
    """Previous season playoff performance score shifted to next year."""
    t = teams_df[['year','tmID','playoff','firstRound','semis','finals']].copy()
    cond = [
        t['finals'].eq('W'),
        t['finals'].eq('L'),
        t['semis'].eq('L'),
        t['firstRound'].eq('L'),
        t['playoff'].eq('Y')
    ]
    choices = [4, 3, 2, 1, 1]
    t['Playoff_Score'] = np.select(cond, choices, default=0)
    prev = t[['year','tmID','Playoff_Score']].copy()
    prev['year'] = prev['year'] + 1
    return prev.rename(columns={'Playoff_Score':'Prev_Playoff_Score'})

def team_prev_winpct(teams_df):
    """Previous season Win% shifted to next year."""
    t = teams_df[['year','tmID','won','lost']].copy()
    denom = (t['won'] + t['lost']).replace(0, np.nan)
    t['Win_Pct'] = (t['won'] / denom).fillna(0.5)
    prev = t[['year','tmID','Win_Pct']].copy()
    prev['year'] = prev['year'] + 1
    return prev.rename(columns={'Win_Pct':'Prev_Team_Win_Pct'})

def win_pct_trend_feature(teams_df):
    """Trend: (Win% last year) - (Win% year-2), aligned to current year."""
    t = teams_df[['year','tmID','won','lost']].copy()
    denom = (t['won'] + t['lost']).replace(0, np.nan)
    t['Win_Pct'] = (t['won'] / denom).fillna(0.5)

    prev1 = t[['year','tmID','Win_Pct']].copy()
    prev1['year'] = prev1['year'] + 1
    prev1 = prev1.rename(columns={'Win_Pct':'Prev1_Win_Pct'})

    prev2 = t[['year','tmID','Win_Pct']].copy()
    prev2['year'] = prev2['year'] + 2
    prev2 = prev2.rename(columns={'Win_Pct':'Prev2_Win_Pct'})

    trend = prev1.merge(prev2, on=['year','tmID'], how='left')
    trend['Win_Pct_Trend'] = (trend['Prev1_Win_Pct'] - trend['Prev2_Win_Pct']).fillna(0)
    return trend[['year','tmID','Win_Pct_Trend']]


def coach_score_until_year(target_year, coaches_df, teams_df):
    """
    Coach historical score up to (target_year-1):
    Coach_Score = (historical WinPct * 50) + (Champion * 5)
    Returns mapping for coaches active in target_year (final coach per team-season).
    """
    hist_end = target_year - 1
    if hist_end < 1:
        # No history
        this_year = coaches_df[coaches_df['year'] == target_year][['tmID','coachID','stint']].copy()
        this_year = this_year.sort_values(['tmID','stint']).groupby('tmID').last().reset_index()
        out = this_year[['tmID','coachID']].copy()
        out['year'] = target_year
        out['Coach_Score'] = 0.0
        return out[['year','tmID','coachID','Coach_Score']]

    ch = coaches_df[coaches_df['year'] <= hist_end].copy()
    agg = ch.groupby('coachID')[['won','lost']].sum().reset_index()
    denom = (agg['won'] + agg['lost']).replace(0, np.nan)
    agg['WinPct'] = (agg['won'] / denom).fillna(0)

    champions = teams_df[(teams_df['finals'] == 'W') & (teams_df['year'] <= hist_end)][['year','tmID']]
    champ_coaches = champions.merge(
        coaches_df[['year','tmID','coachID']],
        on=['year','tmID'],
        how='left'
    )
    champ_set = set(champ_coaches['coachID'].dropna())
    agg['Is_Champion'] = agg['coachID'].apply(lambda cid: 1 if cid in champ_set else 0)

    agg['Coach_Score'] = (agg['WinPct'] * 50 + agg['Is_Champion'] * 5).round(2)

    # Final coach per team-season in target_year
    this_year = coaches_df[coaches_df['year'] == target_year][['tmID','coachID','stint']].copy()
    this_year = this_year.sort_values(['tmID','stint']).groupby('tmID').last().reset_index()

    out = this_year.merge(agg[['coachID','Coach_Score']], on='coachID', how='left')
    out['Coach_Score'] = out['Coach_Score'].fillna(0.0)
    out['year'] = target_year
    return out[['year','tmID','coachID','Coach_Score']]

def build_roster_talent_mean(target_year, pt_hist, roster_df):
    """
    Player_Score_Mean for each team in target_year:
    - compute career Player_Score up to year-1 using pt_hist (Season 1–10 stats)
    - merge to target roster (which might be IDs only in Season 11)
    """
    hist_end = target_year - 1
    hist = pt_hist[pt_hist['year'] <= hist_end].copy()

    if hist.empty:
        teams_in_year = roster_df[roster_df['year'] == target_year]['tmID'].unique()
        return pd.DataFrame({'year': target_year, 'tmID': teams_in_year, 'Player_Score_Mean': 0.0})

    career = hist.groupby('playerID').agg({
        'GP':'sum','points':'sum','assists':'sum','oRebounds':'sum','dRebounds':'sum',
        'steals':'sum','blocks':'sum','turnovers':'sum','PF':'sum',
        'fgAttempted':'sum','fgMade':'sum','ftAttempted':'sum','ftMade':'sum'
    }).reset_index()

    # Same "Player_Score" formula used for first question
    career['Player_Score'] = (
        career["points"]
        + 0.4 * career["fgMade"]
        + 0.7 * career["oRebounds"]
        + 0.3 * career["dRebounds"]
        + career["steals"]
        + 0.7 * career["assists"]
        + 0.7 * career["blocks"]
        - 0.7 * career["fgAttempted"]
        - 0.4 * (career["ftAttempted"] - career["ftMade"])
        - 0.4 * career["PF"]
        - career["turnovers"]
    )

    rost = roster_df[roster_df['year'] == target_year][['playerID','tmID','stint','year']].copy()
    rost = rost.sort_values(['playerID','stint']).drop_duplicates('playerID', keep='last')

    merged = rost.merge(career[['playerID','Player_Score']], on='playerID', how='left')
    merged['Player_Score'] = merged['Player_Score'].fillna(0.0)

    team_mean = merged.groupby('tmID')['Player_Score'].mean().reset_index()
    team_mean = team_mean.rename(columns={'Player_Score':'Player_Score_Mean'})
    team_mean['year'] = target_year
    return team_mean[['year','tmID','Player_Score_Mean']]

In [27]:
# COY base (one row per team-season with final coach)
coaches_all = pd.concat([coaches_10, coaches_11], ignore_index=True)
coaches_all = (
    coaches_all.sort_values(['tmID','year','stint'])
              .groupby(['tmID','year'])
              .last()
              .reset_index()
)

# COY labels (Season 1–10 only; Season 11 has no label)
coy_labels = awards_10[awards_10["award"] == "Coach of the Year"][["playerID","year"]].drop_duplicates()
coy_labels["coy_label"] = 1

coy_feat = coaches_all.merge(
    coy_labels,
    left_on=["coachID","year"],
    right_on=["playerID","year"],
    how="left"
)
coy_feat["coy_label"] = coy_feat["coy_label"].fillna(0).astype(int)
coy_feat.drop(columns=["playerID"], inplace=True, errors="ignore")

# Pre-season team features (from Season 1–10 history)
prev_win   = team_prev_winpct(teams_10)
prev_play  = playoff_score_prevyear(teams_10)
trend_df   = win_pct_trend_feature(teams_10)

coy_feat = coy_feat.merge(prev_win,  on=['year','tmID'], how='left')
coy_feat = coy_feat.merge(prev_play, on=['year','tmID'], how='left')
coy_feat = coy_feat.merge(trend_df,  on=['year','tmID'], how='left')

coy_feat['Prev_Team_Win_Pct']  = coy_feat['Prev_Team_Win_Pct'].fillna(0.5)
coy_feat['Prev_Playoff_Score'] = coy_feat['Prev_Playoff_Score'].fillna(0)
coy_feat['Win_Pct_Trend']      = coy_feat['Win_Pct_Trend'].fillna(0)

# Coach features: Is_New_Coach, Coach_Tenure, Coach_Score
coy_feat = coy_feat.sort_values(['tmID','year'])
coy_feat['Prev_CoachID'] = coy_feat.groupby('tmID')['coachID'].shift(1)
coy_feat['Is_New_Coach'] = (coy_feat['coachID'] != coy_feat['Prev_CoachID']).astype(int)
coy_feat['Is_New_Coach'] = coy_feat['Is_New_Coach'].fillna(1)

coy_feat['Coach_Tenure'] = coy_feat.groupby(['tmID','coachID']).cumcount() + 1

coach_scores = []
for y in range(1, 12):
    coach_scores.append(coach_score_until_year(y, coaches_all, teams_10))
coach_scores = pd.concat(coach_scores, ignore_index=True)

coy_feat = coy_feat.merge(coach_scores, on=['year','tmID','coachID'], how='left')
coy_feat['Coach_Score'] = coy_feat['Coach_Score'].fillna(0.0)

# Roster talent: Player_Score_Mean (uses stats up to year-1)
roster_all = pd.concat([
    pt_10[['playerID','year','stint','tmID']].copy(),
    pt_11_roster[['playerID','year','stint','tmID']].copy()
], ignore_index=True)

talent_rows = []
for y in range(1, 12):
    talent_rows.append(build_roster_talent_mean(y, pt_10, roster_all))
talent_all = pd.concat(talent_rows, ignore_index=True)

coy_feat = coy_feat.merge(talent_all, on=['year','tmID'], how='left')
coy_feat['Player_Score_Mean'] = coy_feat['Player_Score_Mean'].fillna(0.0)

features_coy_pre = [
    "Prev_Team_Win_Pct",
    "Prev_Playoff_Score",
    "Win_Pct_Trend",
    "Coach_Score",
    "Is_New_Coach",
    "Coach_Tenure",
    "Player_Score_Mean"
]

In [28]:
train_10 = coy_feat[coy_feat["year"] < 10].copy()
test_10  = coy_feat[coy_feat["year"] == 10].copy()

X_train_10 = train_10[features_coy_pre]
y_train_10 = train_10["coy_label"].astype(int)

X_test_10  = test_10[features_coy_pre]

model_coy_y10 = RandomForestClassifier(
    n_estimators=600,
    max_depth=6,
    class_weight="balanced",
    random_state=42
)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_auc_10 = cross_val_score(model_coy_y10, X_train_10, y_train_10, cv=cv, scoring="roc_auc")
print(f"COY (pre-season) Year10 setup — CV ROC-AUC: {cv_auc_10.mean():.3f} ± {cv_auc_10.std():.3f}")

model_coy_y10.fit(X_train_10, y_train_10)
test_10 = test_10.copy()
test_10["pred_prob"] = model_coy_y10.predict_proba(X_test_10)[:, 1]
results_coy_10 = test_10.sort_values("pred_prob", ascending=False)
print("\n--- TOP Coach of the Year candidates (Year 10) ---")
results_coy_10[[
        "coachID","tmID",
        "Prev_Team_Win_Pct","Prev_Playoff_Score","Win_Pct_Trend",
        "Coach_Score","Is_New_Coach","Coach_Tenure",
        "Player_Score_Mean","pred_prob"
    ]].head(10)

COY (pre-season) Year10 setup — CV ROC-AUC: 0.177 ± 0.104

--- TOP Coach of the Year candidates (Year 10) ---


Unnamed: 0,coachID,tmID,Prev_Team_Win_Pct,Prev_Playoff_Score,Win_Pct_Trend,Coach_Score,Is_New_Coach,Coach_Tenure,Player_Score_Mean,pred_prob
25,thibami99w,CON,0.617647,1.0,0.088235,36.13,0,7,487.864286,0.145712
152,plankju99w,WAS,0.294118,0.0,-0.176471,0.0,1,1,604.172727,0.114395
1,meadoma99w,ATL,0.117647,0.0,0.0,5.88,0,2,695.872727,0.107768
13,keyst99wc,CHI,0.352941,0.0,-0.058824,17.65,0,2,461.545455,0.097276
66,coopemi01w,LAS,0.588235,2.0,0.294118,39.09,0,7,1209.063636,0.092497
105,gaineco01w,PHO,0.470588,0.0,-0.205882,23.53,0,2,890.683333,0.088848
119,whisejo99w,SAC,0.529412,1.0,-0.029412,37.2,1,5,664.45,0.081869
137,aglerbr99w,SEA,0.647059,1.0,0.147059,25.0,0,2,943.353846,0.061866
80,gilloje01w,MIN,0.470588,0.0,0.176471,0.0,1,1,369.376923,0.056318
55,dunnli99wc,IND,0.5,1.0,-0.117647,19.23,0,2,1025.7,0.049675


The one who won in year 10 was meadoma99w, who was predicted to be the third most probable coach of the year, according to the model.

Predicting for year 11

In [29]:
train_11 = coy_feat[coy_feat["year"] < 11].copy()
test_11  = coy_feat[coy_feat["year"] == 11].copy()

X_train_11 = train_11[features_coy_pre]
y_train_11 = train_11["coy_label"].astype(int)

X_test_11  = test_11[features_coy_pre]

model_coy_y11 = RandomForestClassifier(
    n_estimators=600,
    max_depth=6,
    class_weight="balanced",
    random_state=42
)

cv_auc_11 = cross_val_score(model_coy_y11, X_train_11, y_train_11, cv=cv, scoring="roc_auc")
print(f"\nCOY (pre-season) Year11 setup — CV ROC-AUC: {cv_auc_11.mean():.3f} ± {cv_auc_11.std():.3f}")

model_coy_y11.fit(X_train_11, y_train_11)

test_11 = test_11.copy()
test_11["pred_prob"] = model_coy_y11.predict_proba(X_test_11)[:, 1]
results_coy_11 = test_11.sort_values("pred_prob", ascending=False)

print("\n--- TOP Coach of the Year candidates (Year 11) ---")
results_coy_11[[
        "coachID","tmID",
        "Prev_Team_Win_Pct","Prev_Playoff_Score","Win_Pct_Trend",
        "Coach_Score","Is_New_Coach","Coach_Tenure",
        "Player_Score_Mean","pred_prob"
    ]].head(10)


COY (pre-season) Year11 setup — CV ROC-AUC: 0.293 ± 0.083

--- TOP Coach of the Year candidates (Year 11) ---


Unnamed: 0,coachID,tmID,Prev_Team_Win_Pct,Prev_Playoff_Score,Win_Pct_Trend,Coach_Score,Is_New_Coach,Coach_Tenure,Player_Score_Mean,pred_prob
26,thibami99w,CON,0.470588,0.0,-0.147059,35.04,0,8,428.61,0.344829
2,meadoma99w,ATL,0.529412,1.0,0.411765,16.18,0,3,508.858333,0.210481
127,hugheda99w,SAS,0.441176,1.0,-0.264706,24.17,0,6,890.03,0.130605
106,gaineco01w,PHO,0.676471,4.0,0.205882,33.68,0,3,945.483333,0.103071
92,whisejo99w,NYL,0.382353,0.0,-0.176471,35.58,1,1,447.981818,0.092986
139,richano99w,TUL,0.5,0.0,0.0,0.0,1,1,421.866667,0.084137
153,laceytr99w,WAS,0.470588,1.0,0.176471,25.0,1,1,422.438462,0.067983
56,dunnli99wc,IND,0.647059,3.0,0.147059,21.95,0,3,1109.536364,0.064068
138,aglerbr99w,SEA,0.588235,1.0,-0.058824,26.14,0,3,1106.961538,0.053036
81,reevech99w,MIN,0.411765,0.0,-0.058824,0.0,1,1,852.345455,0.037893
