In [165]:
import pandas as pd
import numpy as np

from create_test_and_training_set import create_test_set_data
from collections import defaultdict


def clean_players(players: pd.DataFrame):
    players = players.drop(['firstseason','lastseason'], axis='columns') # all players with first and last season 0
    players = players[players["bioID"].isin(players_teams["playerID"])]
    return players

def clean_players_teams(players_teams: pd.DataFrame):
    return players_teams.drop('lgID', axis='columns') # same lgID

def clean_awards_players(awards_players: pd.DataFrame):
    return awards_players.drop("lgID", axis="columns")

def clean_coaches(coaches: pd.DataFrame):
    return coaches.drop("lgID", axis="columns")

def clean_teams_post(teams_post : pd.DataFrame):
    return teams_post.drop("lgID", axis="columns")

def clean_series_post(series_post : pd.DataFrame):
    return series_post.drop(["lgIDLoser", "lgIDWinner"], axis="columns")

def clean_teams(teams : pd.DataFrame):
    return teams.drop(["lgID", "franchID", "divID", "arena", "attend", "min", "name"], axis="columns")

def parse_player_team_data(df):
    """
    Parses the player-team-season dataset into structured Python dictionaries
    for further analysis and championship prediction.
    """

    df.columns = [c.strip() for c in df.columns]
    df.fillna(0, inplace=True)

    players = defaultdict(list)
    teams_by_year = defaultdict(lambda: defaultdict(list))

    for _, row in df.iterrows():
        playerID = row["playerID"]
        teamID = row["tmID"]
        year = int(row["year"])

        # Regular season stats
        season_stats = {
            "GP": row["GP"],
            "GS": row["GS"],
            "minutes": row["minutes"],
            "points": row["points"],
            "oRebounds": row["oRebounds"],
            "dRebounds": row["dRebounds"],
            "rebounds": row["rebounds"],
            "assists": row["assists"],
            "steals": row["steals"],
            "blocks": row["blocks"],
            "turnovers": row["turnovers"],
            "PF": row["PF"],
            "fgAttempted": row["fgAttempted"],
            "fgMade": row["fgMade"],
            "ftAttempted": row["ftAttempted"],
            "ftMade": row["ftMade"],
            "threeAttempted": row["threeAttempted"],
            "threeMade": row["threeMade"],
            "dq": row["dq"],
        }

        # Postseason stats
        postseason_stats = {
            "PostGP": row["PostGP"],
            "PostGS": row["PostGS"],
            "PostMinutes": row["PostMinutes"],
            "PostPoints": row["PostPoints"],
            "PostoRebounds": row["PostoRebounds"],
            "PostdRebounds": row["PostdRebounds"],
            "PostRebounds": row["PostRebounds"],
            "PostAssists": row["PostAssists"],
            "PostSteals": row["PostSteals"],
            "PostBlocks": row["PostBlocks"],
            "PostTurnovers": row["PostTurnovers"],
            "PostPF": row["PostPF"],
            "PostfgAttempted": row["PostfgAttempted"],
            "PostfgMade": row["PostfgMade"],
            "PostftAttempted": row["PostftAttempted"],
            "PostftMade": row["PostftMade"],
            "PostthreeAttempted": row["PostthreeAttempted"],
            "PostthreeMade": row["PostthreeMade"],
            "PostDQ": row["PostDQ"],
        }

        # Combined player record
        record = {
            "year": year,
            "teamID": teamID,
            "stint": row["stint"],
            **season_stats,
            **postseason_stats
        }

        # Store in both structures
        players[playerID].append(record)
        teams_by_year[year][teamID].append(record)

    print(f"Parsed {len(players)} players across {len(teams_by_year)} seasons.")
    return {"players": players, "teams_by_year": teams_by_year}

awards_players = clean_awards_players(pd.read_csv("basketballPlayoffs/awards_players.csv"))
coaches = clean_coaches(pd.read_csv("basketballPlayoffs/coaches.csv"))
players_teams = clean_players_teams(pd.read_csv("basketballPlayoffs/players_teams.csv"))
players = clean_players(pd.read_csv("basketballPlayoffs/players.csv"))
series_post = clean_series_post(pd.read_csv("basketballPlayoffs/series_post.csv"))
teams_post = clean_teams_post(pd.read_csv("basketballPlayoffs/teams_post.csv"))
teams = clean_teams(pd.read_csv("basketballPlayoffs/teams.csv"))

# For year 11

coaches_11 = pd.read_csv("Season_11/coaches.csv")
players_teams_11 = pd.read_csv("Season_11/players_teams.csv")
teams_11 = pd.read_csv("Season_11/teams.csv")

In [166]:
teams["FG_Percentage"] = teams["o_fgm"] / teams["o_fga"]*100 # Field goal percentage
teams["FT_Percentage"] = teams["o_ftm"] / teams["o_fta"]*100 # Free-Throw percentage
teams["3P_Percentage"] = teams["o_3pm"] / teams["o_3pa"]*100 # 3 Point percentage
teams["O_OREBPG"] = teams["o_oreb"] / teams["GP"] # Ofensive rebounds per game
teams["O_DREBPG"] = teams["o_dreb"] / teams["GP"] # Defensive rebounds per game
teams["TOPG"] = teams["o_to"] / teams["GP"] # Turnovers per game
teams["PPG"] = teams["o_pts"] / teams["GP"] # Points per game
teams["STLPG"] = teams["o_stl"] / teams["GP"] # Steals per game
teams["BLKPG"] = teams["o_blk"] / teams["GP"] # Blocks per game
teams["PFPG"] = teams["o_pf"] / teams["GP"] # Personal fouls per game
teams["D_PPG"] = teams["d_pts"] / teams["GP"]
teams["APG"] = teams["o_asts"] / teams["GP"] # Assists per game
teams["Win%"] = teams["won"] / teams["GP"] *100

teams[["tmID", "FG_Percentage", "FT_Percentage", "3P_Percentage", "Win%"]].head()

Unnamed: 0,tmID,FG_Percentage,FT_Percentage,3P_Percentage,Win%
0,ATL,39.636847,74.758621,33.779264,11.764706
1,ATL,44.85173,75.364238,30.481283,52.941176
2,CHA,42.669469,74.696707,33.937824,25.0
3,CHA,41.910112,77.651515,35.747664,56.25
4,CHA,43.01676,73.906486,40.037951,56.25


Building Awards Weights

In [167]:
dict_award_weight = {
    "Most Valuable Player" : 10,
    "Coach of the Year" : 10,
    "WNBA All-Decade Team" : 6,
    "WNBA Finals Most Valuable Player" : 5,
    "WNBA All Decade Team Honorable Mention" : 4,
    "Defensive Player of the Year" : 4,
    "Most Improved Player" : 3,
    "Sixth Woman of the Year" : 2,
    "All-Star Game Most Valuable Player" : 2,
    "Rookie of the Year" : 1,
    "Kim Perrot Sportsmanship Award" : 0,
    "Kim Perrot Sportsmanship" : 0,
}

def compute_award_score(team_id, year, awards_players, players_teams,coaches, dict_award_weight):
  team_players = players_teams[
        (players_teams["tmID"] == team_id) &
        (players_teams["year"] == year)
    ]["playerID"].unique()

  player_awards = awards_players[
      (awards_players["playerID"].isin(team_players)) &
      (awards_players["year"] < year)
  ]

  #if team_id == "LAS":
  #  print(player_awards)

  player_award_score = player_awards["award"].map(dict_award_weight).sum()
  #if team_id == "LAS":
  #  print(player_award_score)

  coach_ = coaches[
      (coaches["tmID"] == team_id) &
      (coaches["year"] == year)
  ]

  #if team_id == "LAS":
   # print(coach_)

  if not coach_.empty:
        coach_id = coach_["coachID"].iloc[0]
        coach_awards = awards_players[
            (awards_players["playerID"] == coach_id) &
            (awards_players["award"] == "Coach of the Year") &
            (awards_players["year"] < year)
        ]
        coach_award_score = coach_awards["award"].map(dict_award_weight).sum()

  total_award_score = player_award_score + coach_award_score

  return total_award_score

Testing team's award score for a specific year(year 10)

In [168]:
award_scores_for_year_10 = []
year = 10
teams_in_year_10 = players_teams[players_teams["year"] == year]["tmID"].unique()
for team_id in teams_in_year_10:
    score = compute_award_score(team_id, year, awards_players, players_teams, coaches, dict_award_weight)
    award_scores_for_year_10.append({"year": year, "tmID": team_id, "Award_Score": score})

award_scores_10_df = pd.DataFrame(award_scores_for_year_10)

print(award_scores_10_df.sort_values("Award_Score", ascending=False))

    year tmID  Award_Score
7     10  LAS           93
2     10  SEA           36
3     10  DET           33
5     10  IND           31
12    10  SAS           25
6     10  CON           20
11    10  ATL           13
9     10  PHO           11
1     10  MIN            6
8     10  SAC            4
4     10  NYL            3
10    10  CHI            1
0     10  WAS            0


Player's score

In [169]:
def getPlayersStats(players_group):
  career_players = players_group.groupby("playerID").agg({
    "GP": "sum",
    "points": "sum",
    "assists": "sum",
    "oRebounds": "sum",
    "dRebounds": "sum",
    "rebounds": "sum",
    "steals": "sum",
    "blocks": "sum",
    "turnovers": "sum",
    "PF": "sum",
    "fgAttempted": "sum",
    "fgMade": "sum",
    "ftAttempted": "sum",
    "ftMade": "sum",
    "threeAttempted": "sum",
    "threeMade": "sum"
    }).reset_index()

  career_players["PPG"] = career_players["points"] / career_players["GP"]
  career_players["APG"] = career_players["assists"] / career_players["GP"]
  career_players["RPG"] = career_players["rebounds"] / career_players["GP"]
  career_players["SPG"] = career_players["steals"] / career_players["GP"]
  career_players["BPG"] = career_players["blocks"] / career_players["GP"]
  career_players["TOPG"] = career_players["turnovers"] / career_players["GP"]
  career_players["PFPG"] = career_players["PF"] / career_players["GP"]
  career_players["FG_Percentage"] = career_players["fgMade"] / career_players["fgAttempted"] * 100
  career_players["FT_Percentage"] = career_players["ftMade"] / career_players["ftAttempted"] * 100
  career_players["3P_Percentage"] = career_players["threeMade"] / career_players["threeAttempted"] * 100

  career_players = career_players.round(2)

  return career_players

def last_year_players_stats(year, players_teams):
  last_year = year - 1

  if last_year < 1:
    teams_in_year = players_teams[players_teams['year'] == year]['tmID'].unique()
    return pd.DataFrame({
        'year': year,
        'tmID': teams_in_year,
        'Player_Score': 0.0
    })

  players_in_year = players_teams[players_teams["year"] == last_year]
  players_stats = getPlayersStats(players_in_year)

  players_stats["Player_Score"] = (
  players_stats["points"] + 0.4 * players_stats["fgMade"] + 0.7 * players_stats["oRebounds"] +
  0.3 * players_stats["dRebounds"] + players_stats["steals"] + 0.7 * players_stats["assists"] +
  0.7 * players_stats["blocks"] - 0.7 * players_stats["fgAttempted"] - 0.4 * (players_stats["ftAttempted"] - players_stats["ftMade"]) - 0.4 * players_stats["PF"] - players_stats["turnovers"]
  )

  rosters_year = players_teams[players_teams["year"] == year][['playerID', 'tmID']]

  players_scores_year = pd.merge(
    rosters_year,
    players_stats,
    on='playerID',
    how='inner'
  )

  team_names_year = teams[teams["year"] == year][['tmID']]

  final_rosters_with_scores = pd.merge(
      players_scores_year,
      team_names_year,
      on='tmID'
  )

  final_rosters_with_scores['year'] = year

  return final_rosters_with_scores

Coach Score

In [170]:
def compute_coach_scores_for_year(target_year, all_coaches_df, all_teams_df):

    # 1. Definir o período de história (tudo ANTES do ano alvo)
    history_end_year = target_year - 1

    # Se for o ano 1, não há história, o score de todos é 0
    if history_end_year < 1:
        teams_in_year = all_coaches_df[all_coaches_df['year'] == target_year]['tmID'].unique()
        return pd.DataFrame({
            'year': target_year,
            'tmID': teams_in_year,
            'Coach_Score': 0
        })

    # --- Calcular Stats Históricos (Baseado nos anos <= history_end_year) ---

    # 2. Calcular Win% Histórico
    coaches_hist = all_coaches_df[all_coaches_df['year'] <= history_end_year]
    coaches_stats = coaches_hist.groupby('coachID')[['won', 'lost']].sum().reset_index()

    coaches_stats['Win Percentage'] = (
        coaches_stats['won'] / (coaches_stats['won'] + coaches_stats['lost'])
    ).fillna(0) # fillna(0) para treinadores com 0-0 (evita NaN)

    # 3. Encontrar Campeões Históricos
    winner_teams_hist = all_teams_df[
        (all_teams_df['finals'] == 'W') &
        (all_teams_df['year'] <= history_end_year)
    ][['year', 'tmID']]

    winner_coaches_hist = winner_teams_hist.merge(
        all_coaches_df[['year', 'tmID', 'coachID']],
        on=['year', 'tmID'],
        how='left'
    )
    historical_champions_set = set(winner_coaches_hist['coachID'].dropna())

    # 4. Calcular o Score Histórico de cada Treinador
    coaches_stats['Champion'] = coaches_stats['coachID'].apply(
        lambda cid: 'Yes' if cid in historical_champions_set else 'No'
    )

    coaches_stats['Coach_Score'] = (
        coaches_stats['Win Percentage'] * 50 +
        coaches_stats['Champion'].apply(lambda x: 5 if x == 'Yes' else 0)
    ).round(2)

    # --- Mapear Scores para as Equipas do 'target_year' ---

    # 5. Obter os treinadores do 'target_year'
    teams_and_coaches_target_year = all_coaches_df[
        all_coaches_df['year'] == target_year
    ][['tmID', 'coachID']]

    # 6. Juntar o score histórico
    teams_with_scores = teams_and_coaches_target_year.merge(
        coaches_stats[['coachID', 'Coach_Score']],
        on='coachID',
        how='left'
    )

    # Treinadores novos (sem história) recebem 0
    teams_with_scores['Coach_Score'] = teams_with_scores['Coach_Score'].fillna(0)

    # 7. Agregar por equipa (caso tenha tido >1 treinador, como DET no seu exemplo)
    # Usamos a média para resolver duplicados
    final_team_scores = teams_with_scores.groupby('tmID')['Coach_Score'].mean().reset_index()
    final_team_scores['year'] = target_year

    return final_team_scores[['year', 'tmID', 'Coach_Score']]

final = compute_coach_scores_for_year(10, coaches, teams)
print(final)

    year tmID  Coach_Score
0     10  ATL        5.880
1     10  CHI       17.650
2     10  CON       36.130
3     10  DET       17.545
4     10  IND       19.230
5     10  LAS       39.170
6     10  MIN        0.000
7     10  NYL       27.645
8     10  PHO       23.530
9     10  SAC       32.205
10    10  SAS       24.440
11    10  SEA       23.500
12    10  WAS        0.000


In [171]:
train_award_scores_list = []

for year in range(2, 10):
  teams_in_year = players_teams[players_teams["year"] == year]["tmID"].unique()

  for team_id in teams_in_year:
        score = compute_award_score(
            team_id,
            year,
            awards_players,
            players_teams,
            coaches,
            dict_award_weight
        )

        train_award_scores_list.append({
            "year": year,
            "tmID": team_id,
            "Award_Score": score
        })
        #print(train_award_scores_list)

  X_train_awards = pd.DataFrame(train_award_scores_list)

train_coach_scores_list = []
for year in range(2, 10):
  scores_for_this_year = compute_coach_scores_for_year(
      target_year=year,
      all_coaches_df=coaches,
      all_teams_df=teams
  )
  train_coach_scores_list.append(scores_for_this_year)
  #print(scores_for_this_year[scores_for_this_year['tmID'] == 'WAS'])

X_train_coaches = pd.concat(train_coach_scores_list, ignore_index=True)
train_final = pd.merge(
    X_train_awards,
    X_train_coaches,
    on=['year', 'tmID'],
    how='left'
)

train_final['Coach_Score'] = train_final['Coach_Score'].fillna(0)

#print(train_final[train_final['Coach_Score'] == 'WAS'])

X_train_awards = pd.DataFrame(train_award_scores_list)
print("--- Tabela de Scores (Anos 1-9) ---")
#print(train_final)

train_players_scores_list = []

for year in range(2,10):
  players_year = last_year_players_stats(year, players_teams)
  #teams_rosters_scores = players_year.groupby(["year", "tmID"])["Player_Score"].mean().reset_index()
  teams_rosters_scores = players_year.groupby(["year", "tmID"])["Player_Score"].agg(
      Player_Score_Mean='mean',
      Player_Score_Max='max',
      Player_Score_Std='std'
  ).reset_index()
  train_players_scores_list.append(teams_rosters_scores)

X_train_players = pd.concat(train_players_scores_list, ignore_index=True)
X_train_players = X_train_players.fillna(0)
#print(X_train_players)

#print(train_final)

X_train_scores = pd.merge(
    train_final,
    X_train_players,
    on=['year', 'tmID'],
    how='left'
)
print(X_train_scores)
teste = X_train_scores[X_train_scores['tmID'] == 'WAS']
#print(teste)

--- Tabela de Scores (Anos 1-9) ---
     year tmID  Award_Score  Coach_Score  Player_Score_Mean  Player_Score_Max  \
0       2  MIN            1        23.44         136.685714             427.2   
1       2  WAS            0         0.00         172.437500             375.9   
2       2  ORL            0        25.00         156.512500             372.5   
3       2  NYL            3        31.25         184.155556             295.0   
4       2  CHA            0        14.06         147.111111             406.3   
..    ...  ...          ...          ...                ...               ...   
108     9  SEA           97        19.88         166.970000             617.0   
109     9  LAS           65        40.05         123.280000             220.1   
110     9  DET           26        34.69         162.838462             398.4   
111     9  CHI            1         0.00         175.855556             367.8   
112     9  PHO           13         0.00         181.544444             4

In [172]:
teams_data = teams[['year', 'tmID', 'confID', 'won', 'lost', 'rank']].copy()
teams_data['total_games'] = teams_data['won'] + teams_data['lost']
teams_data['Win_Percentage'] = teams_data['won'] / teams_data['total_games']

conf_map = {'EA': 0, 'WE': 1}
teams_data['Conf_Code'] = teams_data['confID'].map(conf_map)

dataset_final = pd.merge(
    X_train_scores,
    teams_data[['year', 'tmID', 'Conf_Code', 'Win_Percentage', 'rank']],
    on=['year', 'tmID'],
    how='inner'
)

#print(dataset_final[dataset_final['year'] == 7])


teams_history = teams[['year', 'tmID', 'won', 'lost']].copy()
teams_history['total_games'] = teams_history['won'] + teams_history['lost']
teams_history['Win_Pct'] = teams_history['won'] / teams_history['total_games']


prev_performance = teams_history[['year', 'tmID', 'Win_Pct']].copy()
prev_performance['year'] = prev_performance['year'] + 1
prev_performance = prev_performance.rename(columns={'Win_Pct': 'Prev_Team_Win_Pct'})


dataset_final = pd.merge(
    dataset_final,
    prev_performance,
    on=['year', 'tmID'],
    how='left'
)
dataset_final['Prev_Team_Win_Pct'] = dataset_final['Prev_Team_Win_Pct'].fillna(0.5) # Preencher buracos com média

teams_diff = teams[['year', 'tmID', 'o_pts', 'd_pts']].copy()
teams_diff['Point_Diff'] = teams_diff['o_pts'] - teams_diff['d_pts']
prev_diff = teams_diff[['year', 'tmID', 'Point_Diff']].copy()
prev_diff['year'] = prev_diff['year'] + 1
prev_diff = prev_diff.rename(columns={'Point_Diff': 'Prev_Point_Diff'})

dataset_final = pd.merge(
    dataset_final,
    prev_diff,
    on=['year', 'tmID'],
    how='left'
)
dataset_final['Prev_Point_Diff'] = dataset_final['Prev_Point_Diff'].fillna(0)

print(dataset_final[dataset_final['year'] == 7])

    year tmID  Award_Score  Coach_Score  Player_Score_Mean  Player_Score_Max  \
72     7  MIN            0       12.255         124.914286             274.0   
73     7  CON            0       35.390         175.790000             391.2   
74     7  NYL            0       27.880          80.922222             358.6   
75     7  SAS           10       22.260         114.914286             279.2   
76     7  IND            5       26.470         183.175000             439.1   
77     7  DET           16       32.020         139.870000             344.5   
78     7  WAS            5       27.220         156.900000             270.1   
79     7  SEA           19       31.010         175.070000             508.5   
80     7  CHA            0       15.000         124.137500             239.4   
81     7  CHI            0        0.000          73.081818             151.2   
82     7  HOU           52       35.810         155.810000             485.6   
83     7  SAC           15       37.740 

In [173]:
from sklearn.preprocessing import StandardScaler

features = ['Award_Score', 'Coach_Score', 'Conf_Code', 'Prev_Team_Win_Pct', 'Prev_Point_Diff', 'Player_Score_Mean', 'Player_Score_Max', 'Player_Score_Std']

scaler = StandardScaler()

award_list_10 = []
teams_in_10 = players_teams[players_teams["year"] == 10]["tmID"].unique()

for team_id in teams_in_10:
    score = compute_award_score(team_id, 10, awards_players, players_teams, coaches, dict_award_weight)
    award_list_10.append({"year": 10, "tmID": team_id, "Award_Score": score})
X_10_awards = pd.DataFrame(award_list_10)

X_10_coaches = compute_coach_scores_for_year(10, coaches, teams)

players_year_10 = last_year_players_stats(10, players_teams)
#X_10_players = players_year_10.groupby(["year", "tmID"])["Player_Score"].mean().reset_index()
X_10_players = players_year_10.groupby(["year", "tmID"])["Player_Score"].agg(
    Player_Score_Mean='mean',
    Player_Score_Max='max',
    Player_Score_Std='std'
).reset_index()
X_10_players = X_10_players.fillna(0)

X_test_10 = pd.merge(X_10_awards, X_10_coaches, on=['year', 'tmID'], how='left')
X_test_10 = pd.merge(X_test_10, X_10_players, on=['year', 'tmID'], how='left')

teams_10_info = teams[teams['year'] == 10][['tmID', 'confID']]
X_test_10 = pd.merge(
    X_test_10,
    teams_10_info,
    on='tmID',
    how='left'
)

conf_map = {'EA': 0, 'WE': 1}
X_test_10['Conf_Code'] = X_test_10['confID'].map(conf_map)

X_test_10 = pd.merge(
    X_test_10,
    prev_performance,
    on=['year', 'tmID'],
    how='left'
)

X_test_10['Prev_Team_Win_Pct'] = X_test_10['Prev_Team_Win_Pct'].fillna(0.5)

X_test_10 = pd.merge(
    X_test_10,
    prev_diff,
    on=['year', 'tmID'],
    how='left'
)
X_test_10['Prev_Point_Diff'] = X_test_10['Prev_Point_Diff'].fillna(0)

print(X_test_10)

#X_test_scaled = scaler.transform(X_test_10[features])

#print(X_test_scaled)

    year tmID  Award_Score  Coach_Score  Player_Score_Mean  Player_Score_Max  \
0     10  WAS            0        0.000         133.845455             296.2   
1     10  MIN            6        0.000         202.611111             407.1   
2     10  SEA           36       23.500         160.680000             334.2   
3     10  DET           33       17.545         180.164286             399.0   
4     10  NYL            3       27.645         168.150000             344.4   
5     10  IND           31       19.230         147.583333             321.6   
6     10  CON           20       36.130         192.930000             399.3   
7     10  LAS           93       39.170         255.011111             544.7   
8     10  SAC            4       32.205         159.663636             261.3   
9     10  PHO           11       23.530         188.050000             597.0   
10    10  CHI            1       17.650         168.300000             420.9   
11    10  ATL           13        5.880 

Training and testing the model for predicting the ranking of the regular season for each conference

In [174]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
RANDOM_SEED = 42

features = ['Award_Score', 'Coach_Score', 'Conf_Code', 'Prev_Team_Win_Pct', 'Prev_Point_Diff', 'Player_Score_Mean', 'Player_Score_Max', 'Player_Score_Std']
X = dataset_final[features]
y = dataset_final['Win_Percentage']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

model_rf = RandomForestRegressor(n_estimators=500, random_state=RANDOM_SEED)
model_rf.fit(X_scaled, y)

feature_importances = pd.DataFrame({
    'Feature': features,
    'Importância': model_rf.feature_importances_
})

print("--- Importância das Features (Random Forest) ---")
print(feature_importances.sort_values(by='Importância', ascending=False))

X_test_scaled = scaler.transform(X_test_10[features])

predictions_10_rf = model_rf.predict(X_test_scaled)
results_10_rf = X_test_10[['tmID', 'Conf_Code']].copy()
results_10_rf['Predicted_Win%'] = predictions_10_rf
results_10_rf = results_10_rf.sort_values(by='Predicted_Win%', ascending=False)

print("\n--- Results ---")

print(results_10_rf[results_10_rf['Conf_Code'] == 1])
print(results_10_rf[results_10_rf['Conf_Code'] == 0])

--- Importância das Features (Random Forest) ---
             Feature  Importância
5  Player_Score_Mean     0.313580
1        Coach_Score     0.205506
6   Player_Score_Max     0.122898
4    Prev_Point_Diff     0.114710
7   Player_Score_Std     0.087247
3  Prev_Team_Win_Pct     0.075210
0        Award_Score     0.072608
2          Conf_Code     0.008241

--- Results ---
   tmID  Conf_Code  Predicted_Win%
12  SAS          1        0.666926
7   LAS          1        0.636353
9   PHO          1        0.586276
2   SEA          1        0.542004
8   SAC          1        0.519879
1   MIN          1        0.511610
   tmID  Conf_Code  Predicted_Win%
3   DET          0        0.612338
6   CON          0        0.599614
4   NYL          0        0.580445
5   IND          0        0.546603
10  CHI          0        0.510176
0   WAS          0        0.424029
11  ATL          0        0.364941


Predicting for year 11

In [175]:
train_frames = []

for year in range(2, 11):

    players_year = last_year_players_stats(year, players_teams)
    

    p_scores = players_year.groupby(["year", "tmID"])["Player_Score"].agg(
        Player_Score_Mean='mean',
        Player_Score_Max='max',
        Player_Score_Std='std'
    ).reset_index()
    p_scores = p_scores.fillna(0)


    c_scores = compute_coach_scores_for_year(year, coaches, teams)
    

    a_scores_list = []
    teams_in_year = players_teams[players_teams["year"] == year]["tmID"].unique()
    for team_id in teams_in_year:
        score = compute_award_score(team_id, year, awards_players, players_teams, coaches, dict_award_weight)
        a_scores_list.append({"year": year, "tmID": team_id, "Award_Score": score})
    a_scores = pd.DataFrame(a_scores_list)


    df_year = pd.merge(a_scores, c_scores, on=['year', 'tmID'], how='left')
    df_year = pd.merge(df_year, p_scores, on=['year', 'tmID'], how='left')


    prev_year = year - 1
    teams_prev = teams[teams['year'] == prev_year][['tmID', 'won', 'lost', 'o_pts', 'd_pts']].copy()
    

    teams_prev['Prev_Team_Win_Pct'] = teams_prev['won'] / (teams_prev['won'] + teams_prev['lost'])
    teams_prev['Prev_Point_Diff'] = teams_prev['o_pts'] - teams_prev['d_pts']
    

    df_year = pd.merge(df_year, teams_prev[['tmID', 'Prev_Team_Win_Pct', 'Prev_Point_Diff']], on='tmID', how='left')
    

    df_year['Prev_Team_Win_Pct'] = df_year['Prev_Team_Win_Pct'].fillna(0.5)
    df_year['Prev_Point_Diff'] = df_year['Prev_Point_Diff'].fillna(0)


    teams_target = teams[teams['year'] == year][['tmID', 'won', 'lost', 'confID']].copy()
    teams_target['Win_Percentage'] = teams_target['won'] / (teams_target['won'] + teams_target['lost'])
    

    conf_map = {'EA': 0, 'WE': 1}
    teams_target['Conf_Code'] = teams_target['confID'].map(conf_map)

    final_df_year = pd.merge(df_year, teams_target[['tmID', 'Win_Percentage', 'Conf_Code']], on='tmID', how='inner')
    
    train_frames.append(final_df_year)


dataset_train_11 = pd.concat(train_frames, ignore_index=True)

features = [
    'Award_Score', 'Coach_Score', 'Conf_Code', 
    'Prev_Team_Win_Pct', 'Prev_Point_Diff',
    'Player_Score_Mean', 'Player_Score_Max', 'Player_Score_Std'
]

X_train = dataset_train_11[features]
y_train = dataset_train_11['Win_Percentage']


scaler_11 = StandardScaler()
X_train_scaled = scaler_11.fit_transform(X_train)


model_rf_11 = RandomForestRegressor(n_estimators=500, random_state=42)
model_rf_11.fit(X_train_scaled, y_train)


import_df = pd.DataFrame({'Feature': features, 'Importance': model_rf_11.feature_importances_})
print(import_df.sort_values(by='Importance', ascending=False))

             Feature  Importance
5  Player_Score_Mean    0.291631
1        Coach_Score    0.200043
6   Player_Score_Max    0.125279
4    Prev_Point_Diff    0.109552
7   Player_Score_Std    0.094837
3  Prev_Team_Win_Pct    0.086400
0        Award_Score    0.081967
2          Conf_Code    0.010292


In [176]:
teams_in_11 = teams_11['tmID'].unique()

stats_year_10 = last_year_players_stats(10, players_teams)
player_scores_10 = stats_year_10[['playerID', 'Player_Score']].copy()

roster_11 = players_teams_11[['playerID', 'tmID', 'year']].copy()

roster_11_scored = pd.merge(roster_11, player_scores_10, on='playerID', how='left')
roster_11_scored['Player_Score'] = roster_11_scored['Player_Score'].fillna(0)

roster_11_scored.head(10)

p_scores_11 = roster_11_scored.groupby(["year", "tmID"])["Player_Score"].agg(
    Player_Score_Mean='mean',
    Player_Score_Max='max',
    Player_Score_Std='std'
).reset_index()

p_scores_11 = p_scores_11.fillna(0)

p_scores_11.head(10)

all_coaches = pd.concat([coaches, coaches_11], ignore_index=True, sort=False)
all_teams = pd.concat([teams, teams_11], ignore_index=True, sort=False)

all_coaches[all_coaches['year'] == 11]

c_scores_11 = compute_coach_scores_for_year(11, all_coaches, all_teams)

c_scores_11.head(10)

def compute_award_score_11(team_id, roster_df, coaches_df, awards_df, dict_weight):
    score = 0

    team_players = roster_df[roster_df['tmID'] == team_id]['playerID'].unique()

    p_awards = awards_df[awards_df['playerID'].isin(team_players)]
    score += p_awards['award'].map(dict_weight).fillna(0).sum()
    

    team_coach = coaches_df[coaches_df['tmID'] == team_id]
    if not team_coach.empty:
        c_id = team_coach.iloc[0]['coachID']
        c_awards = awards_df[(awards_df['playerID'] == c_id) & (awards_df['award'] == 'Coach of the Year')]
        score += c_awards['award'].map(dict_weight).fillna(0).sum()
        
    return score

a_scores_list_11 = []
for team_id in teams_in_11:
    score = compute_award_score_11(team_id, players_teams_11, coaches_11, awards_players, dict_award_weight)
    a_scores_list_11.append({"year": 11, "tmID": team_id, "Award_Score": score})

a_scores_11 = pd.DataFrame(a_scores_list_11)

#print(a_scores_11.head(10))

X_input_11 = pd.merge(a_scores_11, c_scores_11[['year', 'tmID', 'Coach_Score']], on=['year', 'tmID'], how='left')
X_input_11 = pd.merge(X_input_11, p_scores_11, on=['year', 'tmID'], how='left')

teams_prev_10 = teams[teams['year'] == 10][['tmID', 'won', 'lost', 'o_pts', 'd_pts']].copy()
teams_prev_10['Prev_Team_Win_Pct'] = teams_prev_10['won'] / (teams_prev_10['won'] + teams_prev_10['lost'])
teams_prev_10['Prev_Point_Diff'] = teams_prev_10['o_pts'] - teams_prev_10['d_pts']

X_input_11 = pd.merge(X_input_11, teams_prev_10, on='tmID', how='left')

conf_map = {'EA': 0, 'WE': 1}
teams_11_info = teams_11[['tmID', 'confID']].copy()
teams_11_info['Conf_Code'] = teams_11_info['confID'].map(conf_map)

X_input_11 = pd.merge(X_input_11, teams_11_info, on='tmID', how='left')

X_input_11 = X_input_11.fillna(0)

X_input_11.head(10)

Unnamed: 0,year,tmID,Award_Score,Coach_Score,Player_Score_Mean,Player_Score_Max,Player_Score_Std,won,lost,o_pts,d_pts,Prev_Team_Win_Pct,Prev_Point_Diff,confID,Conf_Code
0,11,ATL,15,16.18,85.692308,233.1,80.572974,18.0,16.0,2861.0,2797.0,0.529412,64.0,EA,0
1,11,CHI,3,0.0,85.653846,265.8,94.850748,16.0,18.0,2573.0,2693.0,0.470588,-120.0,EA,0
2,11,CON,20,35.04,69.158333,392.3,130.841052,16.0,18.0,2651.0,2654.0,0.470588,-3.0,EA,0
3,11,IND,21,21.95,113.627273,321.6,124.875891,22.0,12.0,2606.0,2501.0,0.647059,105.0,EA,0
4,11,LAS,26,20.59,173.290909,544.7,179.232059,18.0,16.0,2533.0,2498.0,0.529412,35.0,WE,1
5,11,MIN,3,0.0,191.654545,407.1,168.119293,14.0,20.0,2731.0,2827.0,0.411765,-96.0,WE,1
6,11,NYL,17,35.58,104.435714,419.6,132.527377,13.0,21.0,2512.0,2535.0,0.382353,-23.0,EA,0
7,11,PHO,23,33.68,131.338462,597.0,179.83223,23.0,11.0,3156.0,3031.0,0.676471,125.0,WE,1
8,11,SAS,25,24.17,157.96,440.7,189.14022,15.0,19.0,2615.0,2661.0,0.441176,-46.0,WE,1
9,11,SEA,49,24.83,140.546154,334.2,141.090158,20.0,14.0,2544.0,2476.0,0.588235,68.0,WE,1


In [177]:
features = [
    'Award_Score', 'Coach_Score', 'Conf_Code', 
    'Prev_Team_Win_Pct', 'Prev_Point_Diff',
    'Player_Score_Mean', 'Player_Score_Max', 'Player_Score_Std'
]

X_test_scaled_11 = scaler_11.transform(X_input_11[features])

predictions_11 = model_rf_11.predict(X_test_scaled_11)
results_11 = X_input_11[['tmID', 'Conf_Code']].copy()
results_11['Predicted_Win_Pct'] = predictions_11
results_11 = results_11.sort_values(by='Predicted_Win_Pct', ascending=False)

print(results_11[results_11['Conf_Code'] == 0].head(10))
print(results_11[results_11['Conf_Code'] == 1].head(10))

   tmID  Conf_Code  Predicted_Win_Pct
3   IND          0           0.493103
6   NYL          0           0.477651
2   CON          0           0.399684
11  WAS          0           0.372820
0   ATL          0           0.369107
1   CHI          0           0.241063
   tmID  Conf_Code  Predicted_Win_Pct
4   LAS          1           0.594500
9   SEA          1           0.555401
8   SAS          1           0.554849
7   PHO          1           0.502882
5   MIN          1           0.422743
10  TUL          1           0.323110


Teams that will change coach

Individual awards

In [178]:
teams["team_success"] = teams["won"] / (teams["won"] + teams["lost"])

player_features = players_teams.merge(
    teams[["tmID", "year", "team_success"]],
    on=["tmID", "year"],
    how="left"
)

rookie_year = (
    player_features.groupby("playerID")["year"]
    .min()
    .reset_index()
    .rename(columns={"year": "rookie_year"})
)

player_features = player_features.merge(rookie_year, on="playerID", how="left")

player_features.head(10)

Unnamed: 0,playerID,year,stint,tmID,GP,GS,minutes,points,oRebounds,dRebounds,...,PostPF,PostfgAttempted,PostfgMade,PostftAttempted,PostftMade,PostthreeAttempted,PostthreeMade,PostDQ,team_success,rookie_year
0,abrossv01w,2,0,MIN,26,23,846,343,43,131,...,0,0,0,0,0,0,0,0,0.375,2
1,abrossv01w,3,0,MIN,27,27,805,314,45,101,...,0,0,0,0,0,0,0,0,0.3125,2
2,abrossv01w,4,0,MIN,30,25,792,318,44,97,...,8,22,6,8,8,7,3,0,0.529412,2
3,abrossv01w,5,0,MIN,22,11,462,146,17,57,...,7,23,8,4,2,8,2,0,0.529412,2
4,abrossv01w,6,0,MIN,31,31,777,304,29,78,...,0,0,0,0,0,0,0,0,0.411765,2
5,abrossv01w,7,0,MIN,34,2,724,263,44,62,...,0,0,0,0,0,0,0,0,0.294118,2
6,abrossv01w,8,0,MIN,34,29,843,345,53,97,...,0,0,0,0,0,0,0,0,0.294118,2
7,abrossv01w,9,0,CON,6,0,107,34,3,17,...,8,24,11,4,2,5,0,0,0.617647,2
8,adamsjo01w,4,0,MIN,10,0,96,33,10,13,...,0,0,0,0,0,0,0,0,0.529412,4
9,aguilel01w,3,0,UTA,28,0,141,43,0,11,...,1,0,0,0,0,0,0,0,0.625,3


Most Valuable Player

For this award, we considered each player’s career statistics (Points, Assists, and Rebounds per Game) as well as a Career_Efficiency metric based on the formula used to calculate individual player performance in the first problem

In [179]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

# ==========================
# Most Valuable Player
# ==========================

def prepare_mvp_career_data(target_year, players_teams, teams_df, awards_df):

    history_end_year = target_year - 1

    if history_end_year < 1:
        return None

    players_history = players_teams[players_teams["year"].between(1, history_end_year)].copy()

    career_stats = players_history.groupby("playerID").agg({
        "GP": "sum",
        "points": "sum",
        "oRebounds": "sum",
        "dRebounds": "sum",
        "assists": "sum",
        "steals": "sum",
        "blocks": "sum",
        "turnovers": "sum",
        "fgMade": "sum",
        "fgAttempted": "sum",
        "ftMade": "sum",
        "ftAttempted": "sum",
        "PF": "sum"
    }).reset_index()

    career_stats["Raw_Score"] = (
        career_stats["points"] + 0.4 * career_stats["fgMade"] + 0.7 * career_stats["oRebounds"] +
        0.3 * career_stats["dRebounds"] + career_stats["steals"] + 0.7 * career_stats["assists"] +
        0.7 * career_stats["blocks"] - 0.7 * career_stats["fgAttempted"] -
        0.4 * (career_stats["ftAttempted"] - career_stats["ftMade"]) -
        0.4 * career_stats["PF"] - career_stats["turnovers"]
    )

    career_stats['Career_Efficiency'] = career_stats['Raw_Score'] / career_stats['GP']
    career_stats['Career_PPG'] = career_stats['points'] / career_stats['GP']
    career_stats['Career_RPG'] = (career_stats['oRebounds'] + career_stats['dRebounds']) / career_stats['GP']
    career_stats['Career_APG'] = career_stats['assists'] / career_stats['GP']

    roster_target_year = players_teams[players_teams["year"] == target_year][['playerID', 'tmID']].drop_duplicates()

    candidates = pd.merge(roster_target_year, career_stats, on='playerID', how='inner')

    prev_year = target_year - 1
    teams_prev = teams_df[teams_df['year'] == prev_year][['tmID', 'won', 'lost']].copy()
    teams_prev['Prev_Team_Win_Pct'] = teams_prev['won'] / (teams_prev['won'] + teams_prev['lost'])

    candidates = pd.merge(candidates, teams_prev[['tmID', 'Prev_Team_Win_Pct']], on='tmID', how='left')
    candidates['Prev_Team_Win_Pct'] = candidates['Prev_Team_Win_Pct'].fillna(0.5)

    awards_target = awards_df[
        (awards_df['year'] == target_year) &
        (awards_df['award'] == 'Most Valuable Player')
    ][['playerID']]
    awards_target['Won_MVP'] = 1

    final_df = pd.merge(candidates, awards_target, on='playerID', how='left')
    final_df['Won_MVP'] = final_df['Won_MVP'].fillna(0)

    return final_df

years_train = [2, 3, 4, 5, 6, 7, 8, 9]
train_list = []

for y in years_train:
    df = prepare_mvp_career_data(y, players_teams, teams, awards_players)
    train_list.append(df)

train_mvp = pd.concat(train_list, ignore_index=True)

features_mvp = ['Career_Efficiency', 'Career_PPG', 'Career_RPG', 'Career_APG']


In [180]:
model_mvp = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
model_mvp.fit(train_mvp[features_mvp], train_mvp['Won_MVP'])

importance = pd.DataFrame({'Feature': features_mvp, 'Value': model_mvp.feature_importances_})
print(importance.sort_values(by='Value', ascending=False))

candidates_10_mvp = prepare_mvp_career_data(10, players_teams, teams, awards_players)

probs = model_mvp.predict_proba(candidates_10_mvp[features_mvp])[:, 1]
candidates_10_mvp['MVP_Prob'] = probs

ranking_mvp = candidates_10_mvp[['playerID', 'tmID', 'Career_PPG', 'Career_APG', 'Career_RPG', 'Career_Efficiency', 'MVP_Prob']].sort_values(by='MVP_Prob', ascending=False)

ranking_mvp.head(10)

             Feature     Value
0  Career_Efficiency  0.518582
1         Career_PPG  0.361762
2         Career_RPG  0.097893
3         Career_APG  0.021763


Unnamed: 0,playerID,tmID,Career_PPG,Career_APG,Career_RPG,Career_Efficiency,MVP_Prob
71,leslili01w,LAS,17.519841,2.480159,9.353175,13.513889,0.42
20,catchta01w,IND,16.674528,3.726415,7.768868,14.523113,0.09
117,tauradi01w,PHO,20.347305,4.065868,4.287425,14.017964,0.05
137,youngso01w,SAS,15.37,1.76,6.36,11.627,0.04
3,augusse01w,MIN,21.252525,2.141414,3.878788,13.870707,0.03
91,parkeca01w,LAS,18.484848,3.424242,9.484848,16.506061,0.03
55,jacksla01w,SEA,19.417722,1.556962,7.962025,15.420675,0.02
122,thompti01w,LAS,17.636735,2.0,6.64898,11.189388,0.01
4,bakersh01w,DET,6.864865,1.716216,2.0,3.935135,0.0
0,ajavoma01w,WAS,8.029412,1.735294,1.764706,3.614706,0.0


The winner in year 10 was tauradi01w, who was predicted to finish third according to the model. However, the probability of tauradi01w winning the award is low compared to the top candidate, leslili01w

Most Valuable Player Prediction for year 11

In [181]:
all_players_teams = pd.concat([players_teams, players_teams_11], ignore_index=True)
all_teams = pd.concat([teams, teams_11], ignore_index=True)

years_train = [2, 3, 4, 5, 6, 7, 8, 9, 10]
train_list = []

for y in years_train:
    df = prepare_mvp_career_data(y, players_teams, teams, awards_players)
    train_list.append(df)

train_mvp = pd.concat(train_list, ignore_index=True)

features_mvp = ['Career_Efficiency', 'Career_PPG', 'Career_RPG', 'Career_APG']

model_mvp = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
model_mvp.fit(train_mvp[features_mvp], train_mvp['Won_MVP'])

importance = pd.DataFrame({'Feature': features_mvp, 'Value': model_mvp.feature_importances_})
print(importance.sort_values(by='Value', ascending=False))

candidates_11_mvp = prepare_mvp_career_data(11, all_players_teams, all_teams, awards_players)

probs = model_mvp.predict_proba(candidates_11_mvp[features_mvp])[:, 1]
candidates_11_mvp['MVP_Prob'] = probs

ranking_mvp = candidates_11_mvp[['playerID', 'tmID', 'Career_PPG', 'Career_APG', 'Career_RPG', 'Career_Efficiency', 'MVP_Prob']].sort_values(by='MVP_Prob', ascending=False)

ranking_mvp.head(10)

             Feature     Value
0  Career_Efficiency  0.529997
1         Career_PPG  0.360126
2         Career_RPG  0.091793
3         Career_APG  0.018083


Unnamed: 0,playerID,tmID,Career_PPG,Career_APG,Career_RPG,Career_Efficiency,MVP_Prob
85,tauradi01w,PHO,20.348485,3.979798,4.515152,14.262626,0.55
99,youngso01w,SAS,16.067669,1.729323,6.398496,11.966917,0.18
2,augusse01w,MIN,21.238095,2.104762,3.895238,14.021905,0.06
61,parkeca01w,LAS,16.172414,3.051724,9.603448,14.593103,0.02
84,swoopsh01w,TUL,15.846154,3.461538,4.950226,12.50362,0.02
15,catchta01w,IND,16.45122,3.646341,7.691057,14.346341,0.02
36,jacksla01w,SEA,19.395437,1.486692,7.86692,15.331179,0.01
4,beviltu01w,SAS,4.742138,2.232704,1.981132,4.126415,0.0
8,braxtka01w,NYL,7.08805,0.779874,4.515723,4.154717,0.0
6,bobbish01w,IND,3.0,2.305085,1.644068,1.791525,0.0


Most Improved Player

For the Most Improved Player, we chose to evaluate players based on their potential in points, assists, and rebounds by assessing per-minute statistics from the previous season. In addition, we identified that potential candidates could not have many points per game (low initial expectations)

In [182]:
# ==========================
# Most Improved Player 
# ==========================


def prepare_mip_data(target_year, player_features, awards_df):
    prev_year = target_year - 1

    base_data = player_features[player_features['year'] == prev_year].copy()

    all_history = player_features.groupby('playerID')['year'].min().reset_index().rename(columns={'year': 'Rookie_Year'})
    base_data = pd.merge(base_data, all_history, on='playerID', how='left')
    base_data['Years_Exp'] = prev_year - base_data['Rookie_Year']

    base_data['PPG'] = base_data['points'] / base_data['GP']

    candidates = base_data[
        (base_data['PPG'] < 15.0) &
        (base_data['GP'] > 10)
    ].copy()

    candidates['PPM'] = candidates['points'] / candidates['minutes'] # Points per Minute
    candidates['MPG'] = candidates['minutes'] / candidates['GP']
    candidates['Start_Ratio'] = candidates['GS'] / candidates['GP']
    candidates['APM'] = candidates['assists'] / candidates['minutes']
    candidates['RPM'] = candidates['rebounds'] / candidates['minutes']

    awards_target = awards_df[
        (awards_df['year'] == target_year) &
        (awards_df['award'] == 'Most Improved Player')
    ][['playerID']]
    awards_target['Won_MIP'] = 1

    final_df = pd.merge(candidates, awards_target, on='playerID', how='left')
    final_df['Won_MIP'] = final_df['Won_MIP'].fillna(0)

    return final_df


data_2 = prepare_mip_data(2, player_features, awards_players)
data_3 = prepare_mip_data(3, player_features, awards_players)
data_4 = prepare_mip_data(4, player_features, awards_players)
data_8 = prepare_mip_data(8, player_features, awards_players)
data_9 = prepare_mip_data(9, player_features, awards_players)
train_mip = pd.concat([data_2, data_3, data_4, data_8, data_9], ignore_index=True)

# Features about player's potential
features_mip = ['APM', 'RPM', 'PPM', 'MPG', 'Start_Ratio']

model_mip = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
model_mip.fit(train_mip[features_mip], train_mip['Won_MIP'])

print("Features' importance for Most Improved Player:")
importance = pd.DataFrame({'Feature': features_mip, 'Value': model_mip.feature_importances_})
print(importance.sort_values(by='Value', ascending=False))

train_mip[train_mip['Won_MIP'] == 1].head()

Features' importance for Most Improved Player:
       Feature     Value
1          RPM  0.275425
0          APM  0.248069
2          PPM  0.240707
3          MPG  0.152874
4  Start_Ratio  0.082926


Unnamed: 0,playerID,year,stint,tmID,GP,GS,minutes,points,oRebounds,dRebounds,...,rookie_year,Rookie_Year,Years_Exp,PPG,PPM,MPG,Start_Ratio,APM,RPM,Won_MIP
3,arcaija01w,1,0,HOU,32,32,977,268,36,83,...,1,1,0,8.375,0.274309,30.53125,1.0,0.061412,0.121801,1.0
260,milleco01w,2,0,WAS,20,0,137,34,5,4,...,2,2,0,1.7,0.248175,6.85,0.0,0.058394,0.065693,1.0
462,snowmi01w,3,0,HOU,32,2,480,125,31,88,...,3,3,0,3.90625,0.260417,15.0,0.0625,0.027083,0.247917,1.0
570,mccarja01w,7,0,CHA,30,3,421,136,40,65,...,6,6,1,4.533333,0.32304,14.033333,0.1,0.059382,0.249406,1.0
686,hoffmeb01w,8,0,IND,34,10,582,144,40,97,...,5,5,3,4.235294,0.247423,17.117647,0.294118,0.044674,0.235395,1.0


In [183]:
candidates_10 = prepare_mip_data(10, player_features, awards_players)

#print(candidates_10[candidates_10['tmID'] == 'WAS'])

probs = model_mip.predict_proba(candidates_10[features_mip])[:, 1]
candidates_10['MIP_Prob'] = probs

ranking_mip = candidates_10[['playerID', 'tmID', 'PPG', 'PPM', 'APM', 'RPM', 'MPG', 'MIP_Prob']].sort_values(by='MIP_Prob', ascending=False)

print("\n--- Most Improved Player Candidates (Year 10) ---")

#ranking_mip[ranking_mip['tmID'] == 'WAS'].head(10)
ranking_mip.head(10)


--- Most Improved Player Candidates (Year 10) ---


Unnamed: 0,playerID,tmID,PPG,PPM,APM,RPM,MPG,MIP_Prob
33,farriba01w,PHO,3.470588,0.213382,0.014467,0.235081,16.264706,0.28
66,langhcr01w,WAS,4.823529,0.310019,0.028355,0.257089,15.558824,0.19
79,mazzake01w,PHO,5.794118,0.307332,0.059282,0.102964,18.852941,0.1
137,williad01w,SAC,6.088235,0.30131,0.029112,0.243086,20.205882,0.05
111,sanfona01w,WAS,6.735294,0.28805,0.050314,0.242767,23.382353,0.04
46,harpela01w,SAC,5.529412,0.335714,0.021429,0.244643,16.470588,0.03
19,careyja01w,CON,4.151515,0.274549,0.094188,0.08016,15.121212,0.03
90,mosbybe01w,WAS,2.0,0.262295,0.010929,0.289617,7.625,0.02
5,balesal01w,ATL,4.823529,0.210797,0.028278,0.262211,22.882353,0.02
94,nnamach01w,ATL,1.333333,0.190476,0.02381,0.142857,7.0,0.02


The one who won in year 10 was langhcr01w, who was predicted to finish second according to the model.

Sixth Woman of the Year

Here our goal is to predict who will win Sixth Player of the Year. We believe that evaluating each player’s potential in terms of scoring efficiency (points per minute), in addition to points, assists, and rebounds per game from the previous season, will be important to understand which players have the potential to be the best sixth player of the season. We also take into account the minutes played per game for each player to assess whether they are important or influential on their team despite coming off the bench.

In [184]:
# ==========================
# Sixth Woman of the Year 
# ==========================


def prepare_6th_data(target_year, player_features, awards_df):
    prev_year = target_year - 1

    current_roster = player_features[player_features['year'] == target_year][['playerID', 'tmID', 'stint', 'year']].copy()

    current_roster = current_roster.sort_values(by=['playerID', 'stint'])
    current_roster = current_roster.drop_duplicates(subset=['playerID'], keep='last') # Remove duplicates (players who changed teams mid-season), keeping their statistics for the year

    history_raw = player_features[player_features['year'] == prev_year].copy()

    history = history_raw.groupby('playerID').agg({
        'GP': 'sum',
        'GS': 'sum',
        'minutes': 'sum',
        'points': 'sum',
        'rebounds': 'sum',
        'assists': 'sum'
    }).reset_index()

    history['Prev_PPG'] = history['points'] / history['GP']
    history['Prev_PPM'] = history['points'] / history['minutes']
    history['Prev_MPG'] = history['minutes'] / history['GP']
    history['Prev_APG'] = history['assists'] / history['GP']
    history['Prev_RPG'] = history['rebounds'] / history['GP']
    history['Prev_Start_Ratio'] = history['GS'] / history['GP'] # Number of games a player played as a starter

    history_cols = ['playerID', 'Prev_PPG', 'Prev_PPM', 'Prev_MPG', 'Prev_APG', 'Prev_RPG', 'Prev_Start_Ratio']

    candidates = pd.merge(current_roster, history[history_cols], on='playerID', how='left')

    candidates['Is_Rookie'] = candidates['Prev_PPG'].isna().astype(int)

    avg_ppg = history['Prev_PPG'].mean()
    avg_ppm = history['Prev_PPM'].mean()
    avg_mpg = history['Prev_MPG'].mean()
    avg_apg = history['Prev_APG'].mean()
    avg_rpg = history['Prev_RPG'].mean()

    # Fill rookie's stats with the average of all candidates
    candidates['Prev_PPG'] = candidates['Prev_PPG'].fillna(avg_ppg)
    candidates['Prev_PPM'] = candidates['Prev_PPM'].fillna(avg_ppm)
    candidates['Prev_MPG'] = candidates['Prev_MPG'].fillna(15.0)
    candidates['Prev_APG'] = candidates['Prev_APG'].fillna(avg_apg)
    candidates['Prev_RPG'] = candidates['Prev_RPG'].fillna(avg_rpg)

    candidates['Prev_Start_Ratio'] = candidates['Prev_Start_Ratio'].fillna(0.0)

    candidates = candidates[candidates['Prev_Start_Ratio'] < 0.5].copy()

    awards_target = awards_df[
        (awards_df['year'] == target_year) &
        (awards_df['award'] == 'Sixth Woman of the Year')
    ][['playerID']]
    awards_target['Won_6th'] = 1

    final_df = pd.merge(candidates, awards_target, on='playerID', how='left')
    final_df['Won_6th'] = final_df['Won_6th'].fillna(0)

    return final_df


data_8 = prepare_6th_data(8, player_features, awards_players)
data_9 = prepare_6th_data(9, player_features, awards_players)

train_6th = pd.concat([data_8, data_9], ignore_index=True)

features_6th = ['Prev_PPG', 'Prev_PPM', 'Prev_MPG', 'Prev_APG', 'Prev_RPG', 'Is_Rookie']

train_6th[train_6th['Won_6th'] == 1].head()

Unnamed: 0,playerID,tmID,stint,year,Prev_PPG,Prev_PPM,Prev_MPG,Prev_APG,Prev_RPG,Prev_Start_Ratio,Is_Rookie,Won_6th
67,pierspl01w,DET,0,8,6.470588,0.389381,16.617647,0.705882,3.882353,0.0,0,1.0
216,wiggica01w,MIN,0,9,6.731642,0.326749,15.0,1.495325,3.078135,0.0,1,1.0


In [185]:
model_6th = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
model_6th.fit(train_6th[features_6th], train_6th['Won_6th'])

print("Features' importance (Sixth Woman):")
importance = pd.DataFrame({'Feature': features_6th, 'Value': model_6th.feature_importances_})
print(importance.sort_values(by='Value', ascending=False))

candidates_10_6th = prepare_6th_data(10, player_features, awards_players)

probs = model_6th.predict_proba(candidates_10_6th[features_6th])[:, 1]
candidates_10_6th['Win_Prob'] = probs

ranking_6th = candidates_10_6th[['playerID', 'tmID', 'Prev_PPG', 'Prev_PPM', 'Prev_APG', 'Prev_RPG', 'Prev_MPG', 'Win_Prob']].sort_values(by='Win_Prob', ascending=False)

ranking_6th.head(10)

Features' importance (Sixth Woman):
     Feature     Value
4   Prev_RPG  0.331263
0   Prev_PPG  0.241105
1   Prev_PPM  0.167225
3   Prev_APG  0.112355
2   Prev_MPG  0.106564
5  Is_Rookie  0.041488


Unnamed: 0,playerID,tmID,Prev_PPG,Prev_PPM,Prev_APG,Prev_RPG,Prev_MPG,Win_Prob
37,kellycr01w,DET,7.363636,0.441016,0.454545,3.30303,16.69697,0.13
22,grudasa01w,CON,6.225806,0.405462,0.903226,3.548387,15.354839,0.123073
61,pricear01w,ATL,6.911765,0.307995,1.735294,3.676471,22.441176,0.099143
24,harpela01w,SAC,5.529412,0.335714,0.352941,4.029412,16.470588,0.08
9,burseja01w,SEA,6.419933,0.326247,1.38058,2.981079,15.0,0.072642
10,cironkr01w,CON,6.419933,0.326247,1.38058,2.981079,15.0,0.072642
19,frazeme01w,SAS,6.419933,0.326247,1.38058,2.981079,15.0,0.072642
14,davista02w,DET,6.419933,0.326247,1.38058,2.981079,15.0,0.072642
11,colemma01w,WAS,6.419933,0.326247,1.38058,2.981079,15.0,0.072642
7,bonnede01w,PHO,6.419933,0.326247,1.38058,2.981079,15.0,0.072642


The one who won in year 10 was bonnede01w, who was predicted to finish 10º according to the model.

Rookie of the Year

Since there isn’t much relevant data available to help predict a possible winner for this award, we had to focus on the data available for each rookie before the season starts (height, weight, position — C, G, F). In addition, we created a college score associated with each player, which evaluates whether a college is known for producing many or few players for the league (college prestige). This way, the model takes into account whether a player comes from a strong, talent-developing college

In [186]:
from datetime import datetime

# ==========================
# Rookie of the Year
# ==========================

def calculate_college_prestige(players_df):
    college_counts = players_df['college'].value_counts() # Count how many unique players came from each college.
    return college_counts.to_dict()


def prepare_rookie_data(target_year, players_df, players_teams_df, awards_df, college_prestige_map):

    rookie_years = players_teams_df.groupby('playerID')['year'].min().reset_index()
    rookie_ids = rookie_years[rookie_years['year'] == target_year]['playerID']

    rookie_profiles = players_df[players_df['bioID'].isin(rookie_ids)].copy()

    if len(rookie_profiles) == 0:
        return None

    rookie_profiles['College_Score'] = rookie_profiles['college'].map(college_prestige_map).fillna(1)

    rookie_profiles['Is_Guard'] = rookie_profiles['pos'].str.contains('G').fillna(False).astype(int)
    rookie_profiles['Is_Forward'] = rookie_profiles['pos'].str.contains('F').fillna(False).astype(int)
    rookie_profiles['Is_Center'] = rookie_profiles['pos'].str.contains('C').fillna(False).astype(int)

    rookie_profiles['height'] = rookie_profiles['height'].fillna(rookie_profiles['height'].mean())
    rookie_profiles['weight'] = rookie_profiles['weight'].fillna(rookie_profiles['weight'].mean())

    awards_target = awards_df[
        (awards_df['year'] == target_year) &
        (awards_df['award'] == 'Rookie of the Year')
    ][['playerID']]
    awards_target['Won_ROY'] = 1

    final_df = pd.merge(rookie_profiles, awards_target, left_on='bioID', right_on='playerID', how='left')
    final_df['Won_ROY'] = final_df['Won_ROY'].fillna(0)

    features_cols = ['bioID', 'height', 'weight', 'College_Score', 'Is_Guard', 'Is_Forward', 'Is_Center', 'Won_ROY']

    return final_df[features_cols]

college_map = calculate_college_prestige(players)
train_list = []
years_with_data = [2, 3, 4, 5, 6, 7, 8, 9]

for y in years_with_data:
    df = prepare_rookie_data(y, players, players_teams, awards_players, college_map)
    if df is not None and df['Won_ROY'].sum() > 0:
        train_list.append(df)

features_roy = ['height', 'weight', 'College_Score', 'Is_Guard', 'Is_Forward', 'Is_Center']

train_roy = pd.concat(train_list, ignore_index=True)

train_roy[train_roy['Won_ROY'] == 1].head(10)

Unnamed: 0,bioID,height,weight,College_Score,Is_Guard,Is_Forward,Is_Center,Won_ROY
47,stileja01w,68.0,144,1.0,1,0,0,1.0
70,catchta01w,73.0,167,23.0,0,1,0,1.0
111,fordch01w,75.0,198,12.0,0,1,0,1.0
163,tauradi01w,72.0,172,20.0,1,1,0,1.0
183,johnste01w,63.0,132,13.0,1,0,0,1.0
210,augusse01w,72.0,179,13.0,0,1,0,1.0
268,pricear01w,69.0,133,2.0,1,1,0,1.0
309,parkeca01w,76.0,175,23.0,0,1,0,1.0


In [187]:
model_roy = RandomForestClassifier(n_estimators=200, random_state=42, class_weight='balanced')
model_roy.fit(df[features_roy], df['Won_ROY'])

importance = pd.DataFrame({'Feature': features_roy, 'Value': model_roy.feature_importances_})
print(importance.sort_values(by='Value', ascending=False))

candidates_10_roy = prepare_rookie_data(10, players, players_teams, awards_players, college_map)

probs = model_roy.predict_proba(candidates_10_roy[features_roy])[:, 1]
candidates_10_roy['ROY_Prob'] = probs

ranking_roy = candidates_10_roy[['bioID', 'College_Score', 'height', 'weight', 'ROY_Prob']].sort_values(by='ROY_Prob', ascending=False)

ranking_roy.head(10)

         Feature     Value
2  College_Score  0.356508
0         height  0.309672
1         weight  0.163679
4     Is_Forward  0.111400
5      Is_Center  0.033784
3       Is_Guard  0.024957


Unnamed: 0,bioID,College_Score,height,weight,ROY_Prob
6,ervinla01w,3.0,76.0,171,0.19
2,bonnede01w,8.0,76.0,136,0.1
14,millebr01w,5.0,76.0,0,0.03
18,parisco01w,4.0,76.0,250,0.015
20,vaughki01w,10.0,76.0,208,0.015
17,nanch01w,1.0,78.0,204,0.01
0,blackch01w,9.0,77.0,188,0.01
8,holliqu01w,2.0,77.0,201,0.01
1,boddiwh01w,8.0,69.0,149,0.0
7,frazeme01w,1.0,75.0,191,0.0


Here the results were not very good, possibly due to the lack of data that could help assess who might win the award. The actual winner of year 10 is not even in the top 10

Defensive Player of The Year

To predict this award, we considered individual defensive data such as stocks (the combination of blocks and steals) and defensive rebounds. Additionally, we evaluated the number of points allowed per game by the player’s team in the previous season

In [188]:
# ==========================
# Defensive Player of the Year
# ==========================

def prepare_dpoy_career_data(target_year, players_teams, teams_df, awards_df):

    history_end_year = target_year - 1

    if history_end_year < 1:
        return None

    players_history = players_teams[players_teams["year"].between(1, history_end_year)].copy()

    career_stats = players_history.groupby("playerID").agg({
        "GP": "sum",
        "minutes": "sum",
        "steals": "sum",
        "blocks": "sum",
        "dRebounds": "sum",
        "PF": "sum"
    }).reset_index()

    career_stats['GP'] = career_stats['GP'].replace(0, 1)
    career_stats['minutes'] = career_stats['minutes'].replace(0, 1)

    career_stats['Career_SPG'] = career_stats['steals'] / career_stats['GP']
    career_stats['Career_BPG'] = career_stats['blocks'] / career_stats['GP']
    career_stats['Career_DRPG'] = career_stats['dRebounds'] / career_stats['GP']
    career_stats['Career_MPG'] = career_stats['minutes'] / career_stats['GP']

    career_stats['Career_Stocks'] = career_stats['Career_SPG'] + career_stats['Career_BPG']

    career_stats['Career_Stocks_Per_Min'] = (career_stats['steals'] + career_stats['blocks']) / career_stats['minutes']


    roster_target_year = players_teams[players_teams["year"] == target_year][['playerID', 'tmID']].drop_duplicates()

    candidates = pd.merge(roster_target_year, career_stats, on='playerID', how='inner')

    prev_year = target_year - 1
    teams_prev = teams_df[teams_df['year'] == prev_year][['tmID', 'd_pts', 'GP']].copy()

    teams_prev['Prev_Team_PA'] = teams_prev['d_pts'] / teams_prev['GP']

    candidates = pd.merge(candidates, teams_prev[['tmID', 'Prev_Team_PA']], on='tmID', how='left')

    avg_pa = teams_prev['Prev_Team_PA'].mean()
    candidates['Prev_Team_PA'] = candidates['Prev_Team_PA'].fillna(avg_pa)

    candidates = candidates[
        (candidates['GP'] > 15) &
        (candidates['Career_MPG'] > 12.0)
    ].copy()

    awards_target = awards_df[
        (awards_df['year'] == target_year) &
        (awards_df['award'] == 'Defensive Player of the Year')
    ][['playerID']]
    awards_target['Won_DPOY'] = 1

    final_df = pd.merge(candidates, awards_target, on='playerID', how='left')
    final_df['Won_DPOY'] = final_df['Won_DPOY'].fillna(0)

    return final_df

years_train = [2, 3, 4, 5, 6, 7, 8, 9]
train_list = []

for y in years_train:
    df = prepare_dpoy_career_data(y, players_teams, teams, awards_players)
    train_list.append(df)

train_dpoy = pd.concat(train_list, ignore_index=True)

print(train_dpoy[train_dpoy['Won_DPOY'] == 1].head(10))

features_dpoy = ['Career_Stocks', 'Career_Stocks_Per_Min', 'Career_DRPG', 'Prev_Team_PA']

       playerID tmID   GP  minutes  steals  blocks  dRebounds   PF  \
7    blackde01w  MIA   32      820      58       1         69   86   
229  swoopsh01w  HOU   31     1090      87      33        155   67   
361  swoopsh01w  HOU   63     2244     175      56        283  117   
430  leslili01w  LAS  117     3913     142     298        840  482   
497  catchta01w  IND  100     3526     233     116        544  317   
612  catchta01w  IND  134     4700     323     132        739  413   
748  jacksla01w  SEA  185     6086     213     389       1005  581   
870  leslili01w  LAS  219     7202     310     524       1534  849   

     Career_SPG  Career_BPG  Career_DRPG  Career_MPG  Career_Stocks  \
7      1.812500    0.031250     2.156250   25.625000       1.843750   
229    2.806452    1.064516     5.000000   35.161290       3.870968   
361    2.777778    0.888889     4.492063   35.619048       3.666667   
430    1.213675    2.547009     7.179487   33.444444       3.760684   
497    2.33000

In [189]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(train_dpoy[features_dpoy])
y_train = train_dpoy['Won_DPOY']

model_log_dpoy = LogisticRegression(random_state=42, class_weight='balanced', max_iter=2000)
model_log_dpoy.fit(X_train_scaled, y_train)

coefs = pd.DataFrame({'Feature': features_dpoy, 'Coeficiente': model_log_dpoy.coef_[0]})
print("--- Features's importance ---")
print(coefs.sort_values(by='Coeficiente', ascending=False))


candidates_10_dpoy = prepare_dpoy_career_data(10, players_teams, teams, awards_players)

X_test_scaled = scaler.transform(candidates_10_dpoy[features_dpoy])

probs = model_log_dpoy.predict_proba(X_test_scaled)[:, 1]
candidates_10_dpoy['Raw_Prob'] = probs

total_prob = candidates_10_dpoy['Raw_Prob'].sum()
candidates_10_dpoy['DPOY_Share'] = candidates_10_dpoy['Raw_Prob'] / total_prob

ranking_dpoy_log = candidates_10_dpoy[[
    'playerID', 'tmID', 'Career_SPG', 'Career_BPG', 'Career_Stocks', 'Career_Stocks_Per_Min', 'Career_DRPG', 'Prev_Team_PA', 'DPOY_Share'
]].sort_values(by='DPOY_Share', ascending=False)

ranking_dpoy_log['Share_Pct'] = ranking_dpoy_log['DPOY_Share'].apply(lambda x: f"{x:.1%}")

print("\n--- DPOY Candidates (Logistic Regression) ---")
ranking_dpoy_log.head(10)

--- Features's importance ---
                 Feature  Coeficiente
0          Career_Stocks     2.534585
1  Career_Stocks_Per_Min     0.187387
2            Career_DRPG    -0.700504
3           Prev_Team_PA    -0.713271

--- DPOY Candidates (Logistic Regression) ---


Unnamed: 0,playerID,tmID,Career_SPG,Career_BPG,Career_Stocks,Career_Stocks_Per_Min,Career_DRPG,Prev_Team_PA,DPOY_Share,Share_Pct
66,leslili01w,LAS,1.424603,2.464286,3.888889,0.11863,6.916667,74.205882,0.092269,9.2%
18,catchta01w,IND,2.509434,0.943396,3.45283,0.102478,5.448113,72.264706,0.08925,8.9%
1,anosini01w,MIN,2.205882,1.264706,3.470588,0.128261,4.5,80.029412,0.087453,8.7%
84,parkeca01w,LAS,1.272727,2.272727,3.545455,0.1055,6.939394,74.205882,0.083374,8.3%
51,jacksla01w,SEA,1.160338,2.046414,3.206751,0.097436,5.624473,70.764706,0.08217,8.2%
31,fowlesy01w,CHI,1.117647,2.117647,3.235294,0.127907,5.823529,73.823529,0.08033,8.0%
47,hornbal01w,DET,2.323529,0.294118,2.617647,0.119143,2.5,74.176471,0.071674,7.2%
34,griffyo01w,IND,1.634409,0.956989,2.591398,0.091081,4.315412,72.264706,0.048516,4.9%
104,suttota01w,IND,0.811538,1.507692,2.319231,0.088365,3.665385,72.264706,0.03335,3.3%
95,rileyru01w,SAS,0.687747,1.55336,2.241107,0.088139,3.438735,71.117647,0.033301,3.3%


The one who won in year 10 was catchta01w, who was predicted to finish second according to the model.