In [36]:
import pandas as pd
import numpy as np

from collections import defaultdict


def clean_players(players: pd.DataFrame):
    players = players.drop(['firstseason','lastseason'], axis='columns') # all players with first and last season 0
    #players = players[players["bioID"].isin(players_teams["playerID"])]
    return players

def clean_players_teams(players_teams: pd.DataFrame):
    return players_teams.drop('lgID', axis='columns') # same lgID

def clean_awards_players(awards_players: pd.DataFrame):
    return awards_players.drop("lgID", axis="columns")

def clean_coaches(coaches: pd.DataFrame):
    return coaches.drop("lgID", axis="columns")

def clean_teams_post(teams_post : pd.DataFrame):
    return teams_post.drop("lgID", axis="columns")

def clean_series_post(series_post : pd.DataFrame):
    return series_post.drop(["lgIDLoser", "lgIDWinner"], axis="columns")

def clean_teams(teams : pd.DataFrame):
    return teams.drop(["lgID", "franchID", "divID", "arena", "attend", "min", "name"], axis="columns")

def parse_player_team_data(df):
    """
    Parses the player-team-season dataset into structured Python dictionaries
    for further analysis and championship prediction.
    """

    df.columns = [c.strip() for c in df.columns]
    df.fillna(0, inplace=True)

    players = defaultdict(list)
    teams_by_year = defaultdict(lambda: defaultdict(list))

    for _, row in df.iterrows():
        playerID = row["playerID"]
        teamID = row["tmID"]
        year = int(row["year"])

        # Regular season stats
        season_stats = {
            "GP": row["GP"],
            "GS": row["GS"],
            "minutes": row["minutes"],
            "points": row["points"],
            "oRebounds": row["oRebounds"],
            "dRebounds": row["dRebounds"],
            "rebounds": row["rebounds"],
            "assists": row["assists"],
            "steals": row["steals"],
            "blocks": row["blocks"],
            "turnovers": row["turnovers"],
            "PF": row["PF"],
            "fgAttempted": row["fgAttempted"],
            "fgMade": row["fgMade"],
            "ftAttempted": row["ftAttempted"],
            "ftMade": row["ftMade"],
            "threeAttempted": row["threeAttempted"],
            "threeMade": row["threeMade"],
            "dq": row["dq"],
        }

        # Postseason stats
        postseason_stats = {
            "PostGP": row["PostGP"],
            "PostGS": row["PostGS"],
            "PostMinutes": row["PostMinutes"],
            "PostPoints": row["PostPoints"],
            "PostoRebounds": row["PostoRebounds"],
            "PostdRebounds": row["PostdRebounds"],
            "PostRebounds": row["PostRebounds"],
            "PostAssists": row["PostAssists"],
            "PostSteals": row["PostSteals"],
            "PostBlocks": row["PostBlocks"],
            "PostTurnovers": row["PostTurnovers"],
            "PostPF": row["PostPF"],
            "PostfgAttempted": row["PostfgAttempted"],
            "PostfgMade": row["PostfgMade"],
            "PostftAttempted": row["PostftAttempted"],
            "PostftMade": row["PostftMade"],
            "PostthreeAttempted": row["PostthreeAttempted"],
            "PostthreeMade": row["PostthreeMade"],
            "PostDQ": row["PostDQ"],
        }

        # Combined player record
        record = {
            "year": year,
            "teamID": teamID,
            "stint": row["stint"],
            **season_stats,
            **postseason_stats
        }

        # Store in both structures
        players[playerID].append(record)
        teams_by_year[year][teamID].append(record)

    print(f"Parsed {len(players)} players across {len(teams_by_year)} seasons.")
    return {"players": players, "teams_by_year": teams_by_year}

awards_players = clean_awards_players(pd.read_csv("basketballPlayoffs/awards_players.csv"))
coaches = clean_coaches(pd.read_csv("basketballPlayoffs/coaches.csv"))
players_teams = clean_players_teams(pd.read_csv("basketballPlayoffs/players_teams.csv"))
players = clean_players(pd.read_csv("basketballPlayoffs/players.csv"))
series_post = clean_series_post(pd.read_csv("basketballPlayoffs/series_post.csv"))
teams_post = clean_teams_post(pd.read_csv("basketballPlayoffs/teams_post.csv"))
teams = clean_teams(pd.read_csv("basketballPlayoffs/teams.csv"))

# For year 11

coaches_11 = pd.read_csv("Season_11/coaches.csv")
players_teams_11 = pd.read_csv("Season_11/players_teams.csv")
teams_11 = pd.read_csv("Season_11/teams.csv")

In [37]:
teams["FG_Percentage"] = teams["o_fgm"] / teams["o_fga"]*100 # Field goal percentage
teams["FT_Percentage"] = teams["o_ftm"] / teams["o_fta"]*100 # Free-Throw percentage
teams["3P_Percentage"] = teams["o_3pm"] / teams["o_3pa"]*100 # 3 Point percentage
teams["O_OREBPG"] = teams["o_oreb"] / teams["GP"] # Ofensive rebounds per game
teams["O_DREBPG"] = teams["o_dreb"] / teams["GP"] # Defensive rebounds per game
teams["TOPG"] = teams["o_to"] / teams["GP"] # Turnovers per game
teams["PPG"] = teams["o_pts"] / teams["GP"] # Points per game
teams["STLPG"] = teams["o_stl"] / teams["GP"] # Steals per game
teams["BLKPG"] = teams["o_blk"] / teams["GP"] # Blocks per game
teams["PFPG"] = teams["o_pf"] / teams["GP"] # Personal fouls per game
teams["D_PPG"] = teams["d_pts"] / teams["GP"]
teams["APG"] = teams["o_asts"] / teams["GP"] # Assists per game
teams["Win%"] = teams["won"] / teams["GP"] *100

teams[["tmID", "FG_Percentage", "FT_Percentage", "3P_Percentage", "Win%"]].head()

Unnamed: 0,tmID,FG_Percentage,FT_Percentage,3P_Percentage,Win%
0,ATL,39.636847,74.758621,33.779264,11.764706
1,ATL,44.85173,75.364238,30.481283,52.941176
2,CHA,42.669469,74.696707,33.937824,25.0
3,CHA,41.910112,77.651515,35.747664,56.25
4,CHA,43.01676,73.906486,40.037951,56.25


Building Awards Weights

In [38]:
dict_award_weight = {
    "Most Valuable Player" : 10,
    "Coach of the Year" : 10,
    "WNBA All-Decade Team" : 6,
    "WNBA Finals Most Valuable Player" : 5,
    "WNBA All Decade Team Honorable Mention" : 4,
    "Defensive Player of the Year" : 4,
    "Most Improved Player" : 3,
    "Sixth Woman of the Year" : 2,
    "All-Star Game Most Valuable Player" : 2,
    "Rookie of the Year" : 1,
    "Kim Perrot Sportsmanship Award" : 0,
    "Kim Perrot Sportsmanship" : 0,
}

def compute_award_score(team_id, year, awards_players, players_teams,coaches, dict_award_weight):
  team_players = players_teams[
        (players_teams["tmID"] == team_id) &
        (players_teams["year"] == year)
    ]["playerID"].unique()

  player_awards = awards_players[
      (awards_players["playerID"].isin(team_players)) &
      (awards_players["year"] < year)
  ]

  player_award_score = player_awards["award"].map(dict_award_weight).sum()

  coach_ = coaches[
      (coaches["tmID"] == team_id) &
      (coaches["year"] == year)
  ]


  if not coach_.empty:
        coach_id = coach_["coachID"].iloc[0]
        coach_awards = awards_players[
            (awards_players["playerID"] == coach_id) &
            (awards_players["award"] == "Coach of the Year") &
            (awards_players["year"] < year)
        ]
        coach_award_score = coach_awards["award"].map(dict_award_weight).sum()

  total_award_score = player_award_score + coach_award_score

  return total_award_score

Testing team's award score for a specific year(year 10)

In [39]:
award_scores_for_year_10 = []
year = 10
teams_in_year_10 = players_teams[players_teams["year"] == year]["tmID"].unique()
for team_id in teams_in_year_10:
    score = compute_award_score(team_id, year, awards_players, players_teams, coaches, dict_award_weight)
    award_scores_for_year_10.append({"year": year, "tmID": team_id, "Award_Score": score})

award_scores_10_df = pd.DataFrame(award_scores_for_year_10)

print(award_scores_10_df.sort_values("Award_Score", ascending=False))

    year tmID  Award_Score
7     10  LAS           93
2     10  SEA           36
3     10  DET           33
5     10  IND           31
12    10  SAS           25
6     10  CON           20
11    10  ATL           13
9     10  PHO           11
1     10  MIN            6
8     10  SAC            4
4     10  NYL            3
10    10  CHI            1
0     10  WAS            0


Player's score

In [40]:
def getPlayersStats(players_group):
  career_players = players_group.groupby("playerID").agg({
    "GP": "sum",
    "points": "sum",
    "assists": "sum",
    "oRebounds": "sum",
    "dRebounds": "sum",
    "rebounds": "sum",
    "steals": "sum",
    "blocks": "sum",
    "turnovers": "sum",
    "PF": "sum",
    "fgAttempted": "sum",
    "fgMade": "sum",
    "ftAttempted": "sum",
    "ftMade": "sum",
    "threeAttempted": "sum",
    "threeMade": "sum"
    }).reset_index()

  career_players["PPG"] = career_players["points"] / career_players["GP"]
  career_players["APG"] = career_players["assists"] / career_players["GP"]
  career_players["RPG"] = career_players["rebounds"] / career_players["GP"]
  career_players["SPG"] = career_players["steals"] / career_players["GP"]
  career_players["BPG"] = career_players["blocks"] / career_players["GP"]
  career_players["TOPG"] = career_players["turnovers"] / career_players["GP"]
  career_players["PFPG"] = career_players["PF"] / career_players["GP"]
  career_players["FG_Percentage"] = career_players["fgMade"] / career_players["fgAttempted"] * 100
  career_players["FT_Percentage"] = career_players["ftMade"] / career_players["ftAttempted"] * 100
  career_players["3P_Percentage"] = career_players["threeMade"] / career_players["threeAttempted"] * 100

  career_players = career_players.round(2)

  return career_players

def last_year_players_stats(year, players_teams):
  last_year = year - 1

  if last_year < 1:
    teams_in_year = players_teams[players_teams['year'] == year]['tmID'].unique()
    return pd.DataFrame({
        'year': year,
        'tmID': teams_in_year,
        'Player_Score': 0.0
    })

  players_in_year = players_teams[players_teams["year"] == last_year]
  players_stats = getPlayersStats(players_in_year)

  players_stats["Player_Score"] = (
  players_stats["points"] + 0.4 * players_stats["fgMade"] + 0.7 * players_stats["oRebounds"] +
  0.3 * players_stats["dRebounds"] + players_stats["steals"] + 0.7 * players_stats["assists"] +
  0.7 * players_stats["blocks"] - 0.7 * players_stats["fgAttempted"] - 0.4 * (players_stats["ftAttempted"] - players_stats["ftMade"]) - 0.4 * players_stats["PF"] - players_stats["turnovers"]
  )

  rosters_year = players_teams[players_teams["year"] == year][['playerID', 'tmID']]

  players_scores_year = pd.merge(
    rosters_year,
    players_stats,
    on='playerID',
    how='inner'
  )

  team_names_year = teams[teams["year"] == year][['tmID']]

  final_rosters_with_scores = pd.merge(
      players_scores_year,
      team_names_year,
      on='tmID'
  )

  final_rosters_with_scores['year'] = year

  return final_rosters_with_scores

Coach Score

In [41]:
def compute_coach_scores_for_year(target_year, all_coaches_df, all_teams_df):

    # 1. Definir o período de história (tudo ANTES do ano alvo)
    history_end_year = target_year - 1

    # Se for o ano 1, não há história, o score de todos é 0
    if history_end_year < 1:
        teams_in_year = all_coaches_df[all_coaches_df['year'] == target_year]['tmID'].unique()
        return pd.DataFrame({
            'year': target_year,
            'tmID': teams_in_year,
            'Coach_Score': 0
        })

    # --- Calcular Stats Históricos (Baseado nos anos <= history_end_year) ---

    # 2. Calcular Win% Histórico
    coaches_hist = all_coaches_df[all_coaches_df['year'] <= history_end_year]
    coaches_stats = coaches_hist.groupby('coachID')[['won', 'lost']].sum().reset_index()

    coaches_stats['Win Percentage'] = (
        coaches_stats['won'] / (coaches_stats['won'] + coaches_stats['lost'])
    ).fillna(0) # fillna(0) para treinadores com 0-0 (evita NaN)

    # 3. Encontrar Campeões Históricos
    winner_teams_hist = all_teams_df[
        (all_teams_df['finals'] == 'W') &
        (all_teams_df['year'] <= history_end_year)
    ][['year', 'tmID']]

    winner_coaches_hist = winner_teams_hist.merge(
        all_coaches_df[['year', 'tmID', 'coachID']],
        on=['year', 'tmID'],
        how='left'
    )
    historical_champions_set = set(winner_coaches_hist['coachID'].dropna())

    # 4. Calcular o Score Histórico de cada Treinador
    coaches_stats['Champion'] = coaches_stats['coachID'].apply(
        lambda cid: 'Yes' if cid in historical_champions_set else 'No'
    )

    coaches_stats['Coach_Score'] = (
        coaches_stats['Win Percentage'] * 50 +
        coaches_stats['Champion'].apply(lambda x: 5 if x == 'Yes' else 0)
    ).round(2)

    # --- Mapear Scores para as Equipas do 'target_year' ---

    # 5. Obter os treinadores do 'target_year'
    teams_and_coaches_target_year = all_coaches_df[
        all_coaches_df['year'] == target_year
    ][['tmID', 'coachID']]

    # 6. Juntar o score histórico
    teams_with_scores = teams_and_coaches_target_year.merge(
        coaches_stats[['coachID', 'Coach_Score']],
        on='coachID',
        how='left'
    )

    # Treinadores novos (sem história) recebem 0
    teams_with_scores['Coach_Score'] = teams_with_scores['Coach_Score'].fillna(0)

    # 7. Agregar por equipa (caso tenha tido >1 treinador, como DET no seu exemplo)
    # Usamos a média para resolver duplicados
    final_team_scores = teams_with_scores.groupby('tmID')['Coach_Score'].mean().reset_index()
    final_team_scores['year'] = target_year

    return final_team_scores[['year', 'tmID', 'Coach_Score']]

final = compute_coach_scores_for_year(10, coaches, teams)
print(final)

    year tmID  Coach_Score
0     10  ATL        5.880
1     10  CHI       17.650
2     10  CON       36.130
3     10  DET       17.545
4     10  IND       19.230
5     10  LAS       39.170
6     10  MIN        0.000
7     10  NYL       27.645
8     10  PHO       23.530
9     10  SAC       32.205
10    10  SAS       24.440
11    10  SEA       23.500
12    10  WAS        0.000


In [42]:
train_award_scores_list = []

for year in range(2, 10):
  teams_in_year = players_teams[players_teams["year"] == year]["tmID"].unique()

  for team_id in teams_in_year:
        score = compute_award_score(
            team_id,
            year,
            awards_players,
            players_teams,
            coaches,
            dict_award_weight
        )

        train_award_scores_list.append({
            "year": year,
            "tmID": team_id,
            "Award_Score": score
        })
        #print(train_award_scores_list)

  X_train_awards = pd.DataFrame(train_award_scores_list)

train_coach_scores_list = []
for year in range(2, 10):
  scores_for_this_year = compute_coach_scores_for_year(
      target_year=year,
      all_coaches_df=coaches,
      all_teams_df=teams
  )
  train_coach_scores_list.append(scores_for_this_year)
  #print(scores_for_this_year[scores_for_this_year['tmID'] == 'WAS'])

X_train_coaches = pd.concat(train_coach_scores_list, ignore_index=True)
train_final = pd.merge(
    X_train_awards,
    X_train_coaches,
    on=['year', 'tmID'],
    how='left'
)

train_final['Coach_Score'] = train_final['Coach_Score'].fillna(0)

#print(train_final[train_final['Coach_Score'] == 'WAS'])

X_train_awards = pd.DataFrame(train_award_scores_list)
print("--- Tabela de Scores (Anos 1-9) ---")
#print(train_final)

train_players_scores_list = []

for year in range(2,10):
  players_year = last_year_players_stats(year, players_teams)
  #teams_rosters_scores = players_year.groupby(["year", "tmID"])["Player_Score"].mean().reset_index()
  teams_rosters_scores = players_year.groupby(["year", "tmID"])["Player_Score"].agg(
      Player_Score_Mean='mean',
      Player_Score_Max='max',
      Player_Score_Std='std'
  ).reset_index()
  train_players_scores_list.append(teams_rosters_scores)

X_train_players = pd.concat(train_players_scores_list, ignore_index=True)
X_train_players = X_train_players.fillna(0)
#print(X_train_players)

#print(train_final)

X_train_scores = pd.merge(
    train_final,
    X_train_players,
    on=['year', 'tmID'],
    how='left'
)
print(X_train_scores)
teste = X_train_scores[X_train_scores['tmID'] == 'WAS']
#print(teste)

--- Tabela de Scores (Anos 1-9) ---
     year tmID  Award_Score  Coach_Score  Player_Score_Mean  Player_Score_Max  \
0       2  MIN            1        23.44         136.685714             427.2   
1       2  WAS            0         0.00         172.437500             375.9   
2       2  ORL            0        25.00         156.512500             372.5   
3       2  NYL            3        31.25         184.155556             295.0   
4       2  CHA            0        14.06         147.111111             406.3   
..    ...  ...          ...          ...                ...               ...   
108     9  SEA           97        19.88         166.970000             617.0   
109     9  LAS           65        40.05         123.280000             220.1   
110     9  DET           26        34.69         162.838462             398.4   
111     9  CHI            1         0.00         175.855556             367.8   
112     9  PHO           13         0.00         181.544444             4

In [43]:
teams_data = teams[['year', 'tmID', 'confID', 'won', 'lost', 'rank']].copy()
teams_data['total_games'] = teams_data['won'] + teams_data['lost']
teams_data['Win_Percentage'] = teams_data['won'] / teams_data['total_games']

conf_map = {'EA': 0, 'WE': 1}
teams_data['Conf_Code'] = teams_data['confID'].map(conf_map)

dataset_final = pd.merge(
    X_train_scores,
    teams_data[['year', 'tmID', 'Conf_Code', 'Win_Percentage', 'rank']],
    on=['year', 'tmID'],
    how='inner'
)

#print(dataset_final[dataset_final['year'] == 7])


teams_history = teams[['year', 'tmID', 'won', 'lost']].copy()
teams_history['total_games'] = teams_history['won'] + teams_history['lost']
teams_history['Win_Pct'] = teams_history['won'] / teams_history['total_games']


prev_performance = teams_history[['year', 'tmID', 'Win_Pct']].copy()
prev_performance['year'] = prev_performance['year'] + 1
prev_performance = prev_performance.rename(columns={'Win_Pct': 'Prev_Team_Win_Pct'})


dataset_final = pd.merge(
    dataset_final,
    prev_performance,
    on=['year', 'tmID'],
    how='left'
)
dataset_final['Prev_Team_Win_Pct'] = dataset_final['Prev_Team_Win_Pct'].fillna(0.5) # Preencher buracos com média

teams_diff = teams[['year', 'tmID', 'o_pts', 'd_pts']].copy()
teams_diff['Point_Diff'] = teams_diff['o_pts'] - teams_diff['d_pts']
prev_diff = teams_diff[['year', 'tmID', 'Point_Diff']].copy()
prev_diff['year'] = prev_diff['year'] + 1
prev_diff = prev_diff.rename(columns={'Point_Diff': 'Prev_Point_Diff'})

dataset_final = pd.merge(
    dataset_final,
    prev_diff,
    on=['year', 'tmID'],
    how='left'
)
dataset_final['Prev_Point_Diff'] = dataset_final['Prev_Point_Diff'].fillna(0)

print(dataset_final[dataset_final['year'] == 7])

    year tmID  Award_Score  Coach_Score  Player_Score_Mean  Player_Score_Max  \
72     7  MIN            0       12.255         124.914286             274.0   
73     7  CON            0       35.390         175.790000             391.2   
74     7  NYL            0       27.880          80.922222             358.6   
75     7  SAS           10       22.260         114.914286             279.2   
76     7  IND            5       26.470         183.175000             439.1   
77     7  DET           16       32.020         139.870000             344.5   
78     7  WAS            5       27.220         156.900000             270.1   
79     7  SEA           19       31.010         175.070000             508.5   
80     7  CHA            0       15.000         124.137500             239.4   
81     7  CHI            0        0.000          73.081818             151.2   
82     7  HOU           52       35.810         155.810000             485.6   
83     7  SAC           15       37.740 

In [44]:
from sklearn.preprocessing import StandardScaler

features = ['Award_Score', 'Coach_Score', 'Conf_Code', 'Prev_Team_Win_Pct', 'Prev_Point_Diff', 'Player_Score_Mean', 'Player_Score_Max', 'Player_Score_Std']

scaler = StandardScaler()

award_list_10 = []
teams_in_10 = players_teams[players_teams["year"] == 10]["tmID"].unique()

for team_id in teams_in_10:
    score = compute_award_score(team_id, 10, awards_players, players_teams, coaches, dict_award_weight)
    award_list_10.append({"year": 10, "tmID": team_id, "Award_Score": score})
X_10_awards = pd.DataFrame(award_list_10)

X_10_coaches = compute_coach_scores_for_year(10, coaches, teams)

players_year_10 = last_year_players_stats(10, players_teams)
#X_10_players = players_year_10.groupby(["year", "tmID"])["Player_Score"].mean().reset_index()
X_10_players = players_year_10.groupby(["year", "tmID"])["Player_Score"].agg(
    Player_Score_Mean='mean',
    Player_Score_Max='max',
    Player_Score_Std='std'
).reset_index()
X_10_players = X_10_players.fillna(0)

X_test_10 = pd.merge(X_10_awards, X_10_coaches, on=['year', 'tmID'], how='left')
X_test_10 = pd.merge(X_test_10, X_10_players, on=['year', 'tmID'], how='left')

teams_10_info = teams[teams['year'] == 10][['tmID', 'confID']]
X_test_10 = pd.merge(
    X_test_10,
    teams_10_info,
    on='tmID',
    how='left'
)

conf_map = {'EA': 0, 'WE': 1}
X_test_10['Conf_Code'] = X_test_10['confID'].map(conf_map)

X_test_10 = pd.merge(
    X_test_10,
    prev_performance,
    on=['year', 'tmID'],
    how='left'
)

X_test_10['Prev_Team_Win_Pct'] = X_test_10['Prev_Team_Win_Pct'].fillna(0.5)

X_test_10 = pd.merge(
    X_test_10,
    prev_diff,
    on=['year', 'tmID'],
    how='left'
)
X_test_10['Prev_Point_Diff'] = X_test_10['Prev_Point_Diff'].fillna(0)

print(X_test_10)

#X_test_scaled = scaler.transform(X_test_10[features])

#print(X_test_scaled)

    year tmID  Award_Score  Coach_Score  Player_Score_Mean  Player_Score_Max  \
0     10  WAS            0        0.000         133.845455             296.2   
1     10  MIN            6        0.000         202.611111             407.1   
2     10  SEA           36       23.500         160.680000             334.2   
3     10  DET           33       17.545         180.164286             399.0   
4     10  NYL            3       27.645         168.150000             344.4   
5     10  IND           31       19.230         147.583333             321.6   
6     10  CON           20       36.130         192.930000             399.3   
7     10  LAS           93       39.170         255.011111             544.7   
8     10  SAC            4       32.205         159.663636             261.3   
9     10  PHO           11       23.530         188.050000             597.0   
10    10  CHI            1       17.650         168.300000             420.9   
11    10  ATL           13        5.880 

Training and testing the model for predicting the ranking of the regular season for each conference

In [45]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
RANDOM_SEED = 42

features = ['Award_Score', 'Coach_Score', 'Conf_Code', 'Prev_Team_Win_Pct', 'Prev_Point_Diff', 'Player_Score_Mean', 'Player_Score_Max', 'Player_Score_Std']
X = dataset_final[features]
y = dataset_final['Win_Percentage']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

model_rf = RandomForestRegressor(n_estimators=500, random_state=RANDOM_SEED)
model_rf.fit(X_scaled, y)

feature_importances = pd.DataFrame({
    'Feature': features,
    'Importance': model_rf.feature_importances_
})

print(feature_importances.sort_values(by='Importance', ascending=False))

X_test_scaled = scaler.transform(X_test_10[features])

predictions_10_rf = model_rf.predict(X_test_scaled)
results_10_rf = X_test_10[['tmID', 'Conf_Code']].copy()
results_10_rf['Predicted_Win%'] = predictions_10_rf
results_10_rf = results_10_rf.sort_values(by='Predicted_Win%', ascending=False)

print("\n--- WE Conference ---")
print(results_10_rf[results_10_rf['Conf_Code'] == 1])
print("\n--- EA Conference ---")
print(results_10_rf[results_10_rf['Conf_Code'] == 0])

             Feature  Importance
5  Player_Score_Mean    0.313580
1        Coach_Score    0.205506
6   Player_Score_Max    0.122898
4    Prev_Point_Diff    0.114710
7   Player_Score_Std    0.087247
3  Prev_Team_Win_Pct    0.075210
0        Award_Score    0.072608
2          Conf_Code    0.008241

--- WE Conference ---
   tmID  Conf_Code  Predicted_Win%
12  SAS          1        0.666926
7   LAS          1        0.636353
9   PHO          1        0.586276
2   SEA          1        0.542004
8   SAC          1        0.519879
1   MIN          1        0.511610

--- EA Conference ---
   tmID  Conf_Code  Predicted_Win%
3   DET          0        0.612338
6   CON          0        0.599614
4   NYL          0        0.580445
5   IND          0        0.546603
10  CHI          0        0.510176
0   WAS          0        0.424029
11  ATL          0        0.364941


Evaluating the results for year 10

In [50]:
from sklearn.metrics import ndcg_score

def evaluate_ranking_ndcg(df_results, k=None):

    y_true = np.array([df_results['Actual_Win_Pct'].values])
    y_score = np.array([df_results['Predicted_Win%'].values])
    score = ndcg_score(y_true, y_score, k=k)

    return score


teams_10_real = teams[teams['year'] == 10][['tmID', 'won', 'lost']].copy()
teams_10_real['Actual_Win_Pct'] = teams_10_real['won'] / (teams_10_real['won'] + teams_10_real['lost'])

comparison = pd.merge(results_10_rf, teams_10_real[['tmID', 'Actual_Win_Pct']], on='tmID')

ndcg_total = evaluate_ranking_ndcg(comparison)
ndcg_top4 = evaluate_ranking_ndcg(comparison, k=4)

print(f"Ranking Quality (NDCG Global): {ndcg_total:.4f}")
print(f"Top 4 Quality (Playoffs):      {ndcg_top4:.4f}")

Ranking Quality (NDCG Global): 0.9234
Top 4 Quality (Playoffs):      0.7733


Predicting for year 11

In [47]:
train_frames = []

for year in range(2, 11):

    players_year = last_year_players_stats(year, players_teams)
    

    p_scores = players_year.groupby(["year", "tmID"])["Player_Score"].agg(
        Player_Score_Mean='mean',
        Player_Score_Max='max',
        Player_Score_Std='std'
    ).reset_index()
    p_scores = p_scores.fillna(0)


    c_scores = compute_coach_scores_for_year(year, coaches, teams)
    

    a_scores_list = []
    teams_in_year = players_teams[players_teams["year"] == year]["tmID"].unique()
    for team_id in teams_in_year:
        score = compute_award_score(team_id, year, awards_players, players_teams, coaches, dict_award_weight)
        a_scores_list.append({"year": year, "tmID": team_id, "Award_Score": score})
    a_scores = pd.DataFrame(a_scores_list)


    df_year = pd.merge(a_scores, c_scores, on=['year', 'tmID'], how='left')
    df_year = pd.merge(df_year, p_scores, on=['year', 'tmID'], how='left')


    prev_year = year - 1
    teams_prev = teams[teams['year'] == prev_year][['tmID', 'won', 'lost', 'o_pts', 'd_pts']].copy()
    

    teams_prev['Prev_Team_Win_Pct'] = teams_prev['won'] / (teams_prev['won'] + teams_prev['lost'])
    teams_prev['Prev_Point_Diff'] = teams_prev['o_pts'] - teams_prev['d_pts']
    

    df_year = pd.merge(df_year, teams_prev[['tmID', 'Prev_Team_Win_Pct', 'Prev_Point_Diff']], on='tmID', how='left')
    

    df_year['Prev_Team_Win_Pct'] = df_year['Prev_Team_Win_Pct'].fillna(0.5)
    df_year['Prev_Point_Diff'] = df_year['Prev_Point_Diff'].fillna(0)


    teams_target = teams[teams['year'] == year][['tmID', 'won', 'lost', 'confID']].copy()
    teams_target['Win_Percentage'] = teams_target['won'] / (teams_target['won'] + teams_target['lost'])
    

    conf_map = {'EA': 0, 'WE': 1}
    teams_target['Conf_Code'] = teams_target['confID'].map(conf_map)

    final_df_year = pd.merge(df_year, teams_target[['tmID', 'Win_Percentage', 'Conf_Code']], on='tmID', how='inner')
    
    train_frames.append(final_df_year)


dataset_train_11 = pd.concat(train_frames, ignore_index=True)

features = [
    'Award_Score', 'Coach_Score', 'Conf_Code', 
    'Prev_Team_Win_Pct', 'Prev_Point_Diff',
    'Player_Score_Mean', 'Player_Score_Max', 'Player_Score_Std'
]

X_train = dataset_train_11[features]
y_train = dataset_train_11['Win_Percentage']


scaler_11 = StandardScaler()
X_train_scaled = scaler_11.fit_transform(X_train)


model_rf_11 = RandomForestRegressor(n_estimators=500, random_state=42)
model_rf_11.fit(X_train_scaled, y_train)


import_df = pd.DataFrame({'Feature': features, 'Importance': model_rf_11.feature_importances_})
print(import_df.sort_values(by='Importance', ascending=False))

             Feature  Importance
5  Player_Score_Mean    0.291631
1        Coach_Score    0.200043
6   Player_Score_Max    0.125279
4    Prev_Point_Diff    0.109552
7   Player_Score_Std    0.094837
3  Prev_Team_Win_Pct    0.086400
0        Award_Score    0.081967
2          Conf_Code    0.010292


In [48]:
teams_in_11 = teams_11['tmID'].unique()

stats_year_10 = last_year_players_stats(10, players_teams)
player_scores_10 = stats_year_10[['playerID', 'Player_Score']].copy()

roster_11 = players_teams_11[['playerID', 'tmID', 'year']].copy()

roster_11_scored = pd.merge(roster_11, player_scores_10, on='playerID', how='left')
roster_11_scored['Player_Score'] = roster_11_scored['Player_Score'].fillna(0)

roster_11_scored.head(10)

p_scores_11 = roster_11_scored.groupby(["year", "tmID"])["Player_Score"].agg(
    Player_Score_Mean='mean',
    Player_Score_Max='max',
    Player_Score_Std='std'
).reset_index()

p_scores_11 = p_scores_11.fillna(0)

p_scores_11.head(10)

all_coaches = pd.concat([coaches, coaches_11], ignore_index=True, sort=False)
all_teams = pd.concat([teams, teams_11], ignore_index=True, sort=False)

all_coaches[all_coaches['year'] == 11]

c_scores_11 = compute_coach_scores_for_year(11, all_coaches, all_teams)

c_scores_11.head(10)

def compute_award_score_11(team_id, roster_df, coaches_df, awards_df, dict_weight):
    score = 0

    team_players = roster_df[roster_df['tmID'] == team_id]['playerID'].unique()

    p_awards = awards_df[awards_df['playerID'].isin(team_players)]
    score += p_awards['award'].map(dict_weight).fillna(0).sum()
    

    team_coach = coaches_df[coaches_df['tmID'] == team_id]
    if not team_coach.empty:
        c_id = team_coach.iloc[0]['coachID']
        c_awards = awards_df[(awards_df['playerID'] == c_id) & (awards_df['award'] == 'Coach of the Year')]
        score += c_awards['award'].map(dict_weight).fillna(0).sum()
        
    return score

a_scores_list_11 = []
for team_id in teams_in_11:
    score = compute_award_score_11(team_id, players_teams_11, coaches_11, awards_players, dict_award_weight)
    a_scores_list_11.append({"year": 11, "tmID": team_id, "Award_Score": score})

a_scores_11 = pd.DataFrame(a_scores_list_11)

#print(a_scores_11.head(10))

X_input_11 = pd.merge(a_scores_11, c_scores_11[['year', 'tmID', 'Coach_Score']], on=['year', 'tmID'], how='left')
X_input_11 = pd.merge(X_input_11, p_scores_11, on=['year', 'tmID'], how='left')

teams_prev_10 = teams[teams['year'] == 10][['tmID', 'won', 'lost', 'o_pts', 'd_pts']].copy()
teams_prev_10['Prev_Team_Win_Pct'] = teams_prev_10['won'] / (teams_prev_10['won'] + teams_prev_10['lost'])
teams_prev_10['Prev_Point_Diff'] = teams_prev_10['o_pts'] - teams_prev_10['d_pts']

X_input_11 = pd.merge(X_input_11, teams_prev_10, on='tmID', how='left')

conf_map = {'EA': 0, 'WE': 1}
teams_11_info = teams_11[['tmID', 'confID']].copy()
teams_11_info['Conf_Code'] = teams_11_info['confID'].map(conf_map)

X_input_11 = pd.merge(X_input_11, teams_11_info, on='tmID', how='left')

X_input_11 = X_input_11.fillna(0)

X_input_11.head(10)

Unnamed: 0,year,tmID,Award_Score,Coach_Score,Player_Score_Mean,Player_Score_Max,Player_Score_Std,won,lost,o_pts,d_pts,Prev_Team_Win_Pct,Prev_Point_Diff,confID,Conf_Code
0,11,ATL,15,16.18,85.692308,233.1,80.572974,18.0,16.0,2861.0,2797.0,0.529412,64.0,EA,0
1,11,CHI,3,0.0,85.653846,265.8,94.850748,16.0,18.0,2573.0,2693.0,0.470588,-120.0,EA,0
2,11,CON,20,35.04,69.158333,392.3,130.841052,16.0,18.0,2651.0,2654.0,0.470588,-3.0,EA,0
3,11,IND,21,21.95,113.627273,321.6,124.875891,22.0,12.0,2606.0,2501.0,0.647059,105.0,EA,0
4,11,LAS,26,20.59,173.290909,544.7,179.232059,18.0,16.0,2533.0,2498.0,0.529412,35.0,WE,1
5,11,MIN,3,0.0,191.654545,407.1,168.119293,14.0,20.0,2731.0,2827.0,0.411765,-96.0,WE,1
6,11,NYL,17,35.58,104.435714,419.6,132.527377,13.0,21.0,2512.0,2535.0,0.382353,-23.0,EA,0
7,11,PHO,23,33.68,131.338462,597.0,179.83223,23.0,11.0,3156.0,3031.0,0.676471,125.0,WE,1
8,11,SAS,25,24.17,157.96,440.7,189.14022,15.0,19.0,2615.0,2661.0,0.441176,-46.0,WE,1
9,11,SEA,49,24.83,140.546154,334.2,141.090158,20.0,14.0,2544.0,2476.0,0.588235,68.0,WE,1


In [49]:
features = [
    'Award_Score', 'Coach_Score', 'Conf_Code', 
    'Prev_Team_Win_Pct', 'Prev_Point_Diff',
    'Player_Score_Mean', 'Player_Score_Max', 'Player_Score_Std'
]

X_test_scaled_11 = scaler_11.transform(X_input_11[features])

predictions_11 = model_rf_11.predict(X_test_scaled_11)
results_11 = X_input_11[['tmID', 'Conf_Code']].copy()
results_11['Predicted_Win_Pct'] = predictions_11
results_11 = results_11.sort_values(by='Predicted_Win_Pct', ascending=False)

print("\n--- EA Conference ---")
print(results_11[results_11['Conf_Code'] == 0].head(10))
print("\n--- WE Conference ---")
print(results_11[results_11['Conf_Code'] == 1].head(10))


--- EA Conference ---
   tmID  Conf_Code  Predicted_Win_Pct
3   IND          0           0.493103
6   NYL          0           0.477651
2   CON          0           0.399684
11  WAS          0           0.372820
0   ATL          0           0.369107
1   CHI          0           0.241063

--- WE Conference ---
   tmID  Conf_Code  Predicted_Win_Pct
4   LAS          1           0.594500
9   SEA          1           0.555401
8   SAS          1           0.554849
7   PHO          1           0.502882
5   MIN          1           0.422743
10  TUL          1           0.323110
