#### Final Season Rankings

In [10]:
import pandas as pd
import numpy as np
from pathlib import Path

In [11]:
# ================================
# LOAD DATA
# ================================
awards = pd.read_csv("../awards_players.csv")
coaches = pd.read_csv("../coaches.csv")
players_teams = pd.read_csv("../players_teams.csv")
players = pd.read_csv("../players.csv")
series_post = pd.read_csv("../series_post.csv")
teams_post = pd.read_csv("../teams_post.csv")
teams = pd.read_csv("../teams.csv")

In [12]:
# ================================
# GET PLAYERS FOR TEAM
# ================================
def get_players_for_team(players_teams_df, year, team_id):
    """Return all players who played for a specific team in a given year."""
    return players_teams_df[
        (players_teams_df["year"] == year) &
        (players_teams_df["tmID"] == team_id)
    ].copy()

In [13]:
# ================================
# PLAYER GAME SCORE
# ================================
def get_player_game_score(players_teams_df, player_id, year):
    """
    Compute a player's game score for a given year.
    Returns a clean Python dictionary.
    """
    df_player = players_teams_df[
        (players_teams_df["playerID"] == player_id) &
        (players_teams_df["year"] == year)
    ].copy()

    if df_player.empty:
        return {
            "playerID": player_id,
            "year": year,
            "tmID": None,
            "Game_Score_Total": None,
            "Game_Score_Per_Game": None,
            "Game_Score_Per_Minute": None
        }

    # Base game score formula
    base_score = (
        df_player["points"]
        + 0.4 * df_player["fgMade"]
        - 0.7 * df_player["fgAttempted"]
        - 0.4 * (df_player["ftAttempted"] - df_player["ftMade"])
        + 0.7 * df_player["oRebounds"]
        + 0.3 * df_player["dRebounds"]
        + df_player["steals"]
        + 0.7 * df_player["assists"]
        + 0.7 * df_player["blocks"]
        - 0.4 * df_player["PF"]
        - df_player["turnovers"]
    )

    df_player["Game_Score_Total"] = base_score
    df_player["Game_Score_Per_Game"] = base_score / df_player["GP"].replace(0, np.nan)
    df_player["Game_Score_Per_Minute"] = base_score / df_player["minutes"].replace(0, np.nan)

    return {
        "playerID": player_id,
        "year": int(year),
        "tmID": ",".join(df_player["tmID"].unique()),
        "Game_Score_Total": float(df_player["Game_Score_Total"].sum(skipna=True)),
        "Game_Score_Per_Game": float(df_player["Game_Score_Per_Game"].mean(skipna=True)),
        "Game_Score_Per_Minute": float(df_player["Game_Score_Per_Minute"].mean(skipna=True)),
    }



In [14]:
# ================================
# TEAM WEIGHTED GAME SCORE
# ================================
def get_team_weighted_avg_score(players_teams_df, year, team_id):
    """
    Weighted average = mean player GameScorePerMinute weighted by GameScoreTotal.
    Uses prior-year stats (year-1), which is correct for prediction models.
    """
    df_team = get_players_for_team(players_teams_df, year, team_id)

    if df_team.empty:
        return {
            "tmID": team_id,
            "year": year,
            "Team_Weighted_Avg_Per_Minute": None,
            "Num_Players": 0
        }

    player_scores = []
    for player_id in df_team["playerID"].unique():
        player_scores.append(get_player_game_score(players_teams_df, player_id, year - 1))

    df_scores = pd.DataFrame(player_scores)

    total_weight = df_scores["Game_Score_Total"].sum(skipna=True)
    if not total_weight or np.isnan(total_weight):
        weighted_avg = None
    else:
        weighted_avg = (
            (df_scores["Game_Score_Per_Minute"] * df_scores["Game_Score_Total"]).sum(skipna=True)
            / total_weight
        )

    return {
        "tmID": team_id,
        "year": year,
        "Team_Weighted_Avg_Per_Minute": float(weighted_avg) if weighted_avg is not None else None,
        "Num_Players": int(len(df_scores))
    }


In [15]:
# ================================
# CONFERENCE RANKINGS
# ================================
def get_conference_rankings(players_teams_df, teams_df, year):
    """
    Predict conference rankings using team weighted scores.
    Returns (Eastern_df, Western_df).
    """
    teams_year = teams_df[teams_df["year"] == year][[
        "tmID", "confID", "name", "rank", "won", "lost"
    ]].copy()

    if teams_year.empty:
        print(f"No team data found for year {year}.")
        return pd.DataFrame(), pd.DataFrame()

    teams_year.rename(columns={"rank": "Actual_Rank"}, inplace=True)

    # Compute weighted score for each team
    team_results = [
        get_team_weighted_avg_score(players_teams_df, year, tm)
        for tm in teams_year["tmID"].unique()
    ]

    df_scores = pd.DataFrame(team_results)
    df_scores = df_scores.merge(teams_year, on="tmID", how="left")

    # Rank within each conference
    df_scores["Predicted_Rank_Conf"] = (
        df_scores.groupby("confID")["Team_Weighted_Avg_Per_Minute"]
        .rank(ascending=False, method="first")
    )

    df_scores["Rank_Diff"] = (
        df_scores["Predicted_Rank_Conf"] - df_scores["Actual_Rank"]
    )

    # Separate conferences
    east_df = df_scores[df_scores["confID"] == "EA"].sort_values("Predicted_Rank_Conf").reset_index(drop=True)
    west_df = df_scores[df_scores["confID"] == "WE"].sort_values("Predicted_Rank_Conf").reset_index(drop=True)

    return east_df, west_df

In [16]:
# ================================
# EXAMPLE
# ================================
east_rankings, west_rankings = get_conference_rankings(players_teams, teams, year=10)

print("\nüèÄ EASTERN CONFERENCE")
print(east_rankings)

print("\nüèÄ WESTERN CONFERENCE")
print(west_rankings)


üèÄ EASTERN CONFERENCE
  tmID  year  Team_Weighted_Avg_Per_Minute  Num_Players confID  \
0  CON    10                      0.318042           14     EA   
1  DET    10                      0.301598           18     EA   
2  CHI    10                      0.291838           12     EA   
3  NYL    10                      0.288212           11     EA   
4  ATL    10                      0.281187           13     EA   
5  IND    10                      0.273435           14     EA   
6  WAS    10                      0.232280           12     EA   

                 name  Actual_Rank  won  lost  Predicted_Rank_Conf  Rank_Diff  
0     Connecticut Sun            6   16    18                  1.0       -5.0  
1       Detroit Shock            3   18    16                  2.0       -1.0  
2         Chicago Sky            5   16    18                  3.0       -2.0  
3    New York Liberty            7   13    21                  4.0       -3.0  
4       Atlanta Dream            2   18    16 

## Predicative Model PipeLine (NOT FINISHED)

In [17]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error


# ======================================================
# HELPER FUNCTIONS
# ======================================================

def get_players_for_team(players_teams_df, year, team_id):
    return players_teams_df[
        (players_teams_df["year"] == year) &
        (players_teams_df["tmID"] == team_id)
    ].copy()


def calculate_player_score(df):
    """Apply your Game Score formula to a DataFrame of players."""
    base = (
        df["points"]
        + 0.4 * df["fgMade"]
        - 0.7 * df["fgAttempted"]
        - 0.4 * (df["ftAttempted"] - df["ftMade"])
        + 0.7 * df["oRebounds"]
        + 0.3 * df["dRebounds"]
        + df["steals"]
        + 0.7 * df["assists"]
        + 0.7 * df["blocks"]
        - 0.4 * df["PF"]
        - df["turnovers"]
    )

    df["Game_Score_Total"] = base
    df["Game_Score_Per_Minute"] = base / df["minutes"].replace(0, np.nan)
    return df


def get_team_weighted_score(players_teams_df, year, team_id):
    """Weighted avg player score using previous season."""
    df = get_players_for_team(players_teams_df, year - 1, team_id)
    if df.empty:
        return None, 0

    df = calculate_player_score(df)

    total_weight = df["Game_Score_Total"].sum(skipna=True)
    if total_weight == 0 or np.isnan(total_weight):
        return None, len(df)

    weighted_avg = (
        (df["Game_Score_Per_Minute"] * df["Game_Score_Total"]).sum()
        / total_weight
    )

    return float(weighted_avg), len(df)


def get_coach_features(coaches_df, tmID, year):
    """Return coach experience and whether a coach changed from prev season."""
    df = coaches_df[coaches_df["tmID"] == tmID]

    this_year = df[df["year"] == year]
    prev_year = df[df["year"] == year - 1]

    if this_year.empty:
        return 0, 0

    coach_id = this_year["coachID"].iloc[0]

    # Experience = total seasons coached before this year
    experience = len(df[df["coachID"] == coach_id][df["year"] < year])

    # Coach change?
    changed = int(prev_year.empty or prev_year["coachID"].iloc[0] != coach_id)

    return experience, changed


# ======================================================
# BUILD TRAINING DATASET
# ======================================================
rows = []

all_years = sorted(teams["year"].unique())

for year in all_years:
    teams_this_year = teams[teams["year"] == year]

    for _, row in teams_this_year.iterrows():
        tmID = row["tmID"]

        # Player-based weighted score (from year-1)
        wscore, num_players = get_team_weighted_score(players_teams, year, tmID)

        # Previous season team stats
        prev_team = teams[
            (teams["tmID"] == tmID) & (teams["year"] == year - 1)
        ]

        if prev_team.empty:
            continue  # skip teams without history

        prev = prev_team.iloc[0]

        winpct = prev["won"] / max(prev["GP"], 1)

        # Coach features
        coach_exp, coach_changed = get_coach_features(coaches, tmID, year)

        # Build ML row
        rows.append({
            "year": year,
            "tmID": tmID,
            "confID": row["confID"],
            "Actual_Rank": row["rank"],

            # ML Features
            "WeightedScore": wscore,
            "NumPlayers": num_players,
            "PrevWinPct": winpct,
            # "PrevPointsFor": prev["o_pts"],
            # "PrevPointsAgainst": prev["d_pts"],
            # "PrevRebounds": prev["tmTRB"],
            "CoachExperience": coach_exp,
            "CoachChanged": coach_changed,
        })


dataset = pd.DataFrame(rows)
dataset = dataset.dropna()
dataset.to_csv("training_data.csv", index=False)
print("Saved training_data.csv")


# ======================================================
# PREPARE ML INPUTS
# ======================================================
feature_cols = [
    "WeightedScore",
    "NumPlayers",
    "PrevWinPct",
    # "PrevPointsFor",
    # "PrevPointsAgainst",
    # "PrevRebounds",
    "CoachExperience",
    "CoachChanged",
]

X = dataset[feature_cols]
y = dataset["Actual_Rank"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, shuffle=True, random_state=42
)


# ======================================================
# TRAIN RANDOM FOREST MODEL
# ======================================================
model = RandomForestRegressor(
    n_estimators=500,
    max_depth=12,
    random_state=42
)

model.fit(X_train, y_train)

preds = model.predict(X_test)

mae = mean_absolute_error(y_test, preds)
rmse = np.sqrt(mean_squared_error(y_test, preds))

print("Model Performance:")
print("MAE:", mae)
print("RMSE:", rmse)


# ======================================================
# FEATURE IMPORTANCE
# ======================================================
feat_imp = pd.DataFrame({
    "Feature": feature_cols,
    "Importance": model.feature_importances_
}).sort_values("Importance", ascending=False)

feat_imp.to_csv("feature_importances.csv", index=False)
print("Saved feature_importances.csv")
print(feat_imp)


# ======================================================
# PREDICT RANKINGS FOR A SPECIFIC YEAR
# ======================================================
def predict_rankings_for_year(year):
    df = dataset[dataset["year"] == year].copy()
    df["Predicted_Rank"] = model.predict(df[feature_cols])

    df = df.sort_values("Predicted_Rank")

    east = df[df["confID"] == "EA"]
    west = df[df["confID"] == "WE"]

    east.to_csv(f"predicted_rankings_{year}_EA.csv", index=False)
    west.to_csv(f"predicted_rankings_{year}_WE.csv", index=False)

    return east, west


# Example usage
east, west = predict_rankings_for_year(9)

print("\nPredicted EAST:")
print(east)

print("\nPredicted WEST:")
print(west)


  experience = len(df[df["coachID"] == coach_id][df["year"] < year])
  experience = len(df[df["coachID"] == coach_id][df["year"] < year])
  experience = len(df[df["coachID"] == coach_id][df["year"] < year])
  experience = len(df[df["coachID"] == coach_id][df["year"] < year])
  experience = len(df[df["coachID"] == coach_id][df["year"] < year])
  experience = len(df[df["coachID"] == coach_id][df["year"] < year])
  experience = len(df[df["coachID"] == coach_id][df["year"] < year])
  experience = len(df[df["coachID"] == coach_id][df["year"] < year])
  experience = len(df[df["coachID"] == coach_id][df["year"] < year])
  experience = len(df[df["coachID"] == coach_id][df["year"] < year])
  experience = len(df[df["coachID"] == coach_id][df["year"] < year])
  experience = len(df[df["coachID"] == coach_id][df["year"] < year])
  experience = len(df[df["coachID"] == coach_id][df["year"] < year])
  experience = len(df[df["coachID"] == coach_id][df["year"] < year])
  experience = len(df[df["coachID"

Saved training_data.csv
Model Performance:
MAE: 1.8353142127638447
RMSE: 2.1138576510698828
Saved feature_importances.csv
           Feature  Importance
2       PrevWinPct    0.361335
0    WeightedScore    0.283907
3  CoachExperience    0.147596
1       NumPlayers    0.134028
4     CoachChanged    0.073134

Predicted EAST:
     year tmID confID  Actual_Rank  WeightedScore  NumPlayers  PrevWinPct  \
98      9  DET     EA            1       0.289347          14    0.705882   
103     9  NYL     EA            3       0.242149          14    0.470588   
97      9  CON     EA            2       0.312815          13    0.529412   
100     9  IND     EA            4       0.289161          12    0.617647   
96      9  CHI     EA            5       0.280849          14    0.411765   
108     9  WAS     EA            6       0.263903          15    0.470588   

     CoachExperience  CoachChanged  Predicted_Rank  
98                 6             0        2.429000  
103                4         