In [7]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score
from sklearn.inspection import permutation_importance

# Read data
matches = pd.read_csv("matches.csv", index_col=0)
players = pd.read_csv("players.csv", index_col=0)

# Feature engineering
matches["Date"] = pd.to_datetime(matches["Date"])
matches["venue_code"] = matches["Venue"].astype("category").cat.codes
matches["opp_code"] = matches["Opponent"].astype("category").cat.codes
matches["hour"] = matches["Start (ET)"].str.replace(":.+", "", regex=True).astype(int)
matches["day_code"] = matches["Date"].dt.dayofweek
matches["target"] = (matches["Result"] == "W").astype(int)
matches["point_diff"] = matches["Tm"] - matches["Opp"]
matches["win"] = (matches["Result"] == "W").astype(int)
matches["streak_value"] = matches["Streak"].str.replace("W ", "").str.replace("L ", "-").astype(int)
matches["streak_value"] = matches.groupby("Team")["streak_value"].shift(1)

# Clean height to inches
def height_to_inches(ht):
    if isinstance(ht, str) and '-' in ht:
        feet, inches = ht.split('-')
        return int(feet) * 12 + int(inches)
    return None
players["Ht"] = players["Ht"].apply(height_to_inches)
players["Exp"] = pd.to_numeric(players["Exp"], errors="coerce")
players["Season"] = players["Season"].astype(int)

def get_team_stats(team_name, season, players_df, top_n=8):
    team_players = players_df[(players_df["Team"] == team_name) & (players_df["Season"] == season)]
    top_players = team_players.sort_values("MP", ascending=False).head(top_n)

    numeric_cols = ["MP", "FG", "FGA", "3P", "3PA", "FT", "FTA", "TRB", "AST", "STL", "BLK", "TOV", "PF", "PTS"]

    # Weight by minutes played
    weights = top_players["MP"] / top_players["MP"].sum()
    weighted_stats = (top_players[numeric_cols].T * weights).T.sum()
    
    return weighted_stats

def add_team_stats_to_matches(matches, players):
    features = []
    for _, row in matches.iterrows():
        season = row["Season"]
        home_stats = get_team_stats(row["Team"], season, players)
        away_stats = get_team_stats(row["Opponent"], season, players)
        
        # Rename for clarity
        home_stats = home_stats.add_prefix("home_")
        away_stats = away_stats.add_prefix("away_")

        combined = pd.concat([home_stats, away_stats])
        features.append(combined)

    feature_df = pd.DataFrame(features)
    return pd.concat([matches.reset_index(drop=True), feature_df], axis=1)

matches_with_player_stats = add_team_stats_to_matches(matches, players)
matches_with_player_stats["point_diff_player_avg"] = (
    matches_with_player_stats["home_PTS"] - matches_with_player_stats["away_PTS"]
)

# Rolling averages
def rolling_averages(group, cols, new_cols):
    group = group.sort_values("Date").copy()
    rolling_stats = group[cols].rolling(4, closed="left").mean()
    for new_col, val in zip(new_cols, rolling_stats.values.T):
        group[new_col] = val
    return group.dropna(subset=new_cols)

cols = ["Tm", "Opp", "point_diff", "win"]
new_cols = [f"{c}_rolling" for c in cols]

matches_with_rolling = matches_with_player_stats.groupby("Team").apply(lambda x: rolling_averages(x, cols, new_cols))
matches_with_rolling = matches_with_rolling.droplevel("Team").reset_index(drop=True)

# Model training function
def make_predictions(data, predictors, model):
    train = data[data["Date"] < "2025-01-01"]
    test = data[data["Date"] > "2025-01-01"]
    model.fit(train[predictors], train["target"])
    preds = model.predict(test[predictors])
    combined = pd.DataFrame(dict(actual=test["target"], prediction=preds), index=test.index)
    precision = precision_score(test["target"], preds)
    return combined, precision

# Set up predictors
predictors = ["venue_code", "opp_code", "hour", "day_code", "streak_value"] + new_cols
player_cols = [c for c in matches_with_player_stats.columns if c.startswith("home_") or c.startswith("away_")]
predictors += player_cols + ["point_diff_player_avg"]

# Train Random Forest
rf = RandomForestClassifier(n_estimators=500, min_samples_split=10, random_state=1)

# Make predictions
combined, precision = make_predictions(matches_with_rolling, predictors, rf)
combined = combined.merge(matches_with_rolling[["Date", "Team", "Opponent", "Result"]], left_index=True, right_index=True)

# Mapping team abbreviations
class MissingDict(dict):
    __missing__ = lambda self, key: key

map_values = {
    "ATL": "Atlanta Hawks",
    "BOS": "Boston Celtics",
    "BRK": "Brooklyn Nets",
    "CHA": "Charlotte Hornets",
    "CHI": "Chicago Bulls",
    "CLE": "Cleveland Cavaliers",
    "DAL": "Dallas Mavericks",
    "DEN": "Denver Nuggets",
    "DET": "Detroit Pistons",
    "GSW": "Golden State Warriors",
    "HOU": "Houston Rockets",
    "IND": "Indiana Pacers",
    "LAC": "Los Angeles Clippers",
    "LAL": "Los Angeles Lakers",
    "MEM": "Memphis Grizzlies",
    "MIA": "Miami Heat",
    "MIL": "Milwaukee Bucks",
    "MIN": "Minnesota Timberwolves",
    "NOP": "New Orleans Pelicans",
    "NYK": "New York Knicks",
    "OKC": "Oklahoma City Thunder",
    "ORL": "Orlando Magic",
    "PHI": "Philadelphia 76ers",
    "PHX": "Phoenix Suns",
    "POR": "Portland Trail Blazers",
    "SAC": "Sacramento Kings",
    "SAS": "San Antonio Spurs",
    "TOR": "Toronto Raptors",
    "UTA": "Utah Jazz",
    "WAS": "Washington Wizards"
}
mapping = MissingDict(**map_values)

# Map full team names
combined["new_team"] = combined["Team"].map(mapping)

# Merge to add opponent features
merged = combined.merge(
    combined,
    left_on=["Date", "Opponent"],
    right_on=["Date", "new_team"],
    suffixes=('', '_opponent'),
    how='inner'
)

# Analyze Results
print("Model Precision:", precision)
print(merged[(merged["prediction"] == 1) & (merged["prediction_opponent"] == 0)]["actual"].value_counts())

# === FEATURE IMPORTANCE ===
result = permutation_importance(rf, matches_with_rolling[predictors], matches_with_rolling["target"], n_repeats=5, random_state=1)
importance = pd.Series(result.importances_mean, index=predictors)
print("\nTop Predictive Features:\n", importance.sort_values(ascending=False).head(10))

  matches_with_rolling = matches_with_player_stats.groupby("Team").apply(lambda x: rolling_averages(x, cols, new_cols))


Model Precision: 0.5864661654135338
actual
1    239
0    121
Name: count, dtype: int64

Top Predictive Features:
 opp_code              0.048376
point_diff_rolling    0.024274
Tm_rolling            0.018376
venue_code            0.016838
Opp_rolling           0.014188
day_code              0.013162
streak_value          0.010342
hour                  0.008803
home_TOV              0.006410
win_rolling           0.006325
dtype: float64


In [64]:
262/(262+133)

0.6632911392405063

In [107]:
precision

0.5941676792223572

In [116]:
# Step 1: Prepare input for tomorrow

# Assume you have precomputed latest rolling stats manually or from your matches.csv
# Example dummy values (you should use real ones if you have them!)
venue_code = 1  # Home = 1, Away = 0 (you must check your data encoding)
opp_code = 8   # MIA code (you must check what code 'MIA' maps to)
hour = 7
day_code = 1  # If tomorrow is Tuesday, dayofweek=1
streak_value = 2 # Say Cavaliers are on 4 game win streak

# Rolling stats (fake example numbers — replace with real)
Tm_rolling = 107.25
Opp_rolling = 105.25
point_diff_rolling = 2
win_rolling = 0.75

# Build single prediction row
import pandas as pd

tomorrow_game = pd.DataFrame({
    "venue_code": [venue_code],
    "opp_code": [opp_code],
    "hour": [hour],
    "day_code": [day_code],
    "streak_value": [streak_value],
    "Tm_rolling": [Tm_rolling],
    "Opp_rolling": [Opp_rolling],
    "point_diff_rolling": [point_diff_rolling],
    "win_rolling": [win_rolling]
})

# Step 2: Predict
prediction = rf.predict(tomorrow_game)

# Step 3: Output result
print("Prediction for tomorrow's game:")
if prediction[0] == 1:
    print("🏀 Model predicts: WIN")
else:
    print("🏀 Model predicts: LOSS")

Prediction for tomorrow's game:
🏀 Model predicts: WIN
