In [108]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score

# Read data
matches = pd.read_csv("matches.csv", index_col=0)
players = pd.read_csv("players.csv", index_col=0)

# Feature engineering
matches["Date"] = pd.to_datetime(matches["Date"])
matches["venue_code"] = matches["Venue"].astype("category").cat.codes
matches["opp_code"] = matches["Opponent"].astype("category").cat.codes
matches["hour"] = matches["Start (ET)"].str.replace(":.+", "", regex=True).astype(int)
matches["day_code"] = matches["Date"].dt.dayofweek
matches["target"] = (matches["Result"] == "W").astype(int)
matches["point_diff"] = matches["Tm"] - matches["Opp"]
matches["win"] = (matches["Result"] == "W").astype(int)
matches["streak_value"] = matches["Streak"].str.replace("W ", "").str.replace("L ", "-").astype(int)
matches["streak_value"] = matches.groupby("Team")["streak_value"].shift(1)

# Rolling averages
def rolling_averages(group, cols, new_cols):
    group = group.sort_values("Date")
    rolling_stats = group[cols].rolling(4, closed="left").mean()
    group[new_cols] = rolling_stats
    group = group.dropna(subset=new_cols)
    return group

cols = ["Tm", "Opp", "point_diff", "win"]
new_cols = [f"{c}_rolling" for c in cols]

matches_rolling = matches.groupby("Team").apply(lambda x: rolling_averages(x, cols, new_cols))
matches_rolling = matches_rolling.droplevel("Team")
matches_rolling.index = range(matches_rolling.shape[0])

# Model training function
def make_predictions(data, predictors, model):
    train = data[data["Date"] < "2025-01-01"]
    test = data[data["Date"] > "2025-01-01"]
    model.fit(train[predictors], train["target"])
    preds = model.predict(test[predictors])
    combined = pd.DataFrame(dict(actual=test["target"], prediction=preds), index=test.index)
    precision = precision_score(test["target"], preds)
    return combined, precision

# Set up predictors
predictors = ["venue_code", "opp_code", "hour", "day_code", "streak_value"] + new_cols

# Train Random Forest
rf = RandomForestClassifier(n_estimators=5000, min_samples_split=100, random_state=1)

# Make predictions
combined, precision = make_predictions(matches_rolling, predictors, rf)

# Merge back basic info
combined = combined.merge(matches_rolling[["Date", "Team", "Opponent", "Result"]], left_index=True, right_index=True)

# Mapping team abbreviations
class MissingDict(dict):
    __missing__ = lambda self, key: key

map_values = {
    "ATL": "Atlanta Hawks",
    "BOS": "Boston Celtics",
    "BRK": "Brooklyn Nets",
    "CHA": "Charlotte Hornets",
    "CHI": "Chicago Bulls",
    "CLE": "Cleveland Cavaliers",
    "DAL": "Dallas Mavericks",
    "DEN": "Denver Nuggets",
    "DET": "Detroit Pistons",
    "GSW": "Golden State Warriors",
    "HOU": "Houston Rockets",
    "IND": "Indiana Pacers",
    "LAC": "Los Angeles Clippers",
    "LAL": "Los Angeles Lakers",
    "MEM": "Memphis Grizzlies",
    "MIA": "Miami Heat",
    "MIL": "Milwaukee Bucks",
    "MIN": "Minnesota Timberwolves",
    "NOP": "New Orleans Pelicans",
    "NYK": "New York Knicks",
    "OKC": "Oklahoma City Thunder",
    "ORL": "Orlando Magic",
    "PHI": "Philadelphia 76ers",
    "PHX": "Phoenix Suns",
    "POR": "Portland Trail Blazers",
    "SAC": "Sacramento Kings",
    "SAS": "San Antonio Spurs",
    "TOR": "Toronto Raptors",
    "UTA": "Utah Jazz",
    "WAS": "Washington Wizards"
}
mapping = MissingDict(**map_values)

# Map full team names
combined["new_team"] = combined["Team"].map(mapping)

# Merge to add opponent features
merged = combined.merge(
    combined,
    left_on=["Date", "Opponent"],
    right_on=["Date", "new_team"],
    suffixes=('', '_opponent'),
    how='inner'
)

# Now you can analyze
print(merged[(merged["prediction"] == 1) & (merged["prediction_opponent"] == 0)]["actual"].value_counts())

  matches_rolling = matches.groupby("Team").apply(lambda x: rolling_averages(x, cols, new_cols))


actual
1    274
0    127
Name: count, dtype: int64


In [64]:
262/(262+133)

0.6632911392405063

In [107]:
precision

0.5941676792223572

In [116]:
# Step 1: Prepare input for tomorrow

# Assume you have precomputed latest rolling stats manually or from your matches.csv
# Example dummy values (you should use real ones if you have them!)
venue_code = 1  # Home = 1, Away = 0 (you must check your data encoding)
opp_code = 8   # MIA code (you must check what code 'MIA' maps to)
hour = 7
day_code = 1  # If tomorrow is Tuesday, dayofweek=1
streak_value = 2 # Say Cavaliers are on 4 game win streak

# Rolling stats (fake example numbers — replace with real)
Tm_rolling = 107.25
Opp_rolling = 105.25
point_diff_rolling = 2
win_rolling = 0.75

# Build single prediction row
import pandas as pd

tomorrow_game = pd.DataFrame({
    "venue_code": [venue_code],
    "opp_code": [opp_code],
    "hour": [hour],
    "day_code": [day_code],
    "streak_value": [streak_value],
    "Tm_rolling": [Tm_rolling],
    "Opp_rolling": [Opp_rolling],
    "point_diff_rolling": [point_diff_rolling],
    "win_rolling": [win_rolling]
})

# Step 2: Predict
prediction = rf.predict(tomorrow_game)

# Step 3: Output result
print("Prediction for tomorrow's game:")
if prediction[0] == 1:
    print("🏀 Model predicts: WIN")
else:
    print("🏀 Model predicts: LOSS")

Prediction for tomorrow's game:
🏀 Model predicts: WIN
