In [1]:
import pandas as pd
import sys
sys.path.append("../..")
from config.settings import  DIRECTORY_COMBINED_MATCHES_CLEAN, Club

df = pd.read_csv(DIRECTORY_COMBINED_MATCHES_CLEAN)
# Delete empty rows
df = df[df["Opponent"].notna()]

df.head(10)

Unnamed: 0.1,Unnamed: 0,SCA,GCA,GF,SH,SoT,npxG,Result,Venue,GA,Opponent,Team,Date,Season
0,0,30.0,0.0,0,15.0,5.0,1.3,D,Away,0,Columbus-Crew,Philadelphia-Union,2021-04-18,2021
1,1,24.0,2.0,1,13.0,3.0,0.9,L,Home,2,Inter-Miami,Philadelphia-Union,2021-04-24,2021
2,2,6.0,0.0,0,4.0,0.0,0.3,L,Home,2,New-York-City-FC,Philadelphia-Union,2021-05-01,2021
3,3,27.0,3.0,2,15.0,5.0,1.4,W,Away,0,Chicago-Fire,Philadelphia-Union,2021-05-08,2021
4,4,26.0,2.0,1,14.0,3.0,2.3,D,Home,1,New-England-Revolution,Philadelphia-Union,2021-05-12,2021
5,5,22.0,2.0,1,12.0,5.0,1.0,W,Home,0,New-York-Red-Bulls,Philadelphia-Union,2021-05-15,2021
6,6,13.0,2.0,1,7.0,2.0,1.2,W,Away,0,DC-United,Philadelphia-Union,2021-05-23,2021
7,7,27.0,4.0,3,16.0,5.0,2.8,W,Home,0,Portland-Timbers,Philadelphia-Union,2021-05-30,2021
8,8,28.0,3.0,2,16.0,5.0,1.6,D,Away,2,Atlanta-United,Philadelphia-Union,2021-06-20,2021
9,9,8.0,2.0,1,5.0,1.0,0.5,W,Home,0,Columbus-Crew,Philadelphia-Union,2021-06-23,2021


In [2]:
# Ensure Date column is datetime
df['Date'] = pd.to_datetime(df['Date'])

# Sort by team and date (most recent first)
df_sorted = df.sort_values(by=['Team', 'Date'], ascending=[True, False])

# Take last 10 games for each team
df_last_10 = df_sorted.groupby('Team').head(10).reset_index(drop=True)


In [3]:
team_stats = df_last_10.groupby(['Team', 'Venue']).agg(
    avg_GF=('GF', 'mean'),
    avg_GA=('GA', 'mean'),
    matches_played=('GF', 'count')
).reset_index()

In [4]:
pivot_stats = team_stats.pivot(index='Team', columns='Venue', values=['avg_GF', 'avg_GA'])
pivot_stats.columns = ['avg_GF_away', 'avg_GF_home', 'avg_GA_away', 'avg_GA_home']

pivot_stats


Unnamed: 0_level_0,avg_GF_away,avg_GF_home,avg_GA_away,avg_GA_home
Team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Atlanta-United,0.333333,2.0,2.5,1.5
Austin-FC,1.0,1.166667,1.25,1.166667
CF-Montreal,1.5,0.833333,1.5,2.333333
Charlotte-FC,1.714286,1.333333,2.571429,2.333333
Chicago-Fire,3.333333,1.0,2.333333,1.5
Colorado-Rapids,1.0,0.5,2.0,1.0
Columbus-Crew,1.5,1.833333,2.75,1.333333
DC-United,0.5,1.166667,1.25,2.333333
FC-Cincinnati,1.333333,2.0,1.333333,1.75
FC-Dallas,1.857143,0.333333,2.285714,1.0


In [5]:
league_avg_goals_home = df[df["Venue"] == "Home"]["GF"].mean()
league_avg_goals_away = df[df["Venue"] == "Away"]["GF"].mean()


In [6]:
philly_attack = pivot_stats.loc[Club.Philadelphia.value, "avg_GF_home"]
nycfc_defense = pivot_stats.loc[Club.NYCFC.value, "avg_GA_away"]

expected_goals_philly = philly_attack * nycfc_defense / league_avg_goals_home

nycfc_attack = pivot_stats.loc[Club.NYCFC.value, "avg_GF_away"]
philly_defense = pivot_stats.loc[Club.Philadelphia.value, "avg_GA_home"]

expected_goals_nycfc = nycfc_attack * philly_defense / league_avg_goals_away


In [7]:
from scipy.stats import poisson

# Example expected goals
lambda_home = 1.7
lambda_away = 1.2

# Probability of each goal count (0 to 6)
home_goal_probs = [poisson.pmf(k, expected_goals_philly) for k in range(7)]
away_goal_probs = [poisson.pmf(k, expected_goals_nycfc) for k in range(7)]


In [8]:
import numpy as np

score_matrix = np.outer(home_goal_probs, away_goal_probs)

home_win_prob = np.sum(np.tril(score_matrix, -1))
draw_prob = np.sum(np.diag(score_matrix))
away_win_prob = np.sum(np.triu(score_matrix, 1))



In [10]:
import numpy as np
import pandas as pd
from scipy.optimize import minimize
from scipy.stats import poisson

def dc_negloglik(params, df, teams, half_life=180):
    # param unpacking ----------
    n = len(teams)
    gamma, rho = params[:2]
    attack  = dict(zip(teams, params[2:2+n]))
    defense = dict(zip(teams, params[2+n:]))

    # time–decay weights -------
    today = df['Date'].max()
    w = 0.5 ** ((today - df['Date']).dt.days / half_life)

    # loop over matches --------
    ll = 0.0
    for wt, row in zip(w, df.itertuples()):
        i, j = row.Team, row.Opponent
        x, y = row.GF,  row.GA

        lam = np.exp(attack[i] - defense[j] + gamma)
        mu  = np.exp(attack[j] - defense[i])

        p = poisson.pmf(x, lam) * poisson.pmf(y, mu)

        # Dixon–Coles tweak
        if   x==0 and y==0:  p *= 1 - (lam+mu) * rho
        elif x==0 and y==1:  p *= 1 + lam * rho
        elif x==1 and y==0:  p *= 1 + mu  * rho
        elif x==1 and y==1:  p *= 1 - rho

        ll += wt * np.log(p + 1e-12)    # epsilon to avoid log(0)

    return -ll                           # negative because we minimise

# --- prepare data -----------------------------------------------------------
teams = sorted(set(df.Team))

# --- start values -----------------------------------------------------------
init = np.r_[0.25, -0.1,           # gamma, rho
             np.zeros(len(teams)*2)]  # attacks, defences

# optional: box constraints for rho, e.g. [-0.5, 0.0]
bounds = [(None, None), (-0.5, 0.0)] + [(None, None)]*len(init[2:])

res = minimize(dc_negloglik, init,
               args=(df, teams),
               method="L-BFGS-B",
               bounds=bounds)

gamma_hat   = res.x[0]
rho_hat     = res.x[1]
attack_hat  = dict(zip(teams, res.x[2               : 2+len(teams)]))
defense_hat = dict(zip(teams, res.x[2+len(teams): ]))

print("Fitted rho =", round(rho_hat, 3))



  ll += wt * np.log(p + 1e-12)    # epsilon to avoid log(0)


Fitted rho = -0.129


In [12]:
import pandas as pd
import numpy as np

def build_weighted_team_stats(
        df: pd.DataFrame,
        half_life: int = 180,
        today: pd.Timestamp | None = None,
        *,
        date_col: str = "Date",
        team_col: str = "Team",
        venue_col: str = "Venue",          # “Home” / “Away”
        gf_col: str = "GF",                # goals for
        ga_col: str = "GA",                # goals against
        xg_col: str = "npxG",       # optional – remove if you don’t track xG
) -> pd.DataFrame:
    """
    Returns a pivot indexed by team with exponentially-weighted averages:

        • avg_GF_home / avg_GF_away
        • avg_GA_home / avg_GA_away
        • (optionally) avg_xG_home / avg_xG_away

    Parameters
    ----------
    df : DataFrame
        Must contain at least the columns listed above.
        One row = one team’s stats for one match.
    half_life : int
        Half-life for exponential decay in **days**. 120-200
    today : Timestamp | None
        Reference “now”.  Defaults to the most recent `Date` in *df*.
    """

    data = df.copy()

    # --- 1. housekeeping ----------------------------------------------------
    data[date_col]  = pd.to_datetime(data[date_col])
    data[venue_col] = data[venue_col].str.lower().map(
        {"home": "home", "away": "away", "h": "home", "a": "away"}
    )

    if today is None:
        today = data[date_col].max()

    # weight = 0.5 ** (Δdays / half_life)
    data["days_since"] = (today - data[date_col]).dt.days
    data["weight"]     = 0.5 ** (data["days_since"] / half_life)

    # --- 2. weighted means ---------------------------------------------------
    def _ew_mean(col):
        return np.average(col, weights=data.loc[col.index, "weight"])

    agg_cols = {gf_col: "avg_GF", ga_col: "avg_GA"}
    if xg_col and xg_col in data.columns:
        agg_cols[xg_col] = "avg_xG"

    grouped = (
        data.groupby([team_col, venue_col])
            .apply(lambda g: pd.Series({
                new: _ew_mean(g[old]) for old, new in agg_cols.items()
            }))
            .reset_index()
    )

    # --- 3. pivot to wide ----------------------------------------------------
    pivot = (
        grouped
        .pivot(index=team_col, columns=venue_col)
        .sort_index()
    )

    # flatten (“avg_GF”, “home”) → “avg_GF_home”
    pivot.columns = [f"{stat}_{venue}" for stat, venue in pivot.columns]

    return pivot


In [13]:
def calculate_expected_goals(
        home_team: Club,
        away_team: Club,
        pivot_stats,
        league_avg_goals_home: float,
        league_avg_goals_away: float,
        home_edge: float = 0.25,
        decimals: int | None = None
        ):
    """
    Calculates expected goals for both home and away teams in a match using Poisson inputs.

    Args:
        home_team (str): Club value (e.g., Club.Philadelphia)
        away_team (str): Club value (e.g., Club.NYCFC)
        pivot_stats (pd.DataFrame): DataFrame indexed by team, with avg_GF/GA split by home/away
        league_avg_goals_home (float): Average league goals per home match
        league_avg_goals_away (float): Average league goals per away match

    Returns:
        tuple: (expected_goals_home, expected_goals_away)
    """
    home_attack = pivot_stats.loc[home_team.value, "avg_GF_home"]
    away_defense = pivot_stats.loc[away_team.value, "avg_GA_away"]
    expected_goals_home = (home_attack * away_defense / league_avg_goals_home) + home_edge

    away_attack = pivot_stats.loc[away_team.value, "avg_GF_away"]
    home_defense = pivot_stats.loc[home_team.value, "avg_GA_home"]
    expected_goals_away = away_attack * home_defense / league_avg_goals_away

    if decimals is not None:
        expected_goals_home = round(expected_goals_home, decimals)
        expected_goals_away = round(expected_goals_away, decimals)

    return expected_goals_home, expected_goals_away


In [14]:
from scipy.stats import poisson
import numpy as np

def predict_poisson_outcomes(lambda_home: float, lambda_away: float, max_goals: int = 6):
    """
    Calculates probabilities of match outcomes based on expected goals using Poisson distribution.

    Args:
        lambda_home (float): Expected goals for the home team
        lambda_away (float): Expected goals for the away team
        max_goals (int): Max number of goals to consider in the Poisson distribution

    Returns:
        dict: Dictionary with outcome probabilities and optional scoreline matrix
    """
    # Create matrix of score probabilities
    home_goals = np.arange(0, max_goals + 1)
    away_goals = np.arange(0, max_goals + 1)

    prob_matrix = np.outer(
        poisson.pmf(home_goals, lambda_home),
        poisson.pmf(away_goals, lambda_away)
    )

    home_win_prob = np.tril(prob_matrix, -1).sum()
    draw_prob = np.trace(prob_matrix)
    away_win_prob = np.triu(prob_matrix, 1).sum()

    most_likely_score = np.unravel_index(np.argmax(prob_matrix), prob_matrix.shape)

    return {
        "home_win_prob": round(home_win_prob, 4),
        "draw_prob": round(draw_prob, 4),
        "away_win_prob": round(away_win_prob, 4),
        "most_likely_score": f"{most_likely_score[0]}–{most_likely_score[1]}",
        "score_matrix": prob_matrix  # optional for deeper inspection
    }


In [15]:
def predict_poisson_outcomes_two(
        lambda_home: float,
        lambda_away: float,
        max_goals: int = 10,      # safer ceiling
        rho: float | None = None  # Dixon–Coles tweak; None => independent
):
    # score grid
    goals = np.arange(0, max_goals + 1)
    home_pmf = poisson.pmf(goals, lambda_home)
    away_pmf = poisson.pmf(goals, lambda_away)
    prob_matrix = np.outer(home_pmf, away_pmf)

    if rho is not None:                       # Dixon–Coles correlation fudge
        for i in (0, 1):
            for j in (0, 1):
                if i == j == 0:
                    k = -rho
                elif i == j == 1:
                    k = rho
                else:
                    k = 0.0
                prob_matrix[i, j] *= np.exp(k)

        prob_matrix /= prob_matrix.sum()      # renormalise

    # outcomes
    home_win = np.tril(prob_matrix, -1).sum()
    draw     = np.trace(prob_matrix)
    away_win = np.triu(prob_matrix,  1).sum()

    tail = 1 - (home_win + draw + away_win)   # probability ≥ max_goals+1
    # Optionally spread *tail* proportionally or keep as a remainder.

    ml_idx = np.unravel_index(np.argmax(prob_matrix), prob_matrix.shape)

    return dict(
        home_win_prob = round(home_win, 4),
        draw_prob     = round(draw, 4),
        away_win_prob = round(away_win, 4),
        remainder     = round(tail, 6),
        most_likely_score = {
            "score": f"{ml_idx[0]}–{ml_idx[1]}",
            "prob": round(prob_matrix[ml_idx], 4)
        },
        score_matrix = prob_matrix
    )


In [None]:
# weighted_frame = build_weighted_team_stats(df, 120, pd.Timestamp.now())

pivot_stats = build_weighted_team_stats(df, half_life=120, today=pd.Timestamp.now())

# league means you’ll feed into calculate_expected_goals
league_avg_home = pivot_stats["avg_GF_home"].mean()
league_avg_away = pivot_stats["avg_GF_away"].mean()

data = calculate_expected_goals(Club.Chicago, Club.Philadelphia, pivot_stats, league_avg_home, league_avg_away)
outcome = predict_poisson_outcomes_two(0.9491985237019736, 1.7045041487107573, 10, -0.129)

outcome

(np.float64(0.9491985237019736), np.float64(1.7045041487107573))
-0.12857923256192877


  .apply(lambda g: pd.Series({


{'home_win_prob': np.float64(0.2099),
 'draw_prob': np.float64(0.2364),
 'away_win_prob': np.float64(0.5537),
 'remainder': np.float64(0.0),
 'most_likely_score': {'score': '0–1', 'prob': np.float64(0.1205)},
 'score_matrix': array([[8.04112358e-02, 1.20473301e-01, 1.02673621e-01, 5.83358709e-02,
         2.48584335e-02, 8.47426060e-03, 2.40740206e-03, 5.86203828e-04,
         1.24898357e-04, 2.36544187e-05, 4.03190547e-06],
        [6.70887658e-02, 1.00513380e-01, 9.74576493e-02, 5.53723225e-02,
         2.35955884e-02, 8.04375565e-03, 2.28510248e-03, 5.56423808e-04,
         1.18553336e-04, 2.24527393e-05, 3.82707872e-06],
        [3.18402787e-02, 5.42718872e-02, 4.62533284e-02, 2.62796634e-02,
         1.11984488e-02, 3.81756050e-03, 1.08450795e-03, 2.64078329e-04,
         5.62653259e-05, 1.06560535e-05, 1.81632874e-06],
        [1.00742485e-02, 1.71715984e-02, 1.46345304e-02, 8.31487257e-03,
         3.54318370e-03, 1.20787426e-03, 3.43137782e-04, 8.35542533e-05,
         1.780232