In [77]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import matplotlib as mpl
from joblib import dump,load

mpl.rcParams['axes.grid'] = "True"
mpl.rcParams['axes.spines.top'] = "False"
mpl.rcParams['axes.spines.right'] = "False"
mpl.rcParams['xtick.labelsize'] = 20
mpl.rcParams['ytick.labelsize'] = 20

df_mls_matches = pd.read_csv("../../data/MLS_Matches_1823_clean.csv")

df_mls_matches_phl = df_mls_matches[(df_mls_matches["Team"] == 'philadelphia union') & (df_mls_matches['Season'] == 2023)]
# df_mls_matches_phl = df_mls_matches[(df_mls_matches["Team"] == 'philadelphia union')]
# df_mls_matches_phl = df_mls_matches[(df_mls_matches["Season"] == 2024)]
print(df_mls_matches_phl.shape)

df_mls_matches = df_mls_matches[df_mls_matches['Season'] == 2022]
df_mls_matches = df_mls_matches.sort_values(by="Date")


#     (df_mls_matches['Season'] == 2019) 
   
# ]
# print(df_mls_matches_phl)
all_teams = df_mls_matches["Team"].unique()

df_mls_odds = pd.read_csv("../../data/MLS_Matches_odds_clean.csv")
df_mls_odds = df_mls_odds[df_mls_odds['Season'] == 2022]

df_mls_odds = df_mls_odds.sort_values(by='Date')

(33, 13)


In [30]:
def calculate_winnings(odds, bet_amount=1):
    if np.isnan(odds):
        return 0
    elif (odds > 0):
        # Positive odds
        return (odds / 100) * bet_amount
    else:
        # Negative odds
        return (100 / abs(odds)) * bet_amount

In [31]:
# calculate_winnings(-114)

In [32]:
def split_data_odds(fixture):
    matches = fixture*14
    df1 = df_mls_odds.iloc[:matches, :]
    df2 = df_mls_odds.iloc[matches:matches+14, :]
    return df1, df2


In [33]:
def split_data_matches(fixture):
    matches = fixture*28
    df1 = df_mls_matches.iloc[:matches, :]
    df2 = df_mls_matches.iloc[matches:matches+28, :]
    return df1, df2



In [34]:
def calc_feats_match(df, team):
    team_stats = dict()
    df_team = df[(df["Team"]==team)]
    team_shots = df_team.apply(lambda x: x["Sh.2"], axis=1).sum()
    team_shots_ot = df_team.apply(lambda x: x["SoT"], axis=1).sum()
    team_shots_sca = df_team.apply(lambda x: x["SCA"], axis=1).sum()
    team_shots_gca = df_team.apply(lambda x: x["GCA"], axis=1).sum()
    team_shots_npxG = df_team.apply(lambda x: x["npxG"], axis=1).sum()
    team_shots_xA = df_team.apply(lambda x: x["xA"], axis=1).sum()
    team_points = df_team.apply(lambda x: 3 if (x["Result"].startswith("W")) else (1 if x["Result"].startswith("D") else 0), axis=1).sum()

    team_stats["team"] = team
    team_stats["shots"] = team_shots
    team_stats["shots_ot"] = team_shots_ot
    team_stats["sca"] = team_shots_sca
    team_stats["gca"] = team_shots_gca
    team_stats["npxG"] = team_shots_npxG
    team_stats["xA"] = team_shots_xA
    team_stats["points"] = team_points
    return team_stats



In [35]:
def calc_points(df, team, team_stats):
    df_team = df[(df["Home Team"]==team) | (df["Away Team"]==team)]
    team_points = df_team.apply(lambda x: 3 if ((x["Home Team"] == team and x["Full Time Result"].startswith("H")) or (x["Away Team"] == team and x["Full Time Result"].startswith("A"))) else (1 if x["Full Time Result"] == "D" else 0), axis=1).sum() 
    team_stats["points_2"] = team_points
    return team_stats

In [36]:
def get_match_day_results(match_day, ht, at, dt, hc, ac, dc):
    df_mls_fix , df_mls_match = split_data_matches(match_day)
    df_mls_odds_fix , df_mls_odds = split_data_odds(match_day)

    all_teams_stats = []

    for team in all_teams:
        team_stats = calc_feats_match(df_mls_fix, team)
        team_stats = calc_points(df_mls_odds_fix, team, team_stats)
        all_teams_stats.append(team_stats)

    df_all_stats = pd.DataFrame(all_teams_stats)

    from joblib import dump,load

    features = df_all_stats[["npxG","sca","gca","xA", "shots", "shots_ot"]]

    model = load('../../data/mls_model.joblib')

    model.predict(features)

    df_all_stats["exp_points"] = model.predict(features)
    df_all_stats["exp_points_diff"] = (df_all_stats["exp_points"] - df_all_stats["points"])

    df_mls_odds[["Home Team", "Away Team", "Industry Average Away Team Win Odds (American)", "Industry Average Draw Odds (American)", "Industry Average Home Team Win Odds (American)"]]

    predictions = []

    for ind, row in df_mls_odds.iterrows():
        home_team = row["Home Team"]
        away_team = row["Away Team"]
        home_exp_points = df_all_stats[ df_all_stats["team"]==home_team]["exp_points" ].values[0]
        away_exp_points = df_all_stats[ df_all_stats["team"]==away_team]["exp_points" ].values[0]

        exp_points_diff_teams = (home_exp_points - away_exp_points)
        if row["Full Time Result"].startswith("H"):
            ht.append(exp_points_diff_teams)
            hc.append(1)
        elif row["Full Time Result"].startswith("A"):
            at.append(exp_points_diff_teams)
            ac.append(1)
        else:
            dt.append(exp_points_diff_teams)
            dc.append(1)

        if exp_points_diff_teams > 5:
            result = "H"
            odds = row["Industry Average Home Team Win Odds (American)"]
        elif exp_points_diff_teams < -3:
            result = "A"
            odds = row["Industry Average Away Team Win Odds (American)"]
            # result = None
            # odds = None
        else:
            result = None
            odds = None
        # else:
        #     result = "D"
        #     odds = row["Industry Average Draw Odds (American)"]

        pred = {"Home Team": home_team, "Away Team": away_team, "Prediction": result, "Odds": odds, "EXP": exp_points_diff_teams}
        predictions.append(pred)

    df_slip = pd.DataFrame(predictions)

    df_slip = df_mls_odds.merge(df_slip, on=["Home Team", "Away Team"])[["Home Team", "Away Team", "Prediction", "Odds", "Full Time Result", "EXP"]]

    vectorized_calculate_winnings = np.vectorize(calculate_winnings)

    conditions = [
    (df_slip['Prediction'] == df_slip['Full Time Result']),  # Prediction matches result
    (df_slip['Prediction'].isna())  # No prediction
    ]

    # Choices corresponding to each condition
    choices = [
    vectorized_calculate_winnings(df_slip['Odds'], 1),  # Calculate winnings
    0  # Set to 0 for no prediction
    ]

    df_slip['Win'] = np.select(conditions, choices, default=-1)

    # df_slip["Win"] = np.where( df_slip["Prediction"]==df_slip["Full Time Result"], calculate_winnings(df_slip["Odds"], 1), -1)
    return {"sum": df_slip["Win"].sum(), "ht": ht, "hc": hc, "at": at, "ac": ac, "dt": dt, "dc": dc}



summ = 0
lower_bound = 10
upper_bound = 34

bound_dif = upper_bound - lower_bound

average = (bound_dif * 28) if bound_dif > 0 else 28

home_total = []
away_total = []
draw_total = []

home_count = []
away_count = []
draw_count = []

test = []
for x in range(lower_bound, upper_bound):
    result = get_match_day_results(x, home_total, away_total, draw_total, home_count, away_count, draw_count)
    summ += result["sum"]

print(np.median(home_total), np.median(away_total), np.median(draw_total))
print(summ, np.sum(home_total) / len(home_count), np.sum(away_total) / len(away_count), np.sum(draw_total) / len(draw_count))

0.19557139127550904 -2.2981512464604705 1.143753381465344
-205.0 0.636225255773203 -2.6499995511051715 1.2419008819659059


In [37]:
df_mls_fix4 , df_mls_match10 = split_data_matches(33)
df_mls_odds_fix4 , df_mls_odds_10 = split_data_odds(33)

print(df_mls_fix4)

all_teams_stats = []

for team in all_teams:
    team_stats = calc_feats_match(df_mls_fix4, team)
    team_stats = calc_points(df_mls_odds_fix4, team, team_stats)
    all_teams_stats.append(team_stats)

df_all_stats = pd.DataFrame(all_teams_stats)


      Unnamed: 0                Team              Opp  Season  npxG   SCA  \
2332        2332           austin fc    FC Cincinnati    2022   2.3  30.0   
2348        2348  philadelphia union    Minnesota Utd    2022   1.1  36.0   
2347        2347     colorado rapids   Los Angeles FC    2022   0.4  12.0   
2346        2346         d.c. united        Charlotte    2022   0.8  17.0   
2345        2345        ny red bulls         San Jose    2022   3.0  31.0   
...          ...                 ...              ...     ...   ...   ...   
3236        3236       fc cincinnati      D.C. United    2022   2.1  25.0   
3235        3235    seattle sounders         San Jose    2022   1.3  28.0   
3234        3234           austin fc  Colorado Rapids    2022   0.8  16.0   
3233        3233     colorado rapids           Austin    2022   1.2  30.0   
3232        3232        chicago fire      New England    2022   2.1  20.0   

      GCA  GF  Sh.2   xA  SoT Result        Date  
2332    5   5  19.0  2.6

In [38]:


features = df_all_stats[["npxG","sca","gca","xA", "shots", "shots_ot"]]
features
model = load('../../data/mls_model.joblib')

model.predict(features)

array([66.33662235, 59.30257774, 44.52242544, 42.27886866, 50.64091691,
       48.97962201, 56.75549277, 39.99883381, 45.74038438, 41.51623291,
       56.83005694, 42.32255835, 41.97502819, 43.19020217, 40.81253512,
       54.47476822, 56.94536668, 56.05487073, 47.84374498, 42.45835038,
       48.29649956, 42.0572002 , 59.56659911, 39.25468695, 36.54703712,
       38.24448169, 41.80683801, 46.31276603])

In [39]:
df_all_stats["exp_points"] = model.predict(features)
df_all_stats["exp_points_diff"] = (df_all_stats["exp_points"] - df_all_stats["points"])
df_all_stats

Unnamed: 0,team,shots,shots_ot,sca,gca,npxG,xA,points,points_2,exp_points,exp_points_diff
0,austin fc,446.0,141,764.0,108,49.9,41.8,56,48,66.336622,10.336622
1,philadelphia union,401.0,152,705.0,108,47.7,36.5,61,54,59.302578,-1.697422
2,colorado rapids,456.0,145,828.0,80,47.7,35.9,43,33,44.522425,1.522425
3,d.c. united,357.0,108,617.0,67,34.3,28.7,27,21,42.278869,15.278869
4,ny red bulls,440.0,141,731.0,78,37.1,26.6,50,42,50.640917,0.640917
5,new england revolution,431.0,145,791.0,87,40.5,36.7,42,30,48.979622,6.979622
6,los angeles fc,489.0,174,840.0,101,53.2,35.0,64,63,56.755493,-7.244507
7,charlotte,364.0,136,641.0,74,37.7,30.5,41,39,39.998834,-1.001166
8,toronto fc,364.0,133,651.0,82,36.0,32.4,34,27,45.740384,11.740384
9,fc dallas,360.0,119,631.0,71,36.5,28.4,44,39,41.516233,-2.483767


In [40]:
df_mls_odds_10[["Home Team", "Away Team", "Industry Average Away Team Win Odds (American)", "Industry Average Draw Odds (American)", "Industry Average Home Team Win Odds (American)"]]

predictions = []
for ind, row in df_mls_odds_10.iterrows():
    home_team = row["Home Team"]
    away_team = row["Away Team"]
    home_exp_points = df_all_stats[ df_all_stats["team"]==home_team]["exp_points" ].values[0]
    away_exp_points = df_all_stats[ df_all_stats["team"]==away_team]["exp_points" ].values[0]

    exp_points_diff_teams = (home_exp_points - away_exp_points)
    print(exp_points_diff_teams)

    if exp_points_diff_teams > 2:
        result = "H"
        odds = row["Industry Average Home Team Win Odds (American)"]
    elif abs(exp_points_diff_teams) < 2:
        result = "D"
        odds = row["Industry Average Draw Odds (American)"]
    else:
        result = "A"
        odds = row["Industry Average Away Team Win Odds (American)"]

    pred = {"Home Team": home_team, "Away Team": away_team, "Prediction": result, "Odds": odds}
    predictions.append(pred)

df_slip = pd.DataFrame(predictions)

df_slip

13.562193366074403
-3.854415645228329
4.9691957890187695
8.458993205946811
1.2151739759919025
-17.57536999846875
10.642083101610908
-17.244040761688957
-14.666498025826101
-8.167086892157393
21.814196903101376
-17.810389037082828
-12.667930214442563
5.786544778879886


Unnamed: 0,Home Team,Away Team,Prediction,Odds
0,philadelphia union,toronto fc,H,-294
1,orlando city,columbus crew,A,242
2,fc dallas,sporting kansas city,H,-112
3,los angeles fc,nashville sc,H,-147
4,minnesota united,vancouver whitecaps,D,315
5,real salt lake,portland timbers,A,358
6,ny red bulls,charlotte,H,-161
7,inter miami,cf montreal,A,156
8,d.c. united,fc cincinnati,A,-149
9,chicago fire,new england revolution,A,178


In [41]:
df_slip = df_mls_odds_10.merge(df_slip, on=["Home Team", "Away Team"])[["Home Team", "Away Team", "Prediction", "Odds", "Full Time Result"]]

df_slip

Unnamed: 0,Home Team,Away Team,Prediction,Odds,Full Time Result
0,philadelphia union,toronto fc,H,-294,Home Team Win
1,orlando city,columbus crew,A,242,Home Team Win
2,fc dallas,sporting kansas city,H,-112,Home Team Win
3,los angeles fc,nashville sc,H,-147,Away Team Win
4,minnesota united,vancouver whitecaps,D,315,Home Team Win
5,real salt lake,portland timbers,A,358,Home Team Win
6,ny red bulls,charlotte,H,-161,Home Team Win
7,inter miami,cf montreal,A,156,Away Team Win
8,d.c. united,fc cincinnati,A,-149,Away Team Win
9,chicago fire,new england revolution,A,178,Draw


In [43]:
df_slip["Win"] = df_slip.apply(
    lambda row: calculate_winnings(row["Odds"], 1) if row["Prediction"] == row["Full Time Result"] else -1,
    axis=1
)
df_slip

Unnamed: 0,Home Team,Away Team,Prediction,Odds,Full Time Result,Win
0,philadelphia union,toronto fc,H,-294,Home Team Win,-1
1,orlando city,columbus crew,A,242,Home Team Win,-1
2,fc dallas,sporting kansas city,H,-112,Home Team Win,-1
3,los angeles fc,nashville sc,H,-147,Away Team Win,-1
4,minnesota united,vancouver whitecaps,D,315,Home Team Win,-1
5,real salt lake,portland timbers,A,358,Home Team Win,-1
6,ny red bulls,charlotte,H,-161,Home Team Win,-1
7,inter miami,cf montreal,A,156,Away Team Win,-1
8,d.c. united,fc cincinnati,A,-149,Away Team Win,-1
9,chicago fire,new england revolution,A,178,Draw,-1


In [None]:
sumWinnings = df_slip["Win"].sum()

sumWinnings


-15.000000000000002