In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import matplotlib as mpl
from joblib import dump,load
import sys
sys.path.append("../..")
from config.settings import DIRECTORY_COMBINED_MATCHES_CLEAN, Club, Features, get_features, extract_number

mpl.rcParams['axes.grid'] = "True"
mpl.rcParams['axes.spines.top'] = "False"
mpl.rcParams['axes.spines.right'] = "False"
mpl.rcParams['xtick.labelsize'] = 20
mpl.rcParams['ytick.labelsize'] = 20

df_mls_matches = pd.read_csv(DIRECTORY_COMBINED_MATCHES_CLEAN)
df_mls_matches = df_mls_matches[df_mls_matches['Season'] == 2022]
df_mls_matches = df_mls_matches.sort_values(by="Date")

all_teams = df_mls_matches["Team"].unique()

df_mls_odds = pd.read_csv("../../data/MLS_Matches_odds_clean.csv")
df_mls_odds = df_mls_odds[df_mls_odds['Season'] == 2022]

df_mls_odds = df_mls_odds.sort_values(by='Date')

      Unnamed: 0     SCA    GCA   GF     SH    SoT  npxG Result  \
0              0    30.0    0.0    0   15.0    5.0   1.3      D   
1              1    24.0    2.0    1   13.0    3.0   0.9      L   
2              2     6.0    0.0    0    4.0    0.0   0.3      L   
3              3    27.0    3.0    2   15.0    5.0   1.4      W   
4              4    26.0    2.0    1   14.0    3.0   2.3      D   
...          ...     ...    ...  ...    ...    ...   ...    ...   
4269        1092    22.0    6.0  4.0   13.0    9.0   1.7      W   
4270        1093    31.0   11.0  6.0   16.0    9.0   3.3      W   
4271        1094    23.0    2.0  1.0   13.0    5.0   0.8      W   
4272        1095    15.0    3.0  2.0   12.0    6.0   2.1      W   
4273        1096  1004.0  148.0  NaN  561.0  233.0   NaN     --   

                    Team        Date  Season  
0     Philadelphia-Union  2021-04-18    2021  
1     Philadelphia-Union  2021-04-24    2021  
2     Philadelphia-Union  2021-05-01    2021  
3     P

In [2]:
def calculate_winnings(odds, bet_amount=1):
    if np.isnan(odds):
        return 0
    elif (odds > 0):
        # Positive odds
        return (odds / 100) * bet_amount
    else:
        # Negative odds
        return (100 / abs(odds)) * bet_amount

In [3]:
def split_data_odds(fixture):
    matches = fixture*14
    df1 = df_mls_odds.iloc[:matches, :]
    df2 = df_mls_odds.iloc[matches:matches+14, :]
    return df1, df2


In [4]:
def split_data_matches(fixture):
    matches = fixture*28
    df1 = df_mls_matches.iloc[:matches, :]
    df2 = df_mls_matches.iloc[matches:matches+28, :]
    return df1, df2

In [None]:
def calc_feats_match(df, team):
    team_stats = dict()
    df_team = df[(df["Team"]==team)]
    team_shots = df_team.apply(lambda x: x["SH"], axis=1).sum()
    team_shots_ot = df_team.apply(lambda x: x["SoT"], axis=1).sum()
    team_shots_sca = df_team.apply(lambda x: x["SCA"], axis=1).sum()
    team_shots_gca = df_team.apply(lambda x: x["GCA"], axis=1).sum()
    team_shots_npxG = df_team.apply(lambda x: x["npxG"], axis=1).sum()
    team_shots_gf = df_team.apply(lambda x: extract_number(x["GF"]), axis=1).sum()
    team_points = df_team.apply(lambda x: 3 if (x["Result"].startswith("W")) else (1 if x["Result"].startswith("D") else 0), axis=1).sum()

    team_stats[Features.Team.value] = team
    team_stats[Features.Sh.value] = team_shots
    team_stats[Features.SoT.value] = team_shots_ot
    team_stats[Features.SCA.value] = team_shots_sca
    team_stats[Features.GCA.value] = team_shots_gca
    team_stats[Features.GF.value] = team_shots_gf
    team_stats[Features.npxG.value] = team_shots_npxG
    team_stats[Features.Pts.value] = team_points
    return team_stats



In [6]:
def calc_points(df, team, team_stats):
    df_team = df[(df["Home Team"]==team) | (df["Away Team"]==team)]
    team_points = df_team.apply(lambda x: 3 if ((x["Home Team"] == team and x["Full Time Result"].startswith("H")) or (x["Away Team"] == team and x["Full Time Result"].startswith("A"))) else (1 if x["Full Time Result"] == "D" else 0), axis=1).sum() 
    team_stats["points_2"] = team_points
    return team_stats

In [21]:
def get_match_day_results(match_day, ht, at, dt, hc, ac, dc):
    df_mls_fix , df_mls_match = split_data_matches(match_day)
    df_mls_odds_fix , df_mls_odds = split_data_odds(match_day)

    all_teams_stats = []

    for team in all_teams:
        team_stats = calc_feats_match(df_mls_fix, team)
        team_stats = calc_points(df_mls_odds_fix, team, team_stats)
        all_teams_stats.append(team_stats)

    df_all_stats = pd.DataFrame(all_teams_stats)

    from joblib import dump,load
    features = df_all_stats[get_features()]

    model = load('../../data/mls_model.joblib')

    model.predict(features)

    df_all_stats["exp_points"] = model.predict(features)
    df_all_stats["exp_points_diff"] = (df_all_stats["exp_points"] - df_all_stats["Pts"])

    df_mls_odds[["Home Team", "Away Team", "Industry Average Away Team Win Odds (American)", "Industry Average Draw Odds (American)", "Industry Average Home Team Win Odds (American)"]]

    predictions = []

    for ind, row in df_mls_odds.iterrows():
        home_team = row["Home Team"]
        away_team = row["Away Team"]
        home_exp_points = df_all_stats[ df_all_stats["Team"]==home_team]["exp_points" ].values[0]
        away_exp_points = df_all_stats[ df_all_stats["Team"]==away_team]["exp_points" ].values[0]

        exp_points_diff_teams = (home_exp_points - away_exp_points)
        if row["Full Time Result"].startswith("H"):
            ht.append(exp_points_diff_teams)
            hc.append(1)
        elif row["Full Time Result"].startswith("A"):
            at.append(exp_points_diff_teams)
            ac.append(1)
        else:
            dt.append(exp_points_diff_teams)
            dc.append(1)

        if exp_points_diff_teams > 5:
            result = "H"
            odds = row["Industry Average Home Team Win Odds (American)"]
        elif exp_points_diff_teams < -3:
            result = "A"
            odds = row["Industry Average Away Team Win Odds (American)"]
            # result = None
            # odds = None
        else:
            result = None
            odds = None
        # else:
        #     result = "D"
        #     odds = row["Industry Average Draw Odds (American)"]

        pred = {"Home Team": home_team, "Away Team": away_team, "Prediction": result, "Odds": odds, "EXP": exp_points_diff_teams}
        predictions.append(pred)

    df_slip = pd.DataFrame(predictions)

    df_slip = df_mls_odds.merge(df_slip, on=["Home Team", "Away Team"])[["Home Team", "Away Team", "Prediction", "Odds", "Full Time Result", "EXP"]]

    vectorized_calculate_winnings = np.vectorize(calculate_winnings)

    conditions = [
    (df_slip['Prediction'] == df_slip['Full Time Result']),  # Prediction matches result
    (df_slip['Prediction'].isna())  # No prediction
    ]

    # Choices corresponding to each condition
    choices = [
    vectorized_calculate_winnings(df_slip['Odds'], 1),  # Calculate winnings
    0  # Set to 0 for no prediction
    ]

    df_slip['Win'] = np.select(conditions, choices, default=-1)

    # df_slip["Win"] = np.where( df_slip["Prediction"]==df_slip["Full Time Result"], calculate_winnings(df_slip["Odds"], 1), -1)
    return {"sum": df_slip["Win"].sum(), "ht": ht, "hc": hc, "at": at, "ac": ac, "dt": dt, "dc": dc}



summ = 0
lower_bound = 10
upper_bound = 34

bound_dif = upper_bound - lower_bound

average = (bound_dif * 28) if bound_dif > 0 else 28

home_total = []
away_total = []
draw_total = []

home_count = []
away_count = []
draw_count = []

test = []
for x in range(lower_bound, upper_bound):
    result = get_match_day_results(x, home_total, away_total, draw_total, home_count, away_count, draw_count)
    summ += result["sum"]

print(np.median(home_total), np.median(away_total), np.median(draw_total))
print(summ, np.sum(home_total) / len(home_count), np.sum(away_total) / len(away_count), np.sum(draw_total) / len(draw_count))

0.3824193713292132 -1.2575734997388643 -0.18043056685065118
-203.0 0.30692185356596896 -1.8594755483803704 0.19144492939942137


In [None]:
df_mls_fix4 , df_mls_match10 = split_data_matches(33)
df_mls_odds_fix4 , df_mls_odds_10 = split_data_odds(33)

all_teams_stats = []

for team in all_teams:
    team_stats = calc_feats_match(df_mls_fix4, team)
    team_stats = calc_points(df_mls_odds_fix4, team, team_stats)
    all_teams_stats.append(team_stats)

df_all_stats = pd.DataFrame(all_teams_stats)


      Unnamed: 0   SCA  GCA   GF    SH   SoT  npxG Result  \
971            0  36.0  2.0    1  19.0   5.0   1.1      D   
1088         117  40.0  6.0  4.0  22.0   7.0   3.4      W   
1166         195  29.0  0.0    0  17.0   3.0   1.6      D   
1246         275  31.0  5.0  3.0  18.0   7.0   3.0      W   
1287         316  22.0  0.0  0.0  11.0   5.0   1.2      L   
...          ...   ...  ...  ...   ...   ...   ...    ...   
1574         603  19.0  5.0  3.0  11.0   6.0   1.6      W   
1952         981  29.0  0.0    0  17.0   6.0   1.2      L   
2031        1060  34.0  8.0  4.0  20.0  12.0   1.9      W   
1993        1022  27.0  6.0    4  14.0   8.0   1.6      W   
1953         982  33.0  2.0    1  17.0   7.0   0.9      L   

                        Team        Date  Season  
971       Philadelphia-Union  2022-02-26    2022  
1088           Columbus-Crew  2022-02-26    2022  
1166             Inter-Miami  2022-02-26    2022  
1246      New-York-Red-Bulls  2022-02-26    2022  
1287        

In [None]:
features = df_all_stats[get_features()]
model = load('../../data/mls_model.joblib')

model.predict(features)

array([64.68313119, 40.09092147, 40.03009102, 51.30368684, 38.51619422,
       36.55849334, 45.32096862, 34.99893834, 35.79386756, 47.35830062,
       45.10797285, 48.76464329, 63.86270232, 60.85710123, 46.64511807,
       45.63718124, 46.66545092, 53.77218013, 56.06151333, 51.28822781,
       52.79523294, 44.99277851, 54.57058317, 53.36130816, 44.73233367,
       41.37509261, 44.93681017, 38.83272099])

In [14]:
df_all_stats["exp_points"] = model.predict(features)
df_all_stats["exp_points_diff"] = (df_all_stats["exp_points"] - df_all_stats["Pts"])
df_all_stats

Unnamed: 0,Team,SH,SoT,SCA,GCA,GF,npxG,Pts,points_2,exp_points,exp_points_diff
0,Philadelphia-Union,416.0,163.0,713.0,114.0,69,49.6,63,54,64.683131,1.683131
1,Columbus-Crew,396.0,121.0,688.0,70.0,41,34.7,41,30,40.090921,-0.909079
2,Inter-Miami,365.0,133.0,619.0,69.0,43,33.9,46,42,40.030091,-5.969909
3,New-York-Red-Bulls,496.0,159.0,690.0,76.0,59,35.1,62,42,51.303687,-10.696313
4,Charlotte-FC,362.0,135.0,586.0,59.0,43,31.2,41,39,38.516194,-2.483806
5,Chicago-Fire,403.0,122.0,678.0,61.0,35,37.9,36,30,36.558493,0.558493
6,New-England-Revolution,414.0,145.0,725.0,81.0,49,36.3,41,30,45.320969,4.320969
7,DC-United,343.0,102.0,556.0,57.0,35,31.8,30,21,34.998938,4.998938
8,Vancouver-Whitecaps-FC,352.0,101.0,624.0,59.0,36,30.9,37,36,35.793868,-1.206132
9,Minnesota-United,419.0,140.0,654.0,76.0,50,39.6,51,39,47.358301,-3.641699


In [17]:
df_mls_odds_10[["Home Team", "Away Team", "Industry Average Away Team Win Odds (American)", "Industry Average Draw Odds (American)", "Industry Average Home Team Win Odds (American)"]]

predictions = []
for ind, row in df_mls_odds_10.iterrows():
    home_team = row["Home Team"]
    away_team = row["Away Team"]
    home_exp_points = df_all_stats[ df_all_stats["Team"]==home_team]["exp_points" ].values[0]
    away_exp_points = df_all_stats[ df_all_stats["Team"]==away_team]["exp_points" ].values[0]

    exp_points_diff_teams = (home_exp_points - away_exp_points)
    print(exp_points_diff_teams)

    if exp_points_diff_teams > 2:
        result = "H"
        odds = row["Industry Average Home Team Win Odds (American)"]
    elif abs(exp_points_diff_teams) < 2:
        result = "D"
        odds = row["Industry Average Draw Odds (American)"]
    else:
        result = "A"
        odds = row["Industry Average Away Team Win Odds (American)"]

    pred = {"Home Team": home_team, "Away Team": away_team, "Prediction": result, "Odds": odds}
    predictions.append(pred)

df_slip = pd.DataFrame(predictions)

df_slip

19.575158346453755
4.641412200747077
1.7286407473547314
12.57447450938318
11.564433065433263
-9.931922304154384
12.78749261819192
-16.03142231141546
-18.77324179402909
-8.762475288257512
15.219919990494326
-13.195490562951072
-1.6523395622303454
-0.5660752156764275


Unnamed: 0,Home Team,Away Team,Prediction,Odds
0,Philadelphia-Union,Toronto-FC,H,-294
1,Orlando-City,Columbus-Crew,H,111
2,FC-Dallas,Sporting-Kansas-City,D,277
3,Los-Angeles-FC,Nashville-SC,H,-147
4,Minnesota-United,Vancouver-Whitecaps-FC,H,-135
5,Real-Salt-Lake,Portland-Timbers,A,358
6,New-York-Red-Bulls,Charlotte-FC,H,-161
7,Inter-Miami,CF-Montreal,A,156
8,DC-United,FC-Cincinnati,A,-149
9,Chicago-Fire,New-England-Revolution,A,178


In [18]:
df_slip = df_mls_odds_10.merge(df_slip, on=["Home Team", "Away Team"])[["Home Team", "Away Team", "Prediction", "Odds", "Full Time Result"]]

df_slip

Unnamed: 0,Home Team,Away Team,Prediction,Odds,Full Time Result
0,Philadelphia-Union,Toronto-FC,H,-294,Home Team Win
1,Orlando-City,Columbus-Crew,H,111,Home Team Win
2,FC-Dallas,Sporting-Kansas-City,D,277,Home Team Win
3,Los-Angeles-FC,Nashville-SC,H,-147,Away Team Win
4,Minnesota-United,Vancouver-Whitecaps-FC,H,-135,Home Team Win
5,Real-Salt-Lake,Portland-Timbers,A,358,Home Team Win
6,New-York-Red-Bulls,Charlotte-FC,H,-161,Home Team Win
7,Inter-Miami,CF-Montreal,A,156,Away Team Win
8,DC-United,FC-Cincinnati,A,-149,Away Team Win
9,Chicago-Fire,New-England-Revolution,A,178,Draw


In [19]:
df_slip["Win"] = df_slip.apply(
    lambda row: calculate_winnings(row["Odds"], 1) if row["Prediction"] == row["Full Time Result"] else -1,
    axis=1
)
df_slip

Unnamed: 0,Home Team,Away Team,Prediction,Odds,Full Time Result,Win
0,Philadelphia-Union,Toronto-FC,H,-294,Home Team Win,-1
1,Orlando-City,Columbus-Crew,H,111,Home Team Win,-1
2,FC-Dallas,Sporting-Kansas-City,D,277,Home Team Win,-1
3,Los-Angeles-FC,Nashville-SC,H,-147,Away Team Win,-1
4,Minnesota-United,Vancouver-Whitecaps-FC,H,-135,Home Team Win,-1
5,Real-Salt-Lake,Portland-Timbers,A,358,Home Team Win,-1
6,New-York-Red-Bulls,Charlotte-FC,H,-161,Home Team Win,-1
7,Inter-Miami,CF-Montreal,A,156,Away Team Win,-1
8,DC-United,FC-Cincinnati,A,-149,Away Team Win,-1
9,Chicago-Fire,New-England-Revolution,A,178,Draw,-1


In [20]:
sumWinnings = df_slip["Win"].sum()

sumWinnings


-14