In [51]:
import pandas as pd
import scipy
import glob
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy
import os 

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 1000)
pd.set_option('max_colwidth', 100)

# Data

In [52]:
data_code_dict = {
    '9':'Premier League',
    '10':'Championship',
    '11':'Serie A',
    '12':'La Liga',
    '13':'Ligue 1',
    '20':'Bundesliga',
    '32':'Primeira Liga',
    '23':'Eredivisie'
}

def read_data(fileDir, league_name, season):
    data = pd.read_csv(fileDir)
    data["league"] = league_name
    data["season"] = season
    return data

base_dir = "C:/MyDevelopment/Goalscorers/goal_expectancies/odds_data/"
df = []

for file in os.listdir(base_dir):
    if "odds" in file:
        fh = file.split("-")
        season = "-".join(fh[0:2])
        league = data_code_dict[fh[3]]
        season_data = pd.read_csv(base_dir+file)
        season_data["league_name"] = league
        season_data["season"] = season
        df.append(season_data)
        
df = pd.concat(df, axis=0)

In [53]:
df.Date

0        04/08/17
1        04/08/17
2        05/08/17
3        05/08/17
4        05/08/17
          ...    
375    28/05/2023
376    28/05/2023
377    28/05/2023
378    28/05/2023
379    28/05/2023
Name: Date, Length: 17765, dtype: object

In [54]:
av_odds = df.groupby(['league_name', 'season'])['Avg>2.5'].agg(['count', lambda x: x.isnull().sum()]).reset_index()
av_odds

Unnamed: 0,league_name,season,count,<lambda_0>
0,Bundesliga,2017-2018,0,306
1,Bundesliga,2018-2019,0,306
2,Bundesliga,2019-2020,306,0
3,Bundesliga,2020-2021,306,0
4,Bundesliga,2021-2022,306,0
5,Bundesliga,2022-2023,306,0
6,Championship,2017-2018,0,552
7,Championship,2018-2019,0,552
8,Championship,2019-2020,552,0
9,Championship,2020-2021,552,0


# Maths

In [55]:
def odds_to_probs(odds: np.array) -> np.array:
    probs = 1.0 / odds
    probs = probs / np.sum(probs)  # remove overround
    return probs

In [56]:
def calc_score_matrix(home_exp: float, away_exp: float, max_goals=10) -> np.array:
    home_team_goals = scipy.stats.poisson.pmf(np.arange(0, max_goals+1), home_exp)
    away_team_goals = scipy.stats.poisson.pmf(np.arange(0, max_goals+1), away_exp)
    score_matrix = np.outer(home_team_goals, away_team_goals)
    return score_matrix

In [57]:
def wdw_probabilities(score_matrix: np.array) -> np.array:  # [home_win_prob, draw_prob, away_win_prob]
    home_win_prob = np.tril(score_matrix, -1).sum()
    draw_prob = np.diag(score_matrix).sum()
    away_win_prob = np.triu(score_matrix, 1).sum()
    return np.array([home_win_prob, draw_prob, away_win_prob])

In [58]:
def ou_probabilities(score_matrix: np.array, line=2.5) -> np.array:  # [over_prob, under_prob]
    under_prob = 0
    for home_goals in range(int(np.floor(line)) + 1):
        for away_goals in range(int(np.floor(line)) + 1 - home_goals):
            under_prob += score_matrix[home_goals][away_goals]
    over_prob = 1 - under_prob    
    return np.array([over_prob, under_prob])

In [59]:
def params_to_vars(params: np.array) -> tuple:
    home_exp, away_exp = params[0], params[1]
    return home_exp, away_exp

In [60]:
def _mse(params: np.array, wdw_obs: np.array, ou_obs: np.array) -> float:
    exp_params = np.exp(params)
    home_exp, away_exp = params_to_vars(exp_params)
    
    score_matrix = calc_score_matrix(home_exp, away_exp)
    
    wdw_probs = wdw_probabilities(score_matrix)
    ou_probs = ou_probabilities(score_matrix)    
    
    pred = np.concatenate([wdw_probs, ou_probs])
    obs = np.concatenate([wdw_obs, ou_obs])
    
    mse = np.sum((pred - obs)**2)
    
    return mse

In [61]:
def goal_expectation(home_odds: float, draw_odds: float, away_odds: float, over_odds: float, under_odds: float) -> dict:
    wdw_odds = np.array([home_odds, draw_odds, away_odds])
    ou_odds = np.array([over_odds, under_odds])    
    
    wdw_obs = odds_to_probs(wdw_odds)
    ou_obs = odds_to_probs(ou_odds)
    
    options = {
        "maxiter": 1000,
        "disp": False,
    }    
    
    res = scipy.optimize.minimize(
        fun=_mse,
        x0=[1.5, 0.75],
        args=(wdw_obs, ou_obs),
        options=options
    )
    
    output = {
        "home_exp": res["x"][0],
        "away_exp": res["x"][1],
        "error": res["fun"],
        "success": res["success"],
    }      

    return output

In [62]:
output = list()
for i, row in df.iterrows():
    #pick over under odds
    if np.isnan(row["P>2.5"]):
        if np.isnan(row["BbAv>2.5"]):
            over = row["Avg>2.5"]
            under = row["Avg<2.5"]
        else:
            over = row["BbAv>2.5"]
            under = row["BbAv<2.5"]
    else:
        over = row["P>2.5"]
        under = row["P<2.5"]
        
    #pick wdw odds
    if np.isnan(row["PSH"]):
        if np.isnan(row["B365H"]):
            home = row["AvgH"]
            draw = row["AvgD"]
            away = row["AvgA"]
        else:
            home = row["B365H"]
            draw = row["B365D"]
            away = row["B365A"]
    else:
        home = row["PSH"]
        draw = row["PSD"]
        away = row["PSA"]
        
        
    res = goal_expectation(home, draw, away, over, under)
    
    
    tmp = {
        "league_name": row["league_name"],
        "season":row["season"],
        "date": row["Date"],
        "home_team": row["HomeTeam"],
        "away_team": row["AwayTeam"],
        "home_exp": np.exp(res["home_exp"]),
        "away_exp": np.exp(res["away_exp"]),       
        "success": res["success"],
        "error": res["error"]
    }

    output.append(tmp)

output = pd.DataFrame(output)  

In [63]:
# Convert column to datetime
for index, row in output.iterrows():
    try:
        output.at[index, 'date'] = pd.to_datetime(row['date'], format='%d/%m/%Y')
    except ValueError:
        output.at[index, 'date'] = pd.to_datetime(row['date'], format='%d/%m/%y')

In [64]:
output['date'] = pd.to_datetime(output['date']).dt.strftime('%Y-%m-%d')

In [12]:
#output['date'] = pd.to_datetime(output['date']).dt.strftime('%Y-%m-%d')

  output['date'] = pd.to_datetime(output['date']).dt.strftime('%Y-%m-%d')


In [65]:
output[["league_name", "season","success"]].value_counts().sort_index()

league_name     season     success
Bundesliga      2017-2018  True       306
                2018-2019  True       306
                2019-2020  True       306
                2020-2021  True       306
                2021-2022  True       306
                2022-2023  True       306
Championship    2017-2018  True       552
                2018-2019  True       552
                2019-2020  True       552
                2020-2021  True       552
                2021-2022  True       552
                2022-2023  True       552
Eredivisie      2017-2018  True       306
                2018-2019  True       306
                2019-2020  True       232
                2020-2021  True       306
                2021-2022  True       306
                2022-2023  True       306
La Liga         2017-2018  True       380
                2018-2019  True       380
                2019-2020  True       380
                2020-2021  True       380
                2021-2022  True       380

In [66]:
output[output.success == False]

Unnamed: 0,league_name,season,date,home_team,away_team,home_exp,away_exp,success,error
9384,Serie A,2020-2021,2020-10-19,Verona,Genoa,4.481689,2.117,False,
12540,Serie A,2021-2022,2022-01-10,Torino,Fiorentina,4.481689,2.117,False,


In [67]:
output.query("home_team == 'Barcelona' & season == '2017-2018'")

Unnamed: 0,league_name,season,date,home_team,away_team,home_exp,away_exp,success,error
938,La Liga,2017-2018,2017-08-20,Barcelona,Betis,2.765642,0.698084,True,5.344035e-05
953,La Liga,2017-2018,2017-09-09,Barcelona,Espanol,2.968079,0.671206,True,5.434302e-07
972,La Liga,2017-2018,2017-09-19,Barcelona,Eibar,3.343778,0.651951,True,1.41129e-05
997,La Liga,2017-2018,2017-10-01,Barcelona,Las Palmas,3.801048,0.711705,True,3.19382e-06
1012,La Liga,2017-2018,2017-10-21,Barcelona,Malaga,3.689497,0.583533,True,0.0002165514
1034,La Liga,2017-2018,2017-11-04,Barcelona,Sevilla,2.9858,0.811698,True,3.898006e-08
1065,La Liga,2017-2018,2017-12-02,Barcelona,Celta,3.324934,0.728675,True,2.387901e-05
1086,La Liga,2017-2018,2017-12-17,Barcelona,La Coruna,3.560462,0.620429,True,6.533799e-05
1106,La Liga,2017-2018,2018-01-07,Barcelona,Levante,3.887656,0.561855,True,0.0001585758
1137,La Liga,2017-2018,2018-01-28,Barcelona,Alaves,3.377783,0.574813,True,8.878442e-05


In [69]:
output.to_csv("clean_data/match_expectancies.csv", index=False)

In [68]:
output.query("league_name == 'La Liga'")

Unnamed: 0,league_name,season,date,home_team,away_team,home_exp,away_exp,success,error
932,La Liga,2017-2018,2017-08-18,Leganes,Alaves,1.305313,0.790534,True,0.000354
933,La Liga,2017-2018,2017-08-18,Valencia,Las Palmas,1.897996,1.104446,True,0.000782
934,La Liga,2017-2018,2017-08-19,Celta,Sociedad,1.386451,1.196485,True,0.000959
935,La Liga,2017-2018,2017-08-19,Girona,Ath Madrid,0.643340,1.835199,True,0.000107
936,La Liga,2017-2018,2017-08-19,Sevilla,Espanol,1.929472,0.916639,True,0.000612
...,...,...,...,...,...,...,...,...,...
16082,La Liga,2022-2023,2023-06-04,Betis,Valencia,1.394369,1.511490,True,0.000795
16083,La Liga,2022-2023,2023-06-04,Celta,Barcelona,1.421056,1.433893,True,0.000004
16084,La Liga,2022-2023,2023-06-04,Elche,Cadiz,1.320692,1.317974,True,0.001231
16085,La Liga,2022-2023,2023-06-04,Espanol,Almeria,1.518784,1.632685,True,0.001099


In [70]:
df.query("season == '2017-2018' & league_name == 'La Liga'")

Unnamed: 0,Div,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,Referee,HS,AS,HST,AST,HF,AF,HC,AC,HY,AY,HR,AR,B365H,B365D,B365A,BWH,BWD,BWA,IWH,IWD,IWA,LBH,LBD,LBA,PSH,PSD,PSA,WHH,WHD,WHA,VCH,VCD,VCA,Bb1X2,BbMxH,BbAvH,BbMxD,BbAvD,BbMxA,BbAvA,BbOU,BbMx>2.5,BbAv>2.5,BbMx<2.5,BbAv<2.5,BbAH,BbAHh,BbMxAHH,BbAvAHH,BbMxAHA,BbAvAHA,PSCH,PSCD,PSCA,league_name,season,Time,MaxH,MaxD,MaxA,AvgH,AvgD,AvgA,B365>2.5,B365<2.5,P>2.5,P<2.5,Max>2.5,Max<2.5,Avg>2.5,Avg<2.5,AHh,B365AHH,B365AHA,PAHH,PAHA,MaxAHH,MaxAHA,AvgAHH,AvgAHA,B365CH,B365CD,B365CA,BWCH,BWCD,BWCA,IWCH,IWCD,IWCA,WHCH,WHCD,WHCA,VCCH,VCCD,VCCA,MaxCH,MaxCD,MaxCA,AvgCH,AvgCD,AvgCA,B365C>2.5,B365C<2.5,PC>2.5,PC<2.5,MaxC>2.5,MaxC<2.5,AvgC>2.5,AvgC<2.5,AHCh,B365CAHH,B365CAHA,PCAHH,PCAHA,MaxCAHH,MaxCAHA,AvgCAHH,AvgCAHA
0,SP1,18/08/17,Leganes,Alaves,1,0,H,1.0,0.0,H,,16.0,6.0,9.0,3.0,14.0,18.0,4.0,2.0,0.0,1.0,0.0,0.0,2.05,3.2,4.1,2.05,3.1,4.1,2.1,3.4,3.5,2.05,3.0,4.2,2.03,3.25,4.52,2.05,3.1,4.0,2.05,3.2,4.4,35.0,2.12,2.03,3.4,3.15,4.52,4.17,31.0,2.84,2.68,1.53,1.46,18.0,-0.5,2.07,2.03,1.9,1.86,1.98,3.35,4.63,La Liga,2017-2018,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,SP1,18/08/17,Valencia,Las Palmas,1,0,H,1.0,0.0,H,,22.0,5.0,6.0,4.0,25.0,13.0,5.0,2.0,3.0,3.0,0.0,1.0,1.75,3.8,4.5,1.75,3.9,4.6,1.75,3.6,4.8,1.75,3.8,4.33,1.78,4.01,4.83,1.8,3.75,4.2,1.8,4.0,4.6,35.0,1.83,1.77,4.04,3.86,4.83,4.46,33.0,1.69,1.64,2.4,2.27,16.0,-0.75,2.05,1.97,1.96,1.91,1.78,4.24,4.43,La Liga,2017-2018,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,SP1,19/08/17,Celta,Sociedad,2,3,A,1.0,1.0,D,,16.0,13.0,5.0,6.0,12.0,11.0,5.0,4.0,3.0,1.0,0.0,0.0,2.38,3.25,3.2,2.4,3.3,3.0,2.5,3.3,2.85,2.35,3.25,3.0,2.44,3.4,3.16,2.4,3.4,2.9,2.4,3.4,3.13,35.0,2.5,2.39,3.5,3.32,3.2,3.01,34.0,2.03,1.98,1.9,1.84,18.0,-0.25,2.08,2.05,1.87,1.83,2.12,3.53,3.74,La Liga,2017-2018,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,SP1,19/08/17,Girona,Ath Madrid,2,2,D,2.0,0.0,H,,13.0,9.0,6.0,3.0,15.0,15.0,6.0,0.0,2.0,4.0,0.0,1.0,8.0,4.33,1.45,7.5,4.33,1.45,7.2,4.4,1.45,7.5,4.0,1.5,8.36,4.38,1.49,8.0,4.2,1.44,7.5,4.3,1.5,35.0,8.36,7.53,4.4,4.17,1.51,1.48,34.0,2.2,2.11,1.8,1.74,16.0,1.25,1.77,1.75,2.25,2.16,6.93,3.83,1.63,La Liga,2017-2018,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,SP1,19/08/17,Sevilla,Espanol,1,1,D,1.0,1.0,D,,9.0,9.0,4.0,6.0,14.0,12.0,7.0,3.0,2.0,4.0,1.0,0.0,1.62,4.0,5.5,1.62,3.9,5.75,1.55,4.0,6.2,1.6,3.9,5.5,1.62,4.17,6.18,1.67,3.6,5.5,1.65,4.0,5.75,35.0,1.69,1.63,4.17,3.93,6.2,5.58,33.0,1.81,1.75,2.14,2.09,16.0,-1.0,2.12,2.06,1.86,1.82,1.64,4.18,5.82,La Liga,2017-2018,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
5,SP1,20/08/17,Ath Bilbao,Getafe,0,0,D,0.0,0.0,D,,12.0,8.0,2.0,2.0,16.0,15.0,7.0,6.0,1.0,3.0,0.0,1.0,1.5,4.0,7.5,1.48,4.25,7.0,1.5,4.2,6.5,1.5,4.0,7.0,1.53,4.37,7.31,1.5,4.0,7.0,1.5,4.2,7.0,34.0,1.53,1.5,4.4,4.17,7.5,6.94,32.0,2.01,1.94,1.96,1.87,17.0,-1.0,1.9,1.86,2.05,2.01,1.53,4.48,6.91,La Liga,2017-2018,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
6,SP1,20/08/17,Barcelona,Betis,2,0,H,2.0,0.0,H,,15.0,3.0,2.0,0.0,16.0,15.0,8.0,0.0,2.0,1.0,0.0,0.0,1.17,8.0,15.0,1.18,7.5,14.5,1.17,7.5,15.0,1.2,6.5,15.0,1.22,7.35,15.5,1.22,6.0,13.0,1.2,7.0,13.0,35.0,1.22,1.19,8.0,7.11,17.0,13.85,27.0,1.44,1.4,3.1,2.88,17.0,-2.0,2.05,2.0,1.91,1.86,1.2,8.25,15.2,La Liga,2017-2018,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
7,SP1,20/08/17,La Coruna,Real Madrid,0,3,A,0.0,2.0,A,,12.0,16.0,6.0,8.0,16.0,12.0,4.0,4.0,5.0,1.0,0.0,1.0,9.5,5.75,1.3,9.25,5.75,1.3,7.5,5.5,1.35,9.5,5.25,1.3,10.26,5.79,1.33,11.0,4.5,1.33,9.5,5.75,1.3,35.0,11.45,9.68,5.86,5.44,1.35,1.31,27.0,1.5,1.46,2.95,2.64,16.0,1.5,2.03,1.98,1.95,1.89,12.4,7.0,1.26,La Liga,2017-2018,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
8,SP1,21/08/17,Levante,Villarreal,1,0,H,0.0,0.0,D,,14.0,9.0,3.0,1.0,18.0,14.0,11.0,6.0,1.0,3.0,0.0,0.0,3.25,3.25,2.3,3.25,3.2,2.3,3.3,3.35,2.2,3.25,3.1,2.3,3.36,3.24,2.36,3.1,3.1,2.4,3.25,3.25,2.3,34.0,3.5,3.26,3.35,3.17,2.4,2.31,32.0,2.42,2.36,1.63,1.58,15.0,0.25,1.93,1.89,2.03,1.98,3.31,3.32,2.4,La Liga,2017-2018,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
9,SP1,21/08/17,Malaga,Eibar,0,1,A,0.0,0.0,D,,10.0,13.0,4.0,6.0,16.0,15.0,3.0,7.0,2.0,3.0,0.0,0.0,2.1,3.3,3.7,2.15,3.3,3.5,2.1,3.4,3.5,2.1,3.1,3.4,2.24,3.36,3.49,2.2,3.3,3.3,2.15,3.3,3.5,34.0,2.28,2.18,3.4,3.26,3.7,3.43,32.0,2.25,2.14,1.76,1.7,17.0,-0.25,1.92,1.88,2.04,1.99,2.2,3.27,3.85,La Liga,2017-2018,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
