In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl

mpl.rcParams['axes.grid'] = "True"
mpl.rcParams['axes.spines.top'] = "False"
mpl.rcParams['axes.spines.right'] = "False"
mpl.rcParams['xtick.labelsize'] = 20
mpl.rcParams['ytick.labelsize'] = 20

df_epl_1920= pd.read_csv("https://www.football-data.co.uk/mmz4281/1920/E0.csv")

df_epl_1920.head(5)

Unnamed: 0,Div,Date,Time,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,...,AvgC<2.5,AHCh,B365CAHH,B365CAHA,PCAHH,PCAHA,MaxCAHH,MaxCAHA,AvgCAHH,AvgCAHA
0,E0,09/08/2019,20:00,Liverpool,Norwich,4,1,H,4,0,...,3.43,-2.25,1.91,1.99,1.94,1.98,1.99,2.07,1.9,1.99
1,E0,10/08/2019,12:30,West Ham,Man City,0,5,A,0,1,...,2.91,1.75,1.95,1.95,1.96,1.97,2.07,1.98,1.97,1.92
2,E0,10/08/2019,15:00,Bournemouth,Sheffield United,1,1,D,0,0,...,1.92,-0.5,1.95,1.95,1.98,1.95,2.0,1.96,1.96,1.92
3,E0,10/08/2019,15:00,Burnley,Southampton,3,0,H,0,0,...,1.71,0.0,1.87,2.03,1.89,2.03,1.9,2.07,1.86,2.02
4,E0,10/08/2019,15:00,Crystal Palace,Everton,0,0,D,0,0,...,1.71,0.25,1.82,2.08,1.97,1.96,2.03,2.08,1.96,1.93


In [2]:
df_epl_1920_clean = df_epl_1920[["HomeTeam","AwayTeam", "FTHG","FTAG","FTR", "HS","AS","HST", "AST","PSH","PSD","PSA"]]

df1 = df_epl_1920_clean.iloc[:20, :]
df2 = df_epl_1920_clean.iloc[20:,:]

In [3]:
def split_data(fixture):
    matches = fixture*10
    df1 = df_epl_1920_clean.iloc[:matches, :]
    df2 = df_epl_1920_clean.iloc[matches:matches+10, :]
    return df1, df2


In [5]:
df_epl_fix9 , df_epl_match10 = split_data(9)


In [26]:
def calc_feats(df,team):
    team_stats = dict()

    df_team = df_epl_fix9[(df_epl_fix9["HomeTeam"]==team) | (df_epl_fix9["AwayTeam"]==team)]
    # calculate shots
    team_shots = df_team.apply( lambda x: x["HS"] if x["HomeTeam"] == team else x["AS"], axis=1).sum()
    # calculate shots on target
    team_shots_ot = df_team.apply( lambda x: x["HST"] if x["HomeTeam"] == team else x["AST"], axis=1).sum()
    # calculate shots against
    team_shots_vs = df_team.apply( lambda x: x["AS"] if x["HomeTeam"] == team else x["HS"], axis=1).sum()
    # calculate shots on target against
    team_shots_ot_vs = df_team.apply( lambda x: x["AST"] if x["HomeTeam"] == team else x["HST"], axis=1).sum()
    # calculate points
    team_points = df_team.apply(lambda x: 3 if ((x["HomeTeam"] == team and x["FTR"] == "H") or (x["AwayTeam"] == team and x["FTR"] == "A")) else (1 if x["FTR"] == "D" else 0), axis=1).sum() 

    team_stats["team"] = team
    team_stats["shots"] = team_shots
    team_stats["shots_ot"] = team_shots_ot
    team_stats["shots_vs"] = team_shots_vs
    team_stats["shots_ot_vs"] = team_shots_ot_vs
    team_stats["points"] = team_points
    return team_stats


In [29]:
all_teams = df_epl_fix9.HomeTeam.unique()
all_teams_stats = []

print(all_teams)
for team in all_teams:
    team_stats = calc_feats(df_epl_fix9, team)
    all_teams_stats.append(team_stats)

# print(all_teams_stats)
df_all_stats = pd.DataFrame(all_teams_stats)
df_all_stats

['Liverpool' 'West Ham' 'Bournemouth' 'Burnley' 'Crystal Palace' 'Watford'
 'Tottenham' 'Leicester' 'Newcastle' 'Man United' 'Arsenal' 'Aston Villa'
 'Brighton' 'Everton' 'Norwich' 'Southampton' 'Man City'
 'Sheffield United' 'Chelsea' 'Wolves']


Unnamed: 0,team,shots,shots_ot,shots_vs,shots_ot_vs,points
0,Liverpool,141,53,84,21,25
1,West Ham,102,45,119,46,12
2,Bournemouth,96,34,140,40,12
3,Burnley,100,32,130,36,12
4,Crystal Palace,92,33,124,37,14
5,Watford,120,31,111,42,4
6,Tottenham,117,38,136,52,12
7,Leicester,109,30,93,33,17
8,Newcastle,79,20,131,43,8
9,Man United,117,35,88,34,10


In [30]:
from joblib import dump,load

features = df_all_stats[["shots", "shots_ot", "shots_vs", "shots_ot_vs"]]

model = load('../data/epl_model.joblib')

model.predict(features)

array([21.47836095, 13.49184314, 11.86020353, 11.51712885, 11.27850211,
        8.35963602, 10.05190496,  9.2668875 ,  4.71338591, 10.76180823,
       14.08470516, 21.8882428 ,  9.30899224, 12.12796296,  8.41141844,
       11.94865893, 23.9678279 ,  6.89859056, 21.76948235,  8.22103986])

In [33]:
df_all_stats["exp_points"] = model.predict(features)
df_all_stats["exp_points_diff"] = (df_all_stats["exp_points"] - df_all_stats["points"])
df_all_stats

Unnamed: 0,team,shots,shots_ot,shots_vs,shots_ot_vs,points,exp_points,exp_points_diff
0,Liverpool,141,53,84,21,25,21.478361,-3.521639
1,West Ham,102,45,119,46,12,13.491843,1.491843
2,Bournemouth,96,34,140,40,12,11.860204,-0.139796
3,Burnley,100,32,130,36,12,11.517129,-0.482871
4,Crystal Palace,92,33,124,37,14,11.278502,-2.721498
5,Watford,120,31,111,42,4,8.359636,4.359636
6,Tottenham,117,38,136,52,12,10.051905,-1.948095
7,Leicester,109,30,93,33,17,9.266888,-7.733112
8,Newcastle,79,20,131,43,8,4.713386,-3.286614
9,Man United,117,35,88,34,10,10.761808,0.761808


In [39]:
df_epl_match10[["HomeTeam", "AwayTeam", "PSH", "PSD", "PSA"]]
df_epl_match10


predictions = []
for ind, row in df_epl_match10.iterrows():
    home_team = row["HomeTeam"]
    away_team = row["AwayTeam"]
    home_exp_points = df_all_stats[ df_all_stats["team"]==home_team]["exp_points" ].values[0]
    away_exp_points = df_all_stats[ df_all_stats["team"]==away_team]["exp_points" ].values[0]

    exp_points_diff_teams = (home_exp_points - away_exp_points)

    if exp_points_diff_teams > 4:
        result = "H"
        odds = row["PSH"]
    elif abs(exp_points_diff_teams) < 4:
        result = "D"
        odds = row["PSD"]
    else:
        result = "A"
        odds = row["PSA"]

    pred = {"HomeTeam": home_team, "AwayTeam": away_team, "Prediction": result, "Odds": odds}
    predictions.append(pred)

df_slip = pd.DataFrame(predictions)

df_slip


Unnamed: 0,HomeTeam,AwayTeam,Prediction,Odds
0,Southampton,Leicester,D,3.34
1,Man City,Aston Villa,D,12.5
2,Brighton,Everton,D,3.37
3,Watford,Bournemouth,D,3.52
4,West Ham,Sheffield United,H,2.15
5,Burnley,Chelsea,A,1.75
6,Newcastle,Wolves,D,3.11
7,Arsenal,Crystal Palace,D,4.66
8,Liverpool,Tottenham,H,1.55
9,Norwich,Man United,D,3.96


In [None]:
df_slip = df_epl_match10.merge(df_slip, on=["HomeTeam", "AwayTeam"])[["HomeTeam", "AwayTeam", "Prediction", "Odds", "FTR"]]

df_slip
# df_epl_match10

Unnamed: 0,HomeTeam,AwayTeam,Prediction,Odds,FTR
0,Southampton,Leicester,D,3.34,A
1,Man City,Aston Villa,D,12.5,H
2,Brighton,Everton,D,3.37,H
3,Watford,Bournemouth,D,3.52,D
4,West Ham,Sheffield United,H,2.15,D
5,Burnley,Chelsea,A,1.75,A
6,Newcastle,Wolves,D,3.11,D
7,Arsenal,Crystal Palace,D,4.66,D
8,Liverpool,Tottenham,H,1.55,H
9,Norwich,Man United,D,3.96,A


In [42]:
df_slip["Win"] = np.where( df_slip["Prediction"]==df_slip["FTR"], df_slip["Odds"]-1, -1)

df_slip

Unnamed: 0,HomeTeam,AwayTeam,Prediction,Odds,FTR,Win
0,Southampton,Leicester,D,3.34,A,-1.0
1,Man City,Aston Villa,D,12.5,H,-1.0
2,Brighton,Everton,D,3.37,H,-1.0
3,Watford,Bournemouth,D,3.52,D,2.52
4,West Ham,Sheffield United,H,2.15,D,-1.0
5,Burnley,Chelsea,A,1.75,A,0.75
6,Newcastle,Wolves,D,3.11,D,2.11
7,Arsenal,Crystal Palace,D,4.66,D,3.66
8,Liverpool,Tottenham,H,1.55,H,0.55
9,Norwich,Man United,D,3.96,A,-1.0
