In [41]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import math
import seaborn as sns

## Data Cleaning and restructuring

### Data import

In [67]:
DF = pd.read_csv('EPL_2022-2023_match_data.csv')
DF.head()

Unnamed: 0,match_id,teams_home,teams_away,goals_home,goals_away,xg_home,xg_away
0,18202,Crystal Palace,Arsenal,0,2,1.21,1.44
1,18203,Fulham,Liverpool,2,2,1.27,2.34
2,18204,Bournemouth,Aston Villa,2,0,0.59,0.49
3,18205,Leeds,Wolverhampton Wanderers,2,1,0.89,1.1
4,18206,Newcastle United,Nottingham Forest,2,0,1.86,0.24


### Adding boulean columns for results, xg attempts results and whether they match

In [72]:
#Creating boulean columns to figure out who won
DF_result = DF.copy(deep=True)
DF_result['home_win'] = DF_result['goals_home'] > DF_result['goals_away']
DF_result['draw'] = DF_result['goals_home'] == DF_result['goals_away']
DF_result['away_win'] = DF_result['goals_away'] > DF_result['goals_home']
DF_result.head()

Unnamed: 0,match_id,teams_home,teams_away,goals_home,goals_away,xg_home,xg_away,home_win,draw,away_win
0,18202,Crystal Palace,Arsenal,0,2,1.21,1.44,False,False,True
1,18203,Fulham,Liverpool,2,2,1.27,2.34,False,True,False
2,18204,Bournemouth,Aston Villa,2,0,0.59,0.49,True,False,False
3,18205,Leeds,Wolverhampton Wanderers,2,1,0.89,1.1,True,False,False
4,18206,Newcastle United,Nottingham Forest,2,0,1.86,0.24,True,False,False


In [73]:
#Creating boulean columns to figure out what the expected goals result would be
DF_result ['xg_home_win'] = DF_result ['xg_home'] > DF_result ['xg_away']
DF_result ['xg_draw'] = DF_result ['xg_home'] == DF_result ['xg_away']
DF_result ['xg_away_win'] = DF_result ['xg_away'] > DF_result ['xg_home']
DF_result.head()

Unnamed: 0,match_id,teams_home,teams_away,goals_home,goals_away,xg_home,xg_away,home_win,draw,away_win,xg_home_win,xg_draw,xg_away_win
0,18202,Crystal Palace,Arsenal,0,2,1.21,1.44,False,False,True,False,False,True
1,18203,Fulham,Liverpool,2,2,1.27,2.34,False,True,False,False,False,True
2,18204,Bournemouth,Aston Villa,2,0,0.59,0.49,True,False,False,True,False,False
3,18205,Leeds,Wolverhampton Wanderers,2,1,0.89,1.1,True,False,False,False,False,True
4,18206,Newcastle United,Nottingham Forest,2,0,1.86,0.24,True,False,False,True,False,False


In [74]:
#Working out if the expected goals measure correctly shows the outcome of the game, and which way it was wrong
def compare_results(row):
    return (row['home_win'] == row['xg_home_win']) and (row['draw'] == row['xg_draw']) and (row['away_win'] == row['xg_away_win'])
DF_result['xg_matches_real'] = DF_result.apply(compare_results, axis=1)
DF_result.head()

Unnamed: 0,match_id,teams_home,teams_away,goals_home,goals_away,xg_home,xg_away,home_win,draw,away_win,xg_home_win,xg_draw,xg_away_win,xg_matches_real
0,18202,Crystal Palace,Arsenal,0,2,1.21,1.44,False,False,True,False,False,True,True
1,18203,Fulham,Liverpool,2,2,1.27,2.34,False,True,False,False,False,True,False
2,18204,Bournemouth,Aston Villa,2,0,0.59,0.49,True,False,False,True,False,False,True
3,18205,Leeds,Wolverhampton Wanderers,2,1,0.89,1.1,True,False,False,False,False,True,False
4,18206,Newcastle United,Nottingham Forest,2,0,1.86,0.24,True,False,False,True,False,False,True


## Analysis

In [76]:
discrepancy_count = DF_result[DF_result['xg_matches_real'] == False].shape[0]
discrepancy_count

147