# Structure
* Perform basic analysis: counts, means, std max etc
* Basic EDA to understand data dynamics: distribution of the model, independence
* Statistical tests to confirm
  * Goals are Poisson distributed
  * In/dependece between home/away goals
  * Significant difference between home/away goals
Assumptions for model by Dixon are satisfied and we can proceed to implement the model to estimate probabilities of team encounters

In [2]:
from football_odds.utils.connectors import QuestDB
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from scipy.stats import mannwhitneyu, poisson_means_test, poisson, chisquare, chi2_contingency

ModuleNotFoundError: No module named 'football_odds'

In [None]:
Q = r'''
select
  dlf.league_name, 
  dlf.season,
  dff.fixture_date,
  dtfh.team_name as home_team_name, 
  dtfa.team_name as away_team_name, 
  dff.teams_home_winner, 
  dff.goals_home, 
  dff.goals_away,
from dim_fixtures_fa dff
inner join dim_leagues_fa dlf
  on dlf.league_id = dff.league_id and dff.league_season = dlf.season
inner join dim_teams_fa dtfh
  on dtfh.team_id = dff.teams_home_id
inner join dim_teams_fa dtfa
  on dtfa.team_id = dff.teams_away_id
where 1=1
  and dlf.league_name IN ('Premier League')
  and goals_home is not null and goals_away is not null
  and season >= 2021
;
'''

conn = QuestDB()

df = conn.execute_query(Q)
df['goal_difference'] = (df.goals_home - df.goals_away).abs()
df['total_goals'] = (df.goals_home + df.goals_away)

In [None]:
df.describe()

## Summary Statistics

* We have a sample size of 860 games
* An average of 2.9 goals are scored per match
* The average number of home goals is almost 1.6 while away goals are 1.3. Is this statistically significant?
* The variation of home goals is slightly larger than that of away goals. How can this impact the results?
* The Liverpool-Bournmouth 2022-08-27 match ended 9-0, the largest number of home goals scored
* The Sheffield-Newcastle 2023-09-23 match ended 9-0, the largest number of home goals scored
* Should we truncate the high goals scored to something more reasonable to not skew the results? How to test this?

In [None]:
df.sort_values('goals_home', ascending=False).head(8)

In [None]:
df.sort_values('goals_away', ascending=False).head(8)

In [None]:
df.sort_values('goals_away', ascending=False).head(8)

In [None]:
fig, ax = plt.subplots(2, 1, figsize=(15, 15))

# Highest average scoring teams
df_home = df[['home_team_name', 'goals_home']].rename({'home_team_name': 'team', 'goals_home': 'goals'}, axis=1)
df_away = df[['away_team_name', 'goals_away']].rename({'away_team_name': 'team', 'goals_away': 'goals'}, axis=1)
df_team_goals = pd.concat([df_home, df_away])
df_team_goals.groupby('team').mean().sort_values('goals').plot.bar(ax=ax[0])
ax[0].grid()
ax[0].set_title('Average number of goals per team')
ax[0].tick_params(axis='x', labelrotation=45)

# Highest average conceding teams
df_home = df[['home_team_name', 'goals_away']].rename({'home_team_name': 'team', 'goals_away': 'goals'}, axis=1)
df_away = df[['away_team_name', 'goals_home']].rename({'away_team_name': 'team', 'goals_home': 'goals'}, axis=1)
df_team_conceded = pd.concat([df_home, df_away])
df_team_conceded.groupby('team').mean().sort_values('goals').plot.bar(ax=ax[1])
ax[1].grid()
ax[1].set_title('Average number of conceded goals per team')
ax[1].tick_params(axis='x', labelrotation=45)

In [None]:
# home/away avg goals
avg_home_goals = df.goals_home.mean()
avg_away_goals = df.goals_away.mean()
plt.bar(['home', 'away'], [avg_home_goals, avg_away_goals], color=['blue', 'green'])
plt.title('Average number of home and away goals per match')
plt.xticks(rotation=45)
plt.grid()

In [None]:
df.goals_home.hist(alpha=0.4, label='home', color='blue')
df.goals_away.hist(alpha=0.4, label='away', color='orange')
plt.title('Home/Away goals scored histogram')
plt.axvline(x=avg_home_goals, color='blue', alpha=0.4)
plt.axvline(x=avg_away_goals, color='orange', alpha=0.4)
plt.legend()

## Poisson Test
We confirm that the Poisson distribution is a valid distribution for the number of home/away goals

In [None]:
MLE = df.goals_home.mean()
n_games = len(df)
f_obs = pd.value_counts(df.goals_home).sort_index().values
f_exp = poisson.pmf(list(range(0, df.goals_home.max()+1)), MLE) * n_games
f_exp = np.sum(f_obs) / np.sum(f_exp) * f_exp
    
stat, pval = chisquare(f_obs=f_obs, f_exp=f_exp)
print(f'HOME: Pearson Chi-square p-value={pval} > 0.05')

MLE = df.goals_away.mean()
n_games = len(df)
f_obs = pd.value_counts(df.goals_away).sort_index().values
f_exp = poisson.pmf(list(range(0, df.goals_away.max()+1)), MLE) * n_games
f_exp = np.sum(f_obs) / np.sum(f_exp) * f_exp
    
stat, pval = chisquare(f_obs=f_obs, f_exp=f_exp)
print(f'AWAY: Pearson Chi-square p-value={pval} > 0.05')

## Test for Independence
Chi-Square test assumptions:
* The two samples are independent* No expected cell count is = 0
* 
No more than 20% of the cells have and expected cell count  


### Hypothesis
H0: Variables Independent
H1: Variables Dependent

After considering only matches with 4 or less goals (95% of all matches) to not violate point 3, we use the Chi-square test and get a p-value of 0.13 > 0.05. There we cannot reject H0, ie. that variables are independent

For details see https://www.pythonfordatascience.org/chi-square-test-of-independence-python/ 

In [None]:
n = 5
num_lt_n_goals = df[(df.goals_away <= n)&(df.goals_home <= n)].league_name.count()
num_games = len(df)
print(f'{num_lt_n_goals} outof {num_games} games ({int(num_lt_n_goals/num_games*100)}%) contain less than {n} goals for both teams')

In [None]:
crosstab = pd.crosstab(df.goals_home, df.goals_away)
crosstab

In [None]:
crosstab = crosstab.iloc[:n, :n]
crosstab

In [None]:
res = chi2_contingency(crosstab)
print(f'Chi-square test p-value = {res.pvalue:.2f} > 0.05')

## Home Advantage significance
* 24% More home goals scored than away goals
* Mann-Whitney-U: Since p-value < 0.05, we reject null hypothesis, i.e. the home goals scored are statistically greater than the away goals
* Further confirmed by E-test, assuming goals are Poisson distributed: p-value < 0.05

In [None]:
print('Home Advantage: {:.2f}'.format(avg_home_goals / avg_away_goals))
res = mannwhitneyu(df.goals_home, df.goals_away, alternative='greater')
print(f'MWU-test p-value = {res.pvalue}')

In [None]:
res = poisson_means_test(df.goals_home.sum(), df.goals_home.count(), df.goals_away.sum(), df.goals_away.count())
print(f'E-test p-value = {res.pvalue}')