In [None]:
import soccerdata as sd
from scipy.stats import ttest_rel
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import plotly.express as px

sns.set_theme()

In [None]:
prem_hist = sd.MatchHistory('ENG-Premier League', seasons=2023)
games = prem_hist.read_games()
games.sample(5)

In [None]:
def home_away_results(games: pd.DataFrame):
    """Returns aggregated home/away results per team, including average points per game"""
    
    # Melt the data to have one row per team per game
    res = pd.melt(
        games.reset_index(),
        id_vars=['date', 'FTR'],
        value_name='team', var_name='is_home',
        value_vars=['home_team', 'away_team']
    )
    
    res.is_home = res.is_home.replace(['home_team', 'away_team'], ['Home', 'Away'])
    
    # Initialize points to 0 & assign points based on the result
    res['points'] = 0
    res.loc[(res['is_home'] == 'Home') & (res['FTR'] == 'H'), 'points'] = 3
    res.loc[(res['is_home'] == 'Away') & (res['FTR'] == 'A'), 'points'] = 3
    res.loc[res['FTR'] == 'D', 'points'] = 1
    
    # Group by team and home/away to calculate total and average points
    g = res.groupby(['team', 'is_home'])
    points = g.points.agg(['sum', 'mean']).rename(columns={'sum': 'total_points', 'mean': 'avg_points_per_game'})
    
    return points

In [None]:
results = home_away_results(games)
results.head(6)

In [None]:
team_colors = {
    'Arsenal': '#EF0107',          # Red
    'Aston Villa': '#95BFE5',      # Light Blue
    'Bournemouth': '#DA291C',      # Red
    'Brentford': '#E30613',        # Red
    'Brighton': '#0057B8',         # Blue
    'Burnley': '#6C1D45',          # Claret
    'Chelsea': '#034694',          # Blue
    'Crystal Palace': '#1B458F',   # Blue
    'Everton': '#003399',          # Blue
    'Fulham': '#000000',           # Black
    'Liverpool': '#C8102E',        # Red
    'Luton': '#FB4F14',            # Orange
    'Man City': '#6CABDD',         # Sky Blue
    'Man United': '#DA291C',       # Red
    'Newcastle': '#241F20',        # Black
    'Nott\'m Forest': '#DD0000',   # Red
    'Sheffield United': '#EE2737', # Red
    'Tottenham': '#132257',        # Dark Blue
    'West Ham': '#7A263A',         # Claret
    'Wolves': '#FFA500',           # Orange
}

In [None]:
# Interactive line plot with Plotly
fig = px.line(results.reset_index(),
              x='is_home',
              y='avg_points_per_game',
              color='team',
              line_group='team',
              color_discrete_map=team_colors,
              hover_data={'team': True, 'avg_points_per_game': ':.2f'},
              labels={'avg_points_per_game': 'Average Points per Game'},
              title='Average Points per Game: Home vs Away',
              markers=True
)
fig.update_traces(
    hovertemplate='Average Points per Game: %{y:.2f}'
)
fig.update_layout(
              xaxis_title="",
              yaxis_title="Avg Points per Game",
              showlegend=False,
              plot_bgcolor='rgba(0,0,0,0)',
              xaxis=dict(tickmode='array', tickvals=['Home', 'Away']),
              width=600,
              height=600,
              title_x=0.5
)
fig.show()
fig.write_html("interactive_plot.html")

In [None]:
g = sns.FacetGrid(results.reset_index(), col='team', col_wrap=5)
g.map(sns.pointplot, 'is_home', 'avg_points_per_game', order=["Away", "Home"])
g.set_axis_labels('', 'Avg Points per Game');
g.set_titles("{col_name}")
plt.savefig('home_away_points.png')

In [None]:
# t-test
def paired_t_test_on_points(results: pd.DataFrame):
    # Pivot the table to have home and away in separate columns
    pivot_table = results.unstack().reset_index()
    
    # Perform a paired t-test on avg_points_per_game for home and away
    home_points = pivot_table['avg_points_per_game']['Home']
    away_points = pivot_table['avg_points_per_game']['Away']
    
    t_stat, p_value = ttest_rel(home_points, away_points)
    
    return t_stat, p_value

# Calculate the t-statistic and p-value
t_stat, p_value = paired_t_test_on_points(results)

print(f"T-statistic: {t_stat:.4f}, P-value: {p_value:.4f}")