# Libraries

In [94]:
import pandas as pd
from getpass import getuser
from collections import defaultdict
from scipy.stats import ttest_ind

In [95]:
# Get the current user's name
user = getuser()


# Load Data

In [96]:
# ---- EURO datasets ----
data_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\out\wiki\men\uefa\eu\goals_eu_uefa.xlsx'
goals_eu_uefa = pd.read_excel(data_path)

data_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\out\wiki\men\fifa\eu\goals_eu_fifa.xlsx'
goals_eu_fifa = pd.read_excel(data_path)

data_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\out\wiki\men\uefa\eu\mbm_eu_uefa.xlsx'
mbm_eu_uefa = pd.read_excel(data_path)

data_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\out\wiki\men\fifa\eu\mbm_eu_fifa.xlsx'
mbm_eu_fifa = pd.read_excel(data_path)

# ---- WORLD CUP datasets ----
data_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\out\wiki\men\uefa\wc\goals_wc_uefa.xlsx'
goals_wc_uefa = pd.read_excel(data_path)

data_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\out\wiki\men\fifa\wc\goals_wc_fifa.xlsx'
goals_wc_fifa = pd.read_excel(data_path)

data_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\out\wiki\men\uefa\wc\mbm_wc_uefa.xlsx'
mbm_wc_uefa = pd.read_excel(data_path)

data_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\out\wiki\men\fifa\wc\mbm_wc_fifa.xlsx'
mbm_wc_fifa = pd.read_excel(data_path)

# ---- Elo files ----
data_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\in\elo_eu.xlsx'
elo_eu = pd.read_excel(data_path)

data_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\in\elo_wc.xlsx'
elo_wc = pd.read_excel(data_path)


In [97]:
# Dictionary of replacements
replacements = {
    "Republic of Ireland": "Ireland",
    "CIS": "Commonwealth of Independent States",
    "FR Yugoslavia": "Yugoslavia"
}

# List of all relevant DataFrames
datasets = [
    goals_eu_uefa, goals_eu_fifa, mbm_eu_uefa, mbm_eu_fifa,
    goals_wc_uefa, goals_wc_fifa, mbm_wc_uefa, mbm_wc_fifa
]

# Apply replacements to each dataset
for df in datasets:
    df['home_team'] = df['home_team'].replace(replacements)
    df['away_team'] = df['away_team'].replace(replacements)

# Optional: show one updated DataFrame
display(goals_eu_uefa)



Unnamed: 0,year,stage,date,time,goal_minute,half_time,home_team,away_team,scorer_team,new_top_teams,...,third_qualify,qualified_teams,qual_changed,qual_count,pts_diff,gls_diff,elo_home,elo_away,h2h,suspense
0,1984,Group 1,1984-06-19,20:30,0,,,,,"['Denmark', 'France']",...,0,"['France', 'Denmark']",0,0,0,-7,,,1,1
1,1984,Group 1,1984-06-19,20:30,26,1.0,Denmark,Belgium,Belgium,"['France', 'Belgium']",...,0,"['France', 'Belgium']",1,1,-2,5,1809.0,1898.0,1,1
2,1984,Group 1,1984-06-19,20:30,32,1.0,France,Yugoslavia,Yugoslavia,"['France', 'Belgium']",...,0,"['France', 'Belgium']",0,1,-2,5,1960.0,1890.0,1,1
3,1984,Group 1,1984-06-19,20:30,39,1.0,Denmark,Belgium,Belgium,"['France', 'Belgium']",...,0,"['France', 'Belgium']",0,1,-2,3,1809.0,1898.0,1,0
4,1984,Group 1,1984-06-19,20:30,41,1.0,Denmark,Belgium,Denmark,"['France', 'Belgium']",...,0,"['France', 'Belgium']",0,1,-2,5,1809.0,1898.0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
265,2024,Group F,2024-06-26,21:00,2,1.0,Georgia,Portugal,Georgia,"['Portugal', 'Turkey', 'Georgia']",...,1,"['Portugal', 'Turkey', 'Georgia']",1,1,-2,0,1666.0,2002.0,0,1
266,2024,Group F,2024-06-26,21:00,51,2.0,Czech Republic,Turkey,Turkey,"['Portugal', 'Turkey', 'Georgia']",...,1,"['Portugal', 'Turkey', 'Georgia']",0,1,-3,-1,1777.0,1749.0,0,0
267,2024,Group F,2024-06-26,21:00,57,2.0,Georgia,Portugal,Georgia,"['Portugal', 'Turkey', 'Georgia']",...,1,"['Portugal', 'Turkey', 'Georgia']",0,1,-3,-2,1666.0,2002.0,0,0
268,2024,Group F,2024-06-26,21:00,66,2.0,Czech Republic,Turkey,Czech Republic,"['Portugal', 'Turkey', 'Georgia']",...,1,"['Portugal', 'Turkey', 'Georgia']",0,1,-2,-1,1777.0,1749.0,0,1


# aggregate measure for elo, suspense, and changes

In [98]:
def compute_aggregated_metrics(goals_df, elo_df):
    """
    Merges Elo ratings for home and away teams, computes average Elo per match,
    and aggregates average Elo, qual_count, and suspense by year and stage.

    Parameters:
    - goals_df: DataFrame containing match data with 'home_team', 'away_team', 'year', 'stage', 'qual_count', and 'suspense'.
    - elo_df: DataFrame containing Elo ratings with 'team', 'year', and 'elo_rating'.

    Returns:
    - aggregated_df: DataFrame with aggregated metrics by year and stage (renamed to 'group').
    """
    # Merge elo rating for home_team
    merged_df = goals_df.merge(
        elo_df[['year', 'team', 'elo_rating']].rename(columns={'team': 'home_team', 'elo_rating': 'home_elo'}),
        on=['year', 'home_team'],
        how='left'
    )

    # Merge elo rating for away_team
    merged_df = merged_df.merge(
        elo_df[['year', 'team', 'elo_rating']].rename(columns={'team': 'away_team', 'elo_rating': 'away_elo'}),
        on=['year', 'away_team'],
        how='left'
    )

    # Compute average elo rating for each match
    merged_df['elo_avg'] = merged_df[['home_elo', 'away_elo']].mean(axis=1)

    # Group by year and stage, aggregating desired metrics
    aggregated_df = merged_df.groupby(['year', 'stage']).agg(
        elo_avg=('elo_avg', 'mean'),
        avg_qual_count=('qual_count', 'mean'),
        avg_suspense=('suspense', 'mean')
    ).reset_index()

    # Rename 'stage' to 'group' and reorder columns
    aggregated_df = aggregated_df.rename(columns={'stage': 'group'})
    aggregated_df = aggregated_df[['year', 'group', 'elo_avg', 'avg_qual_count', 'avg_suspense']]

    return aggregated_df


In [99]:
agg_goals_eu_uefa = compute_aggregated_metrics(goals_eu_uefa, elo_eu)
agg_goals_eu_fifa = compute_aggregated_metrics(goals_eu_fifa, elo_eu)
agg_mbm_eu_uefa = compute_aggregated_metrics(mbm_eu_uefa, elo_eu)
agg_mbm_eu_fifa = compute_aggregated_metrics(mbm_eu_fifa, elo_eu)


In [100]:
# ---- Compute aggregated metrics for World Cup datasets ----
agg_goals_wc_uefa = compute_aggregated_metrics(goals_wc_uefa, elo_wc)
agg_goals_wc_fifa = compute_aggregated_metrics(goals_wc_fifa, elo_wc)
agg_mbm_wc_uefa   = compute_aggregated_metrics(mbm_wc_uefa, elo_wc)
agg_mbm_wc_fifa   = compute_aggregated_metrics(mbm_wc_fifa, elo_wc)


## add manually elo_avg for 2024 Group C where no goal was scored

In [101]:
# Filter rows where elo_avg is NaN
missing_elo = agg_mbm_eu_uefa[agg_mbm_eu_uefa['elo_avg'].isna()]

# Display the result
print(missing_elo)


    year    group  elo_avg  avg_qual_count  avg_suspense
40  2024  Group C      NaN             0.0           1.0


In [102]:
def patch_group_c_elo(agg_df, elo_df):
    group_c_teams = ['England', 'Denmark', 'Slovenia', 'Serbia']

    # Get average elo for the group
    group_c_elo = elo_df[(elo_df['year'] == 2024) & (elo_df['team'].isin(group_c_teams))]['elo_rating'].mean()

    # Fill in the missing elo_avg value
    agg_df.loc[
        (agg_df['year'] == 2024) & (agg_df['group'] == 'Group C'),
        'elo_avg'
    ] = group_c_elo

    return agg_df


In [103]:
agg_goals_eu_uefa = patch_group_c_elo(agg_goals_eu_uefa, elo_eu)
agg_goals_eu_fifa = patch_group_c_elo(agg_goals_eu_fifa, elo_eu)
agg_mbm_eu_uefa = patch_group_c_elo(agg_mbm_eu_uefa, elo_eu)
agg_mbm_eu_fifa = patch_group_c_elo(agg_mbm_eu_fifa, elo_eu)


In [104]:
# Filter rows where elo_avg is NaN
missing_elo = agg_mbm_wc_fifa[agg_mbm_wc_fifa['elo_avg'].isna()]

# Display the result
print(missing_elo)


Empty DataFrame
Columns: [year, group, elo_avg, avg_qual_count, avg_suspense]
Index: []


In [106]:
def show_correlations(df, dataset_name):
    # Select only numeric columns of interest
    corr_matrix = df[['elo_avg', 'avg_qual_count', 'avg_suspense']].corr()

    print(f"\nCorrelation matrix for {dataset_name}:\n")
    print(corr_matrix)


In [107]:
# ---- Apply to EURO datasets ----
show_correlations(agg_goals_eu_uefa, 'agg_goals_eu_uefa')
show_correlations(agg_goals_eu_fifa, 'agg_goals_eu_fifa')
show_correlations(agg_mbm_eu_uefa, 'agg_mbm_eu_uefa')
show_correlations(agg_mbm_eu_fifa, 'agg_mbm_eu_fifa')

# ---- Apply to WORLD CUP datasets ----
show_correlations(agg_goals_wc_uefa, 'agg_goals_wc_uefa')
show_correlations(agg_goals_wc_fifa, 'agg_goals_wc_fifa')
show_correlations(agg_mbm_wc_uefa, 'agg_mbm_wc_uefa')
show_correlations(agg_mbm_wc_fifa, 'agg_mbm_wc_fifa')


Correlation matrix for agg_goals_eu_uefa:

                 elo_avg  avg_qual_count  avg_suspense
elo_avg         1.000000        0.066299     -0.042183
avg_qual_count  0.066299        1.000000      0.252357
avg_suspense   -0.042183        0.252357      1.000000

Correlation matrix for agg_goals_eu_fifa:

                 elo_avg  avg_qual_count  avg_suspense
elo_avg         1.000000        0.177690      0.112943
avg_qual_count  0.177690        1.000000      0.444604
avg_suspense    0.112943        0.444604      1.000000

Correlation matrix for agg_mbm_eu_uefa:

                 elo_avg  avg_qual_count  avg_suspense
elo_avg         1.000000        0.032814     -0.084731
avg_qual_count  0.032814        1.000000      0.218186
avg_suspense   -0.084731        0.218186      1.000000

Correlation matrix for agg_mbm_eu_fifa:

                 elo_avg  avg_qual_count  avg_suspense
elo_avg         1.000000        0.123578      0.041145
avg_qual_count  0.123578        1.000000      0.352235
avg