# Libraries

In [13]:
import pandas as pd
from getpass import getuser
from collections import defaultdict
from scipy.stats import ttest_ind

# Load European Championship dataset

In [14]:
# Get the current user's name
user = getuser()

# Path to the dataset
data_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\out\wiki\men\uefa\eu\tb_eu_uefa_men.xlsx'

# Read the dataset with a different encoding
df_eu = pd.read_excel(data_path)


In [15]:
# Grouping by year and stage, summing changes, and calculating avg_change_per_team
aggregated_df_eu = df_eu.groupby(['year', 'stage']).agg(
    sum_changes=('changes', 'sum'),
    avg_change_per_team=('changes', lambda x: x.sum() / len(x))
).reset_index()

aggregated_df_eu

Unnamed: 0,year,stage,sum_changes,avg_change_per_team
0,1988,Group 1,4,1.0
1,1988,Group 2,8,2.0
2,1992,Group 1,18,4.5
3,1992,Group 2,8,2.0
4,1996,Group A,6,1.5
5,1996,Group B,14,3.5
6,1996,Group C,12,3.0
7,1996,Group D,6,1.5
8,2000,Group A,11,2.75
9,2000,Group B,6,1.5


# Load World Cup dataset

In [16]:
# Path to the dataset
data_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\out\kaggle\fifa\tb_wc_fifa_men.xlsx'

# Read the dataset with a different encoding
df_wc = pd.read_excel(data_path)
df_wc = df_wc.rename(columns={'group_name': 'stage'})
display(df_wc)

Unnamed: 0,year,stage,team,1st,2nd,3rd,4th,changes
0,1986,Group A,Argentina,1,0,0,0,1
1,1986,Group A,Italy,0,1,1,0,2
2,1986,Group A,Bulgaria,0,1,1,0,2
3,1986,Group A,South Korea,0,0,0,2,2
4,1986,Group B,Mexico,2,1,0,0,3
...,...,...,...,...,...,...,...,...
291,2022,Group G,Serbia,0,1,0,2,3
292,2022,Group H,Portugal,1,0,0,0,1
293,2022,Group H,South Korea,0,1,1,1,3
294,2022,Group H,Uruguay,0,1,2,1,4


In [17]:
# Grouping by year and stage, summing changes, and calculating avg_change_per_team
aggregated_df_wc = df_wc.groupby(['year', 'stage']).agg(
    sum_changes=('changes', 'sum'),
    avg_change_per_team=('changes', lambda x: x.sum() / len(x))
).reset_index()

aggregated_df_wc

Unnamed: 0,year,stage,sum_changes,avg_change_per_team
0,1986,Group A,7,1.75
1,1986,Group B,15,3.75
2,1986,Group C,10,2.50
3,1986,Group D,8,2.00
4,1986,Group E,4,1.00
...,...,...,...,...
69,2022,Group D,9,2.25
70,2022,Group E,19,4.75
71,2022,Group F,8,2.00
72,2022,Group G,10,2.50


In [18]:
# Appending the datasets and adding the "wc" variable
combined_df = pd.concat([aggregated_df_wc.assign(wc=1), aggregated_df_eu.assign(wc=0)], ignore_index=True)
combined_df

Unnamed: 0,year,stage,sum_changes,avg_change_per_team,wc
0,1986,Group A,7,1.75,1
1,1986,Group B,15,3.75,1
2,1986,Group C,10,2.50,1
3,1986,Group D,8,2.00,1
4,1986,Group E,4,1.00,1
...,...,...,...,...,...
111,2024,Group B,10,2.50,0
112,2024,Group C,8,2.00,0
113,2024,Group D,18,4.50,0
114,2024,Group E,11,2.75,0


In [19]:
# Performing a t-test using the two separate datasets
eu_data = aggregated_df_eu['avg_change_per_team']
wc_data = aggregated_df_wc['avg_change_per_team']

# Performing the t-test
t_stat, p_value = ttest_ind(wc_data, eu_data, equal_var=False)

t_test_results = {
    "t-statistic": t_stat,
    "p-value": p_value
}

t_test_results


{'t-statistic': np.float64(0.6546466633008601),
 'p-value': np.float64(0.5146045555344064)}

# suspense

## European Championship

In [20]:
# Path to the dataset
data_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\out\wiki\men\uefa\eu\active_suspense_eu.xlsx'

# Read the dataset with a different encoding
suspense_eu = pd.read_excel(data_path)

display(suspense_eu)

Unnamed: 0,year,team,stage,aggregate_active_suspense
0,1988,Netherlands,Group 2,4
1,1988,Republic of Ireland,Group 2,1
2,1992,CIS,Group 2,3
3,1992,Denmark,Group 1,2
4,1992,England,Group 1,2
5,1992,France,Group 1,2
6,1992,Scotland,Group 2,4
7,1996,Bulgaria,Group B,3
8,1996,Czech Republic,Group C,3
9,1996,France,Group B,1


In [21]:
# Grouping by year and stage, summing changes, and calculating avg_change_per_team
agg_suspense_eu = suspense_eu.groupby(['year', 'stage']).agg(
    sum_suspense=('aggregate_active_suspense', 'sum')
).reset_index()

agg_suspense_eu

Unnamed: 0,year,stage,sum_suspense
0,1988,Group 2,5
1,1992,Group 1,6
2,1992,Group 2,7
3,1996,Group A,5
4,1996,Group B,7
5,1996,Group C,3
6,2000,Group A,3
7,2000,Group B,4
8,2000,Group C,7
9,2004,Group A,4


## World Cup

In [22]:
# Path to the dataset
data_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\out\kaggle\fifa\active_suspence_wc.xlsx'
# Read the dataset with a different encoding
suspense_wc = pd.read_excel(data_path)
suspense_wc = suspense_wc.rename(columns={'group_name': 'stage'})

display(suspense_wc)

Unnamed: 0,year,team,stage,aggregate_active_suspense
0,1986,Algeria,Group D,5
1,1986,Morocco,Group F,1
2,1986,Northern Ireland,Group D,1
3,1986,Poland,Group F,1
4,1986,Portugal,Group F,5
...,...,...,...,...
73,2022,Mexico,Group C,4
74,2022,Saudi Arabia,Group C,1
75,2022,Senegal,Group A,2
76,2022,Switzerland,Group G,1


In [23]:
# Grouping by year and stage, summing changes, and calculating avg_change_per_team
agg_suspense_wc = suspense_wc.groupby(['year', 'stage']).agg(
    sum_suspense=('aggregate_active_suspense', 'sum')
).reset_index()

agg_suspense_wc

Unnamed: 0,year,stage,sum_suspense
0,1986,Group A,7
1,1986,Group D,6
2,1986,Group F,7
3,1990,Group A,1
4,1990,Group B,6
5,1990,Group C,2
6,1990,Group E,3
7,1990,Group F,3
8,1994,Group A,2
9,1994,Group B,9


In [24]:
# Performing a t-test using the two separate datasets
eu_data = agg_suspense_eu['sum_suspense']
wc_data = agg_suspense_wc['sum_suspense']

# Performing the t-test
t_stat, p_value = ttest_ind(wc_data, eu_data, equal_var=False)

t_test_results = {
    "t-statistic": t_stat,
    "p-value": p_value
}

t_test_results


{'t-statistic': np.float64(0.5059861166577166),
 'p-value': np.float64(0.6147433449270425)}