# Libraries

In [149]:
import pandas as pd
from getpass import getuser
from collections import defaultdict
from scipy.stats import ttest_ind

In [150]:
# Get the current user's name
user = getuser()


# Load European Championship dataset

In [151]:

 # Path to the dataset
data_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\out\wiki\men\uefa\eu\tb_eu_uefa_men.xlsx'

# Read the dataset with a different encoding
df_eu = pd.read_excel(data_path)


In [152]:


# Dictionary of replacements
replacements = {
    "Republic of Ireland": "Ireland",
    "CIS": "Commonwealth of Independent States",
    "FR Yugoslavia": "Yugoslavia"
}

df_eu['team'] = df_eu['team'].replace(replacements)


# Display the updated DataFrame
display(df_eu)


Unnamed: 0,year,stage,team,1st,2nd,3rd,4th,changes
0,1988,Group 1,West Germany,1,0,0,0,1
1,1988,Group 1,Italy,0,1,0,0,1
2,1988,Group 1,Spain,0,0,1,0,1
3,1988,Group 1,Denmark,0,0,0,1,1
4,1988,Group 2,Soviet Union,1,1,0,0,2
...,...,...,...,...,...,...,...,...
163,2024,Group E,Ukraine,0,1,1,1,3
164,2024,Group F,Portugal,1,0,0,0,1
165,2024,Group F,Turkey,0,1,0,0,1
166,2024,Group F,Georgia,0,0,1,1,2


# Load ELO ratings European Championship

In [153]:

# Path to the dataset
data_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\in\elo_eu.xlsx'

# Read the dataset with a different encoding
elo_eu = pd.read_excel(data_path)
display(elo_eu)

Unnamed: 0,year,month,day,team,elo_rating
0,1988,6,8,England,1998
1,1988,6,8,Soviet Union,1976
2,1988,6,8,Sweden,1972
3,1988,6,8,West Germany,1967
4,1988,6,8,Italy,1937
...,...,...,...,...,...
494,2024,6,12,Faroe Islands,1246
495,2024,6,12,Andorra,1110
496,2024,6,12,Gibraltar,1058
497,2024,6,12,Liechtenstein,968


# merge elo and team changes

In [154]:
# Perform the merge and keep only the 'elo_rating' from elo_eu
df_eu = df_eu.merge(
    elo_eu[['year', 'team', 'elo_rating']],  # Select only relevant columns from elo_eu
    on=['year', 'team'],                    # Keys for the merge
    how='left'                     # Use a left join to keep all rows in df_eu
)



In [155]:
# Grouping by year and stage, summing changes, and calculating average elo_rating
aggregated_df_eu = df_eu.groupby(['year', 'stage']).agg(
    sum_changes=('changes', 'sum'),
    avg_elo_rating=('elo_rating', 'mean')  # Calculate the average of elo_rating
).reset_index()
aggregated_df_eu.head()

Unnamed: 0,year,stage,sum_changes,avg_elo_rating
0,1988,Group 1,4,1924.5
1,1988,Group 2,8,1935.5
2,1992,Group 1,18,1920.75
3,1992,Group 2,8,1934.0
4,1996,Group A,6,1843.25


# Load standings changes European Championship

In [156]:

 # Path to the dataset
data_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\out\wiki\men\uefa\eu\standings_eu_uefa_men.xlsx'

# Read the dataset with a different encoding
df_standings_eu = pd.read_excel(data_path)


In [157]:
# List of columns to replace team names
columns_to_replace = ['home_team', 'away_team', 'new_top_teams', '1st', '2nd', '3rd']

# Dictionary of replacements
replacements = {
    "Republic of Ireland": "Ireland",
    "CIS": "Commonwealth of Independent States",
    "FR Yugoslavia": "Yugoslavia"
}

# Replace values in the specified columns
for column in columns_to_replace:
    df_standings_eu[column] = df_standings_eu[column].replace(replacements)

# If 'new_top_teams' is a list-like column, you may need to apply replacement element-wise
if 'new_top_teams' in columns_to_replace:
    df_standings_eu['new_top_teams'] = df_standings_eu['new_top_teams'].apply(lambda teams: [replacements.get(team, team) for team in teams] if isinstance(teams, list) else teams)

# Group by 'year' and 'stage' and keep the row with the maximum 'change_num'
df_standings_eu_filtered = df_standings_eu.loc[df_standings_eu.groupby(['year', 'stage'])['change_num'].idxmax()]

# Reset index for better readability (optional)
df_standings_eu_filtered = df_standings_eu_filtered.reset_index(drop=True)



In [158]:
# Keep only the first three columns in df_standings_eu
df_standings_eu_filtered = df_standings_eu_filtered.iloc[:, :3]

# Drop the 'sum_changes' column from aggregated_df_eu before merging
aggregated_df_eu_cleaned = aggregated_df_eu.drop(columns=['sum_changes'])

# Merge the DataFrames on 'year' and 'stage'
eu_standings = pd.merge(df_standings_eu_filtered, aggregated_df_eu_cleaned, on=['year', 'stage'], how='inner')
display(eu_standings)


Unnamed: 0,year,stage,change_num,avg_elo_rating
0,1988,Group 1,0,1924.5
1,1988,Group 2,1,1935.5
2,1992,Group 1,3,1920.75
3,1992,Group 2,0,1934.0
4,1996,Group A,2,1843.25
5,1996,Group B,4,1927.75
6,1996,Group C,2,1966.0
7,1996,Group D,0,1873.25
8,2000,Group A,3,1891.5
9,2000,Group B,1,1846.75


# Load World Cup dataset

In [159]:
# Path to the dataset
data_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\out\kaggle\fifa\tb_wc_fifa_men.xlsx'

# Read the dataset with a different encoding
df_wc = pd.read_excel(data_path)
df_wc = df_wc.rename(columns={'group_name': 'stage'})
display(df_wc)

Unnamed: 0,year,stage,team,1st,2nd,3rd,4th,changes
0,1986,Group A,Argentina,1,0,0,0,1
1,1986,Group A,Italy,0,1,1,0,2
2,1986,Group A,Bulgaria,0,1,1,0,2
3,1986,Group A,South Korea,0,0,0,2,2
4,1986,Group B,Mexico,2,1,0,0,3
...,...,...,...,...,...,...,...,...
291,2022,Group G,Serbia,0,1,0,2,3
292,2022,Group H,Portugal,1,0,0,0,1
293,2022,Group H,South Korea,0,1,1,1,3
294,2022,Group H,Uruguay,0,1,2,1,4


In [160]:
# Rename specific team names in the 'team' column
df_wc['team'] = df_wc['team'].replace({
    "Republic of Ireland": "Ireland"
})

# Load ELO ratings World Cup

In [161]:

# Path to the dataset
data_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\in\elo_wc.xlsx'

# Read the dataset with a different encoding
elo_wc = pd.read_excel(data_path)

# merge  team changes and elo 

In [162]:
# Perform the merge and keep only the 'elo_rating' from elo_eu
df_wc = df_wc.merge(
    elo_wc[['year', 'team', 'elo_rating']],  # Select only relevant columns from elo_eu
    on=['year', 'team'],                    # Keys for the merge
    how='left'                             # Use a left join to keep all rows in df_eu
)


In [163]:
# Grouping by year and stage, summing changes, and calculating average elo_rating
aggregated_df_wc = df_wc.groupby(['year', 'stage']).agg(
    sum_changes=('changes', 'sum'),
    avg_elo_rating=('elo_rating', 'mean')  # Calculate the average of elo_rating
).reset_index()

In [164]:
# Appending the datasets and adding the "wc" variable
combined_df = pd.concat([aggregated_df_wc.assign(wc=1), aggregated_df_eu.assign(wc=0)], ignore_index=True)

# Load wc standings changes

In [165]:

 # Path to the dataset
data_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\out\kaggle\fifa\standings_wc_fifa_men.xlsx'

# Read the dataset with a different encoding
df_standings_wc = pd.read_excel(data_path)

df_standings_wc = df_standings_wc.rename(columns={'group_name': 'stage'})


In [166]:
# List of columns to replace team names
columns_to_replace = ['home_team', 'away_team', 'new_top_teams', '1st', '2nd', '3rd']

# Dictionary of replacements
replacements = {
    "Republic of Ireland": "Ireland"
}

# Replace values in the specified columns
for column in columns_to_replace:
    df_standings_wc[column] = df_standings_wc[column].replace(replacements)

# If 'new_top_teams' is a list-like column, you may need to apply replacement element-wise
if 'new_top_teams' in columns_to_replace:
    df_standings_wc['new_top_teams'] = df_standings_wc['new_top_teams'].apply(lambda teams: [replacements.get(team, team) for team in teams] if isinstance(teams, list) else teams)

# Group by 'year' and 'stage' and keep the row with the maximum 'change_num'
df_standings_wc_filtered = df_standings_wc.loc[df_standings_wc.groupby(['year', 'stage'])['change_num'].idxmax()]

# Reset index for better readability (optional)
df_standings_wc_filtered = df_standings_wc_filtered.reset_index(drop=True)


In [167]:
# Keep only the first three columns in df_standings_eu
df_standings_wc_filtered = df_standings_wc_filtered.iloc[:, :3]

# Drop the 'sum_changes' column from aggregated_df_eu before merging
aggregated_df_wc_cleaned = aggregated_df_wc.drop(columns=['sum_changes'])

# Merge the DataFrames on 'year' and 'stage'
wc_standings = pd.merge(df_standings_wc_filtered, aggregated_df_wc_cleaned, on=['year', 'stage'], how='inner')
display(wc_standings)


Unnamed: 0,year,stage,change_num,avg_elo_rating
0,1986,Group A,0,1805.50
1,1986,Group B,0,1774.00
2,1986,Group C,0,1870.75
3,1986,Group D,2,1821.25
4,1986,Group E,0,1884.50
...,...,...,...,...
69,2022,Group D,2,1850.50
70,2022,Group E,4,1885.50
71,2022,Group F,0,1869.00
72,2022,Group G,2,1895.00


In [168]:
# Appending the datasets and adding the "wc" variable
combined_df_standings = pd.concat([wc_standings.assign(wc=1), eu_standings.assign(wc=0)], ignore_index=True)
combined_df_standings.head()

Unnamed: 0,year,stage,change_num,avg_elo_rating,wc
0,1986,Group A,0,1805.5,1
1,1986,Group B,0,1774.0,1
2,1986,Group C,0,1870.75,1
3,1986,Group D,2,1821.25,1
4,1986,Group E,0,1884.5,1


In [169]:
    # Optionally, save the results to a CSV file
file_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\out\aggregate_standings.xlsx'
combined_df_standings.to_excel(file_path, index=False)

# T-tests

In [170]:
# Performing a t-test using the two separate datasets
eu_data = eu_standings['change_num']
wc_data = wc_standings['change_num']

# Performing the t-test
t_stat, p_value = ttest_ind(wc_data, eu_data, equal_var=False)

t_test_results = {
    "t-statistic": t_stat,
    "p-value": p_value
}

t_test_results


{'t-statistic': -0.7936050016453972, 'p-value': 0.4297905143403734}

In [171]:
# Performing a t-test using the two separate datasets
eu_data = aggregated_df_eu['sum_changes']
wc_data = aggregated_df_wc['sum_changes']

# Performing the t-test
t_stat, p_value = ttest_ind(wc_data, eu_data, equal_var=False)

t_test_results = {
    "t-statistic": t_stat,
    "p-value": p_value
}

t_test_results


{'t-statistic': 0.6546466633008601, 'p-value': 0.5146045555344064}

# suspense

## European Championship

In [172]:
# Path to the dataset
data_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\out\wiki\men\uefa\eu\active_suspense_eu.xlsx'

# Read the dataset with a different encoding
suspense_eu = pd.read_excel(data_path)

display(suspense_eu)

Unnamed: 0,year,team,stage,aggregate_active_suspense
0,1988,Netherlands,Group 2,4
1,1988,Republic of Ireland,Group 2,1
2,1992,CIS,Group 2,3
3,1992,Denmark,Group 1,2
4,1992,England,Group 1,2
5,1992,France,Group 1,2
6,1992,Scotland,Group 2,4
7,1996,Bulgaria,Group B,3
8,1996,Czech Republic,Group C,3
9,1996,France,Group B,1


In [173]:
# Grouping by year and stage, summing changes, and calculating avg_change_per_team
agg_suspense_eu = suspense_eu.groupby(['year', 'stage']).agg(
    sum_suspense=('aggregate_active_suspense', 'sum')
).reset_index()

agg_suspense_eu

Unnamed: 0,year,stage,sum_suspense
0,1988,Group 2,5
1,1992,Group 1,6
2,1992,Group 2,7
3,1996,Group A,5
4,1996,Group B,7
5,1996,Group C,3
6,2000,Group A,3
7,2000,Group B,4
8,2000,Group C,7
9,2004,Group A,4


## World Cup

In [174]:
# Path to the dataset
data_path = rf'C:\Users\{user}\Documents\GitHub\tiebreak_wc\data\out\kaggle\fifa\active_suspence_wc.xlsx'
# Read the dataset with a different encoding
suspense_wc = pd.read_excel(data_path)
suspense_wc = suspense_wc.rename(columns={'group_name': 'stage'})

display(suspense_wc)

Unnamed: 0,year,team,stage,aggregate_active_suspense
0,1986,Algeria,Group D,5
1,1986,Morocco,Group F,1
2,1986,Northern Ireland,Group D,1
3,1986,Poland,Group F,1
4,1986,Portugal,Group F,5
...,...,...,...,...
73,2022,Mexico,Group C,4
74,2022,Saudi Arabia,Group C,1
75,2022,Senegal,Group A,2
76,2022,Switzerland,Group G,1


In [175]:
# Grouping by year and stage, summing changes, and calculating avg_change_per_team
agg_suspense_wc = suspense_wc.groupby(['year', 'stage']).agg(
    sum_suspense=('aggregate_active_suspense', 'sum')
).reset_index()

agg_suspense_wc

Unnamed: 0,year,stage,sum_suspense
0,1986,Group A,7
1,1986,Group D,6
2,1986,Group F,7
3,1990,Group A,1
4,1990,Group B,6
5,1990,Group C,2
6,1990,Group E,3
7,1990,Group F,3
8,1994,Group A,2
9,1994,Group B,9


In [176]:
# Performing a t-test using the two separate datasets
eu_data = agg_suspense_eu['sum_suspense']
wc_data = agg_suspense_wc['sum_suspense']

# Performing the t-test
t_stat, p_value = ttest_ind(wc_data, eu_data, equal_var=False)

t_test_results = {
    "t-statistic": t_stat,
    "p-value": p_value
}

t_test_results


{'t-statistic': 0.5059861166577166, 'p-value': 0.6147433449270425}