In [1]:
import pandas as pd
import pickle
from scipy.stats import poisson

In [2]:
table_dict = pickle.load(open('table_dict', 'rb'))
historical_data_df = pd.read_csv('clean_fifa_worldcup_historical_data.csv')
fixture_df = pd.read_csv('clean_fifa_worldcup_fixture.csv')

In [3]:
historical_data_df

Unnamed: 0,Home_Team,Away_Team,Year,Home_Goals,Away_Goals,Total_Goals
0,France,Mexico,1930,4,1,5
1,Uruguay,Argentina,1930,4,2,6
2,Uruguay,Yugoslavia,1930,6,1,7
3,Argentina,United States,1930,6,1,7
4,Paraguay,Belgium,1930,1,0,1
...,...,...,...,...,...,...
895,Serbia,Brazil,2018,0,2,2
896,Serbia,Switzerland,2018,1,2,3
897,Brazil,Costa Rica,2018,2,0,2
898,Costa Rica,Serbia,2018,0,1,1


In [4]:
# Split dataframe into home and away 

home_df = historical_data_df[['Home_Team', 'Home_Goals', 'Away_Goals']]
away_df = historical_data_df[['Away_Team', 'Home_Goals', 'Away_Goals']]

In [6]:
# rename the columns

home_df = home_df.rename(columns={'Home_Team': 'Team', 'Home_Goals': 'Goals_Scored', 'Away_Goals': 'Goals_Conceded'})
away_df = away_df.rename(columns={'Away_Team': 'Team', 'Home_Goals': 'Goals_Conceded', 'Away_Goals': 'Goals_Scored'})

In [9]:
# Concatenate the two dataframes, group by team and calculate the mean

team_strength_df = pd.concat([home_df, away_df], ignore_index=True).groupby('Team').mean()
team_strength_df


Unnamed: 0_level_0,Goals_Scored,Goals_Conceded
Team,Unnamed: 1_level_1,Unnamed: 2_level_1
Algeria,1.000000,1.461538
Angola,0.333333,0.666667
Argentina,1.691358,1.148148
Australia,0.812500,1.937500
Austria,1.482759,1.620690
...,...,...
Uruguay,1.553571,1.321429
Wales,0.800000,0.800000
West Germany,2.112903,1.241935
Yugoslavia,1.666667,1.272727


In [10]:
'''
I Chose the Poisson distribution to predict how many points the teams will get in the group because each team will play three 90 minute games and the Poisson distribution is used to model the number of events in a fixed interval of time or space. 
It is assumed that the number of events (goals) in non-overlapping intervals are independent. The Poisson distribution is the probability distribution of independent event occurrences in an interval. Also the number of goals in a game can be counted as an integer.
Furthermore, the probability of scoring a goal is constant in a game and does not change over time and a goal cannot be scored at the same time as another goal.
lambda is the median goals scored in 90 minutes and x is the number of goals that could be scored by team A or team B.
'''

def predict_points(home, away):
    if home in team_strength_df.index and away in team_strength_df.index:
        # If the team scores more goals, then their lamba will be higher. If the team concedes more goals, then the other team's lambda will be higher
        # goals_scored * goals_conceded
        lambda_home = team_strength_df.at[home, 'Goals_Scored'] * team_strength_df.at[away, 'Goals_Conceded'] # average goals scored by home team times average goals conceded by away team
        lambda_away = team_strength_df.at[away, 'Goals_Scored'] * team_strength_df.at[home, 'Goals_Conceded'] # average goals scored by away team times average goals conceded by home team
        home_prob, away_prob, draw_prob = 0, 0, 0 # initialize the probabilities
        for i in range(0, 11): # loop through the number of goals that could be scored by the home team
            for j in range(0, 11): # loop through the number of goals that could be scored by the away team
                p = poisson.pmf(i, lambda_home) * poisson.pmf(j, lambda_away) # calculate the probability of the home team scoring i goals and the away team scoring j goals
                # if the home team has scored the same number of goals as the away team, then it is a draw and the draw probability is increased
                if i == j: 
                    draw_prob += p
                # if the home team has scored more goals than the away team, then the home probability is increased
                elif i > j:
                    home_prob += p
                # if the away team has scored more goals than the home team, then the away probability is increased
                else:
                    away_prob += p

        home_points = 3 * home_prob + draw_prob
        away_points = 3 * away_prob + draw_prob
        return (home_points, away_points)
    else:
        return (0, 0)

In [16]:
# spliting the fixture dataframe into group stage, round of 16, quarter finals, semi finals, and final

fixture_group_48_df = fixture_df[:48].copy()
fixture_round_16_df = fixture_df[48:56].copy()
fixture_quarter_finals_df = fixture_df[56:60].copy()
fixture_semi_finals_df = fixture_df[60:62].copy()
fixture_final_df = fixture_df[62:].copy()


### Group Stage simulation

In [23]:
for group in table_dict:
    teams_in_group = table_dict[group]['Team'].values
    fixture_group_6_df = fixture_group_48_df[fixture_group_48_df['home'].isin(teams_in_group)] 
    for index, row in fixture_group_6_df.iterrows():
        home, away = row['home'], row['away']
        home_points, away_points = predict_points(home, away)
        table_dict[group].loc[table_dict[group]['Team'] == home, 'Pts'] += home_points
        table_dict[group].loc[table_dict[group]['Team'] == away, 'Pts'] += away_points

    table_dict[group] = table_dict[group].sort_values('Pts', ascending=False).reset_index()
    table_dict[group] = table_dict[group][['Team', 'Pts']]
    table_dict[group] = table_dict[group].round(0)

In [24]:
table_dict['Group H']

Unnamed: 0,Team,Pts
0,Portugal,12.0
1,Uruguay,10.0
2,Ghana,8.0
3,South Korea,4.0


### Knockout simulation

In [25]:
fixture_round_16_df

Unnamed: 0,home,score,away,year
48,Winners Group A,Match 49,Runners-up Group B,2022
49,Winners Group C,Match 50,Runners-up Group D,2022
50,Winners Group D,Match 52,Runners-up Group C,2022
51,Winners Group B,Match 51,Runners-up Group A,2022
52,Winners Group E,Match 53,Runners-up Group F,2022
53,Winners Group G,Match 54,Runners-up Group H,2022
54,Winners Group F,Match 55,Runners-up Group E,2022
55,Winners Group H,Match 56,Runners-up Group G,2022


In [27]:
for group in table_dict:
    group_winner = table_dict[group].loc[0, 'Team']
    group_runner_up = table_dict[group].loc[1, 'Team']

    fixture_round_16_df.replace({f'Winners {group}': group_winner, f'Runners-up {group}' : group_runner_up}, inplace=True)

fixture_round_16_df['winner'] = '?'
fixture_round_16_df

Unnamed: 0,home,score,away,year,winner
48,Netherlands,Match 49,Wales,2022,?
49,Argentina,Match 50,Denmark,2022,?
50,France,Match 52,Poland,2022,?
51,England,Match 51,Senegal,2022,?
52,Germany,Match 53,Belgium,2022,?
53,Brazil,Match 54,Uruguay,2022,?
54,Croatia,Match 55,Spain,2022,?
55,Portugal,Match 56,Switzerland,2022,?


In [28]:
def get_winner(updated_fixture_df):
    for index, row in updated_fixture_df.iterrows():
        home, away = row['home'], row['away']
        home_points, away_points = predict_points(home, away)
        if home_points > away_points:
            winner = home
        else:
            winner = away
        updated_fixture_df.loc[index, 'winner'] = winner
    return updated_fixture_df

get_winner(fixture_round_16_df)

Unnamed: 0,home,score,away,year,winner
48,Netherlands,Match 49,Wales,2022,Netherlands
49,Argentina,Match 50,Denmark,2022,Argentina
50,France,Match 52,Poland,2022,France
51,England,Match 51,Senegal,2022,England
52,Germany,Match 53,Belgium,2022,Germany
53,Brazil,Match 54,Uruguay,2022,Brazil
54,Croatia,Match 55,Spain,2022,Spain
55,Portugal,Match 56,Switzerland,2022,Portugal


### Quarter final simulation

In [31]:
# update the new round with the winners of the the previous round
def update_table(fixture_round_1_df, fixture_round_2_df):
    for index, row in fixture_round_1_df.iterrows():
        winner = fixture_round_1_df.loc[index, 'winner']
        match = fixture_round_1_df.loc[index, 'score']
        fixture_round_2_df.replace({f'Winners {match}': winner}, inplace=True)
    fixture_round_2_df['winner'] = '?'
    return fixture_round_2_df

In [32]:
update_table(fixture_round_16_df, fixture_quarter_finals_df)

Unnamed: 0,home,score,away,year,winner
56,Germany,Match 58,Brazil,2022,?
57,Netherlands,Match 57,Argentina,2022,?
58,Spain,Match 60,Portugal,2022,?
59,England,Match 59,France,2022,?


In [33]:
get_winner(fixture_quarter_finals_df)

Unnamed: 0,home,score,away,year,winner
56,Germany,Match 58,Brazil,2022,Brazil
57,Netherlands,Match 57,Argentina,2022,Netherlands
58,Spain,Match 60,Portugal,2022,Portugal
59,England,Match 59,France,2022,France


### Semi Final simulation

In [34]:
update_table(fixture_quarter_finals_df, fixture_semi_finals_df)

Unnamed: 0,home,score,away,year,winner
60,Netherlands,Match 61,Brazil,2022,?
61,France,Match 62,Portugal,2022,?


In [35]:
get_winner(fixture_semi_finals_df)

Unnamed: 0,home,score,away,year,winner
60,Netherlands,Match 61,Brazil,2022,Brazil
61,France,Match 62,Portugal,2022,France


### Final simulation

In [36]:
update_table(fixture_semi_finals_df, fixture_final_df)

Unnamed: 0,home,score,away,year,winner
62,Losers Match 61,Match 63,Losers Match 62,2022,?
63,Brazil,Match 64,France,2022,?


In [37]:

get_winner(fixture_final_df)

Unnamed: 0,home,score,away,year,winner
62,Losers Match 61,Match 63,Losers Match 62,2022,Losers Match 62
63,Brazil,Match 64,France,2022,Brazil


### Brazil Won!!!