<h1>Data Cleaning</h1>

In [201]:
# Imports
import numpy as np
import pandas as pd
from scipy.stats import poisson

# Read results data and clean up dates
worldCupData = pd.read_csv("results.csv")
worldCupData["date"] = pd.to_datetime(worldCupData["date"])
worldCupData.sort_values("date").tail()
worldCupData = worldCupData[(worldCupData['date'] >= "2018-8-1") & (worldCupData['date'] <= "2022-11-19")].reset_index(drop=True)

# Read ranking data and clean up dates
rank = pd.read_csv("fifa_ranking-2022-12-22.csv")
rank["rank_date"] = pd.to_datetime(rank["rank_date"])
rank = rank[(rank['rank_date'] >= "2018-8-1") & (rank['rank_date'] <= "2022-11-19")].reset_index(drop=True)

# Make sure data is concordant and fill NaN values with previous rank
rank["country_full"] = rank["country_full"].str.replace("IR Iran", "Iran").str.replace("Korea Republic", "South Korea").str.replace("USA", "United States")
rank = rank.set_index(['rank_date']).groupby(['country_full'], group_keys=False).resample('D').first().fillna(method='ffill').reset_index()

# Merge dataframes
df_wc_ranked = worldCupData.merge(rank[["country_full", "total_points", "previous_points", "rank", "rank_change", "rank_date"]], left_on=["date", "home_team"], right_on=["rank_date", "country_full"]).drop(["rank_date", "country_full"], axis=1)
worldCupDataR = df_wc_ranked.merge(rank[["country_full", "total_points", "previous_points", "rank", "rank_change", "rank_date"]], left_on=["date", "away_team"], right_on=["rank_date", "country_full"], suffixes=("_home", "_away")).drop(["rank_date", "country_full"], axis=1)

display (worldCupDataR)

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral,total_points_home,previous_points_home,rank_home,rank_change_home,total_points_away,previous_points_away,rank_away,rank_change_away
0,2018-08-18,Andorra,United Arab Emirates,0,0,Friendly,Grödig,Austria,True,1120.00,1120.00,130.0,0.0,1312.00,1312.00,77.0,0.0
1,2018-08-18,Grenada,Jamaica,1,5,Friendly,St. George's,Grenada,False,980.00,980.00,168.0,0.0,1400.00,1400.00,54.0,0.0
2,2018-08-18,Guatemala,Cuba,1,0,Friendly,Quetzaltenango,Guatemala,False,1064.00,1064.00,146.0,0.0,940.00,940.00,181.0,0.0
3,2018-08-20,Barbados,Jamaica,2,2,Friendly,Bridgetown,Barbados,False,1005.00,1008.00,160.0,0.0,1400.00,1400.00,54.0,0.0
4,2018-08-26,Barbados,Cuba,0,0,Friendly,Bridgetown,Barbados,False,1005.00,1008.00,160.0,0.0,940.00,940.00,181.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3332,2022-09-27,Norway,Serbia,0,2,UEFA Nations League,Oslo,Norway,False,1488.57,1488.57,36.0,0.0,1549.53,1549.53,25.0,0.0
3333,2022-09-27,Sweden,Slovenia,1,1,UEFA Nations League,Stockholm,Sweden,False,1563.44,1563.44,20.0,0.0,1372.48,1372.48,65.0,0.0
3334,2022-09-27,Kosovo,Cyprus,5,1,UEFA Nations League,Pristina,Kosovo,False,1183.90,1183.90,106.0,0.0,1180.52,1180.52,108.0,1.0
3335,2022-09-27,Greece,Northern Ireland,3,1,UEFA Nations League,Athens,Greece,False,1441.45,1441.45,49.0,1.0,1399.10,1399.10,58.0,0.0


<h1>Averaging and Merging</h1>

In [156]:
# worldCupDataR["date_diff"] = ((worldCupDataR["date"] - pd.to_datetime("2010-08-17")).dt.days).div(365*5)

# worldCupDataR['home_score'] = worldCupDataR['home_score'].mul(abs(worldCupDataR['rank_home'].sub(worldCupDataR['rank_away'].mean())).div(abs(worldCupDataR['rank_away'].sub(worldCupDataR['rank_home'].mean()))))
# worldCupDataR['away_score'] = worldCupDataR['away_score'].mul(abs(worldCupDataR['rank_away'].sub(worldCupDataR['rank_home'].mean())).div(abs(worldCupDataR['rank_home'].sub(worldCupDataR['rank_away'].mean()))))

# Get average goals scored and conceded for teams (both home and away)
home = worldCupDataR.groupby('home_team', as_index=False)[['home_score','away_score']].mean()
away = worldCupDataR.groupby('away_team', as_index=False)[['away_score','home_score']].mean()

# Rename columns
home.rename(columns = {'home_team': 'Team', 'home_score':'Scored', 'away_score':'Conceded'}, inplace = True)
away.rename(columns = {'away_team': 'Team', 'away_score':'Scored', 'home_score':'Conceded'}, inplace = True)

# Merge datasets index with team names
complete_df = home.merge(away, how='left')
complete_df.set_index('Team', inplace=True)

display (complete_df)

Unnamed: 0_level_0,Scored,Conceded
Team,Unnamed: 1_level_1,Unnamed: 2_level_1
Afghanistan,0.833333,1.333333
Albania,1.227273,1.045455
Algeria,2.538462,0.538462
American Samoa,2.666667,5.000000
Andorra,0.521739,1.695652
...,...,...
Vietnam,1.473684,0.473684
Wales,1.259259,0.851852
Yemen,0.500000,1.750000
Zambia,1.857143,1.428571


<h1>Predicting points using Poisson Distribution</h1>

In [186]:
def predict_points(home, away):
    if home in complete_df.index and away in complete_df.index:    
        
        # Calculate lamba for home and away
        lamb_home = complete_df.at[home,'Scored'] * complete_df.at[away,'Conceded']
        lamb_away = complete_df.at[away,'Scored'] * complete_df.at[home,'Conceded']
        
        prob_home, prob_away, prob_draw = 0, 0, 0
        for x in range(0,11): #number of goals home team
            for y in range(0, 11): #number of goals away team
                
                # Poisson distribution 
                p = poisson.pmf(x, lamb_home) * poisson.pmf(y, lamb_away)
                
                # Draw
                if x == y:
                    prob_draw += p
                    
                # Home win
                elif x > y:
                    prob_home += p
                    
                # Away win
                else:
                    prob_away += p
        
        # Points
        points_home = 3 * prob_home 
        points_away = 3 * prob_away
        return (points_home, points_away)
    else:
        return (0, 0)

predict_points("Scotland", "England")

(0.4032103887086164, 2.1535299683798352)

<h1>Scraping table data from Wikipedia</h1>

In [198]:
from operator import itemgetter
from collections.abc import Iterable

# Get page
dfs = pd.read_html(r"https://en.wikipedia.org/wiki/2022_FIFA_World_Cup#Teams")

for i in range(len(dfs)):
    df = dfs[i]
    cols = list(df.columns.values)
    
    #Look for start of data and end of data
    if isinstance(cols[0], Iterable):
        if any("Tie-breaking criteria" in c for c in cols):
            start_pos = i+1

        if any("Match 46" in c for c in cols):
            end_pos = i+1
    
    # Mathces and groups
    matches = []
    groups = ["A", "B", "C", "D", "E", "F", "G", "H"]
    group_count = 0 
    
    # Create group tables from start position
    table = {}
    table[groups[group_count]] = [[a.split(" ")[0], 0] for a in list(dfs[start_pos].iloc[:, 1].values)]

    # Loop through each and create matches
    for i in range(start_pos+1, end_pos, 1):
        if len(dfs[i].columns) == 3:
            team_1 = dfs[i].columns.values[0]
            team_2 = dfs[i].columns.values[-1]

            matches.append((groups[group_count], team_1, team_2))
        else:
            group_count+=1
            table[groups[group_count]] = [[a, 0] for a in list(dfs[i].iloc[:, 1].values)]

display (table)

{'A': [['Netherlands', 0], ['Senegal', 0], ['Ecuador', 0], ['Qatar', 0]],
 'B': [['England', 0], ['United States', 0], ['Iran', 0], ['Wales', 0]],
 'C': [['Argentina', 0], ['Poland', 0], ['Mexico', 0], ['Saudi Arabia', 0]],
 'D': [['France', 0], ['Australia', 0], ['Tunisia', 0], ['Denmark', 0]],
 'E': [['Japan', 0], ['Spain', 0], ['Germany', 0], ['Costa Rica', 0]],
 'F': [['Morocco', 0], ['Croatia', 0], ['Belgium', 0], ['Canada', 0]],
 'G': [['Brazil', 0], ['Switzerland', 0], ['Cameroon', 0], ['Serbia', 0]],
 'H': [['Portugal', 0], ['South Korea', 0], ['Uruguay', 0], ['Ghana', 0]]}

<h1>Use function to predict points in the group</h1>

In [199]:
advanced_group = []

# Reset data
for k in table.keys():
    for t in table[k]:
        t[1] = 0
        
for teams in matches:
    # Get teams and predict points
    team_1 = teams[1]
    team_2 = teams[2]
    points_1, points_2 = predict_points(teams[1], teams[2])
    
    # If the predicted points are within 0.3 of each other, make it a draw
    if  (points_1 - 0.3) <= points_2 <= (points_1 + 0.3) or (points_2 - 0.3) <= points_1 <= (points_2 + 0.3):
        
        # Add points
        for i in table[teams[0]]:
            if i[0] == teams[1] or i[0] == teams[2]:
                i[1] += 1
    
    # Team 1 wins
    elif points_1 > points_2:
        
        # Add points
        for i in table[teams[0]]:
            if i[0] == teams[1]:
                i[1] += 3
    
    # Team 2 wins           
    elif points_2 > points_1: 
        
        # Add points
        for i in table[teams[0]]:
            if i[0] == teams[2]:
                i[1] += 3

# Sort teams and pick teams that advance
for group in table:
    table[group].sort(key=lambda x: x[1], reverse=True)
    advanced_group.append([table[group][0][0], table[group][1][0]])

display (table)
display (advanced_group)

{'A': [['Netherlands', 7], ['Senegal', 7], ['Ecuador', 1], ['Qatar', 1]],
 'B': [['Iran', 9], ['United States', 6], ['England', 3], ['Wales', 0]],
 'C': [['Argentina', 9], ['Poland', 2], ['Mexico', 2], ['Saudi Arabia', 2]],
 'D': [['Denmark', 9], ['Australia', 6], ['France', 1], ['Tunisia', 1]],
 'E': [['Spain', 9], ['Japan', 4], ['Germany', 4], ['Costa Rica', 0]],
 'F': [['Canada', 9], ['Morocco', 4], ['Belgium', 4], ['Croatia', 0]],
 'G': [['Brazil', 9], ['Switzerland', 6], ['Cameroon', 1], ['Serbia', 1]],
 'H': [['Portugal', 9], ['South Korea', 4], ['Uruguay', 2], ['Ghana', 1]]}

[['Netherlands', 'Senegal'],
 ['Iran', 'United States'],
 ['Argentina', 'Poland'],
 ['Denmark', 'Australia'],
 ['Spain', 'Japan'],
 ['Canada', 'Morocco'],
 ['Brazil', 'Switzerland'],
 ['Portugal', 'South Korea']]

<h1>Playoff Simulation</h1>

In [219]:
# Playoff rounds
playoffs = {"Round of 16": [], "Quarter-Final": [], "Semi-Final": [], "Final": []}

# Reset data
for p in playoffs.keys():
    playoffs[p] = []

next_rounds = []

# Loop through each round
for p in playoffs.keys():
    
    # If this is the first round to be played
    if p == "Round of 16":
        print (p)
        
        # Create the brackets
        control = []
        for a in range(0, len(advanced_group*2), 1):
            if a < len(advanced_group):
                if a % 2 == 0:
                    control.append((advanced_group*2)[a][0])
                else:
                    control.append((advanced_group*2)[a][1])
            else:
                if a % 2 == 0:
                    control.append((advanced_group*2)[a][1])
                else:
                    control.append((advanced_group*2)[a][0])
        
        # Add to playoffs the matches to be played
        playoffs[p] = [[control[c], control[c+1]] for c in range(0, len(control)-1, 1) if c%2 == 0]
    
    # Get teams from previous round
    else:
        print (p)
        playoffs[p] = [[next_rounds[c], next_rounds[c+1]] for c in range(0, len(next_rounds)-1, 1) if c%2 == 0]
        next_rounds = []
    
    # Loop through each match and predict the winner
    for i in range(0, len(playoffs[p])):
        game = playoffs[p][i]
        home = game[0]
        away = game[1]
            
        points_1, points_2 = predict_points(home, away)
            
        if points_1 > points_2:
            next_rounds.append(home)
            print (home + " v " + away + ". " + home + " wins.")
        else:
            next_rounds.append(away)
            print (home + " v " + away + ". " + away + " wins.")

        game.append([points_1, points_2])
        playoffs[p][i] = game
    
    print ("\n")
        

Round of 16
Netherlands v United States. United States wins.
Argentina v Australia. Argentina wins.
Spain v Morocco. Spain wins.
Brazil v South Korea. Brazil wins.
Senegal v Iran. Iran wins.
Poland v Denmark. Denmark wins.
Japan v Canada. Canada wins.
Switzerland v Portugal. Portugal wins.


Quarter-Final
United States v Argentina. Argentina wins.
Spain v Brazil. Brazil wins.
Iran v Denmark. Denmark wins.
Canada v Portugal. Canada wins.


Semi-Final
Argentina v Brazil. Brazil wins.
Denmark v Canada. Canada wins.


Final
Brazil v Canada. Canada wins.


