In [1]:
import pandas as pd
from scipy.stats import poisson
history = pd.read_csv('Data/FIFA_World_Cup_History.csv', encoding='latin-1', sep=';', engine='python')
matches=pd.read_csv('Data/matches.csv')
teams=pd.read_csv('Data/teams.csv')
stages=pd.read_csv('Data/tournament_stages.csv')
cities=pd.read_csv('Data/host_cities.csv')

matches = matches.merge(
    teams[["id", "team_name", "group_letter"]],
    left_on="home_team_id",
    right_on="id",
    suffixes=('', '_home')
).rename(columns={"team_name": "home_team"}).drop(columns=["id_home"])

matches = matches.merge(
    teams[["id", "team_name"]],
    left_on="away_team_id",
    right_on="id",
    suffixes=('', '_away')
).rename(columns={"team_name": "away_team"}).drop(columns=["id_away"])

matches = matches.merge(
    stages[["id", "stage_name", "stage_order"]],
    left_on="stage_id",
    right_on="id",
    suffixes=('', '_stage')
).drop(columns=["id_stage"])

matches = matches.merge(
    cities[["id", "region_cluster"]],
    left_on="city_id",
    right_on="id",
    suffixes=('', '_city')
).drop(columns=["id_city"])

# home_team | away_team | stage_name | region_cluster | kickoff_at
selected_columns = matches[[
    "home_team",
    "away_team",
    "stage_name",
    "region_cluster",
    "kickoff_at"
]]
matches.head()


Unnamed: 0,id,match_number,home_team_id,away_team_id,city_id,stage_id,kickoff_at,match_label,home_team,group_letter,away_team,stage_name,stage_order,region_cluster
0,1,1,1.0,2.0,15,1,2026-06-11 15:00:00-06,Group A,Mexico,A,South Africa,Group Stage,1,Central
1,2,2,3.0,4.0,14,1,2026-06-11 22:00:00-06,Group A,South Korea,A,Winner UEFA Playoff D,Group Stage,1,Central
2,3,3,5.0,6.0,12,1,2026-06-12 15:00:00-04,Group B,Canada,B,Winner UEFA Playoff A,Group Stage,1,East
3,4,4,13.0,14.0,6,1,2026-06-12 21:00:00-07,Group D,USA,D,Paraguay,Group Stage,1,West
4,5,5,7.0,8.0,10,1,2026-06-13 15:00:00-07,Group B,Qatar,B,Switzerland,Group Stage,1,West


In [2]:
history.head()
teams[["team_name", "group_letter"]].head()

Unnamed: 0,team_name,group_letter
0,Mexico,A
1,South Africa,A
2,South Korea,A
3,Winner UEFA Playoff D,A
4,Canada,B


# calculate team strength

In [3]:
#split team into df_home and df_away
# Load results.csv - it has match data with home/away teams and scores
results = pd.read_csv('Data/results.csv')

df_home = results[['home_team','home_score', 'away_score']]
df_away = results[['away_team','away_score', 'home_score']]

#rename cloumns to be the same
df_home = df_home.rename(columns={'home_team': 'Team', 'home_score': 'GoalsScored', 'away_score': 'Goals Against'})
df_away = df_away.rename(columns={'away_team': 'Team', 'away_score': 'GoalsScored', 'home_score': 'Goals Against'})


In [4]:
strength=pd.concat([df_home, df_away], ignore_index=True).groupby('Team').mean()
strength

Unnamed: 0_level_0,GoalsScored,Goals Against
Team,Unnamed: 1_level_1,Unnamed: 2_level_1
Abkhazia,1.593750,0.812500
Afghanistan,0.979167,2.006944
Albania,0.954315,1.482234
Alderney,0.540741,4.592593
Algeria,1.530579,1.006612
...,...,...
Yugoslavia,1.944330,1.567010
Zambia,1.562897,1.069886
Zanzibar,0.913876,2.086124
Zimbabwe,1.334702,1.190965


In [5]:
def predict_points(home, away):
    if home in strength.index and away in strength.index:
        # goals_scored * goals_conceded
        lamb_home = strength.at[home,'GoalsScored'] * strength.at[away,'Goals Against']
        lamb_away = strength.at[away,'GoalsScored'] * strength.at[home,'Goals Against']
        prob_home, prob_away, prob_draw = 0, 0, 0
        for x in range(0,11): #number of goals home team
            for y in range(0, 11): #number of goals away team
                p = poisson.pmf(x, lamb_home) * poisson.pmf(y, lamb_away)
                if x == y:
                    prob_draw += p
                elif x > y:
                    prob_home += p
                else:
                    prob_away += p
        
        points_home = 3 * prob_home + prob_draw
        points_away = 3 * prob_away + prob_draw
        return (points_home, points_away)
    else:
        return (0, 0)

# Testing function

In [6]:
predict_points("Mexico","South Africa")
predict_points("Germany","Argentina")
predict_points("Brazil","Algeria")

(np.float64(1.8799138095689922), np.float64(0.9161408393790633))

# Group Stage Matches

In [7]:
matches_cleaned = matches[['home_team', 'away_team', 'stage_name', 'match_label', 'region_cluster', 'kickoff_at']]

df_group=matches_cleaned[:48].copy()
df_group

Unnamed: 0,home_team,away_team,stage_name,match_label,region_cluster,kickoff_at
0,Mexico,South Africa,Group Stage,Group A,Central,2026-06-11 15:00:00-06
1,South Korea,Winner UEFA Playoff D,Group Stage,Group A,Central,2026-06-11 22:00:00-06
2,Canada,Winner UEFA Playoff A,Group Stage,Group B,East,2026-06-12 15:00:00-04
3,USA,Paraguay,Group Stage,Group D,West,2026-06-12 21:00:00-07
4,Qatar,Switzerland,Group Stage,Group B,West,2026-06-13 15:00:00-07
5,Brazil,Morocco,Group Stage,Group C,East,2026-06-13 18:00:00-04
6,Haiti,Scotland,Group Stage,Group C,East,2026-06-13 21:00:00-04
7,Australia,Winner UEFA Playoff C,Group Stage,Group D,West,2026-06-14 00:00:00-07
8,Germany,Curaçao,Group Stage,Group E,Central,2026-06-14 13:00:00-05
9,Netherlands,Japan,Group Stage,Group F,Central,2026-06-14 16:00:00-05


# create tables

In [8]:
def init_group_table(group_letter):
    group_teams = teams[teams["group_letter"] == group_letter]["team_name"]

    table = pd.DataFrame({
        "Team": group_teams,
        "MP": 0,
        "W": 0,
        "D": 0,
        "L": 0,
        "GF": 0,
        "GA": 0,
        "GD": 0,
        "Pts": 0.0
    })

    return table


In [9]:
groups = {}

for g in ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L"]:
    groups[g] = init_group_table(g)
groups['A']

Unnamed: 0,Team,MP,W,D,L,GF,GA,GD,Pts
0,Mexico,0,0,0,0,0,0,0,0.0
1,South Africa,0,0,0,0,0,0,0,0.0
2,South Korea,0,0,0,0,0,0,0,0.0
3,Winner UEFA Playoff D,0,0,0,0,0,0,0,0.0


In [10]:
# Recreate group tables cleanly so they have GF/GA/GD columns
groups = {g: init_group_table(g) for g in sorted(teams["group_letter"].dropna().unique())}

for group in groups:
    teams_in_group = groups[group]['Team'].values
    df_fixture_group_6 = df_group[df_group['home_team'].isin(teams_in_group)]
    for index, row in df_fixture_group_6.iterrows():
        home, away = row['home_team'], row['away_team']
        points_home, points_away = predict_points(home, away)
        groups[group].loc[groups[group]['Team'] == home, 'Pts'] += points_home
        groups[group].loc[groups[group]['Team'] == away, 'Pts'] += points_away
        # simple score assumption so we can update GF/GA/GD
        if points_home > points_away:
            hg, ag = 1, 0
        elif points_away > points_home:
            hg, ag = 0, 1
        else:
            hg, ag = 1, 1
        # update goals for/against
        groups[group].loc[groups[group]["Team"] == home, "GF"] += hg
        groups[group].loc[groups[group]["Team"] == home, "GA"] += ag
        groups[group].loc[groups[group]["Team"] == away, "GF"] += ag
        groups[group].loc[groups[group]["Team"] == away, "GA"] += hg
    groups[group]["GD"] = groups[group]["GF"] - groups[group]["GA"]

    # Sort by points but keep all columns
    groups[group] = groups[group].sort_values(["Pts", "GD", "GF"], ascending=False).reset_index(drop=True)
    groups[group] = groups[group][["Team", "Pts", "GA", "GD"]].round(0)
    



In [11]:
groups['K']

Unnamed: 0,Team,Pts,GA,GD
0,Uzbekistan,3.0,1,0
1,Portugal,1.0,1,1
2,Colombia,1.0,2,-1
3,Winner FIFA Playoff 1,0.0,2,0


In [12]:
# 1) Rebuild group tables CLEAN (so GF/GA/GD exist)
group_letters = sorted(teams["group_letter"].dropna().unique())
groups = {g: init_group_table(g) for g in group_letters}
qualified = []

for group in groups:
    table = groups[group].reset_index(drop=True)

    # Winner
    qualified.append({
        "Group": group,
        "Pos": 1,
        "Team": table.loc[0, "Team"]
    })

    # Runner-up
    qualified.append({
        "Group": group,
        "Pos": 2,
        "Team": table.loc[1, "Team"]
    })

    # Third place (needed for best-8 selection)
    qualified.append({
        "Group": group,
        "Pos": 3,
        "Team": table.loc[2, "Team"],
        "Pts": table.loc[2, "Pts"],
        "GD":  table.loc[2, "GD"],
        "GF":  table.loc[2, "GF"]
    })

qualified_df = pd.DataFrame(qualified)
third_place = qualified_df[qualified_df["Pos"] == 3]

best_8_third = (
    third_place
    .sort_values(["Pts", "GD", "GF"], ascending=False)
    .head(8)
)
best_8_third


Unnamed: 0,Group,Pos,Team,Pts,GD,GF
2,A,3,South Korea,0.0,0.0,0.0
5,B,3,Qatar,0.0,0.0,0.0
8,C,3,Haiti,0.0,0.0,0.0
11,D,3,Australia,0.0,0.0,0.0
14,E,3,Côte d'Ivoire,0.0,0.0,0.0
17,F,3,Winner UEFA Playoff B,0.0,0.0,0.0
20,G,3,IR Iran,0.0,0.0,0.0
23,H,3,Saudi Arabia,0.0,0.0,0.0


# Knock out 

In [13]:
top2 = qualified_df[qualified_df["Pos"].isin([1, 2])][["Group", "Team"]]

knockout = pd.concat(
    [top2, best_8_third[["Group", "Team"]]],
    ignore_index=True
)

print("Total teams qualified:", len(knockout))  # ✅ should be 32
knockout


Total teams qualified: 32


Unnamed: 0,Group,Team
0,A,Mexico
1,A,South Africa
2,B,Canada
3,B,Winner UEFA Playoff A
4,C,Brazil
5,C,Morocco
6,D,USA
7,D,Paraguay
8,E,Germany
9,E,Curaçao


In [14]:
winners = qualified_df[qualified_df["Pos"] == 1].set_index("Group")["Team"].to_dict()
runners = qualified_df[qualified_df["Pos"] == 2].set_index("Group")["Team"].to_dict()
best3_groups = best_8_third["Group"].tolist()
best3_teams  = best_8_third["Team"].tolist()
import pandas as pd

group_letters = sorted(winners.keys())  # should be A..L

fixtures = []

# pair winners/runners in adjacent groups
for i in range(0, len(group_letters), 2):
    g1 = group_letters[i]
    g2 = group_letters[i+1]
    fixtures.append((winners[g1], runners[g2]))  # g1 winner vs g2 runner-up
    fixtures.append((winners[g2], runners[g1]))  # g2 winner vs g1 runner-up
# Pair third-place teams among themselves (simple)
for i in range(0, len(best3_teams), 2):
    fixtures.append((best3_teams[i], best3_teams[i+1]))
round_of_32 = pd.DataFrame(fixtures, columns=["home_team", "away_team"])
print("R32 matches:", len(round_of_32))  # should be 16
round_of_32


R32 matches: 16


Unnamed: 0,home_team,away_team
0,Mexico,Winner UEFA Playoff A
1,Canada,South Africa
2,Brazil,Paraguay
3,USA,Morocco
4,Germany,Japan
5,Netherlands,Curaçao
6,Belgium,Cabo Verde
7,Spain,Egypt
8,France,Algeria
9,Argentina,Senegal


# Round 16

In [15]:
import random

winners = []

for index, row in round_of_32.iterrows():
    home = row["home_team"]
    away = row["away_team"]

    points_home, points_away = predict_points(home, away)

    # Decide winner (no draws in knockout)
    if points_home > points_away:
        winners.append(home)
    elif points_away > points_home:
        winners.append(away)
    else:
        winners.append(random.choice([home, away]))  # tiebreak
winners_table=pd.DataFrame(
   {"Qaulified Team": winners}
)
winners_table



Unnamed: 0,Qaulified Team
0,Mexico
1,South Africa
2,Brazil
3,Morocco
4,Germany
5,Netherlands
6,Cabo Verde
7,Spain
8,Algeria
9,Argentina


In [16]:
round_of_16=pd.DataFrame(
    [(winners[i],winners[i+1]) for i in range (0,len(winners),2)],
    columns=["home_team", "away_team"]
)
round_of_16

Unnamed: 0,home_team,away_team
0,Mexico,South Africa
1,Brazil,Morocco
2,Germany,Netherlands
3,Cabo Verde,Spain
4,Algeria,Argentina
5,Croatia,Winner FIFA Playoff 1
6,South Korea,Australia
7,Côte d'Ivoire,IR Iran


# Round 8

In [17]:
round_of_16 = pd.DataFrame(
    [(winners[i], winners[i+1]) for i in range(0, len(winners), 2)],
    columns=["home_team", "away_team"]
)
winners_r16 = []
for index, row in round_of_16.iterrows():
    home = row["home_team"]
    away = row["away_team"]

    points_home, points_away = predict_points(home, away)

    # Decide winner (no draws in knockout)
    if points_home > points_away:
        winners_r16.append(home)
    elif points_away > points_home:
        winners_r16.append(away)
    else:
        winners_r16.append(random.choice([home, away]))  # tiebreak
winners_table=pd.DataFrame(
   {"Qaulified Team": winners_r16}
)
winners_table


Unnamed: 0,Qaulified Team
0,Mexico
1,Brazil
2,Germany
3,Cabo Verde
4,Argentina
5,Croatia
6,South Korea
7,IR Iran


# Quarter finals

In [18]:
round_of_16=pd.DataFrame(
    [(winners_r16[i],winners_r16[i+1]) for i in range (0,len(winners_r16),2)],
    columns=["home_team", "away_team"]
)
round_of_16

Unnamed: 0,home_team,away_team
0,Mexico,Brazil
1,Germany,Cabo Verde
2,Argentina,Croatia
3,South Korea,IR Iran


# Semi Finals

In [19]:
round_of_4 = pd.DataFrame(
    [(winners_r16[i], winners_r16[i+1]) for i in range(0, len(winners_r16), 2)],
    columns=["home_team", "away_team"]
)
winners_r4 = []
for index, row in round_of_16.iterrows():
    home = row["home_team"]
    away = row["away_team"]

    points_home, points_away = predict_points(home, away)

    # Decide winner (no draws in knockout)
    if points_home > points_away:
        winners_r4.append(home)
    elif points_away > points_home:
        winners_r4.append(away)
    else:
        winners_r4.append(random.choice([home, away]))  # tiebreak
winners_table=pd.DataFrame(
   {"Qaulified Team": winners_r4}
)
winners_table

Unnamed: 0,Qaulified Team
0,Brazil
1,Germany
2,Argentina
3,IR Iran


In [20]:
round_of_2=pd.DataFrame(
    [(winners_r4[i],winners_r4[i+1]) for i in range (0,len(winners_r4),2)],
    columns=["home_team", "away_team"]
)
round_of_2

Unnamed: 0,home_team,away_team
0,Brazil,Germany
1,Argentina,IR Iran


# Final 


In [21]:
final = pd.DataFrame(
    [(winners_r4[i], winners_r4[i+1]) for i in range(0, len(winners_r4), 2)],
    columns=["home_team", "away_team"]
)
winners_r2 = []
for index, row in round_of_2.iterrows():
    home = row["home_team"]
    away = row["away_team"]

    points_home, points_away = predict_points(home, away)

    # Decide winner (no draws in knockout)
    if points_home > points_away:
        winners_r2.append(home)
    elif points_away > points_home:
        winners_r2.append(away)
    else:
        winners_r2.append(random.choice([home, away]))  # tiebreak
winners_table=pd.DataFrame(
   {"Qaulified Team": winners_r2}
)
winners_table

Unnamed: 0,Qaulified Team
0,Brazil
1,IR Iran


In [22]:
Final1=pd.DataFrame(
    [(winners_r2[i], winners_r2[i+1]) for i in range(0, len(winners_r2), 2)],
    columns=["home_team", "away_team"]
)
Final1

Unnamed: 0,home_team,away_team
0,Brazil,IR Iran


In [None]:
final2 = pd.DataFrame(
    [(winners_r2[i], winners_r2[i+1]) for i in range(0, len(winners_r2), 2)],
    columns=["home_team", "away_team"]
)
winners_r1 = []
for index, row in Final1.iterrows():
    home = row["home_team"]
    away = row["away_team"]

    points_home, points_away = predict_points(home, away)

    # Decide winner (no draws in knockout)
    if points_home > points_away:
        winners_r1.append(home)
    elif points_away > points_home:
        winners_r1.append(away)
    else:
        winners_r1.append(random.choice([home, away]))  # tiebreak
winners_table=pd.DataFrame(
   {"The winner": winners_r1}
)
winners_table

Unnamed: 0,The winnwer
0,Brazil
