In [18]:
import pandas as pd
import numpy as np
import math

seed = 1234

In [7]:
wc_data = pd.read_csv("wc_dataset/WorldCupMatches.csv").dropna()
wc_data.sample(3, random_state=seed)

Unnamed: 0,Year,Datetime,Stage,Stadium,City,Home Team Name,Home Team Goals,Away Team Goals,Away Team Name,Win conditions,Attendance,Half-time Home Goals,Half-time Away Goals,Referee,Assistant 1,Assistant 2,RoundID,MatchID,Home Team Initials,Away Team Initials
104,1958.0,08 Jun 1958 - 19:00,Group 3,Jarnvallen,Sandviken,Hungary,1.0,1.0,Wales,,15343.0,1.0,1.0,CODESAL Jose Maria (URU),LEMESIC Leo (YUG),VAN NUFFEL Lucien (BEL),220.0,1407.0,HUN,WAL
767,2010.0,03 Jul 2010 - 20:30,Quarter-finals,Ellis Park Stadium,Johannesburg,Paraguay,0.0,1.0,Spain,,55359.0,0.0,0.0,BATRES Carlos (GUA),LEAL Leonel (CRC),PASTRANA Carlos (HON),249718.0,300061506.0,PAR,ESP
588,2002.0,03 June 2002 - 18:00,Group C,Munsu Football Stadium,Ulsan,Brazil,2.0,1.0,Turkey,,33842.0,0.0,1.0,KIM Young Joo (KOR),KRISHNAN Visva (SIN),FERNANDEZ Vladimir (SLV),43950100.0,43950010.0,BRA,TUR


In [44]:
team_matchups = {}

for row in wc_data.itertuples():
    t1 = row._19
    t2 = row._20
    order = list(sorted([t1,t2]))
    tup = (order[0], order[1])
    
    if tup in team_matchups:
        team_matchups[tup] += 1
    else:
        team_matchups[tup] = 1

print(team_matchups)
print(len(list(filter(lambda x: x > 1, team_matchups.values()))), len(team_matchups.values()))

{('FRA', 'MEX'): 4, ('BEL', 'USA'): 3, ('BRA', 'YUG'): 4, ('PER', 'ROU'): 1, ('ARG', 'FRA'): 2, ('CHI', 'MEX'): 1, ('BOL', 'YUG'): 1, ('PAR', 'USA'): 1, ('PER', 'URU'): 1, ('CHI', 'FRA'): 1, ('ARG', 'MEX'): 3, ('BOL', 'BRA'): 1, ('BEL', 'PAR'): 2, ('ROU', 'URU'): 1, ('ARG', 'CHI'): 1, ('ARG', 'USA'): 1, ('URU', 'YUG'): 2, ('ARG', 'URU'): 2, ('AUT', 'FRA'): 2, ('EGY', 'HUN'): 1, ('NED', 'SUI'): 1, ('ARG', 'SWE'): 2, ('BEL', 'GER'): 2, ('BRA', 'ESP'): 5, ('ITA', 'USA'): 3, ('ROU', 'TCH'): 2, ('SUI', 'TCH'): 1, ('GER', 'SWE'): 2, ('ESP', 'ITA'): 3, ('AUT', 'HUN'): 1, ('AUT', 'ITA'): 4, ('GER', 'TCH'): 1, ('AUT', 'GER'): 1, ('ITA', 'TCH'): 2, ('GER', 'SUI'): 2, ('HUN', 'INH'): 1, ('BEL', 'FRA'): 2, ('CUB', 'ROU'): 2, ('ITA', 'NOR'): 3, ('BRA', 'POL'): 4, ('NED', 'TCH'): 1, ('BRA', 'TCH'): 5, ('HUN', 'SUI'): 1, ('CUB', 'SWE'): 1, ('FRA', 'ITA'): 5, ('HUN', 'SWE'): 2, ('BRA', 'ITA'): 5, ('BRA', 'SWE'): 7, ('HUN', 'ITA'): 2, ('BRA', 'MEX'): 4, ('CHI', 'ENG'): 1, ('ESP', 'USA'): 1, ('ITA', 'SW

In [104]:
def calculate_lambda(t1, t2):
    """
    Lambda = w1 * (avg goals for t1) + w2 * (avg goals for t1 against t2)
    w1 + w2 = 1
    
    returns lambda_t1
    """
    w1 = 0.8
    w2 = 0.2
    t1_df_home = wc_data[(wc_data["Home Team Initials"] == t1)]
    t1_df_away = wc_data[(wc_data["Away Team Initials"] == t1)]
    avg_goals_t1 = (lazy_mean(list(t1_df_home[t1_df_home["Away Team Initials"] != t2]["Home Team Goals"])) + lazy_mean(list(t1_df_away[t1_df_away["Home Team Initials"] != t2]["Away Team Goals"]))) / 2
    avg_goals_vs_t2 = (lazy_mean(list(t1_df_home[t1_df_home["Away Team Initials"] == t2]["Home Team Goals"])) + lazy_mean(list(t1_df_away[t1_df_away["Home Team Initials"] == t2]["Away Team Goals"]))) / 2
    
    return (w1 * avg_goals_t1) + (w2 * avg_goals_vs_t2)

def lazy_mean(arr):
    if len(arr) == 0:
        return 0
    
    return sum(arr) / len(arr)

def predict_proba(k, lbd):
    return ((lbd ** k) * (math.e ** (-lbd))) / math.factorial(k)
    

In [105]:
team_pairs = [("BRA", "SUI"), ("POR", "URU")]

for t1, t2 in team_pairs:
    max_p = -1
    max_g1 = -1
    max_g2 = -1
    
    for g1 in range(0, 11):
        for g2 in range(0, 11):
            lbd_t1 = calculate_lambda(t1, t2)
            lbd_t2 = calculate_lambda(t2, t1)
            p_t1 = predict_proba(g1, lbd_t1)
            p_t2 = predict_proba(g2, lbd_t2)
            p_combo = p_t1 * p_t2
            
            if p_combo > max_p:
                max_p = p_combo
                max_g1 = g1
                max_g2 = g2
    print(f"Team {t1} v {t2}: {max_g1} - {max_g2}, prob: {max_p}")

Team BRA v SUI: 1 - 1, prob: 0.10590145934453563
Team POR v URU: 1 - 1, prob: 0.13135463428853397


In [106]:
for t1, t2 in team_pairs:
    lbd_t1 = calculate_lambda(t1, t2)
    lbd_t2 = calculate_lambda(t2, t1)
    t1_goals = round(lbd_t1)
    t2_goals = round(lbd_t2)
    
    print(f"Team {t1} v {t2}: {t1_goals} - {t2_goals}, {lbd_t1} - {lbd_t2}")

Team BRA v SUI: 2 - 1, 1.7713200379867047 - 1.3333333333333333
Team POR v URU: 1 - 1, 1.1800000000000002 - 1.1857142857142857
