# Predicting the UEFA European Football Championship

Tutorial by Aabid Roshan, Sid Joshi, Pranav Sivaraman

In [2]:
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
df = pd.read_csv("match_results.csv")
df['date'] = pd.to_datetime(df['date'])

---
# Creating the Ranking System

### These are a list of the teams we are going to look at. The current list of teams below was chosen based on the qualifiers for the upcoming 2020 UEFA European Football Championship. These are the teams that qualified for said tournament. The teams can be changed if need be.

In [4]:
teams = ['Turkey', 'Italy', 'Wales', 'Switzerland',
         'Denmark', 'Finland', 'Belgium', 'Russia',
         'Netherlands', 'Ukraine', 'Austria', 'North Macedonia',
         'England', 'Croatia', 'Scotland', 'Czech Republic',
         'Spain', 'Sweden', 'Poland', 'Slovakia',
         'Hungary', 'Portugal', 'France', 'Germany']

### The methods below are used to calculate the elo of respective teams as used by FIFA. 
For more information on the calculations look at this link: https://en.wikipedia.org/wiki/World_Football_Elo_Ratings

In [21]:
def initialize_elo_system(teams, initial_rating):
    ratings = {}
    for team in teams:
        ratings[team] = [initial_rating]
        
    return ratings

def calculate_probabilities(home_rating, away_rating, scale_factor=400):
    p_home = 1 / (1 + 10**((away_rating - home_rating) / scale_factor))
    p_away = 1 / (1 + 10**((home_rating - away_rating) / scale_factor)) 
    
    return p_home, p_away

def calculate_rankings(matches, ratings, k_factor=22.2):
    num_matches = len(matches)
    home_teams, away_teams = matches['home_team'].values, matches['away_team'].values
    home_scores, away_scores = matches['home_score'].values, matches['away_score'].values
    
    for i in range(num_matches):
        home_team, away_team = home_teams[i], away_teams[i]
        home_score, away_score = home_scores[i], away_scores[i]
        
        p_home, p_away = calculate_probabilities(ratings[home_team][-1], ratings[away_team][-1])
        
        if home_score > away_score:
            match_result_home = 1
            match_result_away = 0
        elif home_score < away_score:
            match_result_home = 0
            match_result_away = 1
        elif home_score == away_score:
            match_result_home = 0.5
            match_result_away = 0.5
            
        new_rating_home = ratings[home_team][-1] + k_factor * (match_result_home - p_home)
        new_rating_away = ratings[away_team][-1] + k_factor * (match_result_away - p_away)
        
        ratings[home_team].append(new_rating_home)
        ratings[away_team].append(new_rating_away)
        
    return ratings

def calculate_elo(teams, year_range):
    ratings = initialize_elo_system(teams, 1200)
    matches = df[df['home_team'].isin(teams) & df['away_team'].isin(teams)]
    matches = matches[matches['date'].dt.year.between(year_range[0], year_range[1])]
    new_ratings = calculate_rankings(matches, ratings)
    rankings = {k: v[-1] for k, v in sorted(new_ratings.items(), key=lambda item: item[1][-1], reverse=True)}
    
    return rankings

In [22]:
rankings = calculate_elo(teams, [2006, 2010])
rankings

{'Spain': 1364.5183009647797,
 'Netherlands': 1313.7134131866942,
 'Germany': 1311.2241288528364,
 'Croatia': 1240.5535333845992,
 'Italy': 1239.1587366951428,
 'France': 1232.2957884337645,
 'England': 1226.9820451253613,
 'Portugal': 1226.670041047823,
 'Switzerland': 1223.7896858908427,
 'Russia': 1213.045771738059,
 'Turkey': 1205.615986060427,
 'Czech Republic': 1193.2703435177937,
 'Sweden': 1188.5366080369754,
 'Slovakia': 1187.7533660997435,
 'Ukraine': 1183.0565508409186,
 'Denmark': 1165.6846236970146,
 'Finland': 1165.6292344395317,
 'Poland': 1159.2568534609081,
 'Scotland': 1148.6082421430551,
 'North Macedonia': 1138.9454850945222,
 'Hungary': 1138.0076886006218,
 'Belgium': 1118.97073932807,
 'Wales': 1109.6238695003447,
 'Austria': 1105.088963860171}

---
# Seeing if being at home gives teams an advantage

In [27]:
def calculate_home_percentages(year_range):
    
    ## This method returns 2 values
    
    home_win = 0
    draw = 0
    total = 0
    
    nonneutral = df[df['neutral'] == False]
    nonneutral = nonneutral[nonneutral['date'].dt.year.between(year_range[0], year_range[1])]

    for index, row in nonneutral.iterrows():
        if row['home_score'] > row['away_score']:
            home_win += 1
        elif row['home_score'] == row['away_score']:
            draw += 1
        total += 1

    return (home_win/total*100, (home_win+draw)/total*100)


In [28]:
calculate_home_percentages([2010, 2014])

(50.224215246636774, 73.42301943198805)

In [17]:
nonneutral

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral
0,1872-11-30,Scotland,England,0,0,Friendly,Glasgow,Scotland,False
1,1873-03-08,England,Scotland,4,2,Friendly,London,England,False
2,1874-03-07,Scotland,England,2,1,Friendly,Glasgow,Scotland,False
3,1875-03-06,England,Scotland,2,2,Friendly,London,England,False
4,1876-03-04,Scotland,England,3,0,Friendly,Glasgow,Scotland,False
...,...,...,...,...,...,...,...,...,...
42079,2021-03-31,Andorra,Hungary,1,4,FIFA World Cup qualification,Andorra la Vella,Andorra,False
42080,2021-03-31,San Marino,Albania,0,2,FIFA World Cup qualification,Serravalle,San Marino,False
42081,2021-03-31,Armenia,Romania,3,2,FIFA World Cup qualification,Yerevan,Armenia,False
42082,2021-03-31,Germany,North Macedonia,1,2,FIFA World Cup qualification,Duisburg,Germany,False
