In [1]:
import numpy as np  # Resolver operaciones de algebra lineal
import pandas as pd  # Procesamiento de datos, archivos CSV
from matplotlib import pyplot as plt

%matplotlib inline

teams = pd.read_csv('/Users/Administrador/Desktop/EuroData/jugadores.csv', sep=";")

rankings = pd.read_csv('/Users/Administrador/Desktop/EuroData/fifa_ranking.csv')
rankings = rankings.replace({'FYR Macedonia': 'North Macedonia'})
rankings = rankings.loc[:, ['rank', 'country_full', 'country_abrv', 'cur_year_avg_weighted', 'rank_date',
                           'two_year_ago_weighted', 'three_year_ago_weighted']]

rankings['weighted_points'] = rankings['cur_year_avg_weighted'] + rankings['two_year_ago_weighted']  + rankings['three_year_ago_weighted']
rankings['rank_date'] = pd.to_datetime(rankings['rank_date'])

matches = pd.read_csv('/Users/Administrador/Desktop/EuroData/results.csv')
matches = matches.replace({'German DR': 'Germany'})
matches['date'] = pd.to_datetime(matches['date'])

euro_cup = pd.read_csv('/Users/Administrador/Desktop/EuroData/grupos_euro.csv', sep=";")
euro_cup = euro_cup[['Team', 'Group', 'First match against', 'Second match against', 'Third match against']]
euro_cup = euro_cup.dropna(how='all')

In [2]:
# Tomamos el ranking para todos los dias
rankings = rankings.set_index(['rank_date'])\
            .groupby(['country_full'], group_keys=False)\
            .resample('D').first()\
            .fillna(method='ffill')\
            .reset_index()

# Unimos rankings
matches = matches.merge(rankings,
                        left_on=['date', 'home_team'],
                        right_on=['rank_date', 'country_full'])
matches = matches.merge(rankings,
                        left_on=['date', 'away_team'],
                        right_on=['rank_date', 'country_full'],
                        suffixes=('_home', '_away'))

# Agregamos el potencial del equipo al dataset de partidos
teamValue = teams.groupby('squad').mean()['overall']
matches = matches.merge(teamValue, 
                        left_on=['home_team'], 
                        right_on=['squad'])
matches = matches.merge(teamValue, 
                        left_on=['away_team'], 
                        right_on=['squad'], 
                        suffixes=('_home', '_away'))

# Agregar diferencia de potencial entre los equipos
matches['teamValue_difference'] = matches['overall_home'] - matches['overall_away']

# Agregando el potencial del equipo al dataset de copa
euro_cup = euro_cup.merge(teamValue, 
                        left_on=['Team'], 
                        right_on=['squad'])
euro_cup = euro_cup.set_index('Team')

In [3]:
# Generamos caracteristicas
matches['rank_difference'] = matches['rank_home'] - matches['rank_away']
matches['average_rank'] = (matches['rank_home'] + matches['rank_away'])/2
matches['point_difference'] = matches['weighted_points_home'] - matches['weighted_points_away']
matches['score_difference'] = matches['home_score'] - matches['away_score']
matches['is_won'] = matches['score_difference'] > 0  # Tomamos el empate como derrota
matches['is_stake'] = matches['tournament'] != 'Friendly'

In [4]:
from sklearn.model_selection import train_test_split

X, y = matches.loc[:, ['average_rank', 'rank_difference', 'teamValue_difference', 'is_stake']], matches['is_won']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators = 100, random_state = 30)
# Se entrena el modelo con datos de entrenamiento
model.fit(X_train, y_train)
# Test de prediccion sobre sobre los datos de entrenamiento
prediction_test_train = model.predict(X_train)
# Test de prediccion sobre sobre los datos de test
prediction_test = model.predict(X_test)

from sklearn import metrics

print ("Accuracy = ", metrics.accuracy_score(y_test, prediction_test))

feature_list = list(X.columns)
feature_imp = pd.Series(model.feature_importances_,index=feature_list).sort_values(ascending=False)
print(feature_imp)

Accuracy =  0.6202090592334495
teamValue_difference    0.374805
rank_difference         0.312720
average_rank            0.268122
is_stake                0.044353
dtype: float64


In [5]:
matches.head(5)

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral,rank_date_home,...,weighted_points_away,overall_home,overall_away,teamValue_difference,rank_difference,average_rank,point_difference,score_difference,is_won,is_stake
0,1993-08-11,Sweden,Switzerland,1,2,Friendly,Borås,Sweden,False,1993-08-11,...,0.0,74.153846,75.538462,-1.384615,1.0,3.5,0.0,-1,False,False
1,1995-09-06,Sweden,Switzerland,0,0,UEFA Euro qualification,Gothenburg,Sweden,False,1995-09-06,...,0.0,74.153846,75.538462,-1.384615,4.0,13.0,0.0,0,False,True
2,2002-03-27,Sweden,Switzerland,1,1,Friendly,Malmö,Sweden,False,2002-03-27,...,0.0,74.153846,75.538462,-1.384615,-46.0,39.0,0.0,0,False,False
3,1996-03-27,Austria,Switzerland,1,0,Friendly,Vienna,Austria,False,1996-03-27,...,0.0,76.153846,75.538462,0.615385,21.0,31.5,0.0,1,True,False
4,2001-08-15,Austria,Switzerland,1,2,Friendly,Vienna,Austria,False,2001-08-15,...,0.0,76.153846,75.538462,0.615385,-19.0,52.5,0.0,-1,False,False


In [6]:
# definimos un pequeño margen para calcular empates
margin = 0.05


euro_cup_rankings = rankings.loc[(rankings['rank_date'] == rankings['rank_date'].max()) &
                                    rankings['country_full'].isin(euro_cup.index.unique())]
euro_cup_rankings = euro_cup_rankings.set_index(['country_full'])

In [7]:
from itertools import combinations

opponents = ['First match against', 'Second match against', 'Third match against']

euro_cup['points'] = 0
euro_cup['total_prob'] = 0

for group in set(euro_cup['Group']):
    print('___Starting group {}:___'.format(group))
    for home, away in combinations(euro_cup.query('Group == "{}"'.format(group)).index, 2):
        print("{} vs. {}: ".format(home, away), end='')
        row = pd.DataFrame(np.array([[np.nan, np.nan, np.nan, True]]), columns=X_test.columns)
        home_rank = euro_cup_rankings.loc[home, 'rank']
        home_points = euro_cup_rankings.loc[home, 'weighted_points']
        opp_rank = euro_cup_rankings.loc[away, 'rank']
        opp_points = euro_cup_rankings.loc[away, 'weighted_points']
        teamValue_home = euro_cup.loc[home, 'overall']
        teamValue_away = euro_cup.loc[away, 'overall']
        
        row['teamValue_difference'] = teamValue_home - teamValue_away
        row['average_rank'] = (home_rank + opp_rank) / 2
        row['rank_difference'] = home_rank - opp_rank
        #row['point_difference'] = home_points - opp_points

        home_win_prob = model.predict_proba(row)[:, 1][0]
        euro_cup.loc[home, 'total_prob'] += home_win_prob
        euro_cup.loc[away, 'total_prob'] += 1 - home_win_prob

        points = 0
        if home_win_prob <= 0.5 - margin:
            print("{} wins with {:.2f}".format(away, 1 - home_win_prob))
            euro_cup.loc[away, 'points'] += 3
        if home_win_prob > 0.5 - margin:
            points = 1
        if home_win_prob >= 0.5 + margin:
            points = 3
            euro_cup.loc[home, 'points'] += 3
            print("{} wins with {:.2f}".format(home, home_win_prob))
        if points == 1:
            print("Draw")
            euro_cup.loc[home, 'points'] += 1
            euro_cup.loc[away, 'points'] += 1

___Starting group A:___
Turkey vs. Italy: Italy wins with 0.94
Turkey vs. Wales: Wales wins with 0.67
Turkey vs. Switzerland: Switzerland wins with 0.73
Italy vs. Wales: Italy wins with 0.79
Italy vs. Switzerland: Italy wins with 0.66
Wales vs. Switzerland: Switzerland wins with 0.65
___Starting group B:___
Denmark vs. Finland: Denmark wins with 0.91
Denmark vs. Belgium: Belgium wins with 0.88
Denmark vs. Russia: Denmark wins with 0.65
Finland vs. Belgium: Belgium wins with 0.90
Finland vs. Russia: Russia wins with 0.70
Belgium vs. Russia: Belgium wins with 0.94
___Starting group C:___
Austria vs. North Macedonia: Austria wins with 0.69
Austria vs. Netherlands: Netherlands wins with 0.85
Austria vs. Ukraine: Ukraine wins with 0.84
North Macedonia vs. Netherlands: Netherlands wins with 0.90
North Macedonia vs. Ukraine: Ukraine wins with 0.77
Netherlands vs. Ukraine: Netherlands wins with 0.60
___Starting group E:___
Poland vs. Slovakia: Draw
Poland vs. Spain: Spain wins with 0.74
Poland

In [8]:
euro_cup = euro_cup.sort_values(by=['Group', 'points', 'total_prob'], ascending=False).reset_index()
next_round_wc = euro_cup.groupby('Group').nth([0, 1, 2])  # select the top 3
next_round_wc = next_round_wc.reset_index()
next_round_wc[['Group', 'Team', 'points']]
next_round_wc = next_round_wc.sort_values(by=['Group', 'points', 'total_prob'], ascending=[True, False, False])
next_round_wc = next_round_wc.reset_index(drop = True)
next_round_wc

Unnamed: 0,Group,Team,First match against,Second match against,Third match against,overall,points,total_prob
0,A,Italy,Turkey,Switzerland,Wales,80.153846,9,2.39
1,A,Switzerland,Wales,Italy,Turkey,75.538462,6,1.72
2,A,Wales,Switzerland,Turkey,Italy,70.384615,3,1.23
3,B,Belgium,Russia,Denmark,Finland,80.538462,9,2.72
4,B,Denmark,Finland,Belgium,Russia,76.423077,6,1.68
5,B,Russia,Belgium,Finland,Denmark,73.653846,3,1.11
6,C,Netherlands,Ukraine,Austria,North Macedonia,79.153846,9,2.35
7,C,Ukraine,Netherlands,North Macedonia,Austria,72.615385,6,2.01
8,C,Austria,North Macedonia,Netherlands,Ukraine,76.153846,3,1.0
9,D,Croatia,England,Czech Republic,Scotland,76.576923,9,2.33


In [9]:
emparejamiento = [3, 17, 0, 10, 15, 2, 8, 13, 12, 11, 6, 16, 9, 7, 1, 4]
next_round_wc = next_round_wc.loc[emparejamiento]
next_round_wc = next_round_wc.set_index('Team')

finals = ['round_of_16', 'quarterfinal', 'semifinal', 'final']
euro_cup = euro_cup.set_index(['Team'])
labels = list()
odds = list()

for f in finals:
    print("___Starting of the {}___".format(f))
    iterations = int(len(next_round_wc) / 2)
    winners = []

    for i in range(iterations):
        home = next_round_wc.index[i * 2]
        away = next_round_wc.index[i * 2 + 1]
        print("{} vs. {}: ".format(home,
                                   away),
              end='')
        row = pd.DataFrame(np.array([[np.nan, np.nan, np.nan, True]]), columns=X_test.columns)
        home_rank = euro_cup_rankings.loc[home, 'rank']
        opp_rank = euro_cup_rankings.loc[away, 'rank']
        home_points = euro_cup_rankings.loc[home, 'weighted_points']
        opp_points = euro_cup_rankings.loc[away, 'weighted_points']
        teamValue_home = euro_cup.loc[home, 'overall']
        teamValue_away = euro_cup.loc[away, 'overall']
        
        row['average_rank'] = (home_rank + opp_rank) / 2
        row['rank_difference'] = home_rank - opp_rank
        #row['point_difference'] = home_points - opp_points
        row['teamValue_difference'] = teamValue_home - teamValue_away

        home_win_prob = model.predict_proba(row)[:, 1][0]
        if model.predict_proba(row)[:, 1] <= 0.5:
            print("{0} wins with probability {1:.2f}".format(away, 1 - home_win_prob))
            winners.append(away)
        else:
            print("{0} wins with probability {1:.2f}".format(home, home_win_prob))
            winners.append(home)

        labels.append("{}({:.2f}) vs. {}({:.2f})".format(euro_cup_rankings.loc[home, 'country_abrv'],
                                                         1 / home_win_prob,
                                                         euro_cup_rankings.loc[away, 'country_abrv'],
                                                         1 / (1 - home_win_prob)))
        odds.append([home_win_prob, 1 - home_win_prob])

    next_round_wc = next_round_wc.loc[winners]
    print("\n")

___Starting of the round_of_16___
Belgium vs. Portugal: Portugal wins with probability 0.53
Italy vs. England: Italy wins with probability 0.55
Germany vs. Wales: Germany wins with probability 0.71
Austria vs. Spain: Spain wins with probability 0.97
Sweden vs. Scotland: Sweden wins with probability 0.53
Netherlands vs. France: France wins with probability 0.92
Croatia vs. Ukraine: Croatia wins with probability 0.84
Switzerland vs. Denmark: Denmark wins with probability 0.71


___Starting of the quarterfinal___
Portugal vs. Italy: Italy wins with probability 0.55
Germany vs. Spain: Spain wins with probability 0.56
Sweden vs. France: France wins with probability 0.89
Croatia vs. Denmark: Denmark wins with probability 0.55


___Starting of the semifinal___
Italy vs. Spain: Italy wins with probability 0.52
France vs. Denmark: France wins with probability 0.73


___Starting of the final___
Italy vs. France: France wins with probability 0.83


