In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time
import seaborn as sns
%matplotlib inline

In [3]:
results = pd.read_csv('data/fifa/international_results.csv')
results['date'] = pd.to_datetime(results['date'], utc=True)

# restrict dates
earliest_date = '2010-01-01'
latest_date = '2018-06-14'
results = results[(results['date'] > earliest_date) & (results['date'] < latest_date)]
results.head()

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral
31700,2010-01-02 00:00:00+00:00,Iran,Korea DPR,1,0,Friendly,Doha,Qatar,True
31701,2010-01-02 00:00:00+00:00,Qatar,Mali,0,0,Friendly,Doha,Qatar,False
31702,2010-01-02 00:00:00+00:00,Syria,Zimbabwe,6,0,Friendly,Kuala Lumpur,Malaysia,True
31703,2010-01-02 00:00:00+00:00,Yemen,Tajikistan,0,1,Friendly,Sana'a,Yemen,False
31704,2010-01-03 00:00:00+00:00,Angola,Gambia,1,1,Friendly,Vila Real de Santo António,Portugal,True


In [4]:
ratings = pd.read_csv('data/team_stats.csv')
ratings['date'] = pd.to_datetime(ratings['date'], utc=True)

# restrict dates
ratings = ratings[(ratings['date'] > earliest_date) & (ratings['date'] < latest_date)]
ratings.head()

Unnamed: 0,team,date,attack,defence,full_age,midfield,overall,prestige,start_age
1364,Brazil,2018-05-28 00:00:00+00:00,86.0,85.0,27.09,83.0,85.0,10.0,26.73
1365,England,2018-05-28 00:00:00+00:00,84.0,81.0,24.65,81.0,82.0,8.0,25.0
1366,Italy,2018-05-28 00:00:00+00:00,85.0,85.0,26.04,81.0,83.0,9.0,28.45
1367,Spain,2018-05-28 00:00:00+00:00,84.0,86.0,27.04,86.0,86.0,9.0,28.18
1368,France,2018-05-28 00:00:00+00:00,83.0,82.0,25.65,85.0,84.0,9.0,25.18


We now have to find the closest match in date between our FIFA team ratings and the match data that we have.

In [5]:
dates = ratings.date.unique()
teams = ratings.team.unique()

In [60]:
# check that we found the corresponding team from team ratings in results
for t in teams:
    try:
        assert np.sum(results.home_team.unique() == t) == 1
    except:
        print('Cannot Find {}'.format(t))

for t in teams:
    try:
        assert np.sum(results.away_team.unique() == t) == 1
    except:
        print('Cannot Find {}'.format(t))

Cannot Find Republic of Ireland
Cannot Find United States
Cannot Find China PR
Cannot Find Côte d'Ivoire
Cannot Find Republic of Ireland
Cannot Find United States
Cannot Find China PR
Cannot Find Côte d'Ivoire


Before we proceed with anything else, let's check that country names are spelled similarly. We find that 'Republic of Ireland', 'United States', 'China PR', and 'Côte d'Ivoire' might be spelled differently.

In [63]:
ratings.replace('Republic of Ireland', 'Ireland', inplace=True)
ratings.replace('United States', 'USA', inplace=True)
ratings.replace('China PR', 'China', inplace=True)
ratings.replace("Côte d'Ivoire", 'Ivory Coast', inplace=True)

Now we finally have two datasets of teams spelled in the same way.

In [65]:
# find closest date that we have data on
results['closest_date'] = results.apply(lambda i: min(dates, key=lambda d: abs(d-i.date)), axis=1)
results.head()

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral,closest_date
31700,2010-01-02 00:00:00+00:00,Iran,Korea DPR,1,0,Friendly,Doha,Qatar,True,2010-02-22 00:00:00+00:00
31701,2010-01-02 00:00:00+00:00,Qatar,Mali,0,0,Friendly,Doha,Qatar,False,2010-02-22 00:00:00+00:00
31702,2010-01-02 00:00:00+00:00,Syria,Zimbabwe,6,0,Friendly,Kuala Lumpur,Malaysia,True,2010-02-22 00:00:00+00:00
31703,2010-01-02 00:00:00+00:00,Yemen,Tajikistan,0,1,Friendly,Sana'a,Yemen,False,2010-02-22 00:00:00+00:00
31704,2010-01-03 00:00:00+00:00,Angola,Gambia,1,1,Friendly,Vila Real de Santo António,Portugal,True,2010-02-22 00:00:00+00:00


Now that we have the closest matching data for both of our datasets, we can merge on team and date. Note that we have merge twice in order to account for both teams in the results table.

In [66]:
# merge home team with their closest ratings
results_ratings1 = results.merge(ratings, how='inner', 
                                left_on=['closest_date', 'home_team'],
                                right_on=['date', 'team'])

results_ratings2 = results_ratings1.merge(ratings, how='inner',
                                       left_on=['closest_date', 'away_team'],
                                       right_on=['date', 'team'])

In [67]:
results_ratings2.shape

(1881, 28)

We find that we have 1881 observations that can still be used after matching team ratings with the results dataframe with an inner merge. We now clean up the merged dataframe slightly.

In [68]:
results_ratings2.head()

Unnamed: 0,date_x,home_team,away_team,home_score,away_score,tournament,city,country,neutral,closest_date,...,start_age_x,team_y,date,attack_y,defence_y,full_age_y,midfield_y,overall_y,prestige_y,start_age_y
0,2010-01-12 00:00:00+00:00,Egypt,Nigeria,3,1,African Cup of Nations,Benguela,Angola,True,2010-02-22 00:00:00+00:00,...,28.64,Nigeria,2010-02-22 00:00:00+00:00,75.0,73.0,24.53,75.0,75.0,4.0,23.55
1,2010-01-25 00:00:00+00:00,Egypt,Cameroon,3,1,African Cup of Nations,Benguela,Angola,True,2010-02-22 00:00:00+00:00,...,28.64,Cameroon,2010-02-22 00:00:00+00:00,73.0,74.0,26.0,73.0,76.0,13.0,26.0
2,2010-03-03 00:00:00+00:00,Italy,Cameroon,0,0,Friendly,Monaco,Monaco,True,2010-02-22 00:00:00+00:00,...,28.91,Cameroon,2010-02-22 00:00:00+00:00,73.0,74.0,26.0,73.0,76.0,13.0,26.0
3,2010-02-10 00:00:00+00:00,China,Korea Republic,3,0,EAFF Championship,Tokyo,Japan,True,2010-02-22 00:00:00+00:00,...,26.55,Korea Republic,2010-02-22 00:00:00+00:00,67.0,65.0,25.57,69.0,70.0,10.0,26.55
4,2010-03-03 00:00:00+00:00,Ivory Coast,Korea Republic,0,2,Friendly,London,England,True,2010-02-22 00:00:00+00:00,...,27.82,Korea Republic,2010-02-22 00:00:00+00:00,67.0,65.0,25.57,69.0,70.0,10.0,26.55


In [69]:
columns = ['date_x', 'home_score', 'away_score', 'attack_x', 'attack_y', 
           'defence_x', 'defence_y', 'midfield_x', 'midfield_y', 'overall_x', 
           'overall_y', 'prestige_x', 'prestige_y','start_age_x', 'start_age_y', 
           'full_age_x', 'full_age_y']

results_ratings2 = results_ratings2.loc[:,columns]
results_ratings2.columns

Index(['date_x', 'home_score', 'away_score', 'attack_x', 'attack_y',
       'defence_x', 'defence_y', 'midfield_x', 'midfield_y', 'overall_x',
       'overall_y', 'prestige_x', 'prestige_y', 'start_age_x', 'start_age_y',
       'full_age_x', 'full_age_y'],
      dtype='object')

Our final training data is basically a difference in the various team ratings. Now let's clean up the dataframe for our training.

In [70]:
results_ratings2.head()

Unnamed: 0,date_x,home_score,away_score,attack_x,attack_y,defence_x,defence_y,midfield_x,midfield_y,overall_x,overall_y,prestige_x,prestige_y,start_age_x,start_age_y,full_age_x,full_age_y
0,2010-01-12 00:00:00+00:00,3,1,70.0,75.0,70.0,73.0,75.0,75.0,73.0,75.0,5.0,4.0,28.64,23.55,27.48,24.53
1,2010-01-25 00:00:00+00:00,3,1,70.0,73.0,70.0,74.0,75.0,73.0,73.0,76.0,5.0,13.0,28.64,26.0,27.48,26.0
2,2010-03-03 00:00:00+00:00,0,0,83.0,73.0,80.0,74.0,81.0,73.0,83.0,76.0,19.0,13.0,28.91,26.0,27.17,26.0
3,2010-02-10 00:00:00+00:00,3,0,63.0,67.0,64.0,65.0,64.0,69.0,66.0,70.0,4.0,10.0,26.55,26.55,26.93,25.57
4,2010-03-03 00:00:00+00:00,0,2,76.0,67.0,76.0,65.0,75.0,69.0,75.0,70.0,6.0,10.0,27.82,26.55,27.39,25.57


In [72]:
# difference data
score_diff = results_ratings2['home_score'] - results_ratings2['away_score']
attack_diff = results_ratings2['attack_x'] - results_ratings2['attack_y']
defence_diff = results_ratings2['defence_x'] - results_ratings2['defence_y']
midfield_diff = results_ratings2['midfield_x'] - results_ratings2['midfield_y']
overall_diff = results_ratings2['overall_x'] - results_ratings2['overall_y']
prestige_diff = results_ratings2['prestige_x'] - results_ratings2['prestige_y']
start_age_diff = results_ratings2['start_age_x'] - results_ratings2['start_age_y']
full_age_diff = results_ratings2['full_age_x'] - results_ratings2['full_age_y']

# convert score diff to categorical
home_win = [0 if score < 0 else 1 if score > 0 else 2 for score in score_diff]

In [73]:
# create new dataframe
train_team = pd.DataFrame({'attack_diff': attack_diff,
                           'defence_diff': defence_diff,
                           'midfield_diff': midfield_diff,
                           'overall_diff': overall_diff,
                           'prestige_diff': prestige_diff,
                           'start_age_diff': start_age_diff,
                           'full_age_diff': full_age_diff,
                           'home_win': home_win})
train_team.head()

Unnamed: 0,attack_diff,defence_diff,midfield_diff,overall_diff,prestige_diff,start_age_diff,full_age_diff,home_win
0,-5.0,-3.0,0.0,-2.0,1.0,5.09,2.95,1
1,-3.0,-4.0,2.0,-3.0,-8.0,2.64,1.48,1
2,10.0,6.0,8.0,7.0,6.0,2.91,1.17,2
3,-4.0,-1.0,-5.0,-4.0,-6.0,0.0,1.36,1
4,9.0,11.0,6.0,5.0,-4.0,1.27,1.82,0


In [74]:
# save to csv
train_team.to_csv('data/train_team.csv', index = False)

We now create our test set with actual world cup data.

In [99]:
ratings_wc = pd.read_csv('data/team_stats.csv')
ratings_wc['date'] = pd.to_datetime(ratings_wc['date'], utc=True)

# restrict dates
latest_date = '2018-06-14'
wc_start = '2018-06-16' # first WC rating
wc_end = '2018-07-15'
# restrict dates
ratings_wc = ratings_wc[(ratings_wc['date'] >= wc_start) & (ratings_wc['date'] <= wc_end)]
ratings_wc.head()

Unnamed: 0,team,date,attack,defence,full_age,midfield,overall,prestige,start_age
0,Brazil,2018-07-15 00:00:00+00:00,87.0,84.0,27.43,86.0,86.0,10.0,27.55
1,England,2018-07-15 00:00:00+00:00,83.0,80.0,25.88,81.0,81.0,8.0,24.64
2,Italy,2018-07-15 00:00:00+00:00,81.0,82.0,25.96,81.0,82.0,9.0,27.36
3,Spain,2018-07-15 00:00:00+00:00,84.0,85.0,27.18,86.0,85.0,9.0,27.27
4,France,2018-07-15 00:00:00+00:00,86.0,81.0,25.05,84.0,85.0,9.0,24.82


In [85]:
dates_wc = ratings_wc.date.unique()
teams_wc = ratings_wc.team.unique()

In [96]:
#https://gitlab.com/djh_or/2018-world-cup-stats/blob/master/world_cup_2018_stats.csv
results_wc = pd.read_csv("data/world_cup_2018_stats.csv")
results_wc.head()

Unnamed: 0,Game,Group,Team,Opponent,Home/Away,Score,WDL,Pens?,Goals For,Goals Against,...,Passes Completed,Distance Covered km,Balls recovered,Tackles,Blocks,Clearances,Yellow cards,Red Cards,Second Yellow Card leading to Red Card,Fouls Committed
0,1,A,Russia,Saudi Arabia,home,5-0,W,,5,0,...,240,118,53,9,3,19,1,0,0,22
1,1,A,Saudi Arabia,Russia,away,5-0,L,,0,5,...,442,105,48,16,3,31,1,0,0,10
2,2,A,Egypt,Uruguay,home,0-1,L,,0,1,...,308,112,57,12,4,32,2,0,0,12
3,2,A,Uruguay,Egypt,away,0-1,W,,1,0,...,508,111,54,8,2,22,0,0,0,6
4,3,B,Morocco,IR Iran,home,0-1,L,,0,1,...,371,101,38,9,1,16,1,0,0,22


Before we proceed, let's ensure that our country names match up similarly.

In [112]:
# check that we found the corresponding team from team ratings in results
# but now we reverse it such that we try to find a rating for each wc team
for t in results_wc.Team.unique():
    try:
        assert np.sum(teams_wc == t) == 1
    except:
        print('Cannot Find {}'.format(t))


Cannot Find IR Iran


In [113]:
ratings_wc.replace('Iran', 'IR Iran', inplace=True)

In [114]:
results_wc = results_wc.loc[:,['Team', 'Opponent', 'WDL']]
results_wc['home_win'] = [0 if score == 'L' else 1 if score == 'W' else 2 for score in results_wc.WDL]
results_wc.drop('WDL', axis=1, inplace=True)
results_wc.head()

Unnamed: 0,Team,Opponent,home_win
0,Russia,Saudi Arabia,1
1,Saudi Arabia,Russia,0
2,Egypt,Uruguay,0
3,Uruguay,Egypt,1
4,Morocco,IR Iran,0


Now we can match the ratings data to our test dataset as well. We note that there might be some value in using the updated FIFA ratings at each time of the match. However, for simplicity, let's just assume that teams have the same rating throughout the tournament. We will take the rating at the start of the tournament.

In [115]:
ratings_wc_start = ratings_wc[ratings_wc['date'] == wc_start]

# merge ratings with results table
results_rating_wc1 = results_wc.merge(ratings_wc_start, how='left',
                                     left_on='Team', right_on='team')
results_rating_wc2 = results_rating_wc1.merge(ratings_wc_start, how='left',
                                             left_on='Opponent', right_on='team')



In [116]:
# check that every team has a rating
assert results_rating_wc2.shape[0] == results_wc.shape[0]

In [117]:
results_rating_wc2.columns

Index(['Team', 'Opponent', 'home_win', 'team_x', 'date_x', 'attack_x',
       'defence_x', 'full_age_x', 'midfield_x', 'overall_x', 'prestige_x',
       'start_age_x', 'team_y', 'date_y', 'attack_y', 'defence_y',
       'full_age_y', 'midfield_y', 'overall_y', 'prestige_y', 'start_age_y'],
      dtype='object')

In [119]:
# create new dataframe
test_team = pd.DataFrame({'attack_diff': results_rating_wc2['attack_x'] - results_rating_wc2['attack_y'],
                           'defence_diff': results_rating_wc2['defence_x'] - results_rating_wc2['defence_y'],
                           'midfield_diff': results_rating_wc2['midfield_x'] - results_rating_wc2['midfield_y'],
                           'overall_diff': results_rating_wc2['overall_x'] - results_rating_wc2['overall_y'],
                           'prestige_diff': results_rating_wc2['prestige_x'] - results_rating_wc2['prestige_y'],
                           'start_age_diff': results_rating_wc2['start_age_x'] - results_rating_wc2['start_age_y'],
                           'full_age_diff': results_rating_wc2['full_age_x'] - results_rating_wc2['full_age_y'],
                           'home_win': results_rating_wc2['home_win']})
test_team.head()

Unnamed: 0,attack_diff,defence_diff,midfield_diff,overall_diff,prestige_diff,start_age_diff,full_age_diff,home_win
0,9.0,6.0,6.0,7.0,2.0,-0.18,-0.13,1
1,-9.0,-6.0,-6.0,-7.0,-2.0,0.18,0.13,0
2,-14.0,-5.0,0.0,-4.0,-2.0,1.55,0.3,0
3,14.0,5.0,0.0,4.0,2.0,-1.55,-0.3,1
4,-7.0,7.0,6.0,2.0,0.0,0.28,-0.3,0


In [120]:
# save to csv
test_team.to_csv('data/test_team.csv', index = False)