In [39]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time
import seaborn as sns
%matplotlib inline

In [40]:
results = pd.read_csv('data/fifa/international_results.csv')
results['date'] = pd.to_datetime(results['date'], utc=True)

# restrict dates
earliest_date = '2010-01-01'
latest_date = '2018-06-14'
results = results[(results['date'] > earliest_date) & (results['date'] < latest_date)]
results.head()

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral
31700,2010-01-02 00:00:00+00:00,Iran,Korea DPR,1,0,Friendly,Doha,Qatar,True
31701,2010-01-02 00:00:00+00:00,Qatar,Mali,0,0,Friendly,Doha,Qatar,False
31702,2010-01-02 00:00:00+00:00,Syria,Zimbabwe,6,0,Friendly,Kuala Lumpur,Malaysia,True
31703,2010-01-02 00:00:00+00:00,Yemen,Tajikistan,0,1,Friendly,Sana'a,Yemen,False
31704,2010-01-03 00:00:00+00:00,Angola,Gambia,1,1,Friendly,Vila Real de Santo António,Portugal,True


In [41]:
ratings = pd.read_csv('data/sofifa_final.csv')
ratings['date'] = pd.to_datetime(ratings['date'])

# restrict dates
# note that we use a different earliest data here to ensure 
# that our earliest match data has a matching ratings data
ratings = ratings[(ratings['date'] > '2009-08-29') & (ratings['date'] < latest_date)]
ratings.head()

Unnamed: 0,team,date,overall,attack,midfield,defence,prestige,start_age,full_age,bup_speed,...,cc_passing,cc_crossing,cc_shooting,d_pressure,d_aggresion,d_width,wage_euros_thousands,value_euros_millions,growth,goalkeeeper_overall
48,Brazil,2018-06-11,85.0,86.0,83.0,85.0,10.0,26.73,27.09,51.0,...,39.0,33.0,67.0,76.0,73.0,63.0,123.608696,32.5,1.826087,84.0
49,Brazil,2018-06-07,85.0,86.0,83.0,85.0,10.0,26.73,27.09,51.0,...,39.0,33.0,67.0,76.0,73.0,63.0,123.608696,32.5,1.826087,84.0
50,Brazil,2018-06-04,85.0,86.0,83.0,85.0,10.0,26.73,27.09,51.0,...,39.0,33.0,67.0,76.0,73.0,63.0,123.608696,32.5,1.826087,84.0
51,Brazil,2018-05-31,85.0,86.0,83.0,85.0,10.0,26.73,27.09,51.0,...,39.0,33.0,67.0,76.0,73.0,63.0,123.608696,32.5,1.826087,84.0
52,Brazil,2018-05-28,85.0,86.0,83.0,85.0,10.0,26.73,27.09,51.0,...,39.0,33.0,67.0,76.0,73.0,63.0,123.608696,32.5,1.826087,84.0


We now have to find the closest match in date between our FIFA team ratings and the match data that we have.

In [42]:
dates = pd.Series(ratings.date.unique())
teams = ratings.team.unique()

In [43]:
# check that we found the corresponding team from team ratings in results
for t in teams:
    try:
        assert np.sum(results.home_team.unique() == t) == 1
    except:
        print('Cannot Find {}'.format(t))

for t in teams:
    try:
        assert np.sum(results.away_team.unique() == t) == 1
    except:
        print('Cannot Find {}'.format(t))

Cannot Find Republic of Ireland
Cannot Find United States
Cannot Find China PR
Cannot Find Côte d'Ivoire
Cannot Find Republic of Ireland
Cannot Find United States
Cannot Find China PR
Cannot Find Côte d'Ivoire


Before we proceed with anything else, let's check that country names are spelled similarly. We find that 'Republic of Ireland', 'United States', 'China PR', and 'Côte d'Ivoire' might be spelled differently.

In [44]:
ratings.replace('Republic of Ireland', 'Ireland', inplace=True)
ratings.replace('United States', 'USA', inplace=True)
ratings.replace('China PR', 'China', inplace=True)
ratings.replace("Côte d'Ivoire", 'Ivory Coast', inplace=True)

Now we finally have two datasets of teams spelled in the same way.

In [45]:
# find closest date that we have data on
def get_latest_date(match_date, dates):
    match_date = match_date.to_datetime64()
    return dates[match_date >  dates].max()

# results['closest_date'] = results.apply(lambda i: min(dates, key=lambda d: abs(d-i.date)), axis=1)
results['closest_date'] = results['date'].apply(get_latest_date, dates = dates)
results.head()

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral,closest_date
31700,2010-01-02 00:00:00+00:00,Iran,Korea DPR,1,0,Friendly,Doha,Qatar,True,2009-08-30
31701,2010-01-02 00:00:00+00:00,Qatar,Mali,0,0,Friendly,Doha,Qatar,False,2009-08-30
31702,2010-01-02 00:00:00+00:00,Syria,Zimbabwe,6,0,Friendly,Kuala Lumpur,Malaysia,True,2009-08-30
31703,2010-01-02 00:00:00+00:00,Yemen,Tajikistan,0,1,Friendly,Sana'a,Yemen,False,2009-08-30
31704,2010-01-03 00:00:00+00:00,Angola,Gambia,1,1,Friendly,Vila Real de Santo António,Portugal,True,2009-08-30


Now that we have the closest matching data for both of our datasets, we can merge on team and date. Note that we have merge twice in order to account for both teams in the results table.

In [46]:
# merge home team with their closest ratings
results_ratings1 = results.merge(ratings, how='inner', 
                                left_on=['closest_date', 'home_team'],
                                right_on=['date', 'team'])

results_ratings2 = results_ratings1.merge(ratings, how='inner',
                                       left_on=['closest_date', 'away_team'],
                                       right_on=['date', 'team'])

In [47]:
results_ratings2.shape

(1171, 54)

We find that we have 1171 observations that can still be used after matching team ratings with the results dataframe with an inner merge. We now clean up the merged dataframe slightly.

In [48]:
# response variable
score_diff = results_ratings2['home_score'] - results_ratings2['away_score']
results_ratings2['home_win'] = [0 if score < 0 else 1 if score > 0 else 2 for score in score_diff]

# drop useless columns
results_ratings2.drop(['home_team', 'away_team', 'tournament', 
                       'city', 'country', 'neutral', 'closest_date',
                      'team_x', 'date_y', 'team_y', 'date',
                      'home_score', 'away_score', 'date_x'], axis=1, inplace=True)

# reorder columns
results_ratings2.sort_index(axis=1, inplace=True)
results_ratings2.head()

Unnamed: 0,attack_x,attack_y,bup_dribbling_x,bup_dribbling_y,bup_passing_x,bup_passing_y,bup_speed_x,bup_speed_y,cc_crossing_x,cc_crossing_y,...,overall_x,overall_y,prestige_x,prestige_y,start_age_x,start_age_y,value_euros_millions_x,value_euros_millions_y,wage_euros_thousands_x,wage_euros_thousands_y
0,66.0,68.0,0.0,0.0,70.0,60.0,70.0,70.0,40.0,65.0,...,66.0,70.0,4.0,10.0,26.27,26.64,0.0,0.0,0.0,0.0
1,68.0,68.0,0.0,0.0,55.0,60.0,70.0,70.0,55.0,65.0,...,72.0,70.0,6.0,10.0,31.18,26.64,0.0,0.0,0.0,0.0
2,73.0,60.0,0.0,0.0,45.0,70.0,65.0,70.0,55.0,70.0,...,77.0,62.0,15.0,6.0,27.09,29.18,0.0,0.0,0.0,0.0
3,71.0,60.0,0.0,0.0,70.0,70.0,70.0,70.0,60.0,70.0,...,75.0,62.0,11.0,6.0,30.36,29.18,0.0,0.0,0.0,0.0
4,83.0,60.0,0.0,0.0,30.0,70.0,67.0,70.0,60.0,70.0,...,83.0,62.0,19.0,6.0,28.91,29.18,0.0,0.0,0.0,0.0


Our final training data is basically a difference in the various team ratings. Now let's clean up the dataframe for our training.

In [49]:
ratings_base = results_ratings2.loc[:,['home_win']]
ratings_base.head()

Unnamed: 0,home_win
0,1
1,0
2,1
3,1
4,2


In [50]:
diff_ratings = results_ratings2.drop('home_win', axis=1)
diff_ratings = diff_ratings.diff(axis=1)
ncol = diff_ratings.shape[1]

# we only want every alternate column
diff_ratings = diff_ratings.iloc[:,list(np.arange(1,ncol, 2))]

# we want our statistics to be from perspective of home team
diff_ratings = diff_ratings*-1
diff_ratings.head()

Unnamed: 0,attack_y,bup_dribbling_y,bup_passing_y,bup_speed_y,cc_crossing_y,cc_passing_y,cc_shooting_y,d_aggresion_y,d_pressure_y,d_width_y,defence_y,full_age_y,goalkeeeper_overall_y,growth_y,midfield_y,overall_y,prestige_y,start_age_y,value_euros_millions_y,wage_euros_thousands_y
0,-2.0,-0.0,10.0,-0.0,-25.0,-20.0,-0.0,10.0,5.0,-5.0,-2.0,1.4,-2.0,-0.046377,-3.0,-4.0,-6.0,-0.37,-0.0,-0.0
1,-0.0,-0.0,-5.0,-0.0,-10.0,-15.0,-10.0,10.0,-20.0,-0.0,4.0,2.86,7.0,1.22029,-1.0,2.0,-4.0,4.54,-0.0,-0.0
2,13.0,-0.0,-25.0,-5.0,-15.0,-5.0,-5.0,-10.0,-5.0,25.0,16.0,0.23,19.0,-1.666667,17.0,15.0,9.0,-2.09,-0.0,-0.0
3,11.0,-0.0,-0.0,-0.0,-10.0,-0.0,-0.0,-0.0,-20.0,15.0,10.0,1.53,19.0,-1.466667,14.0,13.0,5.0,1.18,-0.0,-0.0
4,23.0,-0.0,-40.0,-3.0,-10.0,-15.0,-30.0,-40.0,-40.0,15.0,22.0,1.2,28.0,-3.433333,24.0,21.0,13.0,-0.27,-0.0,-0.0


In [51]:
columns = ['attack_diff', 'bup_dribbling_diff', 'bup_passing_diff', 'bup_speed_diff',
          'cc_crossing_diff', 'cc_passing_diff', 'cc_shooting_diff', 'd_aggresion_diff',
          'd_pressure_diff', 'd_width_diff', 'defence_diff', 'full_age_diff', 
          'goalkeeper_overall_diff', 'growth_diff', 'midfield_diff', 'overall_diff',
          'prestige_diff', 'start_age_diff', 'avg_value_euros_millions_diff',
          'avg_wage_euros_thousands_diff']

diff_ratings.columns = columns

# compile datframe
train = pd.concat([ratings_base, diff_ratings], axis=1)
train.head()

Unnamed: 0,home_win,attack_diff,bup_dribbling_diff,bup_passing_diff,bup_speed_diff,cc_crossing_diff,cc_passing_diff,cc_shooting_diff,d_aggresion_diff,d_pressure_diff,...,defence_diff,full_age_diff,goalkeeper_overall_diff,growth_diff,midfield_diff,overall_diff,prestige_diff,start_age_diff,avg_value_euros_millions_diff,avg_wage_euros_thousands_diff
0,1,-2.0,-0.0,10.0,-0.0,-25.0,-20.0,-0.0,10.0,5.0,...,-2.0,1.4,-2.0,-0.046377,-3.0,-4.0,-6.0,-0.37,-0.0,-0.0
1,0,-0.0,-0.0,-5.0,-0.0,-10.0,-15.0,-10.0,10.0,-20.0,...,4.0,2.86,7.0,1.22029,-1.0,2.0,-4.0,4.54,-0.0,-0.0
2,1,13.0,-0.0,-25.0,-5.0,-15.0,-5.0,-5.0,-10.0,-5.0,...,16.0,0.23,19.0,-1.666667,17.0,15.0,9.0,-2.09,-0.0,-0.0
3,1,11.0,-0.0,-0.0,-0.0,-10.0,-0.0,-0.0,-0.0,-20.0,...,10.0,1.53,19.0,-1.466667,14.0,13.0,5.0,1.18,-0.0,-0.0
4,2,23.0,-0.0,-40.0,-3.0,-10.0,-15.0,-30.0,-40.0,-40.0,...,22.0,1.2,28.0,-3.433333,24.0,21.0,13.0,-0.27,-0.0,-0.0


In [52]:
print(train[train.avg_wage_euros_thousands_diff == 0].shape)
print(train[train.avg_value_euros_millions_diff == 0].shape)

(558, 21)
(558, 21)


Importantly, we note that 558 out of our 1171 observations have missing wage and value data.

In [53]:
# save to csv
train.to_csv('data/train_team.csv', index = False)

We now create our test set with actual world cup data.

In [59]:
ratings_wc = pd.read_csv('data/sofifa_final.csv')
ratings_wc['date'] = pd.to_datetime(ratings_wc['date'], utc=True)

# restrict dates
latest_date = '2018-06-14'
wc_pre = '2018-06-14' # pre WC rating for values and wages
wc_start = '2018-06-16' # first WC rating
wc_end = '2018-07-15'

# get values and wages data
wage = ratings_wc.loc[ratings_wc['date'] == wc_pre, 'wage_euros_thousands']
value = ratings_wc.loc[ratings_wc['date'] == wc_pre, 'value_euros_millions']

# restrict dates
ratings_wc = ratings_wc[(ratings_wc['date'] >= wc_start) & (ratings_wc['date'] <= wc_end)]

# replace values and wage data
ratings_wc['wage_euros_thousands'] = wage.reset_index(drop=True)
ratings_wc['value_euros_millions'] = value.reset_index(drop=True)

ratings_wc.head()

Unnamed: 0,team,date,overall,attack,midfield,defence,prestige,start_age,full_age,bup_speed,...,cc_passing,cc_crossing,cc_shooting,d_pressure,d_aggresion,d_width,wage_euros_thousands,value_euros_millions,growth,goalkeeeper_overall
25,Brazil,2018-07-15 00:00:00+00:00,86.0,87.0,86.0,84.0,10.0,27.55,27.43,51.0,...,39.0,33.0,67.0,76.0,73.0,63.0,7.130435,2.252826,1.625,84.0
26,Brazil,2018-07-12 00:00:00+00:00,86.0,87.0,86.0,84.0,10.0,27.55,27.43,51.0,...,39.0,33.0,67.0,76.0,73.0,63.0,36.695652,5.15,1.625,84.0
27,Brazil,2018-07-11 00:00:00+00:00,86.0,87.0,86.0,84.0,10.0,27.55,27.43,51.0,...,39.0,33.0,67.0,76.0,73.0,63.0,6.391304,1.922826,1.625,84.0
28,Brazil,2018-07-08 00:00:00+00:00,86.0,87.0,86.0,84.0,10.0,27.55,27.43,51.0,...,39.0,33.0,67.0,76.0,73.0,63.0,38.652174,9.293478,1.625,84.0
29,Brazil,2018-07-07 00:00:00+00:00,86.0,87.0,86.0,84.0,10.0,27.55,27.43,51.0,...,39.0,33.0,67.0,76.0,73.0,63.0,50.478261,7.380652,1.625,84.0


In [61]:
dates_wc = ratings_wc.date.unique()
teams_wc = ratings_wc.team.unique()

In [63]:
#https://gitlab.com/djh_or/2018-world-cup-stats/blob/master/world_cup_2018_stats.csv
results_wc = pd.read_csv("data/world_cup_2018_stats.csv")
results_wc.head()

Unnamed: 0,Game,Group,Team,Opponent,Home/Away,Score,WDL,Pens?,Goals For,Goals Against,...,Passes Completed,Distance Covered km,Balls recovered,Tackles,Blocks,Clearances,Yellow cards,Red Cards,Second Yellow Card leading to Red Card,Fouls Committed
0,1,A,Russia,Saudi Arabia,home,5-0,W,,5,0,...,240,118,53,9,3,19,1,0,0,22
1,1,A,Saudi Arabia,Russia,away,5-0,L,,0,5,...,442,105,48,16,3,31,1,0,0,10
2,2,A,Egypt,Uruguay,home,0-1,L,,0,1,...,308,112,57,12,4,32,2,0,0,12
3,2,A,Uruguay,Egypt,away,0-1,W,,1,0,...,508,111,54,8,2,22,0,0,0,6
4,3,B,Morocco,IR Iran,home,0-1,L,,0,1,...,371,101,38,9,1,16,1,0,0,22


Before we proceed, let's ensure that our country names match up similarly.

In [64]:
# check that we found the corresponding team from team ratings in results
# but now we reverse it such that we try to find a rating for each wc team
for t in results_wc.Team.unique():
    try:
        assert np.sum(teams_wc == t) == 1
    except:
        print('Cannot Find {}'.format(t))


Cannot Find IR Iran


In [65]:
ratings_wc.replace('Iran', 'IR Iran', inplace=True)

In [66]:
results_wc = results_wc.loc[:,['Team', 'Opponent', 'WDL']]
results_wc['home_win'] = [0 if score == 'L' else 1 if score == 'W' else 2 for score in results_wc.WDL]
results_wc.drop('WDL', axis=1, inplace=True)

# we only want every alternate column
results_wc = results_wc.iloc[list(np.arange(0, results_wc.shape[0], 2)),:]

results_wc.head()

Unnamed: 0,Team,Opponent,home_win
0,Russia,Saudi Arabia,1
2,Egypt,Uruguay,0
4,Morocco,IR Iran,0
6,Portugal,Spain,2
8,France,Australia,1


Now we can match the ratings data to our test dataset as well. We note that there might be some value in using the updated FIFA ratings at each time of the match. However, for simplicity, let's just assume that teams have the same rating throughout the tournament. We will take the rating at the start of the tournament.

In [67]:
ratings_wc_start = ratings_wc[ratings_wc['date'] == wc_start]

# merge ratings with results table
results_ratings_wc1 = results_wc.merge(ratings_wc_start, how='left',
                                     left_on='Team', right_on='team')
results_ratings_wc2 = results_ratings_wc1.merge(ratings_wc_start, how='left',
                                             left_on='Opponent', right_on='team')



In [68]:
# check that every team has a rating
assert results_ratings_wc2.shape[0] == results_wc.shape[0]

In [69]:
# drop useless columns
results_ratings_wc2.drop(['Team', 'Opponent', 'team_x', 'team_y', 'date_y', 'date_x'], 
                         axis=1, inplace=True)

# reorder columns
results_ratings_wc2.sort_index(axis=1, inplace=True)
results_ratings_wc2.head()

Unnamed: 0,attack_x,attack_y,bup_dribbling_x,bup_dribbling_y,bup_passing_x,bup_passing_y,bup_speed_x,bup_speed_y,cc_crossing_x,cc_crossing_y,...,overall_x,overall_y,prestige_x,prestige_y,start_age_x,start_age_y,value_euros_millions_x,value_euros_millions_y,wage_euros_thousands_x,wage_euros_thousands_y
0,80.0,71.0,77.0,68.0,49.0,66.0,50.0,69.0,37.0,48.0,...,79.0,72.0,6.0,4.0,27.82,28.0,,,,
1,72.0,86.0,34.0,42.0,49.0,36.0,52.0,38.0,64.0,43.0,...,76.0,80.0,5.0,7.0,27.64,26.09,,,,
2,72.0,79.0,52.0,67.0,38.0,69.0,38.0,69.0,58.0,37.0,...,76.0,74.0,3.0,3.0,26.55,26.27,,,,
3,85.0,85.0,68.0,41.0,50.0,28.0,51.0,32.0,58.0,32.0,...,84.0,85.0,8.0,9.0,26.82,28.36,,,,
4,85.0,69.0,52.0,46.0,30.0,49.0,35.0,49.0,53.0,46.0,...,85.0,72.0,9.0,5.0,23.18,26.45,,,,


In [70]:
ratings_base_wc = results_ratings_wc2.loc[:,['home_win']]
ratings_base_wc.head()

Unnamed: 0,home_win
0,1
1,0
2,0
3,2
4,1


In [71]:
diff_ratings_wc = results_ratings_wc2.drop('home_win', axis=1)
diff_ratings_wc = diff_ratings_wc.diff(axis=1)
ncol_wc = diff_ratings_wc.shape[1]

# we only want every alternate column
diff_ratings_wc = diff_ratings_wc.iloc[:,list(np.arange(1,ncol_wc, 2))]

# we want our statistics to be from perspective of home teams
diff_ratings_wc = diff_ratings_wc*-1
diff_ratings_wc.head()

Unnamed: 0,attack_y,bup_dribbling_y,bup_passing_y,bup_speed_y,cc_crossing_y,cc_passing_y,cc_shooting_y,d_aggresion_y,d_pressure_y,d_width_y,defence_y,full_age_y,goalkeeeper_overall_y,growth_y,midfield_y,overall_y,prestige_y,start_age_y,value_euros_millions_y,wage_euros_thousands_y
0,9.0,9.0,-17.0,-19.0,-11.0,-9.0,-27.0,1.0,-0.0,-17.0,6.0,-0.13,11.0,-0.0,6.0,7.0,2.0,-0.18,,
1,-14.0,-8.0,13.0,14.0,21.0,25.0,-12.0,-9.0,-0.0,-30.0,-5.0,0.3,-15.0,-0.425,-0.0,-4.0,-2.0,1.55,,
2,-7.0,-15.0,-31.0,-31.0,21.0,-9.0,-9.0,9.0,17.0,46.0,7.0,-0.3,-3.0,-0.175,6.0,2.0,-0.0,0.28,,
3,-0.0,27.0,22.0,19.0,26.0,27.0,42.0,-4.0,-9.0,-14.0,-3.0,-0.5,-6.0,1.0,-3.0,-1.0,-1.0,-1.54,,
4,16.0,6.0,-19.0,-14.0,7.0,-28.0,-31.0,-5.0,4.0,1.0,11.0,-1.68,11.0,1.05,12.0,13.0,4.0,-3.27,,


In [72]:
diff_ratings_wc.columns = columns

# compile datframe
test = pd.concat([ratings_base_wc, diff_ratings_wc], axis=1)
test.head()

Unnamed: 0,home_win,attack_diff,bup_dribbling_diff,bup_passing_diff,bup_speed_diff,cc_crossing_diff,cc_passing_diff,cc_shooting_diff,d_aggresion_diff,d_pressure_diff,...,defence_diff,full_age_diff,goalkeeper_overall_diff,growth_diff,midfield_diff,overall_diff,prestige_diff,start_age_diff,avg_value_euros_millions_diff,avg_wage_euros_thousands_diff
0,1,9.0,9.0,-17.0,-19.0,-11.0,-9.0,-27.0,1.0,-0.0,...,6.0,-0.13,11.0,-0.0,6.0,7.0,2.0,-0.18,,
1,0,-14.0,-8.0,13.0,14.0,21.0,25.0,-12.0,-9.0,-0.0,...,-5.0,0.3,-15.0,-0.425,-0.0,-4.0,-2.0,1.55,,
2,0,-7.0,-15.0,-31.0,-31.0,21.0,-9.0,-9.0,9.0,17.0,...,7.0,-0.3,-3.0,-0.175,6.0,2.0,-0.0,0.28,,
3,2,-0.0,27.0,22.0,19.0,26.0,27.0,42.0,-4.0,-9.0,...,-3.0,-0.5,-6.0,1.0,-3.0,-1.0,-1.0,-1.54,,
4,1,16.0,6.0,-19.0,-14.0,7.0,-28.0,-31.0,-5.0,4.0,...,11.0,-1.68,11.0,1.05,12.0,13.0,4.0,-3.27,,


In [73]:
# save to csv
test.to_csv('data/test_team.csv', index = False)