In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time
import seaborn as sns
import re
%matplotlib inline

In [7]:
results = pd.read_csv('../data/fifa/international_results.csv')
results['date'] = pd.to_datetime(results['date'], utc=True)

#change column name to avoid confusion later when merging.
results['game_date'] = results['date']
results = results.drop(['date'], axis = 1)


# restrict dates
earliest_date = '2006-01-01'
latest_date = '2018-06-14'
results = results[(results['game_date'] > earliest_date) & (results['game_date'] < latest_date)]
results.head()

Unnamed: 0,home_team,away_team,home_score,away_score,tournament,city,country,neutral,game_date
27908,Qatar,Libya,2,0,Friendly,Doha,Qatar,False,2006-01-02 00:00:00+00:00
27909,Egypt,Zimbabwe,2,0,Friendly,Alexandria,Egypt,False,2006-01-05 00:00:00+00:00
27910,Guinea,Togo,1,0,Friendly,Viry-Châtillon,France,True,2006-01-07 00:00:00+00:00
27911,Morocco,Congo DR,3,0,Friendly,Rabat,Morocco,False,2006-01-09 00:00:00+00:00
27912,Ghana,Togo,0,1,Friendly,Monastir,Tunisia,True,2006-01-11 00:00:00+00:00


In [8]:
# response variable
score_diff = results['home_score'] - results['away_score']
results['home_win'] = [-1 if score < 0 else 1 if score > 0 else 0 for score in score_diff]

In [9]:
ratings = pd.read_csv('../data/sofifa_final.csv')
ratings['rate_date'] = pd.to_datetime(ratings['date'])
ratings = ratings.drop(['date'], axis = 1)

# restrict dates
ratings = ratings[(ratings['rate_date'] > earliest_date) & (ratings['rate_date'] < latest_date)]
ratings.head()

Unnamed: 0,team,overall,attack,midfield,defence,prestige,start_age,full_age,bup_speed,bup_dribbling,...,cc_crossing,cc_shooting,d_pressure,d_aggresion,d_width,wage_euros_thousands,value_euros_millions,growth,goalkeeper_overall,rate_date
48,Brazil,85.0,86.0,83.0,85.0,10.0,26.73,27.09,51.0,73.0,...,33.0,67.0,76.0,73.0,63.0,123.608696,32.5,1.826087,84.0,2018-06-11
49,Brazil,85.0,86.0,83.0,85.0,10.0,26.73,27.09,51.0,73.0,...,33.0,67.0,76.0,73.0,63.0,123.608696,32.5,1.826087,84.0,2018-06-07
50,Brazil,85.0,86.0,83.0,85.0,10.0,26.73,27.09,51.0,73.0,...,33.0,67.0,76.0,73.0,63.0,123.608696,32.5,1.826087,84.0,2018-06-04
51,Brazil,85.0,86.0,83.0,85.0,10.0,26.73,27.09,51.0,73.0,...,33.0,67.0,76.0,73.0,63.0,123.608696,32.5,1.826087,84.0,2018-05-31
52,Brazil,85.0,86.0,83.0,85.0,10.0,26.73,27.09,51.0,73.0,...,33.0,67.0,76.0,73.0,63.0,123.608696,32.5,1.826087,84.0,2018-05-28


In [11]:
fifa_rankings = pd.read_csv('../data/fifa_ranking.csv')
fifa_rankings['rank_date'] = pd.to_datetime(fifa_rankings['rank_date'])
fifa_rankings = fifa_rankings[['rank', 'country_full', 'rank_date', 'confederation']]
fifa_rankings.head()

Unnamed: 0,rank,country_full,rank_date,confederation
0,1,Germany,1993-08-08,UEFA
1,2,Italy,1993-08-08,UEFA
2,3,Switzerland,1993-08-08,UEFA
3,4,Sweden,1993-08-08,UEFA
4,5,Argentina,1993-08-08,CONMEBOL


We now have to find for each match the latest team ratings and ranking.

In [12]:
dates = pd.Series(ratings.rate_date.unique())
teams = ratings.team.unique()

In [13]:
# check that we found the corresponding team from team ratings in results
for t in teams:
    try:
        assert np.sum(results.home_team.unique() == t) == 1
    except:
        print('Cannot Find {} in Results Home'.format(t))

for t in teams:
    try:
        assert np.sum(results.away_team.unique() == t) == 1
    except:
        print('Cannot Find {} in Results Away'.format(t))
        
for t in teams:
    try:
        assert np.sum(fifa_rankings.country_full.unique() == t) == 1
    except:
        print('Cannot Find {} in Rankings'.format(t))

Cannot Find Republic of Ireland in Results Home
Cannot Find United States in Results Home
Cannot Find China PR in Results Home
Cannot Find Côte d'Ivoire in Results Home
Cannot Find Republic of Ireland in Results Away
Cannot Find United States in Results Away
Cannot Find China PR in Results Away
Cannot Find Côte d'Ivoire in Results Away
Cannot Find United States in Rankings
Cannot Find Iran in Rankings


Before we proceed with anything else, let's check that country names are spelled similarly. We find that 'Republic of Ireland', 'United States', 'China PR', 'Iran', and 'Côte d'Ivoire' might be spelled differently. We manually substitute these in.

In [14]:
ratings.replace('Republic of Ireland', 'Ireland', inplace=True)
ratings.replace('United States', 'USA', inplace=True)
#We have no political affiliations with this replacement. 
ratings.replace('China PR', 'China', inplace=True)
ratings.replace("Côte d'Ivoire", 'Ivory Coast', inplace=True)
fifa_rankings.replace('Republic of Ireland', 'Ireland', inplace=True)
fifa_rankings.replace('China PR', 'China', inplace = True)
fifa_rankings.replace("IR Iran", "Iran", inplace = True)
fifa_rankings.replace("Côte d'Ivoire", 'Ivory Coast', inplace=True)


Now we finally have two datasets of teams spelled in the same way.

In [15]:
#Find closest date of ratings and rankings for teams in each match
home_ratings_date = []
away_ratings_date = []
for home, away, date in zip(results['home_team'], results['away_team'],
                                        results['game_date']):
    home_ratings_date.append(ratings.loc[(ratings['rate_date'] < date.to_datetime64()) 
                                         & (ratings['team'] == home), 'rate_date'].max())
    away_ratings_date.append(ratings.loc[(ratings['rate_date'] < date.to_datetime64()) 
                                         & (ratings['team'] == away), 'rate_date'].max())


In [None]:
#Find closest date of ratings and rankings for teams in each match
home_rankings_date = []
away_rankings_date = []
for home, away, date in zip(results['home_team'], results['away_team'],
                                        results['game_date']):
    home_rankings_date.append(fifa_rankings.loc[(fifa_rankings['rank_date'] < date.to_datetime64()) 
                                         & (fifa_rankings['country_full'] == home), 'rank_date'].max())
    away_rankings_date.append(fifa_rankings.loc[(fifa_rankings['rank_date'] < date.to_datetime64()) 
                                         & (fifa_rankings['country_full'] == away), 'rank_date'].max())


In [None]:
results['closest_home_rating_date'] = home_ratings_date
results['closest_away_rating_date'] = away_ratings_date

results['closest_home_ranking_date'] = home_rankings_date
results['closest_away_ranking_date'] = away_rankings_date

In [None]:
results.tail()

Now that we have the closest matching data for both of our datasets, we can merge on team and date. We will first do this for the ratings dataset. 

Note that for both the ratings and rankings tables we have merge twice in order to account for both home and away teams.

In [None]:
#Reset index to keep track of original index
results_copy = results.copy()
results = results.reset_index()

results_ratings1 = results.merge(ratings, how='inner', 
                                left_on=['closest_home_rating_date', 'home_team'],
                                right_on=['rate_date', 'team'])

results_ratings2 = results_ratings1.merge(ratings, how='inner',
                                       left_on=['closest_away_rating_date', 'away_team'],
                                       right_on=['rate_date', 'team'])

results_ratings2.head()

In [None]:
results_ratings2.shape

In [None]:
results_ratings2.home_team.unique()

We find that we have 1897 observations that can still be used after matching team ratings with the results dataframe with an inner merge. We now clean up the merged dataframe slightly.

In [482]:
# drop useless columns
results_ratings3 = results_ratings2.drop(['index', 'home_team', 'away_team', 'tournament', 
                       'city', 'country', 'neutral', 'closest_home_rating_date', 'closest_away_rating_date',
                       'closest_home_ranking_date', 'closest_away_ranking_date',
                      'team_x', 'rate_date_y', 'team_y', 'game_date',
                      'home_score', 'away_score', 'rate_date_x'], axis=1)

# reorder columns
results_ratings3.sort_index(axis=1, inplace=True)
results_ratings3.head()

Unnamed: 0,attack_x,attack_y,bup_dribbling_x,bup_dribbling_y,bup_passing_x,bup_passing_y,bup_speed_x,bup_speed_y,cc_crossing_x,cc_crossing_y,...,overall_x,overall_y,prestige_x,prestige_y,start_age_x,start_age_y,value_euros_millions_x,value_euros_millions_y,wage_euros_thousands_x,wage_euros_thousands_y
0,77.0,78.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,78.0,81.0,13.0,15.0,27.0,27.27,0.0,0.0,0.0,0.0
1,74.0,78.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,74.0,81.0,11.0,15.0,28.36,27.27,0.0,0.0,0.0,0.0
2,72.0,78.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,74.0,81.0,6.0,15.0,28.18,27.27,0.0,0.0,0.0,0.0
3,77.0,64.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,78.0,69.0,13.0,5.0,27.0,27.27,0.0,0.0,0.0,0.0
4,70.0,76.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,73.0,76.0,10.0,11.0,25.82,26.55,0.0,0.0,0.0,0.0


Our final training data is basically a difference in the various team ratings. Now let's clean up the dataframe for our training.

In [483]:
ratings_base = results_ratings3.loc[:,['home_win']]
ratings_base.head()

Unnamed: 0,home_win
0,1
1,1
2,0
3,0
4,1


In [484]:
diff_ratings = results_ratings3.drop('home_win', axis=1)
#Diff every column with column beforehand. 
diff_ratings = diff_ratings.diff(axis=1)
ncol = diff_ratings.shape[1]

# Since we diff every column with column beforehand, we only want every alternate column since those are the true diff
diff_ratings = diff_ratings.iloc[:,list(np.arange(1,ncol, 2))]

# we want our statistics to be from perspective of home team
diff_ratings = diff_ratings*-1


diff_ratings.head()

Unnamed: 0,attack_y,bup_dribbling_y,bup_passing_y,bup_speed_y,cc_crossing_y,cc_passing_y,cc_shooting_y,d_aggresion_y,d_pressure_y,d_width_y,defence_y,full_age_y,goalkeeeper_overall_y,growth_y,midfield_y,overall_y,prestige_y,start_age_y,value_euros_millions_y,wage_euros_thousands_y
0,-1.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-6.0,0.57,1.0,1.0,-3.0,-3.0,-2.0,-0.27,-0.0,-0.0
1,-4.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-13.0,-0.3,5.0,-0.26087,-10.0,-7.0,-4.0,1.09,-0.0,-0.0
2,-6.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-12.0,0.43,4.0,-1.26087,-13.0,-7.0,-9.0,0.91,-0.0,-0.0
3,13.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,10.0,0.87,5.0,0.695652,10.0,9.0,8.0,-0.27,-0.0,-0.0
4,-6.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-5.0,-0.22,9.0,-1.217391,-7.0,-3.0,-1.0,-0.73,-0.0,-0.0


In [485]:
columns = [column[:-2] + '_diff' for column in diff_ratings.columns]

diff_ratings.columns = columns

#Also want difference in attack of one team and defense of the other
diff_ratings['attack_home_defence_away_diff'] = results_ratings3['attack_x'] - results_ratings3['defence_y']
diff_ratings['attack_away_defence_home_diff'] = results_ratings3['attack_y'] - results_ratings3['defence_x']

# compile datframe
train = pd.concat([results_ratings2[['index', 'game_date', 'home_team', 'away_team', 'country', 'neutral']],
                                    ratings_base, diff_ratings], axis=1)
train.head()


Unnamed: 0,index,game_date,home_team,away_team,country,neutral,home_win,attack_diff,bup_dribbling_diff,bup_passing_diff,...,goalkeeeper_overall_diff,growth_diff,midfield_diff,overall_diff,prestige_diff,start_age_diff,value_euros_millions_diff,wage_euros_thousands_diff,attack_home_defence_away_diff,attack_away_defence_home_diff
0,28374,2006-09-01 00:00:00+00:00,Denmark,Portugal,Denmark,False,1,-1.0,-0.0,-0.0,...,1.0,1.0,-3.0,-3.0,-2.0,-0.27,-0.0,-0.0,-4.0,3.0
1,28584,2006-10-11 00:00:00+00:00,Poland,Portugal,Poland,False,1,-4.0,-0.0,-0.0,...,5.0,-0.26087,-10.0,-7.0,-4.0,1.09,-0.0,-0.0,-7.0,10.0
2,28437,2006-09-06 00:00:00+00:00,Finland,Portugal,Finland,False,0,-6.0,-0.0,-0.0,...,4.0,-1.26087,-13.0,-7.0,-9.0,0.91,-0.0,-0.0,-9.0,9.0
3,28517,2006-10-07 00:00:00+00:00,Denmark,Northern Ireland,Denmark,False,0,13.0,-0.0,-0.0,...,5.0,0.695652,10.0,9.0,8.0,-0.27,-0.0,-0.0,12.0,-11.0
4,28560,2006-10-11 00:00:00+00:00,Austria,Switzerland,Austria,False,1,-6.0,-0.0,-0.0,...,9.0,-1.217391,-7.0,-3.0,-1.0,-0.73,-0.0,-0.0,-7.0,4.0


Now we do the same for rankings.

In [489]:
#Now merge rankings
results_rankings1 = results.merge(fifa_rankings, how='inner', 
                                left_on=['closest_home_ranking_date', 'home_team'],
                                right_on=['rank_date', 'country_full'])

results_rankings2 = results_rankings1.merge(fifa_rankings, how='inner', 
                                left_on=['closest_away_ranking_date', 'away_team'],
                                right_on=['rank_date', 'country_full'])


In [490]:
# drop useless columns
results_rankings3 = results_rankings2.drop(['home_team', 'away_team', 'tournament', 
                       'city', 'country', 'neutral', 'closest_home_rating_date', 'closest_away_rating_date',
                      'closest_home_ranking_date', 'closest_away_ranking_date',
                      'country_full_x', 'rank_date_y', 'country_full_y', 'game_date',
                      'home_score', 'away_score', 'rank_date_x', 'confederation_x', 
                       'confederation_y', 'home_win', 'index'], axis=1)

# reorder columns
results_rankings3.sort_index(axis=1, inplace=True)
results_rankings3[['rank_x', 'rank_y']].head()

Unnamed: 0,rank_x,rank_y
0,95,80
1,28,80
2,32,53
3,36,53
4,32,49


In [491]:
#Diff every column with column beforehand. 
diff_rankings = results_rankings3.diff(axis=1)
ncol = diff_rankings.shape[1]

# Since we diff every column with column beforehand, we only want every alternate column since those are the true diff
diff_rankings = diff_rankings.iloc[:,list(np.arange(1,ncol, 2))]

# we want our statistics to be from perspective of home team
diff_rankings = diff_rankings*-1
columns = [column[:-2] + "_diff" for column in diff_rankings.columns]

diff_rankings.columns = columns
diff_rankings['index'] = results_rankings2['index']

diff_rankings.head()

Unnamed: 0,rank_diff,index
0,15.0,27908
1,-52.0,27913
2,-21.0,27909
3,-17.0,27915
4,-17.0,27914


In [492]:
train = train.merge(diff_rankings, left_on=['index'], right_on=['index'], how = 'inner').drop(['index'], axis = 1)

In [493]:
train.shape

(1897, 29)

In [494]:
train.head()

Unnamed: 0,game_date,home_team,away_team,country,neutral,home_win,attack_diff,bup_dribbling_diff,bup_passing_diff,bup_speed_diff,...,growth_diff,midfield_diff,overall_diff,prestige_diff,start_age_diff,value_euros_millions_diff,wage_euros_thousands_diff,attack_home_defence_away_diff,attack_away_defence_home_diff,rank_diff
0,2006-09-01 00:00:00+00:00,Denmark,Portugal,Denmark,False,1,-1.0,-0.0,-0.0,-0.0,...,1.0,-3.0,-3.0,-2.0,-0.27,-0.0,-0.0,-4.0,3.0,9.0
1,2006-10-11 00:00:00+00:00,Poland,Portugal,Poland,False,1,-4.0,-0.0,-0.0,-0.0,...,-0.26087,-10.0,-7.0,-4.0,1.09,-0.0,-0.0,-7.0,10.0,26.0
2,2006-09-06 00:00:00+00:00,Finland,Portugal,Finland,False,0,-6.0,-0.0,-0.0,-0.0,...,-1.26087,-13.0,-7.0,-9.0,0.91,-0.0,-0.0,-9.0,9.0,59.0
3,2006-10-07 00:00:00+00:00,Denmark,Northern Ireland,Denmark,False,0,13.0,-0.0,-0.0,-0.0,...,0.695652,10.0,9.0,8.0,-0.27,-0.0,-0.0,12.0,-11.0,-42.0
4,2006-10-11 00:00:00+00:00,Austria,Switzerland,Austria,False,1,-6.0,-0.0,-0.0,-0.0,...,-1.217391,-7.0,-3.0,-1.0,-0.73,-0.0,-0.0,-7.0,4.0,56.0


In [495]:
# save to csv
train.to_csv('../data/train_team.csv', index = False)

We now create our test set with actual world cup data.

In [507]:
ratings_wc = pd.read_csv('../data/team_stats_final.csv')
ratings_wc['date'] = pd.to_datetime(ratings_wc['date'], utc=True)

# restrict dates
latest_date = '2018-06-14'
wc_start = '2018-06-16' # first WC rating
wc_end = '2018-07-15'
# restrict dates
ratings_wc = ratings_wc[(ratings_wc['date'] >= wc_start) & (ratings_wc['date'] <= wc_end)]

ratings_wc.head()

Unnamed: 0,team,date,attack,defence,full_age,midfield,overall,prestige,start_age,bup_speed,...,cc_passing,cc_crossing,cc_shooting,d_pressure,d_aggresion,d_width,wage_euros_thousands,value_euros_millions,growth,goalkeeeper_overall
0,Brazil,2018-07-15 00:00:00+00:00,87.0,84.0,27.43,86.0,86.0,10.0,27.55,51.0,...,39.0,33.0,67.0,76.0,73.0,63.0,0.0,0.0,1.625,84.0
1,England,2018-07-15 00:00:00+00:00,83.0,80.0,25.88,81.0,81.0,8.0,24.64,36.0,...,31.0,41.0,41.0,43.0,55.0,49.0,0.0,0.0,2.575,80.0
2,Italy,2018-07-15 00:00:00+00:00,81.0,82.0,25.96,81.0,82.0,9.0,27.36,73.0,...,64.0,80.0,78.0,29.0,26.0,32.0,0.0,0.0,3.26087,88.0
3,Spain,2018-07-15 00:00:00+00:00,84.0,85.0,27.18,86.0,85.0,9.0,27.27,32.0,...,27.0,32.0,20.0,75.0,62.0,52.0,0.0,0.0,1.775,91.0
4,France,2018-07-15 00:00:00+00:00,86.0,81.0,25.05,84.0,85.0,9.0,24.82,35.0,...,24.0,53.0,35.0,47.0,47.0,67.0,0.0,0.0,3.45,87.0


In [508]:
ratings_teams_wc = ratings_wc.team.unique()

In [509]:
#https://gitlab.com/djh_or/2018-world-cup-stats/blob/master/world_cup_2018_stats.csv
results_wc = pd.read_csv("../data/world_cup_2018_stats.csv")
results_wc.head()

Unnamed: 0,Game,Group,Team,Opponent,Home/Away,Score,WDL,Pens?,Goals For,Goals Against,...,Passes Completed,Distance Covered km,Balls recovered,Tackles,Blocks,Clearances,Yellow cards,Red Cards,Second Yellow Card leading to Red Card,Fouls Committed
0,1,A,Russia,Saudi Arabia,home,5-0,W,,5,0,...,240,118,53,9,3,19,1,0,0,22
1,1,A,Saudi Arabia,Russia,away,5-0,L,,0,5,...,442,105,48,16,3,31,1,0,0,10
2,2,A,Egypt,Uruguay,home,0-1,L,,0,1,...,308,112,57,12,4,32,2,0,0,12
3,2,A,Uruguay,Egypt,away,0-1,W,,1,0,...,508,111,54,8,2,22,0,0,0,6
4,3,B,Morocco,IR Iran,home,0-1,L,,0,1,...,371,101,38,9,1,16,1,0,0,22


Before we proceed, let's ensure that our country names match up similarly.

In [510]:
# check that we found the corresponding team from team ratings in results
# but now we reverse it such that we try to find a rating and ranking for each wc team
for t in results_wc.Team.unique():
    try:
        assert np.sum(ratings_teams_wc == t) == 1
    except:
        print('Cannot Find in Ratings {}'.format(t))
        
for t in results_wc.Team.unique():
    try:
        assert np.sum(fifa_rankings.country_full.unique() == t) == 1
    except:
        print('Cannot Find in Rankings {}'.format(t))

Cannot Find in Ratings IR Iran
Cannot Find in Rankings IR Iran


In [511]:
results_wc.replace('IR Iran', 'Iran', inplace=True)

In [512]:
tournament_round = results_wc.Group
results_wc = results_wc.loc[:,['Team', 'Opponent', 'WDL']]
results_wc['home_win'] = [-1 if score == 'L' else 1 if score == 'W' else 0 for score in results_wc.WDL]
results_wc.drop('WDL', axis=1, inplace=True)
results_wc.tail()

Unnamed: 0,Team,Opponent,home_win
123,England,Croatia,-1
124,Belgium,England,1
125,England,Belgium,-1
126,France,Croatia,1
127,Croatia,France,-1


In [513]:
#Add these columns for later.
#Will figure out dates of each match later.
results_wc['game_date'] = None
results_wc['home_team'] = results_wc['Team']
results_wc['away_team'] = results_wc['Opponent']
#Make rows where away_team is Russia to home_team since World Cup is in Russia
switch_teams = results_wc[results_wc['Opponent'] == 'Russia'].copy()
results_wc.loc[results_wc['Opponent'] == 'Russia', 'home_team'] = 'Russia'
results_wc.loc[results_wc['Opponent'] == 'Russia', 'away_team'] = switch_teams['Team']
results_wc['country'] = 'Russia'
results_wc['neutral'] = [True if home != 'Russia' else False for home in results_wc.home_team]
results_wc = results_wc.drop(['Team', 'Opponent'], axis = 1)
results_wc.head()

Unnamed: 0,home_win,game_date,home_team,away_team,country,neutral
0,1,,Russia,Saudi Arabia,Russia,False
1,-1,,Russia,Saudi Arabia,Russia,False
2,-1,,Egypt,Uruguay,Russia,True
3,1,,Uruguay,Egypt,Russia,True
4,-1,,Morocco,Iran,Russia,True


Now we can match the ratings and rankings data to our test dataset as well. We note that there might be some value in using the updated FIFA ratings and rankings at each time of the match. However, for simplicity, let's just assume that teams have the same rating and rankings throughout the tournament. We will take the rating and rankings at the start of the tournament.

In [514]:
rankings_wc = fifa_rankings[fifa_rankings['rank_date'] == fifa_rankings['rank_date'].max()]
results_rankings_wc1 = results_wc.merge(rankings_wc, how = 'left', left_on = 'home_team', right_on = 'country_full')
results_rankings_wc2 = results_rankings_wc1.merge(rankings_wc, how = 'left', left_on = 'away_team',
                                                  right_on = 'country_full')

In [515]:
# drop useless columns
results_rankings_wc3 = results_rankings_wc2.drop(['home_team', 'away_team','country_full_x', 'rank_date_y', 'country_full_y', 
                      'rank_date_x', 'confederation_x', 'confederation_y',
                        'country', 'game_date', 'neutral'], axis=1)

# reorder columns
results_rankings_wc3.sort_index(axis=1, inplace=True)
results_rankings_wc3.head()

Unnamed: 0,home_win,rank_x,rank_y
0,1,70,67
1,-1,70,67
2,-1,45,14
3,1,14,45
4,-1,41,37


In [516]:
diff_rankings_wc = results_rankings_wc3.drop('home_win', axis=1)
diff_rankings_wc = diff_rankings_wc.diff(axis=1)
ncol_wc = diff_rankings_wc.shape[1]

# we only want every alternate column
diff_rankings_wc = diff_rankings_wc.iloc[:,list(np.arange(1,ncol_wc, 2))]

# we want our statistics to be from perspective of home teams
diff_rankings_wc = diff_rankings_wc*-1

diff_rankings_wc.columns = [column[:-2] + '_diff' for column in diff_rankings_wc.columns]
diff_rankings_wc.head()

Unnamed: 0,rank_diff
0,3.0
1,3.0
2,31.0
3,-31.0
4,4.0


In [517]:
ratings_wc_start = ratings_wc[ratings_wc['date'] == wc_start]

# merge ratings with results table
results_ratings_wc1 = results_wc.merge(ratings_wc_start, how='left',
                                     left_on='home_team', right_on='team')
results_ratings_wc2 = results_ratings_wc1.merge(ratings_wc_start, how='left',
                                             left_on='away_team', right_on='team')


In [518]:
# drop useless columns
results_ratings_wc3 = results_ratings_wc2.drop(['home_team', 'away_team', 'team_x', 'team_y', 'date_y', 'date_x',
                                               'neutral', 'country', 'game_date'], 
                         axis=1)

# reorder columns
results_ratings_wc3.sort_index(axis=1, inplace=True)
results_ratings_wc3.head()

Unnamed: 0,attack_x,attack_y,bup_dribbling_x,bup_dribbling_y,bup_passing_x,bup_passing_y,bup_speed_x,bup_speed_y,cc_crossing_x,cc_crossing_y,...,overall_x,overall_y,prestige_x,prestige_y,start_age_x,start_age_y,value_euros_millions_x,value_euros_millions_y,wage_euros_thousands_x,wage_euros_thousands_y
0,80.0,71.0,77.0,68.0,49.0,66.0,50.0,69.0,37.0,48.0,...,79.0,72.0,6.0,4.0,27.82,28.0,0.0,0.0,0.0,0.0
1,80.0,71.0,77.0,68.0,49.0,66.0,50.0,69.0,37.0,48.0,...,79.0,72.0,6.0,4.0,27.82,28.0,0.0,0.0,0.0,0.0
2,72.0,86.0,34.0,42.0,49.0,36.0,52.0,38.0,64.0,43.0,...,76.0,80.0,5.0,7.0,27.64,26.09,0.0,0.0,0.0,0.0
3,86.0,72.0,42.0,34.0,36.0,49.0,38.0,52.0,43.0,64.0,...,80.0,76.0,7.0,5.0,26.09,27.64,0.0,0.0,0.0,0.0
4,72.0,79.0,52.0,67.0,38.0,69.0,38.0,69.0,58.0,37.0,...,76.0,74.0,3.0,3.0,26.55,26.27,0.0,0.0,0.0,0.0


In [519]:
ratings_base_wc = results_ratings_wc3.loc[:,['home_win']]
ratings_base_wc.head()

Unnamed: 0,home_win
0,1
1,-1
2,-1
3,1
4,-1


In [520]:
diff_ratings_wc = results_ratings_wc3.drop('home_win', axis=1)
diff_ratings_wc = diff_ratings_wc.diff(axis=1)
ncol_wc = diff_ratings_wc.shape[1]

# we only want every alternate column
diff_ratings_wc = diff_ratings_wc.iloc[:,list(np.arange(1,ncol_wc, 2))]

# we want our statistics to be from perspective of home teams
diff_ratings_wc = diff_ratings_wc*-1
diff_ratings_wc.columns = [column[:-2] + '_diff' for column in diff_ratings_wc.columns]

#Also want difference in attack of one team and defense of the other
diff_ratings_wc['attack_home_defence_away_diff'] = results_ratings_wc3['attack_x'] - results_ratings_wc3['defence_y']
diff_ratings_wc['attack_away_defence_home_diff'] = results_ratings_wc3['attack_y'] - results_ratings_wc3['defence_x']

diff_ratings_wc.head()

Unnamed: 0,attack_diff,bup_dribbling_diff,bup_passing_diff,bup_speed_diff,cc_crossing_diff,cc_passing_diff,cc_shooting_diff,d_aggresion_diff,d_pressure_diff,d_width_diff,...,goalkeeeper_overall_diff,growth_diff,midfield_diff,overall_diff,prestige_diff,start_age_diff,value_euros_millions_diff,wage_euros_thousands_diff,attack_home_defence_away_diff,attack_away_defence_home_diff
0,9.0,9.0,-17.0,-19.0,-11.0,-9.0,-27.0,1.0,-0.0,-17.0,...,11.0,-0.0,6.0,7.0,2.0,-0.18,-0.0,-0.0,9.0,-6.0
1,9.0,9.0,-17.0,-19.0,-11.0,-9.0,-27.0,1.0,-0.0,-17.0,...,11.0,-0.0,6.0,7.0,2.0,-0.18,-0.0,-0.0,9.0,-6.0
2,-14.0,-8.0,13.0,14.0,21.0,25.0,-12.0,-9.0,-0.0,-30.0,...,-15.0,-0.425,-0.0,-4.0,-2.0,1.55,-0.0,-0.0,-7.0,12.0
3,14.0,8.0,-13.0,-14.0,-21.0,-25.0,12.0,9.0,-0.0,30.0,...,15.0,0.425,-0.0,4.0,2.0,-1.55,-0.0,-0.0,12.0,-7.0
4,-7.0,-15.0,-31.0,-31.0,21.0,-9.0,-9.0,9.0,17.0,46.0,...,-3.0,-0.175,6.0,2.0,-0.0,0.28,-0.0,-0.0,2.0,2.0


In [521]:

# compile datframe

test = pd.concat([results_rankings_wc2[['game_date', 'home_team', 'away_team', 'country', 'neutral']],
                  ratings_base_wc, diff_ratings_wc, diff_rankings_wc], axis=1)
#Assert columns of train and test are in same order
assert len(test.columns) == len(train.columns)
assert list(test.columns) == list(train.columns)
test.head()

Unnamed: 0,game_date,home_team,away_team,country,neutral,home_win,attack_diff,bup_dribbling_diff,bup_passing_diff,bup_speed_diff,...,growth_diff,midfield_diff,overall_diff,prestige_diff,start_age_diff,value_euros_millions_diff,wage_euros_thousands_diff,attack_home_defence_away_diff,attack_away_defence_home_diff,rank_diff
0,,Russia,Saudi Arabia,Russia,False,1,9.0,9.0,-17.0,-19.0,...,-0.0,6.0,7.0,2.0,-0.18,-0.0,-0.0,9.0,-6.0,3.0
1,,Russia,Saudi Arabia,Russia,False,-1,9.0,9.0,-17.0,-19.0,...,-0.0,6.0,7.0,2.0,-0.18,-0.0,-0.0,9.0,-6.0,3.0
2,,Egypt,Uruguay,Russia,True,-1,-14.0,-8.0,13.0,14.0,...,-0.425,-0.0,-4.0,-2.0,1.55,-0.0,-0.0,-7.0,12.0,31.0
3,,Uruguay,Egypt,Russia,True,1,14.0,8.0,-13.0,-14.0,...,0.425,-0.0,4.0,2.0,-1.55,-0.0,-0.0,12.0,-7.0,-31.0
4,,Morocco,Iran,Russia,True,-1,-7.0,-15.0,-31.0,-31.0,...,-0.175,6.0,2.0,-0.0,0.28,-0.0,-0.0,2.0,2.0,4.0


In [522]:
test['Group'] = tournament_round

In [523]:
#get rid of even rows since same match as odd rows.
test = test[test.index % 2 == 0]

In [524]:
test.head()

Unnamed: 0,game_date,home_team,away_team,country,neutral,home_win,attack_diff,bup_dribbling_diff,bup_passing_diff,bup_speed_diff,...,midfield_diff,overall_diff,prestige_diff,start_age_diff,value_euros_millions_diff,wage_euros_thousands_diff,attack_home_defence_away_diff,attack_away_defence_home_diff,rank_diff,Group
0,,Russia,Saudi Arabia,Russia,False,1,9.0,9.0,-17.0,-19.0,...,6.0,7.0,2.0,-0.18,-0.0,-0.0,9.0,-6.0,3.0,A
2,,Egypt,Uruguay,Russia,True,-1,-14.0,-8.0,13.0,14.0,...,-0.0,-4.0,-2.0,1.55,-0.0,-0.0,-7.0,12.0,31.0,A
4,,Morocco,Iran,Russia,True,-1,-7.0,-15.0,-31.0,-31.0,...,6.0,2.0,-0.0,0.28,-0.0,-0.0,2.0,2.0,4.0,B
6,,Portugal,Spain,Russia,True,0,-0.0,27.0,22.0,19.0,...,-3.0,-1.0,-1.0,-1.54,-0.0,-0.0,0.0,3.0,-6.0,B
8,,France,Australia,Russia,True,1,16.0,6.0,-19.0,-14.0,...,12.0,13.0,4.0,-3.27,-0.0,-0.0,15.0,-12.0,-29.0,C


In [525]:
# save to csv
test.to_csv("../data/test_team.csv", index = False)