In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time
import seaborn as sns
import re
%matplotlib inline

In [2]:
results = pd.read_csv('data/fifa/international_results.csv')
results['date'] = pd.to_datetime(results['date'], utc=True)

# restrict dates
earliest_date = '2010-01-01'
latest_date = '2018-06-14'
results = results[(results['date'] > earliest_date) & (results['date'] < latest_date)]
results.head()

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral
31700,2010-01-02 00:00:00+00:00,Iran,Korea DPR,1,0,Friendly,Doha,Qatar,True
31701,2010-01-02 00:00:00+00:00,Qatar,Mali,0,0,Friendly,Doha,Qatar,False
31702,2010-01-02 00:00:00+00:00,Syria,Zimbabwe,6,0,Friendly,Kuala Lumpur,Malaysia,True
31703,2010-01-02 00:00:00+00:00,Yemen,Tajikistan,0,1,Friendly,Sana'a,Yemen,False
31704,2010-01-03 00:00:00+00:00,Angola,Gambia,1,1,Friendly,Vila Real de Santo António,Portugal,True


In [3]:
# response variable
score_diff = results['home_score'] - results['away_score']
results['home_win'] = [0 if score < 0 else 1 if score > 0 else 2 for score in score_diff]

In [4]:
ratings = pd.read_csv('data/team_stats_final.csv')
ratings['date'] = pd.to_datetime(ratings['date'])

# restrict dates
ratings = ratings[(ratings['date'] > earliest_date) & (ratings['date'] < latest_date)]
ratings.head()

Unnamed: 0,team,date,attack,defence,full_age,midfield,overall,prestige,start_age,bup_speed,...,cc_passing,cc_crossing,cc_shooting,d_pressure,d_aggresion,d_width,wage_euros_thousands,value_euros_millions,growth,goalkeeeper_overall
1364,Brazil,2018-05-28,86.0,85.0,27.09,83.0,85.0,10.0,26.73,51.0,...,39.0,33.0,67.0,76.0,73.0,63.0,123.608696,32.5,1.826087,84.0
1365,England,2018-05-28,84.0,81.0,24.65,81.0,82.0,8.0,25.0,49.0,...,55.0,41.0,49.0,43.0,55.0,49.0,100.391304,21.065217,3.608696,78.0
1366,Italy,2018-05-28,85.0,85.0,26.04,81.0,83.0,9.0,28.45,73.0,...,64.0,80.0,78.0,29.0,33.0,32.0,91.521739,24.869565,3.173913,89.0
1367,Spain,2018-05-28,84.0,86.0,27.04,86.0,86.0,9.0,28.18,32.0,...,27.0,32.0,20.0,75.0,62.0,52.0,168.695652,39.23913,1.826087,91.0
1368,France,2018-05-28,83.0,82.0,25.65,85.0,84.0,9.0,25.18,35.0,...,24.0,53.0,35.0,47.0,47.0,67.0,110.782609,31.304348,3.173913,88.0


In [5]:
fifa_rankings = pd.read_csv('data/fifa_ranking.csv')
fifa_rankings['rank_date'] = pd.to_datetime(fifa_rankings['rank_date'])
fifa_rankings.head()

Unnamed: 0,rank,country_full,country_abrv,total_points,previous_points,rank_change,cur_year_avg,cur_year_avg_weighted,last_year_avg,last_year_avg_weighted,two_year_ago_avg,two_year_ago_weighted,three_year_ago_avg,three_year_ago_weighted,confederation,rank_date
0,1,Germany,GER,0.0,57,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,UEFA,1993-08-08
1,2,Italy,ITA,0.0,57,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,UEFA,1993-08-08
2,3,Switzerland,SUI,0.0,50,9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,UEFA,1993-08-08
3,4,Sweden,SWE,0.0,55,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,UEFA,1993-08-08
4,5,Argentina,ARG,0.0,51,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,CONMEBOL,1993-08-08


We now have to find for each match the latest team ratings and ranking.

In [6]:
dates = pd.Series(ratings.date.unique())
teams = ratings.team.unique()

In [7]:
# check that we found the corresponding team from team ratings in results
for t in teams:
    try:
        assert np.sum(results.home_team.unique() == t) == 1
    except:
        print('Cannot Find {} in Results Home'.format(t))

for t in teams:
    try:
        assert np.sum(results.away_team.unique() == t) == 1
    except:
        print('Cannot Find {} in Results Away'.format(t))
        
for t in teams:
    try:
        assert np.sum(fifa_rankings.country_full.unique() == t) == 1
    except:
        print('Cannot Find {} in Rankings'.format(t))

Cannot Find Republic of Ireland in Results Home
Cannot Find United States in Results Home
Cannot Find China PR in Results Home
Cannot Find Côte d'Ivoire in Results Home
Cannot Find Republic of Ireland in Results Away
Cannot Find United States in Results Away
Cannot Find China PR in Results Away
Cannot Find Côte d'Ivoire in Results Away
Cannot Find United States in Rankings
Cannot Find Iran in Rankings


In [8]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import unicodedata
import re

def get_countries_by_continent():    
    url = "https://simple.wikipedia.org/wiki/List_of_countries_by_continents"
    my_page = requests.get(url)
    soup = BeautifulSoup(my_page.content, "lxml")
    continents_obj = soup.findAll('span',{'class':'mw-headline'})
    continents_obj = continents_obj[:-2]
    country_to_continent = {}
    
    continents = []
    for continent in continents_obj:
        continents.append(continent.text)
        raw_countries = continent.findNext('ul')
        raw_countries = raw_countries.findAll('li')
        for c in raw_countries:
            country_name = c.text.split("-")[0]
            # remove secondary names
            idx = country_name.find('(')
            idx2 = country_name.find('[')
            if idx != -1:
                country_name = country_name[:idx]
            if idx != -2:
                country_name = country_name[:idx2].strip()
            country_name = country_name.replace('*','')
            country_to_continent[country_name] = continent.text
            
    
    return country_to_continent,continents

In [9]:
country_to_continent,continents = get_countries_by_continent()

In [10]:
country_to_continent['Ivory Coast'] = country_to_continent.pop("Côte d'Ivoire")
country_to_continent['England'] = country_to_continent.pop("United Kingdom")
country_to_continent['Korea Republic'] = country_to_continent.pop("South Korea")
country_to_continent['USA'] = country_to_continent.pop("United States of America")
country_to_continent['Ireland'] = country_to_continent.pop("Republic of Ireland")
country_to_continent['Northern Ireland'] = country_to_continent['Ireland']
country_to_continent['Iceland'] = country_to_continent.pop("Eastern Iceland")
country_to_continent['Wales'] = country_to_continent['England']
country_to_continent['Scotland'] = country_to_continent['England']



In [11]:
my_set = set()
for k,_ in country_to_continent.items():
    my_set.add(k)

Before we proceed with anything else, let's check that country names are spelled similarly. We find that 'Republic of Ireland', 'United States', 'China PR', and 'Côte d'Ivoire' might be spelled differently.

In [12]:
ratings.replace('Republic of Ireland', 'Ireland', inplace=True)
ratings.replace('United States', 'USA', inplace=True)
ratings.replace('China PR', 'China', inplace=True)
ratings.replace("Côte d'Ivoire", 'Ivory Coast', inplace=True)
fifa_rankings.replace("IR Iran", "Iran", inplace = True)

In [13]:
for c in ratings.team.unique():
    if c not in my_set:
        print(c)



In [14]:
countries = {}
for k,v in country_to_continent.items():
    if k in ratings.team.unique():
         countries[k] = v

In [15]:
def scrape_gdp_pp():
    url = "https://en.wikipedia.org/wiki/List_of_countries_by_GDP_(PPP)_per_capita"
    my_page = requests.get(url)
    soup = BeautifulSoup(my_page.content, "lxml")
    tables = soup.find_all('table',{'class':'wikitable sortable'})
    country_rows = tables[0].findAll('tr')
    countries_data = {}

    for country in country_rows[1:]:    
        country_data = country.findAll('td')
        name = country_data[1].text
        country_name = unicodedata.normalize("NFKD", name)
        country_name = country_name.strip()
        gdp_pp = int(country_data[2].text.strip().replace(',',''))

        countries_data[country_name] = gdp_pp

    return countries_data

In [16]:
gdp_countries = scrape_gdp_pp()

In [17]:
gdp_countries

{'Qatar': 124927,
 'Macau': 114430,
 'Luxembourg': 109192,
 'Singapore': 90531,
 'Brunei': 76743,
 'Ireland': 72632,
 'Norway': 70590,
 'Kuwait': 69669,
 'United Arab Emirates': 68245,
 'Switzerland': 61360,
 'Hong Kong': 61016,
 'San Marino': 60359,
 'United States': 59495,
 'Saudi Arabia': 55263,
 'Netherlands': 53582,
 'Iceland': 52150,
 'Bahrain': 51846,
 'Sweden': 51264,
 'Germany': 50206,
 'Australia': 49882,
 'Taiwan': 49827,
 'Denmark': 49613,
 'Austria': 49247,
 'Canada': 48141,
 'Belgium': 46301,
 'Oman': 45464,
 'Finland': 44050,
 'United Kingdom': 43620,
 'France': 43550,
 'Japan': 42659,
 'Malta': 42532,
 'South Korea': 39387,
 'New Zealand': 38502,
 'Spain': 38171,
 'Italy': 37970,
 'Puerto Rico': 37895,
 'Cyprus': 36557,
 'Israel': 36250,
 'Czech Republic': 35223,
 'Equatorial Guinea': 34865,
 'Slovenia': 34063,
 'Slovakia': 32895,
 'Lithuania': 31935,
 'Estonia': 31473,
 'Trinidad and Tobago': 31154,
 'Portugal': 30258,
 'Poland': 29251,
 'Hungary': 28910,
 'Malaysia': 

In [18]:
gdp_countries['Ivory Coast'] = gdp_countries.pop("Côte d'Ivoire")
gdp_countries['England'] = gdp_countries.pop("United Kingdom")
gdp_countries['Korea Republic'] = gdp_countries.pop("South Korea")
gdp_countries['USA'] = gdp_countries.pop("United States")
gdp_countries['Northern Ireland'] = gdp_countries['Ireland']
gdp_countries['Wales'] = gdp_countries['England']
gdp_countries['Scotland'] = gdp_countries['England']



In [19]:
countryset = set(gdp_countries.keys())
for k,v in countries.items():
    if k not in countryset:
        print(k)

In [20]:
continents

['Africa',
 'Antarctica',
 'Asia',
 'Europe',
 'America',
 'North America',
 'Central America and the Antilles',
 'South America',
 'Oceania',
 'Australia',
 'Australasian']

In [21]:
for country,_ in gdp_countries.items():
    if country not in country_to_continent:
        print(country)

Macau
Hong Kong
Bahamas, The
World[n 1]
Congo, Rep.
Timor-Leste
Micronesia
São Tomé and Príncipe
Guinea-Bissau
Gambia, The
Congo, Dem. Rep.


Now we finally have two datasets of teams spelled in the same way.

In [22]:
# find closest date that we have data 
def get_latest_date(match_date, dates):
    match_date = match_date.to_datetime64()
    return dates[match_date >  dates].max()

# results['closest_date'] = results.apply(lambda i: min(dates, key=lambda d: abs(d-i.date)), axis=1)
results['closest_rating_date'] = results['date'].apply(get_latest_date, dates = dates)
results['closest_ranking_date'] = results['date'].apply(get_latest_date, dates = fifa_rankings.rank_date)


In [23]:
results.tail()

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral,home_win,closest_rating_date,closest_ranking_date
39601,2018-06-10 00:00:00+00:00,Austria,Brazil,0,3,Friendly,Vienna,Austria,False,0,2018-06-07,2018-06-07
39602,2018-06-11 00:00:00+00:00,Korea Republic,Senegal,0,2,Friendly,Grödig,Austria,True,0,2018-06-07,2018-06-07
39603,2018-06-11 00:00:00+00:00,Belgium,Costa Rica,4,1,Friendly,Brussels,Belgium,False,1,2018-06-07,2018-06-07
39604,2018-06-12 00:00:00+00:00,Japan,Paraguay,4,2,Friendly,Innsbruck,Austria,True,1,2018-06-11,2018-06-07
39605,2018-06-12 00:00:00+00:00,Poland,Lithuania,4,0,Friendly,Warsaw,Poland,False,1,2018-06-11,2018-06-07


Now that we have the closest matching data for both of our datasets, we can merge on team and date. We will first do this for the ratings dataset. 

Note that for both the ratings and rankings tables we have merge twice in order to account for both home and away teams.

In [24]:
#Reset index to keep track of original index
results = results.reset_index()

results_ratings1 = results.merge(ratings, how='inner', 
                                left_on=['closest_rating_date', 'home_team'],
                                right_on=['date', 'team'])

results_ratings2 = results_ratings1.merge(ratings, how='inner',
                                       left_on=['closest_rating_date', 'away_team'],
                                       right_on=['date', 'team'])

results_ratings2.head()

Unnamed: 0,index,date_x,home_team,away_team,home_score,away_score,tournament,city,country,neutral,...,cc_passing_y,cc_crossing_y,cc_shooting_y,d_pressure_y,d_aggresion_y,d_width_y,wage_euros_thousands_y,value_euros_millions_y,growth_y,goalkeeeper_overall_y
0,31786,2010-02-24 00:00:00+00:00,Mexico,Bolivia,5,0,Friendly,San Francisco,USA,True,...,53.0,72.0,64.0,47.0,33.0,64.0,0.869565,0.275,4.391304,66.0
1,31823,2010-03-03 00:00:00+00:00,Mexico,New Zealand,2,0,Friendly,Pasadena,USA,True,...,70.0,70.0,70.0,70.0,70.0,35.0,0.0,0.0,8.266667,60.0
2,31878,2010-05-24 00:00:00+00:00,Australia,New Zealand,2,1,Friendly,Melbourne,Australia,False,...,70.0,70.0,70.0,70.0,70.0,35.0,0.0,0.0,8.266667,60.0
3,32000,2010-06-20 00:00:00+00:00,Italy,New Zealand,1,1,FIFA World Cup,Nelspruit,South Africa,True,...,70.0,70.0,70.0,70.0,70.0,35.0,0.0,0.0,8.266667,60.0
4,31949,2010-06-04 00:00:00+00:00,Slovenia,New Zealand,3,1,Friendly,Maribor,Slovenia,False,...,70.0,70.0,70.0,70.0,70.0,35.0,0.0,0.0,8.266667,60.0


In [25]:
# # Keeping track of index of matches for inner join
# gdp_col = []
# for col1,col2,idx in zip(results_ratings2['home_team'].items(),results_ratings2['away_team'].items(),results_ratings2['index'].items()):
#     team1 = col1[1]
#     team2 = col2[1]   
#     if country_to_continent[team1] != country_to_continent[team2]:
#         gdp_col.append({'idx':idx[1],'gdp_diff':0})
#     elif gdp_countries[team1] > gdp_countries[team2]:
#         gdp_col.append({'idx':idx[1],'gdp_diff':1})
#     else:
#         gdp_col.append({'idx':idx[1],'gdp_diff':-1})
    

In [26]:
# Keeping track of index of matches for inner join
gdp_col = []
for col1,col2,idx in zip(results_ratings2['home_team'].items(),results_ratings2['away_team'].items(),results_ratings2['index'].items()):
    team1 = col1[1]
    team2 = col2[1]   
    if country_to_continent[team1] != country_to_continent[team2]:
        gdp_col.append({'idx':idx[1],'gdp_diff':0})
    else:
        gdp_col.append({'idx':idx[1],'gdp_diff':float(gdp_countries[team1])/gdp_countries[team2]})
    

In [27]:
df_gdp = pd.DataFrame(gdp_col)
# df_gdp['gdp_diff'] = df_gdp['gdp_diff'].astype('category')

In [28]:
results_ratings2

Unnamed: 0,index,date_x,home_team,away_team,home_score,away_score,tournament,city,country,neutral,...,cc_passing_y,cc_crossing_y,cc_shooting_y,d_pressure_y,d_aggresion_y,d_width_y,wage_euros_thousands_y,value_euros_millions_y,growth_y,goalkeeeper_overall_y
0,31786,2010-02-24 00:00:00+00:00,Mexico,Bolivia,5,0,Friendly,San Francisco,USA,True,...,53.0,72.0,64.0,47.0,33.0,64.0,0.869565,0.275000,4.391304,66.0
1,31823,2010-03-03 00:00:00+00:00,Mexico,New Zealand,2,0,Friendly,Pasadena,USA,True,...,70.0,70.0,70.0,70.0,70.0,35.0,0.000000,0.000000,8.266667,60.0
2,31878,2010-05-24 00:00:00+00:00,Australia,New Zealand,2,1,Friendly,Melbourne,Australia,False,...,70.0,70.0,70.0,70.0,70.0,35.0,0.000000,0.000000,8.266667,60.0
3,32000,2010-06-20 00:00:00+00:00,Italy,New Zealand,1,1,FIFA World Cup,Nelspruit,South Africa,True,...,70.0,70.0,70.0,70.0,70.0,35.0,0.000000,0.000000,8.266667,60.0
4,31949,2010-06-04 00:00:00+00:00,Slovenia,New Zealand,3,1,Friendly,Maribor,Slovenia,False,...,70.0,70.0,70.0,70.0,70.0,35.0,0.000000,0.000000,8.266667,60.0
5,32017,2010-06-24 00:00:00+00:00,Paraguay,New Zealand,0,0,FIFA World Cup,Polokwane,South Africa,True,...,70.0,70.0,70.0,70.0,70.0,35.0,0.000000,0.000000,8.266667,60.0
6,31849,2010-03-24 00:00:00+00:00,Mexico,Iceland,0,0,Friendly,Charlotte,USA,True,...,67.0,49.0,33.0,47.0,47.0,33.0,14.000000,2.938043,2.565217,69.0
7,31859,2010-05-07 00:00:00+00:00,Mexico,Ecuador,0,0,Friendly,East Rutherford,USA,True,...,65.0,45.0,70.0,65.0,65.0,60.0,0.000000,0.000000,7.900000,65.0
8,31865,2010-05-16 00:00:00+00:00,Korea Republic,Ecuador,2,0,Friendly,Seoul,Korea Republic,False,...,65.0,45.0,70.0,65.0,65.0,60.0,0.000000,0.000000,7.900000,65.0
9,31860,2010-05-10 00:00:00+00:00,Mexico,Senegal,1,0,Friendly,Chicago,USA,True,...,59.0,53.0,48.0,48.0,59.0,48.0,0.000000,0.000000,2.575000,69.0


In [29]:
results_ratings2.shape

(1871, 57)

In [30]:
# results_ratings2.dtypes


We find that we have 1871 observations that can still be used after matching team ratings with the results dataframe with an inner merge. We now clean up the merged dataframe slightly.

In [31]:
# drop useless columns
results_ratings3 = results_ratings2.drop(['index', 'home_team', 'away_team', 'tournament', 
                       'city', 'country', 'neutral', 'closest_rating_date', 'closest_ranking_date',
                      'team_x', 'date_y', 'team_y', 'date',
                      'home_score', 'away_score', 'date_x'], axis=1)

# reorder columns
results_ratings3.sort_index(axis=1, inplace=True)
results_ratings3.head()

Unnamed: 0,attack_x,attack_y,bup_dribbling_x,bup_dribbling_y,bup_passing_x,bup_passing_y,bup_speed_x,bup_speed_y,cc_crossing_x,cc_crossing_y,...,overall_x,overall_y,prestige_x,prestige_y,start_age_x,start_age_y,value_euros_millions_x,value_euros_millions_y,wage_euros_thousands_x,wage_euros_thousands_y
0,73.0,67.0,0.0,52.0,45.0,52.0,65.0,50.0,55.0,72.0,...,77.0,67.0,15.0,1.0,27.09,25.18,0.0,0.275,0.0,0.869565
1,73.0,60.0,0.0,0.0,45.0,70.0,65.0,70.0,55.0,70.0,...,77.0,62.0,15.0,6.0,27.09,29.18,0.0,0.0,0.0,0.0
2,71.0,60.0,0.0,0.0,70.0,70.0,70.0,70.0,60.0,70.0,...,75.0,62.0,11.0,6.0,30.36,29.18,0.0,0.0,0.0,0.0
3,83.0,60.0,0.0,0.0,30.0,70.0,67.0,70.0,60.0,70.0,...,83.0,62.0,19.0,6.0,28.91,29.18,0.0,0.0,0.0,0.0
4,70.0,60.0,0.0,0.0,70.0,70.0,70.0,70.0,65.0,70.0,...,70.0,62.0,5.0,6.0,25.64,29.18,0.0,0.0,0.0,0.0


Our final training data is basically a difference in the various team ratings. Now let's clean up the dataframe for our training.

In [32]:
ratings_base = results_ratings3.loc[:,['home_win']]
ratings_base.head()

Unnamed: 0,home_win
0,1
1,1
2,1
3,2
4,1


In [33]:
# results_ratings3

In [34]:
diff_ratings = results_ratings3.drop('home_win', axis=1)
#Diff every column with column beforehand. 
diff_ratings = diff_ratings.diff(axis=1)
ncol = diff_ratings.shape[1]

# Since we diff every column with column beforehand, we only want every alternate column since those are the true diff
diff_ratings = diff_ratings.iloc[:,list(np.arange(1,ncol, 2))]

# we want our statistics to be from perspective of home team
diff_ratings = diff_ratings*-1


diff_ratings.head()

Unnamed: 0,attack_y,bup_dribbling_y,bup_passing_y,bup_speed_y,cc_crossing_y,cc_passing_y,cc_shooting_y,d_aggresion_y,d_pressure_y,d_width_y,defence_y,full_age_y,goalkeeeper_overall_y,growth_y,midfield_y,overall_y,prestige_y,start_age_y,value_euros_millions_y,wage_euros_thousands_y
0,6.0,-52.0,-7.0,15.0,-17.0,12.0,1.0,27.0,18.0,-4.0,9.0,1.46,13.0,2.208696,6.0,10.0,14.0,1.91,-0.275,-0.869565
1,13.0,-0.0,-25.0,-5.0,-15.0,-5.0,-5.0,-10.0,-5.0,25.0,16.0,0.23,19.0,-1.666667,17.0,15.0,9.0,-2.09,-0.0,-0.0
2,11.0,-0.0,-0.0,-0.0,-10.0,-0.0,-0.0,-0.0,-20.0,15.0,10.0,1.53,19.0,-1.466667,14.0,13.0,5.0,1.18,-0.0,-0.0
3,23.0,-0.0,-40.0,-3.0,-10.0,-15.0,-30.0,-40.0,-40.0,15.0,22.0,1.2,28.0,-3.433333,24.0,21.0,13.0,-0.27,-0.0,-0.0
4,10.0,-0.0,-0.0,-0.0,-5.0,-15.0,-0.0,-0.0,-20.0,15.0,9.0,-0.3,18.0,0.3,10.0,8.0,-1.0,-3.54,-0.0,-0.0


In [35]:
columns = [column[:-2] + '_diff' for column in diff_ratings.columns]

diff_ratings.columns = columns

#Also want difference in attack of one team and defense of the other
diff_ratings['attack_home_defence_away_diff'] = results_ratings3['attack_x'] - results_ratings3['defence_y']
diff_ratings['attack_away_defence_home_diff'] = results_ratings3['attack_y'] - results_ratings3['defence_x']

# compile datframe
train = pd.concat([ratings_base, diff_ratings, results_ratings2['index']], axis=1)
train.head()

Unnamed: 0,home_win,attack_diff,bup_dribbling_diff,bup_passing_diff,bup_speed_diff,cc_crossing_diff,cc_passing_diff,cc_shooting_diff,d_aggresion_diff,d_pressure_diff,...,growth_diff,midfield_diff,overall_diff,prestige_diff,start_age_diff,value_euros_millions_diff,wage_euros_thousands_diff,attack_home_defence_away_diff,attack_away_defence_home_diff,index
0,1,6.0,-52.0,-7.0,15.0,-17.0,12.0,1.0,27.0,18.0,...,2.208696,6.0,10.0,14.0,1.91,-0.275,-0.869565,8.0,-7.0,31786
1,1,13.0,-0.0,-25.0,-5.0,-15.0,-5.0,-5.0,-10.0,-5.0,...,-1.666667,17.0,15.0,9.0,-2.09,-0.0,-0.0,15.0,-14.0,31823
2,1,11.0,-0.0,-0.0,-0.0,-10.0,-0.0,-0.0,-0.0,-20.0,...,-1.466667,14.0,13.0,5.0,1.18,-0.0,-0.0,13.0,-8.0,31878
3,2,23.0,-0.0,-40.0,-3.0,-10.0,-15.0,-30.0,-40.0,-40.0,...,-3.433333,24.0,21.0,13.0,-0.27,-0.0,-0.0,25.0,-20.0,32000
4,1,10.0,-0.0,-0.0,-0.0,-5.0,-15.0,-0.0,-0.0,-20.0,...,0.3,10.0,8.0,-1.0,-3.54,-0.0,-0.0,12.0,-7.0,31949


In [36]:

train = train.merge(df_gdp, how='inner', 
                                left_on=['index'],
                                right_on=['idx'])
train.head()

Unnamed: 0,home_win,attack_diff,bup_dribbling_diff,bup_passing_diff,bup_speed_diff,cc_crossing_diff,cc_passing_diff,cc_shooting_diff,d_aggresion_diff,d_pressure_diff,...,overall_diff,prestige_diff,start_age_diff,value_euros_millions_diff,wage_euros_thousands_diff,attack_home_defence_away_diff,attack_away_defence_home_diff,index,gdp_diff,idx
0,1,6.0,-52.0,-7.0,15.0,-17.0,12.0,1.0,27.0,18.0,...,10.0,14.0,1.91,-0.275,-0.869565,8.0,-7.0,31786,0.0,31786
1,1,13.0,-0.0,-25.0,-5.0,-15.0,-5.0,-5.0,-10.0,-5.0,...,15.0,9.0,-2.09,-0.0,-0.0,15.0,-14.0,31823,0.0,31823
2,1,11.0,-0.0,-0.0,-0.0,-10.0,-0.0,-0.0,-0.0,-20.0,...,13.0,5.0,1.18,-0.0,-0.0,13.0,-8.0,31878,1.295569,31878
3,2,23.0,-0.0,-40.0,-3.0,-10.0,-15.0,-30.0,-40.0,-40.0,...,21.0,13.0,-0.27,-0.0,-0.0,25.0,-20.0,32000,0.0,32000
4,1,10.0,-0.0,-0.0,-0.0,-5.0,-15.0,-0.0,-0.0,-20.0,...,8.0,-1.0,-3.54,-0.0,-0.0,12.0,-7.0,31949,0.0,31949


In [37]:
# train.dtypes

Now we do the same for rankings.

In [38]:
results.head()

Unnamed: 0,index,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral,home_win,closest_rating_date,closest_ranking_date
0,31700,2010-01-02 00:00:00+00:00,Iran,Korea DPR,1,0,Friendly,Doha,Qatar,True,1,NaT,2009-12-16
1,31701,2010-01-02 00:00:00+00:00,Qatar,Mali,0,0,Friendly,Doha,Qatar,False,2,NaT,2009-12-16
2,31702,2010-01-02 00:00:00+00:00,Syria,Zimbabwe,6,0,Friendly,Kuala Lumpur,Malaysia,True,1,NaT,2009-12-16
3,31703,2010-01-02 00:00:00+00:00,Yemen,Tajikistan,0,1,Friendly,Sana'a,Yemen,False,0,NaT,2009-12-16
4,31704,2010-01-03 00:00:00+00:00,Angola,Gambia,1,1,Friendly,Vila Real de Santo António,Portugal,True,2,NaT,2009-12-16


In [39]:
#Now merge rankings
results_rankings1 = results.merge(fifa_rankings, how='inner', 
                                left_on=['closest_ranking_date', 'home_team'],
                                right_on=['rank_date', 'country_full'])

results_rankings2 = results_rankings1.merge(fifa_rankings, how='inner', 
                                left_on=['closest_ranking_date', 'away_team'],
                                right_on=['rank_date', 'country_full'])


In [40]:
# drop useless columns
results_rankings3 = results_rankings2.drop(['home_team', 'away_team', 'tournament', 
                       'city', 'country', 'neutral', 'closest_rating_date', 'closest_ranking_date',
                      'country_full_x', 'rank_date_y', 'country_full_y', 'date',
                      'home_score', 'away_score', 'rank_date_x', 'confederation_x', 
                       'confederation_y', 'country_abrv_x', 'country_abrv_y', 'home_win', 'index'], axis=1)

# reorder columns
results_rankings3.sort_index(axis=1, inplace=True)
results_rankings3[['rank_x', 'rank_y']].head()

Unnamed: 0,rank_x,rank_y
0,64,86
1,86,47
2,95,47
3,24,47
4,91,109


In [41]:
#Diff every column with column beforehand. 
diff_rankings = results_rankings3.diff(axis=1)
ncol = diff_rankings.shape[1]

# Since we diff every column with column beforehand, we only want every alternate column since those are the true diff
diff_rankings = diff_rankings.iloc[:,list(np.arange(1,ncol, 2))]

# we want our statistics to be from perspective of home team
diff_rankings = diff_rankings*-1
columns = [column[:-2] + "_diff" for column in diff_rankings.columns]

diff_rankings.columns = columns
diff_rankings['index'] = results_rankings2['index']

diff_rankings.head()

Unnamed: 0,cur_year_avg_weighted_diff,cur_year_avg_diff,last_year_avg_weighted_diff,last_year_avg_diff,previous_points_diff,rank_change_diff,rank_diff,three_year_ago_avg_diff,three_year_ago_weighted_diff,total_points_diff,two_year_ago_avg_diff,two_year_ago_weighted_diff,index
0,-0.0,-0.0,-0.0,-0.0,119.0,1.0,-22.0,-0.0,-0.0,-0.0,-0.0,-0.0,31700
1,-0.0,-0.0,-0.0,-0.0,-288.0,1.0,39.0,-0.0,-0.0,-0.0,-0.0,-0.0,31701
2,-0.0,-0.0,-0.0,-0.0,-325.0,-1.0,48.0,-0.0,-0.0,-0.0,-0.0,-0.0,31723
3,-0.0,-0.0,-0.0,-0.0,158.0,5.0,-23.0,-0.0,-0.0,-0.0,-0.0,-0.0,31705
4,-0.0,-0.0,-0.0,-0.0,83.0,1.0,-18.0,-0.0,-0.0,-0.0,-0.0,-0.0,31702


In [42]:
train = train.merge(diff_rankings, left_on=['index'], right_on=['index'], how = 'inner').drop(['index'], axis = 1)

In [43]:
train = train.drop(['idx'],axis=1)

In [44]:
train.head()

Unnamed: 0,home_win,attack_diff,bup_dribbling_diff,bup_passing_diff,bup_speed_diff,cc_crossing_diff,cc_passing_diff,cc_shooting_diff,d_aggresion_diff,d_pressure_diff,...,last_year_avg_weighted_diff,last_year_avg_diff,previous_points_diff,rank_change_diff,rank_diff,three_year_ago_avg_diff,three_year_ago_weighted_diff,total_points_diff,two_year_ago_avg_diff,two_year_ago_weighted_diff
0,1,6.0,-52.0,-7.0,15.0,-17.0,12.0,1.0,27.0,18.0,...,-0.0,-0.0,358.0,-1.0,-38.0,-0.0,-0.0,-0.0,-0.0,-0.0
1,1,13.0,-0.0,-25.0,-5.0,-15.0,-5.0,-5.0,-10.0,-5.0,...,-0.0,-0.0,531.0,-3.0,-62.0,-0.0,-0.0,-0.0,-0.0,-0.0
2,1,11.0,-0.0,-0.0,-0.0,-10.0,-0.0,-0.0,-0.0,-20.0,...,-0.0,-0.0,489.0,-2.0,-58.0,-0.0,-0.0,-0.0,-0.0,-0.0
3,2,23.0,-0.0,-40.0,-3.0,-10.0,-15.0,-30.0,-40.0,-40.0,...,-0.0,-0.0,771.0,-0.0,-73.0,-0.0,-0.0,-0.0,-0.0,-0.0
4,1,10.0,-0.0,-0.0,-0.0,-5.0,-15.0,-0.0,-0.0,-20.0,...,-0.0,-0.0,447.0,-2.0,-53.0,-0.0,-0.0,-0.0,-0.0,-0.0


In [45]:
# train['gdp_diff']

In [46]:
train.dtypes

home_win                           int64
attack_diff                      float64
bup_dribbling_diff               float64
bup_passing_diff                 float64
bup_speed_diff                   float64
cc_crossing_diff                 float64
cc_passing_diff                  float64
cc_shooting_diff                 float64
d_aggresion_diff                 float64
d_pressure_diff                  float64
d_width_diff                     float64
defence_diff                     float64
full_age_diff                    float64
goalkeeeper_overall_diff         float64
growth_diff                      float64
midfield_diff                    float64
overall_diff                     float64
prestige_diff                    float64
start_age_diff                   float64
value_euros_millions_diff        float64
wage_euros_thousands_diff        float64
attack_home_defence_away_diff    float64
attack_away_defence_home_diff    float64
gdp_diff                         float64
cur_year_avg_wei

In [47]:
# save to csv
train.to_csv('data/train_team.csv', index = False)

We now create our test set with actual world cup data.

In [48]:
ratings_wc = pd.read_csv('data/team_stats_final.csv')
ratings_wc['date'] = pd.to_datetime(ratings_wc['date'], utc=True)

# restrict dates
latest_date = '2018-06-14'
wc_start = '2018-06-16' # first WC rating
wc_end = '2018-07-15'
# restrict dates
ratings_wc = ratings_wc[(ratings_wc['date'] >= wc_start) & (ratings_wc['date'] <= wc_end)]

ratings_wc.head()

Unnamed: 0,team,date,attack,defence,full_age,midfield,overall,prestige,start_age,bup_speed,...,cc_passing,cc_crossing,cc_shooting,d_pressure,d_aggresion,d_width,wage_euros_thousands,value_euros_millions,growth,goalkeeeper_overall
0,Brazil,2018-07-15 00:00:00+00:00,87.0,84.0,27.43,86.0,86.0,10.0,27.55,51.0,...,39.0,33.0,67.0,76.0,73.0,63.0,0.0,0.0,1.625,84.0
1,England,2018-07-15 00:00:00+00:00,83.0,80.0,25.88,81.0,81.0,8.0,24.64,36.0,...,31.0,41.0,41.0,43.0,55.0,49.0,0.0,0.0,2.575,80.0
2,Italy,2018-07-15 00:00:00+00:00,81.0,82.0,25.96,81.0,82.0,9.0,27.36,73.0,...,64.0,80.0,78.0,29.0,26.0,32.0,0.0,0.0,3.26087,88.0
3,Spain,2018-07-15 00:00:00+00:00,84.0,85.0,27.18,86.0,85.0,9.0,27.27,32.0,...,27.0,32.0,20.0,75.0,62.0,52.0,0.0,0.0,1.775,91.0
4,France,2018-07-15 00:00:00+00:00,86.0,81.0,25.05,84.0,85.0,9.0,24.82,35.0,...,24.0,53.0,35.0,47.0,47.0,67.0,0.0,0.0,3.45,87.0


In [49]:
ratings_teams_wc = ratings_wc.team.unique()

In [50]:
#https://gitlab.com/djh_or/2018-world-cup-stats/blob/master/world_cup_2018_stats.csv
results_wc = pd.read_csv("data/world_cup_2018_stats.csv")
results_wc.head()

Unnamed: 0,Game,Group,Team,Opponent,Home/Away,Score,WDL,Pens?,Goals For,Goals Against,...,Passes Completed,Distance Covered km,Balls recovered,Tackles,Blocks,Clearances,Yellow cards,Red Cards,Second Yellow Card leading to Red Card,Fouls Committed
0,1,A,Russia,Saudi Arabia,home,5-0,W,,5,0,...,240,118,53,9,3,19,1,0,0,22
1,1,A,Saudi Arabia,Russia,away,5-0,L,,0,5,...,442,105,48,16,3,31,1,0,0,10
2,2,A,Egypt,Uruguay,home,0-1,L,,0,1,...,308,112,57,12,4,32,2,0,0,12
3,2,A,Uruguay,Egypt,away,0-1,W,,1,0,...,508,111,54,8,2,22,0,0,0,6
4,3,B,Morocco,IR Iran,home,0-1,L,,0,1,...,371,101,38,9,1,16,1,0,0,22


Before we proceed, let's ensure that our country names match up similarly.

In [51]:
# check that we found the corresponding team from team ratings in results
# but now we reverse it such that we try to find a rating and ranking for each wc team
for t in results_wc.Team.unique():
    try:
        assert np.sum(ratings_teams_wc == t) == 1
    except:
        print('Cannot Find in Ratings {}'.format(t))
        
for t in results_wc.Team.unique():
    try:
        assert np.sum(fifa_rankings.country_full.unique() == t) == 1
    except:
        print('Cannot Find in Rankings {}'.format(t))

Cannot Find in Ratings IR Iran
Cannot Find in Rankings IR Iran


In [52]:
results_wc.replace('IR Iran', 'Iran', inplace=True)


In [53]:
# # Repeat the same GDP calculation for Test set

# gdp_col_test = []
# for col1,col2 in zip(results_wc['Team'].items(),results_wc['Opponent'].items()):
#     team1 = col1[1]
#     team2 = col2[1]   
#     if country_to_continent[team1] != country_to_continent[team2]:
#         gdp_col_test.append(0)
#     elif gdp_countries[team1] > gdp_countries[team2]:
#         gdp_col_test.append(1)
#     else:
#         gdp_col_test.append(-1)
    

In [54]:
# Repeat the same GDP calculation for Test set

gdp_col_test = []
for col1,col2 in zip(results_wc['Team'].items(),results_wc['Opponent'].items()):
    team1 = col1[1]
    team2 = col2[1]   
    if country_to_continent[team1] != country_to_continent[team2]:
        gdp_col_test.append(0)
    else:
        gdp_col_test.append(float(gdp_countries[team1])/gdp_countries[team2])
    

In [55]:
len(gdp_col_test)


128

In [56]:
tournament_round = results_wc.Group
results_wc = results_wc.loc[:,['Team', 'Opponent', 'WDL']]
results_wc['home_win'] = [0 if score == 'L' else 1 if score == 'W' else 2 for score in results_wc.WDL]
results_wc.drop('WDL', axis=1, inplace=True)
results_wc.tail()



Unnamed: 0,Team,Opponent,home_win
123,England,Croatia,0
124,Belgium,England,1
125,England,Belgium,0
126,France,Croatia,1
127,Croatia,France,0


Now we can match the ratings and rankings data to our test dataset as well. We note that there might be some value in using the updated FIFA ratings and rankings at each time of the match. However, for simplicity, let's just assume that teams have the same rating and rankings throughout the tournament. We will take the rating and rankings at the start of the tournament.

In [57]:
rankings_wc = fifa_rankings[fifa_rankings.rank_date == \
                            get_latest_date(pd.to_datetime(wc_start), fifa_rankings.rank_date)]
results_rankings_wc1 = results_wc.merge(rankings_wc, how = 'left', left_on = 'Team', right_on = 'country_full')
results_rankings_wc2 = results_rankings_wc1.merge(rankings_wc, how = 'left', left_on = 'Opponent',
                                                  right_on = 'country_full')
results_rankings_wc2.columns


Index(['Team', 'Opponent', 'home_win', 'rank_x', 'country_full_x',
       'country_abrv_x', 'total_points_x', 'previous_points_x',
       'rank_change_x', 'cur_year_avg_x', 'cur_year_avg_weighted_x',
       'last_year_avg_x', 'last_year_avg_weighted_x', 'two_year_ago_avg_x',
       'two_year_ago_weighted_x', 'three_year_ago_avg_x',
       'three_year_ago_weighted_x', 'confederation_x', 'rank_date_x', 'rank_y',
       'country_full_y', 'country_abrv_y', 'total_points_y',
       'previous_points_y', 'rank_change_y', 'cur_year_avg_y',
       'cur_year_avg_weighted_y', 'last_year_avg_y',
       'last_year_avg_weighted_y', 'two_year_ago_avg_y',
       'two_year_ago_weighted_y', 'three_year_ago_avg_y',
       'three_year_ago_weighted_y', 'confederation_y', 'rank_date_y'],
      dtype='object')

In [58]:
# drop useless columns
results_rankings_wc3 = results_rankings_wc2.drop(['Team', 'Opponent','country_full_x', 'rank_date_y', 'country_full_y', 
                      'rank_date_x', 'confederation_x', 'confederation_y', 'country_abrv_x', 'country_abrv_y'], axis=1)

# reorder columns
results_rankings_wc3.sort_index(axis=1, inplace=True)
results_rankings_wc3.head()

Unnamed: 0,cur_year_avg_weighted_x,cur_year_avg_weighted_y,cur_year_avg_x,cur_year_avg_y,home_win,last_year_avg_weighted_x,last_year_avg_weighted_y,last_year_avg_x,last_year_avg_y,previous_points_x,...,three_year_ago_avg_x,three_year_ago_avg_y,three_year_ago_weighted_x,three_year_ago_weighted_y,total_points_x,total_points_y,two_year_ago_avg_x,two_year_ago_avg_y,two_year_ago_weighted_x,two_year_ago_weighted_y
0,166.07,143.89,166.07,143.89,1,108.09,180.1,216.18,360.2,493,...,368.5,135.09,73.7,27.02,456.53,465.28,362.25,380.91,108.68,114.27
1,143.89,166.07,143.89,166.07,0,180.1,108.09,360.2,216.18,462,...,135.09,368.5,27.02,73.7,465.28,456.53,380.91,362.25,114.27,108.68
2,208.27,486.28,208.27,486.28,0,291.04,243.84,582.07,487.69,636,...,253.18,599.26,50.64,119.85,649.43,1018.41,331.62,561.47,99.48,168.44
3,486.28,208.27,486.28,208.27,1,243.84,291.04,487.69,582.07,976,...,599.26,253.18,119.85,50.64,1018.41,649.43,561.47,331.62,168.44,99.48
4,369.1,290.29,369.1,290.29,0,172.65,246.02,345.29,492.04,681,...,170.43,398.22,34.09,79.64,685.86,708.35,366.78,308.01,110.03,92.4


In [59]:
diff_rankings_wc = results_rankings_wc3.drop('home_win', axis=1)
diff_rankings_wc = diff_rankings_wc.diff(axis=1)
ncol_wc = diff_rankings_wc.shape[1]

# we only want every alternate column
diff_rankings_wc = diff_rankings_wc.iloc[:,list(np.arange(1,ncol_wc, 2))]

# we want our statistics to be from perspective of home teams
diff_rankings_wc = diff_rankings_wc*-1

diff_rankings_wc.columns = [column[:-2] + '_diff' for column in diff_rankings_wc.columns]
diff_rankings_wc.head()

Unnamed: 0,cur_year_avg_weighted_diff,cur_year_avg_diff,last_year_avg_weighted_diff,last_year_avg_diff,previous_points_diff,rank_change_diff,rank_diff,three_year_ago_avg_diff,three_year_ago_weighted_diff,total_points_diff,two_year_ago_avg_diff,two_year_ago_weighted_diff
0,22.18,22.18,-72.01,-144.02,31.0,-4.0,3.0,233.41,46.68,-8.75,-18.66,-5.59
1,-22.18,-22.18,72.01,144.02,-31.0,4.0,-3.0,-233.41,-46.68,8.75,18.66,5.59
2,-278.01,-278.01,47.2,94.38,-340.0,-2.0,31.0,-346.08,-69.21,-368.98,-229.85,-68.96
3,278.01,278.01,-47.2,-94.38,340.0,2.0,-31.0,346.08,69.21,368.98,229.85,68.96
4,78.81,78.81,-73.37,-146.75,-46.0,2.0,4.0,-227.79,-45.55,-22.49,58.77,17.63


In [60]:
ratings_wc_start = ratings_wc[ratings_wc['date'] == wc_start]

# merge ratings with results table
results_ratings_wc1 = results_wc.merge(ratings_wc_start, how='left',
                                     left_on='Team', right_on='team')
results_ratings_wc2 = results_ratings_wc1.merge(ratings_wc_start, how='left',
                                             left_on='Opponent', right_on='team')


In [61]:
# drop useless columns
results_ratings_wc3 = results_ratings_wc2.drop(['Team', 'Opponent', 'team_x', 'team_y', 'date_y', 'date_x'], 
                         axis=1)

# reorder columns
results_ratings_wc3.sort_index(axis=1, inplace=True)
results_ratings_wc3.head()

Unnamed: 0,attack_x,attack_y,bup_dribbling_x,bup_dribbling_y,bup_passing_x,bup_passing_y,bup_speed_x,bup_speed_y,cc_crossing_x,cc_crossing_y,...,overall_x,overall_y,prestige_x,prestige_y,start_age_x,start_age_y,value_euros_millions_x,value_euros_millions_y,wage_euros_thousands_x,wage_euros_thousands_y
0,80.0,71.0,77.0,68.0,49.0,66.0,50.0,69.0,37.0,48.0,...,79.0,72.0,6.0,4.0,27.82,28.0,0.0,0.0,0.0,0.0
1,71.0,80.0,68.0,77.0,66.0,49.0,69.0,50.0,48.0,37.0,...,72.0,79.0,4.0,6.0,28.0,27.82,0.0,0.0,0.0,0.0
2,72.0,86.0,34.0,42.0,49.0,36.0,52.0,38.0,64.0,43.0,...,76.0,80.0,5.0,7.0,27.64,26.09,0.0,0.0,0.0,0.0
3,86.0,72.0,42.0,34.0,36.0,49.0,38.0,52.0,43.0,64.0,...,80.0,76.0,7.0,5.0,26.09,27.64,0.0,0.0,0.0,0.0
4,72.0,79.0,52.0,67.0,38.0,69.0,38.0,69.0,58.0,37.0,...,76.0,74.0,3.0,3.0,26.55,26.27,0.0,0.0,0.0,0.0


In [62]:
ratings_base_wc = results_ratings_wc3.loc[:,['home_win']]
ratings_base_wc.head()

Unnamed: 0,home_win
0,1
1,0
2,0
3,1
4,0


In [63]:
diff_ratings_wc = results_ratings_wc3.drop('home_win', axis=1)
diff_ratings_wc = diff_ratings_wc.diff(axis=1)
ncol_wc = diff_ratings_wc.shape[1]

# we only want every alternate column
diff_ratings_wc = diff_ratings_wc.iloc[:,list(np.arange(1,ncol_wc, 2))]

# we want our statistics to be from perspective of home teams
diff_ratings_wc = diff_ratings_wc*-1
diff_ratings_wc.columns = [column[:-2] + '_diff' for column in diff_ratings_wc.columns]

#Also want difference in attack of one team and defense of the other
diff_ratings_wc['attack_home_defence_away_diff'] = results_ratings_wc3['attack_x'] - results_ratings_wc3['defence_y']
diff_ratings_wc['attack_away_defence_home_diff'] = results_ratings_wc3['attack_y'] - results_ratings_wc3['defence_x']

diff_ratings_wc.head()

Unnamed: 0,attack_diff,bup_dribbling_diff,bup_passing_diff,bup_speed_diff,cc_crossing_diff,cc_passing_diff,cc_shooting_diff,d_aggresion_diff,d_pressure_diff,d_width_diff,...,goalkeeeper_overall_diff,growth_diff,midfield_diff,overall_diff,prestige_diff,start_age_diff,value_euros_millions_diff,wage_euros_thousands_diff,attack_home_defence_away_diff,attack_away_defence_home_diff
0,9.0,9.0,-17.0,-19.0,-11.0,-9.0,-27.0,1.0,-0.0,-17.0,...,11.0,-0.0,6.0,7.0,2.0,-0.18,-0.0,-0.0,9.0,-6.0
1,-9.0,-9.0,17.0,19.0,11.0,9.0,27.0,-1.0,-0.0,17.0,...,-11.0,-0.0,-6.0,-7.0,-2.0,0.18,-0.0,-0.0,-6.0,9.0
2,-14.0,-8.0,13.0,14.0,21.0,25.0,-12.0,-9.0,-0.0,-30.0,...,-15.0,-0.425,-0.0,-4.0,-2.0,1.55,-0.0,-0.0,-7.0,12.0
3,14.0,8.0,-13.0,-14.0,-21.0,-25.0,12.0,9.0,-0.0,30.0,...,15.0,0.425,-0.0,4.0,2.0,-1.55,-0.0,-0.0,12.0,-7.0
4,-7.0,-15.0,-31.0,-31.0,21.0,-9.0,-9.0,9.0,17.0,46.0,...,-3.0,-0.175,6.0,2.0,-0.0,0.28,-0.0,-0.0,2.0,2.0


In [64]:
test = pd.concat([ratings_base_wc, diff_ratings_wc, diff_rankings_wc], axis=1)

In [65]:
len(test.columns) , len(train.columns)


(35, 36)

In [66]:
# test['gdp_diff'] = pd.Series(gdp_col_test).astype('category')
test['gdp_diff'] = pd.Series(gdp_col_test)

In [67]:
test.sort_index(axis=1, inplace=True)
train.sort_index(axis=1,inplace=True)
# gdp_series_test

In [68]:
train.columns

Index(['attack_away_defence_home_diff', 'attack_diff',
       'attack_home_defence_away_diff', 'bup_dribbling_diff',
       'bup_passing_diff', 'bup_speed_diff', 'cc_crossing_diff',
       'cc_passing_diff', 'cc_shooting_diff', 'cur_year_avg_diff',
       'cur_year_avg_weighted_diff', 'd_aggresion_diff', 'd_pressure_diff',
       'd_width_diff', 'defence_diff', 'full_age_diff', 'gdp_diff',
       'goalkeeeper_overall_diff', 'growth_diff', 'home_win',
       'last_year_avg_diff', 'last_year_avg_weighted_diff', 'midfield_diff',
       'overall_diff', 'prestige_diff', 'previous_points_diff',
       'rank_change_diff', 'rank_diff', 'start_age_diff',
       'three_year_ago_avg_diff', 'three_year_ago_weighted_diff',
       'total_points_diff', 'two_year_ago_avg_diff',
       'two_year_ago_weighted_diff', 'value_euros_millions_diff',
       'wage_euros_thousands_diff'],
      dtype='object')

In [69]:
test.columns

Index(['attack_away_defence_home_diff', 'attack_diff',
       'attack_home_defence_away_diff', 'bup_dribbling_diff',
       'bup_passing_diff', 'bup_speed_diff', 'cc_crossing_diff',
       'cc_passing_diff', 'cc_shooting_diff', 'cur_year_avg_diff',
       'cur_year_avg_weighted_diff', 'd_aggresion_diff', 'd_pressure_diff',
       'd_width_diff', 'defence_diff', 'full_age_diff', 'gdp_diff',
       'goalkeeeper_overall_diff', 'growth_diff', 'home_win',
       'last_year_avg_diff', 'last_year_avg_weighted_diff', 'midfield_diff',
       'overall_diff', 'prestige_diff', 'previous_points_diff',
       'rank_change_diff', 'rank_diff', 'start_age_diff',
       'three_year_ago_avg_diff', 'three_year_ago_weighted_diff',
       'total_points_diff', 'two_year_ago_avg_diff',
       'two_year_ago_weighted_diff', 'value_euros_millions_diff',
       'wage_euros_thousands_diff'],
      dtype='object')

In [70]:

# compile datframe
# test = pd.concat([ratings_base_wc, diff_ratings_wc, diff_rankings_wc], axis=1)
#Assert columns of train and test are in same order
assert len(test.columns) == len(train.columns)
assert list(test.columns) == list(train.columns)
# test.head()

In [71]:
test['Group'] = tournament_round

In [72]:
#get rid of even rows since same match as odd rows.
test = test[test.index % 2 == 0]

In [73]:
# save to csv
test.to_csv("data/test_team.csv", index = False)