# Wikipedia Scraping for Country Information

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time
import seaborn as sns
import re
%matplotlib inline

In [3]:
import unicodedata
from bs4 import BeautifulSoup
import requests

In [4]:
train = pd.read_csv('../data/train_team.csv')
test = pd.read_csv('../data/test_team.csv')

In [5]:
train.head()

Unnamed: 0,game_date,home_team,away_team,home_score,away_score,tournament,country,neutral,overall_diff,attack_away_defence_home_diff,...,bup_speed_diff,cc_crossing_diff,cc_passing_diff,cc_shooting_diff,d_aggresion_diff,d_pressure_diff,d_width_diff,home_win,x_rating,y_rating
0,2006-09-01 00:00:00+00:00,Denmark,Portugal,4,2,Friendly,Denmark,False,-3.0,3.0,...,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,1,78.0,81.0
1,2006-10-11 00:00:00+00:00,Poland,Portugal,2,1,UEFA Euro qualification,Poland,False,-7.0,10.0,...,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,1,74.0,81.0
2,2006-09-06 00:00:00+00:00,Finland,Portugal,1,1,UEFA Euro qualification,Finland,False,-7.0,9.0,...,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,0,74.0,81.0
3,2006-10-07 00:00:00+00:00,Denmark,Northern Ireland,0,0,UEFA Euro qualification,Denmark,False,9.0,-11.0,...,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,0,78.0,69.0
4,2006-10-11 00:00:00+00:00,Austria,Switzerland,2,1,Friendly,Austria,False,-3.0,4.0,...,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,1,73.0,76.0


In [6]:
def get_countries_by_continent():    
    '''
    Scrape wikipedia to get the name of countries grouped by the continent they belong to.
    Returns an object that maps the name country to its continent and a list of continents
    '''
    url = "https://simple.wikipedia.org/wiki/List_of_countries_by_continents"
    my_page = requests.get(url)
    soup = BeautifulSoup(my_page.content, "lxml")
    continents_obj = soup.findAll('span',{'class':'mw-headline'})
    continents_obj = continents_obj[:-2]
    country_to_continent = {}
    
    continents = []
    for continent in continents_obj:
        continents.append(continent.text)
        raw_countries = continent.findNext('ul')
        raw_countries = raw_countries.findAll('li')
        for c in raw_countries:
            country_name = c.text.split("-")[0]
            # remove secondary names
            idx = country_name.find('(')
            idx2 = country_name.find('[')
            if idx != -1:
                country_name = country_name[:idx]
            if idx != -2:
                country_name = country_name[:idx2].strip()
            country_name = country_name.replace('*','')
            country_to_continent[country_name] = continent.text
            
    
    return country_to_continent,continents

In [7]:
country_to_continent,continents = get_countries_by_continent()

In [8]:
continents

['Africa',
 'Antarctica',
 'Asia',
 'Europe',
 'American',
 'North',
 'Central America and the Antilles',
 'South America',
 'Oceania',
 'Australia',
 'Australasian']

We want to ensure that the country names that we get from wikipedia properly map to the country names in the dataset. More specifically, we want to ensure that all the countries that show up in the dataset also appear in the country_to_continent object.

In [9]:
country_to_continent['Ivory Coast'] = country_to_continent.pop("Côte d'Ivoire")
country_to_continent['England'] = country_to_continent.pop("United Kingdom")
country_to_continent['Korea Republic'] = country_to_continent.pop("South Korea")
country_to_continent['USA'] = country_to_continent.pop("United States of America")
country_to_continent['Ireland'] = country_to_continent.pop("Republic of Ireland")
country_to_continent['Northern Ireland'] = country_to_continent['Ireland']
country_to_continent['Iceland'] = country_to_continent.pop("Eastern Iceland")
country_to_continent['Wales'] = country_to_continent['England']
country_to_continent['Scotland'] = country_to_continent['England']

country_to_continent['Réunion'] = 'Africa'

We run the following lines to print out all the countries that aren't mapped correctly and manually fix them with the above code. 

In [10]:
#Get set of countries
my_set = set()
for k,_ in country_to_continent.items():
    my_set.add(k)

In [11]:
relevant_countries = set(train.country.unique())
for c1,c2 in zip(train.home_team.unique(),train.away_team.unique()):
    relevant_countries.add(c1)
    relevant_countries.add(c2)


In [12]:
for c in relevant_countries:
    if c not in my_set:
        print(c)

Now, all the countries are properly mapped. Next, we want to find the GDP per capita of each country for a particular year.

First, we create a mapping of country names, this time with a function so that it can easily be used inside another function.

In [13]:
mapping = {"Côte d'Ivoire":'Ivory Coast',"United Kingdom":"England","Korea, South":"Korea Republic",
           "United States": "USA", "Ireland": "Northern Ireland"}
           
def map_countries(cur_str, mapping):
    if cur_str not in mapping:
        return cur_str
    return mapping[cur_str]
    

Since our dataset has matches starting from 2006, we want to collect GDP of countries from 2006 onwards since GDP may change over the years. We will use the year data to synchronize with match date later.

In [14]:

def scrape_gdp_pp_by_year():
    '''
     Scrape GDP per capita by year. 
     Returns: (1) an object that maps the year to an object which maps the country name to its GDP
     for example: {(2010) -> {('US') -> 60000}} and (2) the set of countries so we can check that all relevant 
     countries' data have been collected.
    '''
    
    gdp_date_obj = {}

    url = "https://en.wikipedia.org/wiki/List_of_countries_by_past_and_projected_GDP_(nominal)_per_capita"
    my_page = requests.get(url)
    soup = BeautifulSoup(my_page.content, "lxml")
    sections = soup.find_all('h2')

    # The wiki page divides the GDP data for years that we want into 2 different tables so we will have to process 
    # the tables one at a time
    
    # First table
    years = [2006,2007,2008,2009]
    header1 = sections[3]
    all_countries1 = header1.find_next('table').find_next('table').find_next('table').find_all('td')
    
    country_set = set()
    # Variables to skip over irrelevant table sections
    offset = 4
    interval_sz = 11
    for i in range(0, len(all_countries1), interval_sz):
        cur_country = all_countries1[i+interval_sz-offset:i+interval_sz]
        name = all_countries1[i].text
        country_name = unicodedata.normalize("NFKD", name)
        country_name = country_name.strip()
        country_name = map_countries(country_name,mapping)
        
        # Discard the country if it is not in the dataset
        if country_name not in relevant_countries:
            continue

        country_set.add(country_name)
        country_gdps = cur_country
        for country_gdp,year in zip(country_gdps,years):
            if year not in gdp_date_obj:
                gdp_date_obj[year] = {}
            gdp_amount = country_gdp.text.replace(",","").strip()

            # Impute missing data (Egypt) with previous year data and the previous percentage increase
            if gdp_amount == "":
                prev_year = gdp_date_obj[year-1][country_name]
                prev2_year = gdp_date_obj[year-2][country_name]
                prev_year_pct_increase = prev_year/prev2_year
                gdp_amount = prev_year*prev_year_pct_increase
            gdp_date_obj[year][country_name] = float(gdp_amount)
            
    
    header = sections[4]
    all_countries = header.find_next('table').find_next('table').find_next('table').find_all('td')
    
    
    # Second table
    years = [2010,2011,2012,2013,2014,2015,2016,2017,2018]
    interval_sz = 11
    for i in range(0, len(all_countries), interval_sz):
        cur_country = all_countries[i:i+interval_sz]
        name = cur_country[0].text
        country_name = unicodedata.normalize("NFKD", name)
        country_name = country_name.strip()
        country_name = map_countries(country_name,mapping)
        if country_name not in relevant_countries:
            continue

        country_gdps = cur_country[1:-1]
        for country_gdp,year in zip(country_gdps,years):
            if year not in gdp_date_obj:
                gdp_date_obj[year] = {}
            gdp_amount = country_gdp.text.replace(",","").strip()

            # Impute missing data (Egypt) with previous year data and the previous percentage increase
            if gdp_amount == "":
                prev_year = gdp_date_obj[year-1][country_name]
                prev2_year = gdp_date_obj[year-2][country_name]
                prev_year_pct_increase = prev_year/prev2_year
                gdp_amount = prev_year*prev_year_pct_increase
            gdp_date_obj[year][country_name] = float(gdp_amount)

    return gdp_date_obj,country_set

In [15]:
gdp_date_obj,country_set = scrape_gdp_pp_by_year()

Check that each country has a GDP value for each year

In [16]:
keys = len(gdp_date_obj[2006].items())
for i in range(2006,2019):
    assert(keys == len(gdp_date_obj[i].items()))

In [17]:
unmappable_countries = set()
for k in relevant_countries:
    if k not in country_set:
        unmappable_countries.add(k)
unmappable_countries        


{'Ireland', 'Monaco', 'Réunion', 'Scotland', 'Wales'}

Some countries just dont have GDP information recorded on Wikipedia. Not much we can do about that in terms of code, so we manually look up their GDP values (except for Ireland which we just set it equal to Northern Ireland). We're not able to find it by year for these countries so we will just use the same value for all years, which should be okay since they aren't that significant in World Cup anyway.

In [18]:
# Hardcoded values from manual searching
gdp_pp_scotland = 43740
gdp_pp_wales = 24226
gdp_pp_monaco = 168000
gdp_pp_reunion = 23501


In [19]:
for year in range(2006,2019):
    gdp_date_obj[year]['Ireland'] = gdp_date_obj[year]['Northern Ireland']
    gdp_date_obj[year]['Scotland'] = gdp_pp_scotland    
    gdp_date_obj[year]['Monaco'] = gdp_pp_monaco
    gdp_date_obj[year]['Wales'] = gdp_pp_wales
    gdp_date_obj[year]['Réunion'] = gdp_pp_reunion

Now we're done with the pre-processing and data collection and it's time to synchronize with our dataset and build out our features. Since we already have a mapping of country to continent, we will also create another feature as well to be an indicator of some form of home team advantage. We will give a country a home team advantage if they are either the host or they are from the same continent as the host (while the other team is not). 

Therefore, in total, we will create 3 features: (1) Raw GDP difference, (2) Log of GDP ratio of the 2 teams within the same continent*, (3) Indicator of home team advantage.

*GDP ratio is not symmetric around 0, since using the ratio cannot produce any negative values. To ensure that 0 encodes lack of information and that team 1 and team 2 are evaluated in the same when swapping the tag, we induce symmetry by logging the ratio.


In [20]:
# Raw gdp difference
raw_gdp_train = []
# GDP within same continent
gdp_col_train = []
# Indicator of home team advantage
home_team_train = []



for team1, team2, date, home, neutral in zip(train['home_team'], train['away_team'],train['game_date'], train['country'],train['neutral']):

    date = int(date.split('-')[0])
    if country_to_continent[team1] != country_to_continent[team2]:
        # if not from same continent, set to 0
        gdp_col_train.append(0)    
    else:
        gdp_col_train.append(np.log(gdp_date_obj[date][team1]/gdp_date_obj[date][team2]))
    
    

    if team1 == home:
        home_team_train.append(1)
    elif (neutral and country_to_continent[team1] == country_to_continent[home]
          and country_to_continent[team2] != country_to_continent[home]):
        home_team_train.append(1)
    elif team2 == home:
        home_team_train.append(-1)
    elif (neutral and country_to_continent[team1] != country_to_continent[home]
          and country_to_continent[team2] == country_to_continent[home]):
        home_team_train.append(-1)        
    else:
        home_team_train.append(0)
        
    raw_gdp_train.append(gdp_date_obj[date][team1]-gdp_date_obj[date][team2])

In [21]:
train['gdp_diff'] = gdp_col_train
train['is_home'] = home_team_train
train['raw_gdp_diff'] = raw_gdp_train

In [22]:
len(train.loc[train['is_home']==-1]),len(train.loc[train['is_home']==0])

(42, 298)

In [23]:
train.head()

Unnamed: 0,game_date,home_team,away_team,home_score,away_score,tournament,country,neutral,overall_diff,attack_away_defence_home_diff,...,cc_shooting_diff,d_aggresion_diff,d_pressure_diff,d_width_diff,home_win,x_rating,y_rating,gdp_diff,is_home,raw_gdp_diff
0,2006-09-01 00:00:00+00:00,Denmark,Portugal,4,2,Friendly,Denmark,False,-3.0,3.0,...,-0.0,-0.0,-0.0,-0.0,1,78.0,81.0,0.965957,1,32284.0
1,2006-10-11 00:00:00+00:00,Poland,Portugal,2,1,UEFA Euro qualification,Poland,False,-7.0,10.0,...,-0.0,-0.0,-0.0,-0.0,1,74.0,81.0,-0.79087,1,-10843.0
2,2006-09-06 00:00:00+00:00,Finland,Portugal,1,1,UEFA Euro qualification,Finland,False,-7.0,9.0,...,-0.0,-0.0,-0.0,-0.0,0,74.0,81.0,0.731759,1,21401.0
3,2006-10-07 00:00:00+00:00,Denmark,Northern Ireland,0,0,UEFA Euro qualification,Denmark,False,9.0,-11.0,...,-0.0,-0.0,-0.0,-0.0,0,78.0,69.0,-0.031258,1,-1655.0
4,2006-10-11 00:00:00+00:00,Austria,Switzerland,2,1,Friendly,Austria,False,-3.0,4.0,...,-0.0,-0.0,-0.0,-0.0,1,73.0,76.0,-0.356411,1,-17329.0


Now we do the same for the test set.

In [24]:
gdp_col_test = []
home_team_test = []
raw_gdp_test = []
for team1,team2,home in zip(test['home_team'], test['away_team'], test['country']):    
    if country_to_continent[team1] != country_to_continent[team2]:
        gdp_col_test.append(0)
    else:        
        gdp_col_test.append(np.log(float(gdp_date_obj[2018][team1])/gdp_date_obj[2018][team2]))
        
    if team1 == home:
        home_team_test.append(1)
    elif (country_to_continent[team1] == country_to_continent[home]
          and country_to_continent[team2] != country_to_continent[home]):
        home_team_test.append(1)
    elif team2 == home:
        home_team_test.append(-1)
    elif (country_to_continent[team1] != country_to_continent[home]
          and country_to_continent[team2] == country_to_continent[home]):
        home_team_test.append(-1)        
    else:
        home_team_test.append(0)
        
    raw_gdp_test.append(gdp_date_obj[2018][team1]-gdp_date_obj[2018][team2])

In [25]:
len(home_team_test)

64

In [26]:
test['gdp_diff'] = gdp_col_test
test['is_home'] = home_team_test
test['raw_gdp_diff'] = raw_gdp_test

In [27]:
len(test[['home_team','away_team','home_win','is_home']].loc[test['home_win']==test['is_home']])

26

In [28]:
test[['home_team','away_team','home_win','is_home']]

Unnamed: 0,home_team,away_team,home_win,is_home
0,Russia,Saudi Arabia,1,1
1,Egypt,Uruguay,-1,0
2,Morocco,Iran,-1,0
3,Portugal,Spain,0,0
4,France,Australia,1,1
5,Argentina,Iceland,0,-1
6,Peru,Denmark,-1,-1
7,Croatia,Nigeria,1,1
8,Costa Rica,Serbia,-1,-1
9,Germany,Mexico,-1,1


In [29]:
train.to_csv('../data/train_gdp.csv', index = False)
test.to_csv('../data/test_gdp.csv', index = False)