# Wikipedia Scraping

In [55]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time
import seaborn as sns
import re
%matplotlib inline

In [56]:
train = pd.read_csv('../data/train_team.csv')
test = pd.read_csv('../data/test_team.csv')

In [57]:
train.head()

Unnamed: 0,game_date,home_team,away_team,country,neutral,home_win,attack_diff,bup_dribbling_diff,bup_passing_diff,bup_speed_diff,...,growth_diff,midfield_diff,overall_diff,prestige_diff,start_age_diff,value_euros_millions_diff,wage_euros_thousands_diff,attack_home_defence_away_diff,attack_away_defence_home_diff,rank_diff
0,2006-09-01 00:00:00+00:00,Denmark,Portugal,Denmark,False,1,-1.0,-0.0,-0.0,-0.0,...,1.0,-3.0,-3.0,-2.0,-0.27,-0.0,-0.0,-4.0,3.0,9.0
1,2006-10-11 00:00:00+00:00,Poland,Portugal,Poland,False,1,-4.0,-0.0,-0.0,-0.0,...,-0.26087,-10.0,-7.0,-4.0,1.09,-0.0,-0.0,-7.0,10.0,26.0
2,2006-09-06 00:00:00+00:00,Finland,Portugal,Finland,False,0,-6.0,-0.0,-0.0,-0.0,...,-1.26087,-13.0,-7.0,-9.0,0.91,-0.0,-0.0,-9.0,9.0,59.0
3,2006-10-07 00:00:00+00:00,Denmark,Northern Ireland,Denmark,False,0,13.0,-0.0,-0.0,-0.0,...,0.695652,10.0,9.0,8.0,-0.27,-0.0,-0.0,12.0,-11.0,-42.0
4,2006-10-11 00:00:00+00:00,Austria,Switzerland,Austria,False,1,-6.0,-0.0,-0.0,-0.0,...,-1.217391,-7.0,-3.0,-1.0,-0.73,-0.0,-0.0,-7.0,4.0,56.0


In [58]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import unicodedata
import re

def get_countries_by_continent():    
    url = "https://simple.wikipedia.org/wiki/List_of_countries_by_continents"
    my_page = requests.get(url)
    soup = BeautifulSoup(my_page.content, "lxml")
    continents_obj = soup.findAll('span',{'class':'mw-headline'})
    continents_obj = continents_obj[:-2]
    country_to_continent = {}
    
    continents = []
    for continent in continents_obj:
        continents.append(continent.text)
        raw_countries = continent.findNext('ul')
        raw_countries = raw_countries.findAll('li')
        for c in raw_countries:
            country_name = c.text.split("-")[0]
            # remove secondary names
            idx = country_name.find('(')
            idx2 = country_name.find('[')
            if idx != -1:
                country_name = country_name[:idx]
            if idx != -2:
                country_name = country_name[:idx2].strip()
            country_name = country_name.replace('*','')
            country_to_continent[country_name] = continent.text
            
    
    return country_to_continent,continents

In [59]:
country_to_continent,continents = get_countries_by_continent()

In [60]:
continents

['Africa',
 'Antarctica',
 'Asia',
 'Europe',
 'America',
 'North America',
 'Central America and the Antilles',
 'South America',
 'Oceania',
 'Australia',
 'Australasian']

In [61]:
# country_to_continent

In [62]:
country_to_continent['Ivory Coast'] = country_to_continent.pop("Côte d'Ivoire")
country_to_continent['England'] = country_to_continent.pop("United Kingdom")
country_to_continent['Korea Republic'] = country_to_continent.pop("South Korea")
country_to_continent['USA'] = country_to_continent.pop("United States of America")
country_to_continent['Ireland'] = country_to_continent.pop("Republic of Ireland")
country_to_continent['Northern Ireland'] = country_to_continent['Ireland']
country_to_continent['Iceland'] = country_to_continent.pop("Eastern Iceland")
country_to_continent['Wales'] = country_to_continent['England']
country_to_continent['Scotland'] = country_to_continent['England']

country_to_continent['Réunion'] = 'Africa'

In [63]:
#Get set of countries
my_set = set()
for k,_ in country_to_continent.items():
    my_set.add(k)

In [64]:
# my_set

In [65]:
relevant_countries = set(train.country.unique())
for c1,c2 in zip(train.home_team.unique(),train.away_team.unique()):
    relevant_countries.add(c1)
    relevant_countries.add(c2)


In [66]:
#Ensure all countries properly mapped
for c in relevant_countries:
    if c not in my_set:
        print(c)

In [68]:
def scrape_gdp_pp():
    url = "https://en.wikipedia.org/wiki/List_of_countries_by_GDP_(PPP)_per_capita"
    my_page = requests.get(url)
    soup = BeautifulSoup(my_page.content, "lxml")
    tables = soup.find_all('table',{'class':'wikitable sortable'})
    country_rows = tables[0].findAll('tr')
    countries_data = {}

    for country in country_rows[1:]:    
        country_data = country.findAll('td')
        name = country_data[1].text
        country_name = unicodedata.normalize("NFKD", name)
        country_name = country_name.strip()
        gdp_pp = int(country_data[2].text.strip().replace(',',''))

        countries_data[country_name] = gdp_pp

    return countries_data

In [69]:
mapping = {"Côte d'Ivoire":'Ivory Coast',"United Kingdom":"England","Korea, South":"Korea Republic",
           "United States": "USA", "Ireland": "Northern Ireland"}
           
def map_countries(cur_str, mapping):
    if cur_str not in mapping:
        return cur_str
    return mapping[cur_str]
    

In [70]:
def scrape_gdp_pp_by_year():
    # {(2010) -> {(US) -> 300k}}
    gdp_date_obj = {}

    url = "https://en.wikipedia.org/wiki/List_of_countries_by_past_and_projected_GDP_(nominal)_per_capita"
    my_page = requests.get(url)
    soup = BeautifulSoup(my_page.content, "lxml")
    sections = soup.find_all('h2')

    years = [2006,2007,2008,2009]
    header1 = sections[3]
    all_countries1 = header1.find_next('table').find_next('table').find_next('table').find_all('td')
    
    country_set = set()
    offset = 4
    interval_sz = 11
    for i in range(0, len(all_countries1), interval_sz):
        cur_country = all_countries1[i+interval_sz-offset:i+interval_sz]
        name = all_countries1[i].text
        country_name = unicodedata.normalize("NFKD", name)
        country_name = country_name.strip()
        country_name = map_countries(country_name,mapping)
        if country_name not in relevant_countries:
            continue

        country_set.add(country_name)
        country_gdps = cur_country
        for country_gdp,year in zip(country_gdps,years):
            if year not in gdp_date_obj:
                gdp_date_obj[year] = {}
            gdp_amount = country_gdp.text.replace(",","").strip()

            # Impute missing data (Egypt) with previous year data and the previous percentage increase
            if gdp_amount == "":
                prev_year = gdp_date_obj[year-1][country_name]
                prev2_year = gdp_date_obj[year-2][country_name]
                prev_year_pct_increase = prev_year/prev2_year
                gdp_amount = prev_year*prev_year_pct_increase
            gdp_date_obj[year][country_name] = float(gdp_amount)
            
    
    header = sections[4]
    all_countries = header.find_next('table').find_next('table').find_next('table').find_all('td')
    

    years = [2010,2011,2012,2013,2014,2015,2016,2017,2018]
    interval_sz = 11
    for i in range(0, len(all_countries), interval_sz):
        cur_country = all_countries[i:i+interval_sz]
        name = cur_country[0].text
        country_name = unicodedata.normalize("NFKD", name)
        country_name = country_name.strip()
        country_name = map_countries(country_name,mapping)
        if country_name not in relevant_countries:
            continue

        country_gdps = cur_country[1:-1]
        for country_gdp,year in zip(country_gdps,years):
            if year not in gdp_date_obj:
                gdp_date_obj[year] = {}
            gdp_amount = country_gdp.text.replace(",","").strip()

            # Impute missing data (Egypt) with previous year data and the previous percentage increase
            if gdp_amount == "":
                prev_year = gdp_date_obj[year-1][country_name]
                prev2_year = gdp_date_obj[year-2][country_name]
                prev_year_pct_increase = prev_year/prev2_year
                gdp_amount = prev_year*prev_year_pct_increase
            gdp_date_obj[year][country_name] = float(gdp_amount)

    return gdp_date_obj,country_set

In [71]:
gdp_date_obj,country_set = scrape_gdp_pp_by_year()

In [72]:
keys = len(gdp_date_obj[2006].items())
for i in range(2006,2019):
    assert(keys == len(gdp_date_obj[i].items()))

In [73]:
unmappable_countries = set()
for k in relevant_countries:
    if k not in country_set:
        unmappable_countries.add(k)
unmappable_countries        


{'Ireland', 'Monaco', 'Réunion', 'Scotland', 'Wales'}

In [74]:
gdp_pp_scotland = 43740
gdp_pp_wales = 24226
gdp_pp_monaco = 168000
gdp_pp_reunion = 23501


In [75]:
for year in range(2006,2019):
    gdp_date_obj[year]['Ireland'] = gdp_date_obj[year]['Northern Ireland']
    gdp_date_obj[year]['Scotland'] = gdp_pp_scotland    
    gdp_date_obj[year]['Monaco'] = gdp_pp_monaco
    gdp_date_obj[year]['Wales'] = gdp_pp_wales
    gdp_date_obj[year]['Réunion'] = gdp_pp_reunion

In [172]:
# Create a column of GDP difference corresponding to that year


gdp_col_train = []
home_team_train = []
raw_gdp_train = []


for team1, team2, date, home, neutral in zip(train['home_team'], train['away_team'],train['game_date'], train['country'],train['neutral']):

    date = int(date.split('-')[0])
    if country_to_continent[team1] != country_to_continent[team2]:
        gdp_col_train.append(0)    
    else:
        gdp_col_train.append(np.log(gdp_date_obj[date][team1]/gdp_date_obj[date][team2]))
    
    

    if team1 == home:
        home_team_train.append(1)
    elif (neutral and country_to_continent[team1] == country_to_continent[home]
          and country_to_continent[team2] != country_to_continent[home]):
        home_team_train.append(1)
    elif team2 == home:
        home_team_train.append(-1)
    elif (neutral and country_to_continent[team1] != country_to_continent[home]
          and country_to_continent[team2] == country_to_continent[home]):
        home_team_train.append(-1)        
    else:
        home_team_train.append(0)
        
    raw_gdp_train.append(gdp_date_obj[date][team1]-gdp_date_obj[date][team2])

In [147]:
# train.loc[train['neutral']]

In [173]:
train['gdp_diff'] = gdp_col_train
train['is_home'] = home_team_train
train['raw_gdp_diff'] = raw_gdp_train

In [153]:
len(train.loc[train['is_home']==-1]),len(train.loc[train['is_home']==0])

(42, 298)

In [154]:
train.head()

Unnamed: 0,game_date,home_team,away_team,country,neutral,home_win,attack_diff,bup_dribbling_diff,bup_passing_diff,bup_speed_diff,...,overall_diff,prestige_diff,start_age_diff,value_euros_millions_diff,wage_euros_thousands_diff,attack_home_defence_away_diff,attack_away_defence_home_diff,rank_diff,gdp_diff,is_home
0,2006-09-01 00:00:00+00:00,Denmark,Portugal,Denmark,False,1,-1.0,-0.0,-0.0,-0.0,...,-3.0,-2.0,-0.27,-0.0,-0.0,-4.0,3.0,9.0,0.965957,1
1,2006-10-11 00:00:00+00:00,Poland,Portugal,Poland,False,1,-4.0,-0.0,-0.0,-0.0,...,-7.0,-4.0,1.09,-0.0,-0.0,-7.0,10.0,26.0,-0.79087,1
2,2006-09-06 00:00:00+00:00,Finland,Portugal,Finland,False,0,-6.0,-0.0,-0.0,-0.0,...,-7.0,-9.0,0.91,-0.0,-0.0,-9.0,9.0,59.0,0.731759,1
3,2006-10-07 00:00:00+00:00,Denmark,Northern Ireland,Denmark,False,0,13.0,-0.0,-0.0,-0.0,...,9.0,8.0,-0.27,-0.0,-0.0,12.0,-11.0,-42.0,-0.031258,1
4,2006-10-11 00:00:00+00:00,Austria,Switzerland,Austria,False,1,-6.0,-0.0,-0.0,-0.0,...,-3.0,-1.0,-0.73,-0.0,-0.0,-7.0,4.0,56.0,-0.356411,1


In [171]:
gdp_col_test = []
home_team_test = []
raw_gdp_test = []
for team1,team2,home in zip(test['home_team'], test['away_team'], test['country']):    
    if country_to_continent[team1] != country_to_continent[team2]:
        gdp_col_test.append(0)
    else:        
        gdp_col_test.append(np.log(float(gdp_date_obj[2018][team1])/gdp_date_obj[2018][team2]))
        
    if team1 == home:
        home_team_test.append(1)
    elif (country_to_continent[team1] == country_to_continent[home]
          and country_to_continent[team2] != country_to_continent[home]):
        home_team_test.append(1)
    elif team2 == home:
        home_team_test.append(-1)
    elif (country_to_continent[team1] != country_to_continent[home]
          and country_to_continent[team2] == country_to_continent[home]):
        home_team_test.append(-1)        
    else:
        home_team_test.append(0)
        
    raw_gdp_test.append(gdp_date_obj[2018][team1]-gdp_date_obj[2018][team2])

In [165]:
len(home_team_test)

64

In [174]:
test['gdp_diff'] = gdp_col_test
test['is_home'] = home_team_test
test['raw_gdp_diff'] = raw_gdp_test

In [175]:
len(test[['home_team','away_team','home_win','is_home']].loc[test['home_win']==test['is_home']])

26

In [176]:
test[['home_team','away_team','home_win','is_home']]

Unnamed: 0,home_team,away_team,home_win,is_home
0,Russia,Saudi Arabia,1,1
1,Egypt,Uruguay,-1,0
2,Morocco,Iran,-1,0
3,Portugal,Spain,0,0
4,France,Australia,1,1
5,Argentina,Iceland,0,-1
6,Peru,Denmark,-1,-1
7,Croatia,Nigeria,1,1
8,Costa Rica,Serbia,-1,-1
9,Germany,Mexico,-1,1


In [177]:
train.to_csv('../data/train_merged.csv', index = False)
test.to_csv('../data/test_merged.csv', index = False)