# Wikipedia Scraping

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time
import seaborn as sns
import re
%matplotlib inline

In [2]:
train = pd.read_csv('../data/train_team.csv')
test = pd.read_csv('../data/test_team.csv')

In [3]:
train.head()

Unnamed: 0,game_date,home_team,away_team,home_score,away_score,tournament,country,neutral,overall_diff,attack_away_defence_home_diff,...,bup_dribbling_diff,bup_passing_diff,bup_speed_diff,cc_crossing_diff,cc_passing_diff,cc_shooting_diff,d_aggresion_diff,d_pressure_diff,d_width_diff,home_win
0,2006-09-01 00:00:00+00:00,Denmark,Portugal,4,2,Friendly,Denmark,False,-3.0,3.0,...,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,1
1,2006-10-11 00:00:00+00:00,Poland,Portugal,2,1,UEFA Euro qualification,Poland,False,-7.0,10.0,...,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,1
2,2006-09-06 00:00:00+00:00,Finland,Portugal,1,1,UEFA Euro qualification,Finland,False,-7.0,9.0,...,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,0
3,2006-10-07 00:00:00+00:00,Denmark,Northern Ireland,0,0,UEFA Euro qualification,Denmark,False,9.0,-11.0,...,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,0
4,2006-10-11 00:00:00+00:00,Austria,Switzerland,2,1,Friendly,Austria,False,-3.0,4.0,...,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,1


In [4]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import unicodedata
import re

def get_countries_by_continent():    
    url = "https://simple.wikipedia.org/wiki/List_of_countries_by_continents"
    my_page = requests.get(url)
    soup = BeautifulSoup(my_page.content, "lxml")
    continents_obj = soup.findAll('span',{'class':'mw-headline'})
    continents_obj = continents_obj[:-2]
    country_to_continent = {}
    
    continents = []
    for continent in continents_obj:
        continents.append(continent.text)
        raw_countries = continent.findNext('ul')
        raw_countries = raw_countries.findAll('li')
        for c in raw_countries:
            country_name = c.text.split("-")[0]
            # remove secondary names
            idx = country_name.find('(')
            idx2 = country_name.find('[')
            if idx != -1:
                country_name = country_name[:idx]
            if idx != -2:
                country_name = country_name[:idx2].strip()
            country_name = country_name.replace('*','')
            country_to_continent[country_name] = continent.text
            
    
    return country_to_continent,continents

In [5]:
country_to_continent,continents = get_countries_by_continent()

In [6]:
continents

['Africa',
 'Antarctica',
 'Asia',
 'Europe',
 'America',
 'North America',
 'Central America and the Antilles',
 'South America',
 'Oceania',
 'Australia',
 'Australasian']

In [7]:
country_to_continent

{'Algeria': 'Africa',
 'Angola': 'Africa',
 'Benin': 'Africa',
 'Botswana': 'Africa',
 'Burkina Faso': 'Africa',
 'Burundi': 'Africa',
 'Cameroon': 'Africa',
 'Cape Verde': 'Africa',
 'Central African Republic': 'Africa',
 'Chad': 'Africa',
 'Comoros': 'Africa',
 'Republic of the Congo': 'Africa',
 'Democratic Republic of the Congo': 'Africa',
 "Côte d'Ivoire": 'Africa',
 'Djibouti': 'Africa',
 'Egypt': 'Africa',
 'Equatorial Guinea': 'Africa',
 'Eritrea': 'Africa',
 'Ethiopia': 'Africa',
 'Gabon': 'Africa',
 'The Gambia': 'Africa',
 'Ghana': 'Africa',
 'Guinea': 'Africa',
 'Guine': 'Africa',
 'Kenya': 'Africa',
 'Lesotho': 'Africa',
 'Liberia': 'Africa',
 'Libya': 'Africa',
 'Madagascar': 'Africa',
 'Malawi': 'Africa',
 'Mali': 'Africa',
 'Mauritania': 'Africa',
 'Mauritius': 'Africa',
 'Morocco': 'Africa',
 'Mozambique': 'Africa',
 'Namibia': 'Africa',
 'Niger': 'Africa',
 'Nigeria': 'Africa',
 'Rwanda': 'Africa',
 'São Tomé and Príncipe': 'Africa',
 'Senegal': 'Africa',
 'Seychelles

In [8]:
country_to_continent['Ivory Coast'] = country_to_continent.pop("Côte d'Ivoire")
country_to_continent['England'] = country_to_continent.pop("United Kingdom")
country_to_continent['Korea Republic'] = country_to_continent.pop("South Korea")
country_to_continent['USA'] = country_to_continent.pop("United States of America")
country_to_continent['Ireland'] = country_to_continent.pop("Republic of Ireland")
country_to_continent['Northern Ireland'] = country_to_continent['Ireland']
country_to_continent['Iceland'] = country_to_continent.pop("Eastern Iceland")
country_to_continent['Wales'] = country_to_continent['England']
country_to_continent['Scotland'] = country_to_continent['England']

In [9]:
#Get set of countries
my_set = set()
for k,_ in country_to_continent.items():
    my_set.add(k)

In [10]:
my_set

{'Afghanistan',
 'Albania',
 'Algeria',
 'Andorra',
 'Angola',
 'Antigua and Barbuda',
 'Argentina',
 'Armenia',
 'Aruba',
 'Australi',
 'Australia',
 'Austria',
 'Azerbaijan',
 'Bahrain',
 'Bangladesh',
 'Barbados',
 'Belarus',
 'Belgium',
 'Belize',
 'Benin',
 'Bermuda',
 'Bhutan',
 'Bolivia',
 'Bonaire',
 'Bosnia and Herzegovina',
 'Botswana',
 'Brazil',
 'Brunei',
 'Bulgaria',
 'Burkina Faso',
 'Burundi',
 'Cambodia',
 'Cameroon',
 'Canada',
 'Cape Verde',
 'Central African Republic',
 'Chad',
 'Chile',
 'China',
 'Colombia',
 'Comoros',
 'Costa Rica',
 'Croatia',
 'Cuba',
 'Curaçao',
 'Cyprus',
 'Czech Republic',
 'Democratic Republic of the Congo',
 'Denmark',
 'Djibouti',
 'Dominica',
 'Dominican Republic',
 'East Timor',
 'Ecuador',
 'Egypt',
 'El Salvador',
 'England',
 'Equatorial Guinea',
 'Eritrea',
 'Estonia',
 'Ethiopia',
 'Federated States of Micronesia',
 'Fiji',
 'Finland',
 'Flore',
 'France',
 'French Guiana',
 'Gabon',
 'Georgia',
 'Germany',
 'Ghana',
 'Greece',
 '

In [11]:
#Ensure all countries properly mapped
for c in train.country.unique():
    if c not in my_set:
        print(c)
        
for c in train.home_team.unique():
    if c not in my_set:
        print(c)

for c in train.away_team.unique():
    if c not in my_set:
        print(c)

Réunion


In [12]:
countries = {}
for k,v in country_to_continent.items():
    if k in train.home_team.unique():
         countries[k] = v

In [13]:
def scrape_gdp_pp():
    url = "https://en.wikipedia.org/wiki/List_of_countries_by_GDP_(PPP)_per_capita"
    my_page = requests.get(url)
    soup = BeautifulSoup(my_page.content, "lxml")
    tables = soup.find_all('table',{'class':'wikitable sortable'})
    country_rows = tables[0].findAll('tr')
    countries_data = {}

    for country in country_rows[1:]:    
        country_data = country.findAll('td')
        name = country_data[1].text
        country_name = unicodedata.normalize("NFKD", name)
        country_name = country_name.strip()
        gdp_pp = int(country_data[2].text.strip().replace(',',''))

        countries_data[country_name] = gdp_pp

    return countries_data

In [14]:
gdp_countries = scrape_gdp_pp()
gdp_countries['Ivory Coast'] = gdp_countries.pop("Côte d'Ivoire")
gdp_countries['England'] = gdp_countries.pop("United Kingdom")
gdp_countries['Korea Republic'] = gdp_countries.pop("South Korea")
gdp_countries['USA'] = gdp_countries.pop("United States")
gdp_countries['Northern Ireland'] = gdp_countries['Ireland']
gdp_countries['Wales'] = gdp_countries['England']
gdp_countries['Scotland'] = gdp_countries['England']


In [15]:
countryset = set(gdp_countries.keys())
for k,v in countries.items():
    if k not in countryset:
        print(k)

In [16]:
for country,_ in gdp_countries.items():
    if country not in country_to_continent:
        print(country)

Macau
Bahamas, The
World[n 1]
Congo, Rep.
Timor-Leste
Micronesia
São Tomé and Príncipe
Guinea-Bissau
Gambia, The
Congo, Dem. Rep.


In [17]:
gdp_col = []
for col1,col2 in zip(train['home_team'].items(), train['away_team'].items()):
    team1 = col1[1]
    team2 = col2[1]   
    if country_to_continent[team1] != country_to_continent[team2]:
        gdp_col.append(0)
    else:
        gdp_col.append(float(gdp_countries[team1])/gdp_countries[team2])
train['gdp_diff'] = gdp_col

In [18]:
train.head()

Unnamed: 0,game_date,home_team,away_team,home_score,away_score,tournament,country,neutral,overall_diff,attack_away_defence_home_diff,...,bup_passing_diff,bup_speed_diff,cc_crossing_diff,cc_passing_diff,cc_shooting_diff,d_aggresion_diff,d_pressure_diff,d_width_diff,home_win,gdp_diff
0,2006-09-01 00:00:00+00:00,Denmark,Portugal,4,2,Friendly,Denmark,False,-3.0,3.0,...,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,1,1.639666
1,2006-10-11 00:00:00+00:00,Poland,Portugal,2,1,UEFA Euro qualification,Poland,False,-7.0,10.0,...,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,1,0.96672
2,2006-09-06 00:00:00+00:00,Finland,Portugal,1,1,UEFA Euro qualification,Finland,False,-7.0,9.0,...,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,0,1.455813
3,2006-10-07 00:00:00+00:00,Denmark,Northern Ireland,0,0,UEFA Euro qualification,Denmark,False,9.0,-11.0,...,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,0,0.683074
4,2006-10-11 00:00:00+00:00,Austria,Switzerland,2,1,Friendly,Austria,False,-3.0,4.0,...,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,1,0.802591


In [19]:
gdp_col = []
for col1,col2 in zip(test['home_team'].items(), test['away_team'].items()):
    team1 = col1[1]
    team2 = col2[1]   
    if country_to_continent[team1] != country_to_continent[team2]:
        gdp_col.append(0)
    else:
        gdp_col.append(float(gdp_countries[team1])/gdp_countries[team2])
test['gdp_diff'] = gdp_col

In [20]:
test.head()

Unnamed: 0,game_date,home_team,away_team,home_score,away_score,tournament,country,neutral,overall_diff,attack_away_defence_home_diff,...,bup_speed_diff,cc_crossing_diff,cc_passing_diff,cc_shooting_diff,d_aggresion_diff,d_pressure_diff,d_width_diff,home_win,Group,gdp_diff
0,,Russia,Saudi Arabia,5,0,FIFA World Cup,Russia,False,7.0,-6.0,...,-19.0,-11.0,-9.0,-27.0,1.0,-0.0,-17.0,1,A,0.0
1,,Egypt,Uruguay,0,1,FIFA World Cup,Russia,True,-4.0,12.0,...,14.0,21.0,25.0,-12.0,-9.0,-0.0,-30.0,-1,A,0.0
2,,Morocco,Iran,0,1,FIFA World Cup,Russia,True,2.0,2.0,...,-31.0,21.0,-9.0,-9.0,9.0,17.0,46.0,-1,B,0.0
3,,Portugal,Spain,3,3,FIFA World Cup,Russia,True,-1.0,3.0,...,19.0,26.0,27.0,42.0,-4.0,-9.0,-14.0,0,B,0.792696
4,,France,Australia,2,1,FIFA World Cup,Russia,True,13.0,-12.0,...,-14.0,7.0,-28.0,-31.0,-5.0,4.0,1.0,1,C,0.0


In [21]:
train.to_csv('../data/train_merged.csv', index = False)
test.to_csv('../data/test_merged.csv', index = False)