# Scrape World Cup Squads

In [3]:
import numpy as np
import pandas as pd
import requests
import time
from bs4 import BeautifulSoup
from map_countries import map_country
%matplotlib inline 

In [15]:
def scrape_wikpedia_wc_players(year):
    url = "https://en.wikipedia.org/wiki/{}_FIFA_World_Cup_squads".format(year)
    time.sleep(5)
    my_page = requests.get(url)
    if my_page.status_code == 200:
        soup = BeautifulSoup(my_page.content, "lxml")
        tables = soup.find_all('table',{'class':'sortable wikitable plainrowheaders'})
        countries = soup.find_all('li', {'class': 'toclevel-2'})[:32]
        players_df = pd.DataFrame(columns = ['Player', 'Country', 'Pos', 'Birthday', 'Age', 'Caps', 'Club', 'Club Country'])
        for i, table in enumerate(tables):
            #first result would be column names
            country = countries[i].find('span', {'class': 'toctext'}).text
            players = table.find_all('tr')[1:]
            for player in players:
                name = player.find('th').find('a').text
                td_tags = player.find_all('td')
                position = td_tags[1].find('a').text
                birthday = td_tags[2].text[2:12]
                age = td_tags[2].text[-4:-2]
                caps = td_tags[3].text.replace('\n', "")
                #Only 2018 has goals, so do not keep it.
#                 goals = td_tags[4].text.replace('\n', "")
                #Last element is club. Some are free agents. 
                try: 
                    club = td_tags[-1].find_all('a')[1].text
                    club_country = td_tags[-1].find('a')['title']
                except IndexError:
                    club = None
                    club_country = None
                row = pd.DataFrame([[name, country, position, birthday, age, caps, club, club_country]],\
                                   columns = ['Player', 'Country', 'Pos', 'Birthday', 'Age', 'Caps', 'Club', 'Club Country'])
                players_df = players_df.append(row)
        players_df = players_df.reset_index(drop = True)
        players_df['year'] = year
        return players_df

In [16]:
years = [2010, 2014, 2018]
for year in years:
    wc_players = scrape_wikpedia_wc_players(year)
    wc_players['Country'] = wc_players['Country'].copy().apply(map_country)
    wc_players['Club Country'] = wc_players['Club Country'].copy().apply(map_country)
    wc_players.to_csv("data/cleaned/wc_players_{}.csv".format(year), index = False)

## Map Test Data to Sofifa Dataset

In [5]:
wc_players_2018 = pd.read_csv("data/cleaned/wc_players_2018.csv")

In [106]:
wc_players_2018.head()

Unnamed: 0,Player,Country,Pos,Birthday,Age,Caps,Club,Club Country,year,name_converted
0,Essam El-Hadary,egypt,GK,1973-01-15,45,158,Al-Taawoun,saudi arabia,2018,EElHadary
1,Ali Gabr,egypt,DF,1989-01-01,29,21,West Bromwich Albion,england,2018,AGabr
2,Ahmed Elmohamady,egypt,DF,1987-09-09,30,78,Aston Villa,england,2018,AElmohamady
3,Omar Gaber,egypt,MF,1992-01-30,26,24,Los Angeles FC,usa,2018,OGaber
4,Sam Morsy,egypt,MF,1991-09-10,26,5,Wigan Athletic,england,2018,SMorsy


In [94]:
sofifa_df = pd.read_csv('data/sofifa/CompleteDataset.csv').drop(['Unnamed: 0'], axis = 1)
sofifa_df['Nationality'] = sofifa_df['Nationality'].copy().apply(map_country)


  interactivity=interactivity, compiler=compiler, result=result)


In [107]:
sofifa_df.head()

Unnamed: 0,Name,Age,Photo,Nationality,Flag,Overall,Potential,Club,Club Logo,Value,...,RCB,RCM,RDM,RF,RM,RS,RW,RWB,ST,name_converted
0,Cristiano Ronaldo,32,https://cdn.sofifa.org/48/18/players/20801.png,portugal,https://cdn.sofifa.org/flags/38.png,94,94,Real Madrid CF,https://cdn.sofifa.org/24/18/teams/243.png,€95.5M,...,53.0,82.0,62.0,91.0,89.0,92.0,91.0,66.0,92.0,CRonaldo
1,L. Messi,30,https://cdn.sofifa.org/48/18/players/158023.png,argentina,https://cdn.sofifa.org/flags/52.png,93,93,FC Barcelona,https://cdn.sofifa.org/24/18/teams/241.png,€105M,...,45.0,84.0,59.0,92.0,90.0,88.0,91.0,62.0,88.0,LMessi
2,Neymar,25,https://cdn.sofifa.org/48/18/players/190871.png,brazil,https://cdn.sofifa.org/flags/54.png,92,94,Paris Saint-Germain,https://cdn.sofifa.org/24/18/teams/73.png,€123M,...,46.0,79.0,59.0,88.0,87.0,84.0,89.0,64.0,84.0,Neymar
3,L. Suárez,30,https://cdn.sofifa.org/48/18/players/176580.png,uruguay,https://cdn.sofifa.org/flags/60.png,92,92,FC Barcelona,https://cdn.sofifa.org/24/18/teams/241.png,€97M,...,58.0,80.0,65.0,88.0,85.0,88.0,87.0,68.0,88.0,LSuárez
4,M. Neuer,31,https://cdn.sofifa.org/48/18/players/167495.png,germany,https://cdn.sofifa.org/flags/21.png,92,92,FC Bayern Munich,https://cdn.sofifa.org/24/18/teams/21.png,€61M,...,,,,,,,,,,MNeuer


In [114]:
sofifa_df.iloc[:, 26].unique()

array(['14', '15', '33', '91', '8', '86', '7', '5', '10', '12', '11', '4',
       '90', '87', '13', '9', '2', '6', '82', '1', '16', '83', '80', '85',
       '81', '78', '79', '3', '84', '77', '76', '74', '32', '75', '70',
       '17', '19', '73', '72', '69', '71', '65', '18', '71-2', '69+1',
       '20', '66', '68', '67', '24', '21', '64', '66-2', '69+2', '63',
       '60', '62', '55', '64+1', '61', '57', '59', '65+4', '23', '60+3',
       '66-1', '65+1', '58', '30', '63-1', '70+2', '62-1', '56', '51',
       '53', '64+2', '62+4', '49', '54', '61-1', '27', '47', '52', '60+1',
       '58+2', '50', '58+4', '48', '59+2', '46', '45', '44', '51+3',
       '45-1', '42', '40', '43', '41', '38'], dtype=object)

In [97]:
sofifa_df[sofifa_df.Name.str.contains('Hadary')]

Unnamed: 0,Name,Age,Photo,Nationality,Flag,Overall,Potential,Club,Club Logo,Value,...,RB,RCB,RCM,RDM,RF,RM,RS,RW,RWB,ST
5314,E. El Hadary,44,https://cdn.sofifa.org/48/18/players/188033.png,egypt,https://cdn.sofifa.org/flags/111.png,70,70,Al Taawoun,https://cdn.sofifa.org/24/18/teams/112393.png,€120K,...,,,,,,,,,,


In [98]:
def to_common_name(name):
    #Sofifa dataset does not include "-" in people's last name and instead has a space
    # (i.e. Essam El-Hadary), so sub it with space
    common_name = name.replace("-", "")
    common_name = common_name.split(" ")
    
    if len(common_name) > 1:
        #First initial of first name + + last name
        common_name = common_name[0][0] + ''.join(common_name[1:])
        return common_name
        
    #case where there is only last name / nickname. Then just return original wc squad name since it seems
    #the sofifa name and original wc squad name correspond in this case.
    else:
        return name

In [99]:
to_sofifa_name('Essam El-Hadary')

'EElHadary'

In [100]:
to_sofifa_name('E. El Hadary')

'EElHadary'

In [101]:
wc_names_converted = wc_players_2018['Player'].apply(to_sofifa_name)
wc_players_2018['name_converted'] = wc_names_converted

In [102]:
#Sofifa also inconsistent with its own convention sometimes (i.e. Cristiano Ronaldo instead of C. Ronaldo)
#So just run the algorithm on sofifa as well. 
sofifa_names_converted = sofifa_df['Name'].apply(to_sofifa_name)
sofifa_df['name_converted'] = sofifa_names_converted

In [130]:
wc_players_sofifa_merged = wc_players_2018.merge(sofifa_df, how = 'left', left_on = ['name_converted', 'Country',],
                        right_on = ['name_converted', 'Nationality'])
wc_players_sofifa_merged.head()

Unnamed: 0,Player,Country,Pos,Birthday,Age_x,Caps,Club_x,Club Country,year,name_converted,...,RB,RCB,RCM,RDM,RF,RM,RS,RW,RWB,ST
0,Essam El-Hadary,egypt,GK,1973-01-15,45,158,Al-Taawoun,saudi arabia,2018,EElHadary,...,,,,,,,,,,
1,Ali Gabr,egypt,DF,1989-01-01,29,21,West Bromwich Albion,england,2018,AGabr,...,61.0,66.0,50.0,61.0,42.0,46.0,45.0,43.0,58.0,45.0
2,Ahmed Elmohamady,egypt,DF,1987-09-09,30,78,Aston Villa,england,2018,AElmohamady,...,71.0,68.0,72.0,70.0,71.0,73.0,70.0,72.0,72.0,70.0
3,Omar Gaber,egypt,MF,1992-01-30,26,24,Los Angeles FC,usa,2018,OGaber,...,68.0,67.0,65.0,68.0,62.0,65.0,62.0,63.0,68.0,62.0
4,Sam Morsy,egypt,MF,1991-09-10,26,5,Wigan Athletic,england,2018,SMorsy,...,63.0,65.0,67.0,67.0,64.0,65.0,61.0,64.0,64.0,61.0


In [131]:
wc_players_2018.shape

(736, 10)

In [132]:
wc_players_sofifa_merged.shape

(788, 84)

In [139]:
wc_players_sofifa_merged.Player.value_counts()

Danilo                  5
Paulinho                4
Danny Rose              3
Marcelo                 3
Sebastian Larsson       3
Rodrigo                 3
Thomas Müller           3
Dejan Lovren            2
Saúl                    2
John Guidetti           2
Mansoor Al-Harbi        2
Anton Miranchuk         2
Juan Cuadrado           2
James Rodríguez         2
Jesse Lingard           2
Osama Hawsawi           2
Douglas Costa           2
Cristian Rodríguez      2
Timo Werner             2
Nicolai Jørgensen       2
Alfred Gomis            2
Milan Badelj            2
Ismaïla Sarr            2
Camilo Vargas           2
Phil Jones              2
Denis Cheryshev         2
Adrien Silva            2
Adama Mbengue           2
Lee Yong                2
Kosuke Nakamura         2
                       ..
Mohammed Al-Breik       1
Stephan Lichtsteiner    1
Ashley Young            1
Sebastian Rudy          1
Ruben Loftus-Cheek      1
Alberto Rodríguez       1
Naomichi Ueda           1
Edison Flore

In [142]:
wc_players_sofifa_merged[wc_players_sofifa_merged.Player.str.contains('Rodrigo')][['Club_x', 'Club_y']]

Unnamed: 0,Club_x,Club_y
81,Juventus,Juventus
183,Valencia,Valencia CF
184,Valencia,Valencia CF
185,Valencia,Villarreal CF
