In [29]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import requests
import re
from bs4 import BeautifulSoup, Comment
from pathlib import Path
%matplotlib inline

plt.style.use('fivethirtyeight')
sns.set_context('notebook')

# Web Scraping

https://sofifa.com

In [30]:
# referenced from https://realpython.com/python-web-scraping-practical-introduction/
def simple_get(url):
    """
    Attempts to scrape the content at 'url' by making a HTTP GET request. 
    If the content-type of the response is some kind of HTML/XML, return the
    text content, otherwise return None.
    """
    import requests
    from contextlib import closing
    
    try:
        with closing(requests.get(url, stream=True)) as resp:
            if is_good_response(resp):
                return resp.content
            else:
                return None
    except AssertionError as error:
        print(error)
        print('Error in scraping of url')


def is_good_response(resp):
    """
    Returns True if response is some kind of HTML/XML
    """
    content_type = resp.headers['Content-Type'].lower()
    return (resp.status_code == 200
           and content_type is not None
           and content_type.find('html') > -1)


In [31]:
# Find all the href attributes for each national team

def get_nation_href(date_url):
    """Given a date url corresponding to a specific update of FIFA 18, returns a dictionary 
    containing the teams and their hrefs"""
    url = 'https://sofifa.com/teams/national'+date_url   
    html = BeautifulSoup(simple_get(url), 'html.parser')

    teams_href = {}

    for link in html.find_all('a', attrs={'href': re.compile("^/team/.+")}):
        if link.get_text() not in teams_href:
            teams_href[link.get_text()] = link.get('href')
            
    return teams_href

In [32]:
teams_href = get_nation_href('?v=WC18&e=159126&set=true') # corresponds to FIFA WC18 Expansion Jun 16

We now have the urls of all the national teams. We note that the current urls link to the most recent ratings of the teams. We can find the urls of all players in the national teams from this page as well. We start by collecting the urls of all players in the national teams.

In [33]:
def get_players_href(teams_href, date_url):
    """Given a dictionary of team hrefs and names and a segment of a date url, corresponding to a specific update of FIFA 18
    returns a dictionary containing the original href, a list of all players in the team, and a list of all 
    players href in the team, for all teams"""
    teams={}
    for team, href in teams_href.items():
        url_team = 'https://sofifa.com'+href+date_url 
        html_team = BeautifulSoup(simple_get(url_team), 'html.parser')

        player_list = []
        player_href_list = []
        for link in html_team.find_all('a', attrs={'href': re.compile("^/player/.+")}):
            if link.get('href') not in player_href_list:
                player_href_list.append(link.get('href'))
            if link.get_text() not in player_list:
                player_list.append(link.get_text())

        teams[team] = {'href': href,
                       'players': player_list,
                       'players_href': player_href_list}
    return teams

In [34]:
teams = get_players_href(teams_href, '?v=WC18&e=159126&set=true') # corresponds to FIFA WC18 Expansion Jun 16

Now that we have the urls to all the players in the national teams, we can collect the individual player data. We will collect the key summarized data for now.

In [41]:
# still does not sort teams and clubs correctly
def get_players(teams, date_url):
    """Given the output dictionary from get_players_href,  
    and a segment of a date url, corresponding to a specific update of FIFA 18,
    returns a dataframe containing the summary data for each player in the teams"""
    summary = []
    for tname, _ in teams.items():
        player_href = teams[tname]['players_href']
        for i, player in enumerate(player_href):
            url_player = 'https://sofifa.com'+player+date_url
            html_player = BeautifulSoup(simple_get(url_player), 'html.parser')

            h1 = html_player.find('h1')

            name_id = h1.get_text()
            name = name_id.split('(')[0]
            ID = re.search(r'\((.+)\)', name_id).group(1).split(' ')[-1]
            full_name = h1.find_next('div').next_element.strip()
    
            # account for multiple preferred positions
            position = ''
            for p in h1.find_next('div').find_all('span'):
                pos = p.get_text()+' '
                position += pos
            position.strip()

            age = re.search(r'Age (\d+) .+' , h1.find_next('div').find_all('span')[-1].next_sibling).group(1)
            
            # account for +/- signs
            span = html_player.find('div', attrs={'class': 'card-body stats'}).find_all('span')
            span_label = (html_player
                          .find('div', attrs={'class': 'card-body stats'})
                          .find_all('span', attrs={'class': re.compile("label.+")}))
            
            overall = float(span_label[0].get_text())
            potential = float(span_label[1].get_text())
            value = span[-2].get_text()
            wage = span[-1].get_text()
    
            # account for different order of clubs and national teams
            club_nation = html_player.find_all('a', attrs={'href': re.compile("^/team/.*")})
            if len(club_nation) > 1:
                if club_nation[0].get_text() == tname:
                    # account for players who are on loan. they will have 4 teams
                    if len(club_nation) > 2:
                        t = 0
                        c = 2
                    else:
                        t = 0
                        c = 1
                else:
                    # account for players who are on loan. they will have 4 teams
                    if len(club_nation) > 2:
                        t = 2
                        c = 0
                    else:
                        t = 1
                        c = 0
                club = club_nation[c].get_text()
                club_rating = float(club_nation[c].find_next('span').get_text())
                team = club_nation[t].get_text()
                team_rating = float(club_nation[t].find_next('span').get_text())
            else:
                club = None
                club_rating = np.nan
                team = club_nation[0].get_text()
                team_rating = float(club_nation[0].find_next('span').get_text())
                
            summary.append([ID, name, full_name, position,
                   age, overall, potential, value,
                   wage, club, club_rating, team, team_rating])
    
    summary_dat = pd.DataFrame(summary, columns=['ID', 'name', 'full_name', 'position', 
                                                 'age', 'overall', 'potential', 'value',
                                                 'wage', 'club', 'club_rating', 'team', 'team_rating'])
    
    return summary_dat
 

In [42]:
# for players in FIFA WC18 Expansion Jun 16, get their stats from FIFA 18 Jun 14
players = get_players(teams, '?v=18&e=159124&set=true') 

In [44]:
players.head(40)

Unnamed: 0,ID,name,full_name,position,age,overall,potential,value,wage,club,club_rating,team,team_rating
0,212188,T. Werner,Timo Werner,ST,21,83.0,87.0,€34.5M,€68K,,,RB Leipzig,78.0
1,176635,M. Özil,Mesut Özil,CAM RW,28,87.0,87.0,€51M,€205K,Arsenal,82.0,Germany,85.0
2,188350,M. Reus,Marco Reus,LM LW ST CAM,28,85.0,85.0,€39M,€98K,,,Borussia Dortmund,81.0
3,189596,T. Müller,Thomas Müller,CF RW CAM ST,27,86.0,86.0,€47.5M,€190K,FC Bayern München,86.0,Germany,85.0
4,179846,S. Khedira,Sami Khedira,CM CDM,30,84.0,84.0,€29M,€160K,Juventus,85.0,Germany,85.0
5,182521,T. Kroos,Toni Kroos,CM CDM,27,90.0,90.0,€79M,€340K,Real Madrid,86.0,Germany,85.0
6,208334,J. Hector,Jonas Hector,LB CDM,27,80.0,80.0,€12M,€42K,1. FC Köln,76.0,Germany,85.0
7,178603,M. Hummels,Mats Hummels (Mats Hummels),CB,28,90.0,90.0,€62.5M,€215K,FC Bayern München,86.0,Germany,85.0
8,183907,J. Boateng,Jérôme Boateng,CB,28,87.0,87.0,€41M,€165K,FC Bayern München,86.0,Germany,85.0
9,212622,J. Kimmich,Joshua Kimmich,RB CM CB CDM,22,84.0,88.0,€33.5M,€115K,FC Bayern München,86.0,Germany,85.0


In [45]:
players.to_csv('../datasets/sofifa/sofifa_players.csv',index=False)