In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import requests
import re
import time
from bs4 import BeautifulSoup, Comment
from pathlib import Path
from tqdm import tqdm
%matplotlib inline

plt.style.use('fivethirtyeight')
sns.set_context('notebook')

# Web Scraping

https://sofifa.com

In [2]:
# referenced from https://realpython.com/python-web-scraping-practical-introduction/
def simple_get(url):
    """
    Attempts to scrape the content at 'url' by making a HTTP GET request. 
    If the content-type of the response is some kind of HTML/XML, return the
    text content, otherwise return None.
    """
    import requests
    from contextlib import closing
    
    try:
        with closing(requests.get(url, stream=True)) as resp:
            if is_good_response(resp):
                return resp.content
            else:
                return None
    except AssertionError as error:
        print(error)
        print('Error in scraping of url')


def is_good_response(resp):
    """
    Returns True if response is some kind of HTML/XML
    """
    content_type = resp.headers['Content-Type'].lower()
    return (resp.status_code == 200
           and content_type is not None
           and content_type.find('html') > -1)


We note that the information on the website is categorized by FIFA update dates. For each FIFA update date, we have different national teams and national players with different statistics. We note that there does not seem to be a strict order to the update dates. Some months have several updates, while others have only 1 update. We have all the data starting from August 2006.

In [3]:
def get_date_href():
    url = 'https://sofifa.com/'
    html = BeautifulSoup(simple_get(url), 'html.parser')
    
    dates_href = {}
    
    # for each month
    for m in html.find_all('div', attrs={'class': 'card-title h5'})[:-8]: # note the last 8 items are not dates
        month = m.get_text()
        
        # for each day of the month
        for d in m.find_next('div').find_all('a'):
            day = d.get_text()
            date = day+' '+month
            href = d.get('href')
            
            dates_href[date] = href
    
    return dates_href

In [4]:
dates_href = get_date_href()

In [5]:
# Find all the href attributes for each national team

def get_nation_href(date_url):
    """Given a date url corresponding to a specific update of FIFA 18, returns a dictionary 
    containing the teams and their hrefs"""
    url = 'https://sofifa.com/teams/national'+date_url   
    html = BeautifulSoup(simple_get(url), 'html.parser')

    teams_href = {}

    for link in html.find_all('a', attrs={'href': re.compile("^/team/.+")}):
        if link.get_text() not in teams_href:
            teams_href[link.get_text()] = link.get('href')
            
    return teams_href

In [6]:
teams_href = get_nation_href('?v=WC18&e=159126&set=true') # corresponds to FIFA WC18 Expansion Jun 16
len(teams_href)

49

We note that at different dates, there may be different teams represented in the game. As such, we loop through all the possible dates to find the full list of all teams that have ever been represented in the game.

In [7]:
full_teams_href = {}

for d, dhref in dates_href.items():    
    teams_href = get_nation_href(dhref)
    # merges dictionaries
    full_teams_href = {**teams_href, **full_teams_href}

In [8]:
len(full_teams_href)

62

We now have the urls of all 62 national teams that has ever appeared in the game. We note that the current urls link to the most recent ratings of the teams. We can find the urls of all players in the national teams from this page as well. We now wish to collect team specific data for all these teams across all the different dates.

In [65]:
url_team = 'https://sofifa.com/team/1337/germany/'
html_team = BeautifulSoup(simple_get(url_team), 'html.parser')
html_team.find(string='Whole Team Average Age').next_element

'25.35'

In [22]:
team_stats = {}
for date, dhref in tqdm.tqdm(dates_href.items()):
    for team, thref in full_teams_href.items():
        url_team = 'https://sofifa.com'+thref+dhref
        html_team = BeautifulSoup(simple_get(url_team), 'html.parser')
        
        span = html_team.find('div', attrs={'class': 'card-body stats'}).find_all('span')
        span_label = (html_team
                      .find('div', attrs={'class': 'card-body stats'})
                      .find_all('span', attrs={'class': re.compile("label.+")}))
        
        overall = float(span_label[0].get_text())
        attack = float(span_label[1].get_text())
        midfield = float(span_label[2].get_text())
        defence = float(span_label[3].get_text())

        prestige = float(html_team.find(string='International Prestige').find_next('span').get_text())
        start_age = float(html_team.find(string='Starting XI Average Age').next_element)
        full_age = float(html_team.find(string='Whole Team Average Age').next_element)
        
        team_stats[(team, date)] = {
            'overall': overall,
            'attack': attack,
            'midfield': midfield,
            'defence': defence,
            'prestige': prestige,
            'start_age': start_age,
            'full_age': full_age
        }
    


  0%|                                                                                          | 0/432 [00:00<?, ?it/s]
  0%|▏                                                                              | 1/432 [01:27<10:27:22, 87.34s/it]
  0%|▎                                                                              | 2/432 [02:54<10:25:28, 87.28s/it]
  1%|▌                                                                              | 3/432 [04:24<10:29:02, 87.98s/it]
  1%|▋                                                                              | 4/432 [05:52<10:29:12, 88.21s/it]
  1%|▉                                                                              | 5/432 [07:16<10:18:46, 86.95s/it]
  1%|█                                                                              | 6/432 [08:48<10:26:57, 88.30s/it]
  2%|█▎                                                                             | 7/432 [10:22<10:38:08, 90.09s/it]
  2%|█▍                                

 16%|████████████                                                                 | 68/432 [1:26:48<7:36:31, 75.25s/it]
 16%|████████████▎                                                                | 69/432 [1:27:57<7:24:34, 73.48s/it]
 16%|████████████▍                                                                | 70/432 [1:29:03<7:09:42, 71.22s/it]
 16%|████████████▋                                                                | 71/432 [1:30:12<7:04:18, 70.52s/it]
 17%|████████████▊                                                                | 72/432 [1:31:19<6:56:42, 69.45s/it]
 17%|█████████████                                                                | 73/432 [1:32:25<6:49:19, 68.41s/it]
 17%|█████████████▏                                                               | 74/432 [1:33:43<7:04:40, 71.17s/it]
 17%|█████████████▎                                                               | 75/432 [1:34:57<7:08:47, 72.07s/it]
 18%|█████████████▌                     

 31%|███████████████████████▉                                                    | 136/432 [2:48:08<6:48:34, 82.82s/it]
 32%|████████████████████████                                                    | 137/432 [2:49:19<6:30:13, 79.37s/it]
 32%|████████████████████████▎                                                   | 138/432 [2:50:44<6:37:09, 81.05s/it]
 32%|████████████████████████▍                                                   | 139/432 [2:52:06<6:37:09, 81.33s/it]
 32%|████████████████████████▋                                                   | 140/432 [2:53:14<6:16:03, 77.27s/it]
 33%|████████████████████████▊                                                   | 141/432 [2:54:19<5:56:29, 73.50s/it]
 33%|████████████████████████▉                                                   | 142/432 [2:55:32<5:55:48, 73.62s/it]
 33%|█████████████████████████▏                                                  | 143/432 [2:56:40<5:46:26, 71.92s/it]
 33%|█████████████████████████▎         

 47%|███████████████████████████████████▉                                        | 204/432 [4:13:37<4:47:31, 75.67s/it]
 47%|████████████████████████████████████                                        | 205/432 [4:14:49<4:42:17, 74.61s/it]
 48%|████████████████████████████████████▏                                       | 206/432 [4:16:04<4:42:08, 74.90s/it]
 48%|████████████████████████████████████▍                                       | 207/432 [4:17:18<4:39:05, 74.42s/it]
 48%|████████████████████████████████████▌                                       | 208/432 [4:18:31<4:36:18, 74.01s/it]
 48%|████████████████████████████████████▊                                       | 209/432 [4:19:44<4:33:55, 73.70s/it]
 49%|████████████████████████████████████▉                                       | 210/432 [4:21:05<4:40:36, 75.84s/it]
 49%|█████████████████████████████████████                                       | 211/432 [4:22:08<4:25:32, 72.09s/it]
 49%|███████████████████████████████████

 63%|███████████████████████████████████████████████▊                            | 272/432 [5:27:21<2:57:59, 66.75s/it]
 63%|████████████████████████████████████████████████                            | 273/432 [5:28:42<3:08:39, 71.19s/it]
 63%|████████████████████████████████████████████████▏                           | 274/432 [5:29:57<3:10:13, 72.24s/it]
 64%|████████████████████████████████████████████████▍                           | 275/432 [5:31:11<3:10:28, 72.79s/it]
 64%|████████████████████████████████████████████████▌                           | 276/432 [5:32:55<3:33:31, 82.12s/it]
 64%|████████████████████████████████████████████████▋                           | 277/432 [5:34:13<3:29:12, 80.99s/it]
 64%|████████████████████████████████████████████████▉                           | 278/432 [5:35:30<3:24:46, 79.78s/it]
 65%|█████████████████████████████████████████████████                           | 279/432 [5:36:46<3:20:02, 78.45s/it]
 65%|███████████████████████████████████

 79%|███████████████████████████████████████████████████████████▊                | 340/432 [6:41:18<1:26:37, 56.50s/it]
 79%|███████████████████████████████████████████████████████████▉                | 341/432 [6:42:17<1:26:45, 57.21s/it]
 79%|████████████████████████████████████████████████████████████▏               | 342/432 [6:43:12<1:24:50, 56.56s/it]
 79%|████████████████████████████████████████████████████████████▎               | 343/432 [6:44:09<1:24:01, 56.64s/it]
 80%|████████████████████████████████████████████████████████████▌               | 344/432 [6:45:03<1:22:06, 55.99s/it]
 80%|████████████████████████████████████████████████████████████▋               | 345/432 [6:46:00<1:21:29, 56.20s/it]
 80%|████████████████████████████████████████████████████████████▊               | 346/432 [6:46:59<1:21:39, 56.97s/it]
 80%|█████████████████████████████████████████████████████████████               | 347/432 [6:48:07<1:25:30, 60.36s/it]
 81%|███████████████████████████████████

 94%|█████████████████████████████████████████████████████████████████████████▋    | 408/432 [7:57:07<32:25, 81.08s/it]
 95%|█████████████████████████████████████████████████████████████████████████▊    | 409/432 [7:58:33<31:38, 82.53s/it]
 95%|██████████████████████████████████████████████████████████████████████████    | 410/432 [7:59:50<29:39, 80.87s/it]
 95%|██████████████████████████████████████████████████████████████████████████▏   | 411/432 [8:01:08<28:00, 80.04s/it]
 95%|██████████████████████████████████████████████████████████████████████████▍   | 412/432 [8:02:26<26:25, 79.27s/it]
 96%|██████████████████████████████████████████████████████████████████████████▌   | 413/432 [8:03:45<25:03, 79.13s/it]
 96%|██████████████████████████████████████████████████████████████████████████▊   | 414/432 [8:05:01<23:31, 78.40s/it]
 96%|██████████████████████████████████████████████████████████████████████████▉   | 415/432 [8:06:23<22:29, 79.37s/it]
 96%|███████████████████████████████████

In [56]:
team_stats_df = pd.DataFrame(team_stats).T.reset_index()
team_stats_df.columns = ['team', 'date', 'attack', 'defence', 'full_age', 
                         'midfield', 'overall', 'prestige', 'start_age']

In [58]:
team_stats_df.to_csv('../datasets/sofifa/team_stats.csv',index=False)

We could also obtain the player data for each of the national teams at each of the dates represented. However, we first write a function to find all the player information for a given national team at a given point in time.

In [59]:
def get_players_href(team_href, date_url):
    """Given a team href and a segment of a date url, corresponding to a specific update of FIFA 18
    returns a list of all players in the team, and a list of all 
    players href in the team, for all teams"""
    url_team = 'https://sofifa.com'+team_href+date_url 
    html_team = BeautifulSoup(simple_get(url_team), 'html.parser')

    player_href = {}
    for link in html_team.find_all('a', attrs={'href': re.compile("^/player/.+")}):
        if link.get_text() not in player_href:
            player_href[link.get_text()] = link.get('href')
    
    return player_href

We now have the players href for a national team at a given point in time. We now have to obtain all the information for each of the players from these hrefs.

In [60]:
def get_players(player_href, date_url):
    """Given a players href,  and a segment of a date url, corresponding to a specific update of FIFA 18,
    returns the summary data for that player"""

    url_player = 'https://sofifa.com'+player_href+date_url
    html_player = BeautifulSoup(simple_get(url_player), 'html.parser')

    h1 = html_player.find('h1')

    name_id = h1.get_text()
    name = name_id.split('(')[0]
    ID = re.search(r'\((.+)\)', name_id).group(1).split(' ')[-1]
    full_name = h1.find_next('div').next_element.strip()
    
    # account for multiple preferred positions
    position = ''
    for p in h1.find_next('div').find_all('span'):
        pos = p.get_text()+' '
        position += pos
    position.strip()

    age = re.search(r'Age (\d+) .+' , h1.find_next('div').find_all('span')[-1].next_sibling).group(1)
            
    # account for +/- signs
    span = html_player.find('div', attrs={'class': 'card-body stats'}).find_all('span')
    span_label = (html_player
                  .find('div', attrs={'class': 'card-body stats'})
                  .find_all('span', attrs={'class': re.compile("label.+")}))
            
    overall = float(span_label[0].get_text())
    potential = float(span_label[1].get_text())
    value = span[-2].get_text()
    wage = span[-1].get_text()
    
    # account for different order of clubs and national teams
    club_nation = html_player.find_all('figure', attrs={'class': "avatar avatar-sm"})
    cn0 = club_nation[0].find_next('a', attrs={'href': re.compile("^/team/.*")})
    if len(club_nation) > 1:
        cn1 = club_nation[1].find_next('a', attrs={'href': re.compile("^/team/.*")})
        if cn0.get_text() == tname:
            club = cn1.get_text()
            club_rating = float(cn1.find_next('span').get_text())
            team = cn0.get_text()
            team_rating = float(cn0.find_next('span').get_text())
        else:
            club = cn0.get_text()
            club_rating = float(cn0.find_next('span').get_text())
            team = cn1.get_text()
            team_rating = float(cn1.find_next('span').get_text())
    else:
        if cn0.get_text() == tname:
            club = None
            club_rating = np.nan
            team = cn0.get_text()
            team_rating = float(cn0.find_next('span').get_text())
        else:
            club = cn0.get_text()
            club_rating = float(cn0.find_next('span').get_text())
            team = tname
            team_rating = np.nan
            
    # player statistics
    r1 = html_player.find('div', attrs={'class': 'mt-2 mb-2'})
    r1_dat = r1.find_all('span', attrs={'class': re.compile('^label.+')})
    r2_dat = (r1.find_next('div', attrs={'class': 'mb-2'})
              .find_all('span', attrs={'class': re.compile('^label.+')}))
            
    stats = [val for val in r1_dat]+[val for val in r2_dat]
            
    return ID, name, full_name, position, age, overall, potential, value, wage, club, club_rating, team, team_rating, stats
 

In [62]:
summary = {}
for date, dhref in tqdm.tqdm(dates_href.items()):
    for team, thref in full_teams_href.items():
        for player, phref in get_players_href(thref, dhref).items():
            ID, name, full_name, position, age, overall, potential, value, wage, club, club_rating, team, team_rating, stats = get_players(phref, dhref)
            summary[(ID, date)] = {
                'name': name,
                'full_name': full_name,
                'position': position,
                'age': age,
                'overall': overall,
                'potential': potential,
                'value': value,
                'wage': wage,
                'club': club,
                'club_rating': club_rating,
                'team': team,
                'team_rating': team_rating,
                'stats': stats
            }


  0%|                                                                                          | 0/432 [00:00<?, ?it/s]


NameError: name 'tname' is not defined