In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import requests
import re
import time
from bs4 import BeautifulSoup, Comment
from pathlib import Path
from tqdm import tqdm
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options


%matplotlib inline

plt.style.use('fivethirtyeight')
sns.set_context('notebook')

# Web Scraping

https://sofifa.com

In [2]:
# referenced from https://realpython.com/python-web-scraping-practical-introduction/
def simple_get(url):
    """
    Attempts to scrape the content at 'url' by making a HTTP GET request. 
    If the content-type of the response is some kind of HTML/XML, return the
    text content, otherwise return None.
    """
    import requests
    from contextlib import closing
    
    try:
        with closing(requests.get(url, stream=True)) as resp:
            if is_good_response(resp):
                return resp.content
            else:
                return None
    except AssertionError as error:
        print(error)
        print('Error in scraping of url')


def is_good_response(resp):
    """
    Returns True if response is some kind of HTML/XML
    """
    content_type = resp.headers['Content-Type'].lower()
    return (resp.status_code == 200
           and content_type is not None
           and content_type.find('html') > -1)


We note that the information on the website is categorized by FIFA update dates. For each FIFA update date, we have different national teams and national players with different statistics. We note that there does not seem to be a strict order to the update dates. Some months have several updates, while others have only 1 update. We have all the data starting from August 2006.

In [3]:
def get_date_href():
    url = 'https://sofifa.com/'
    html = BeautifulSoup(simple_get(url), 'html.parser')
    
    dates_href = {}
    
    # for each month
    for m in html.find_all('div', attrs={'class': 'card-title h5'})[:-8]: # note the last 8 items are not dates
        month = m.get_text()
        
        # for each day of the month
        for d in m.find_next('div').find_all('a'):
            day = d.get_text()
            date = day+' '+month
            href = d.get('href')
            
            dates_href[date] = href
    
    return dates_href

In [4]:
dates_href = get_date_href()

In [5]:
# Find all the href attributes for each national team

def get_nation_href(date_url):
    """Given a date url corresponding to a specific update of FIFA 18, returns a dictionary 
    containing the teams and their hrefs"""
    url = 'https://sofifa.com/teams/national'+date_url   
    html = BeautifulSoup(simple_get(url), 'html.parser')

    teams_href = {}

    for link in html.find_all('a', attrs={'href': re.compile("^/team/.+")}):
        if link.get_text() not in teams_href:
            teams_href[link.get_text()] = link.get('href')
            
    return teams_href

In [6]:
teams_href = get_nation_href('?v=WC18&e=159126&set=true') # corresponds to FIFA WC18 Expansion Jun 16
len(teams_href)

49

We note that at different dates, there may be different teams represented in the game. As such, we loop through all the possible dates to find the full list of all teams that have ever been represented in the game.

In [7]:
full_teams_href = {}

for d, dhref in dates_href.items():    
    teams_href = get_nation_href(dhref)
    # merges dictionaries
    full_teams_href = {**teams_href, **full_teams_href}

In [8]:
len(full_teams_href)

62

We now have the urls of all 62 national teams that has ever appeared in the game. We note that the current urls link to the most recent ratings of the teams. We can find the urls of all players in the national teams from this page as well. We now wish to collect team specific data for all these teams across all the different dates. 

However, we note that we should not collecting team data for all of the possible dates that we have collected previously. At each version update, not all the teams are in the game. As such, we should be collecting historical team data for teams that appear at each version of the game. Thankfully, we can access the specific history of each team.

In [10]:
url = 'https://sofifa.com'
team_ver_href = {}
for t, thref in full_teams_href.items():
    # create a new session
    driver = webdriver.Chrome()
    driver.implicitly_wait(5)
    driver.get(url+thref)
    
    html_ver = BeautifulSoup(driver.page_source, 'lxml')
    html_ver_dat = html_ver.select('select > option')[1:]
    ver_href = [i.get('value') for i in html_ver_dat]
    date = [i.get_text() for i in html_ver_dat]
    
    # get number of versions
    n = len(ver_href)
    
    for i, v in enumerate(date):
        team_ver_href[(t, v)] = ver_href[i]
    

We can then scrape the data for each version of every team that has ever appeared in FIFA. We scrape some of the team and individual player statistics that we think will be important in our model analysis together at the same time. We will then subsequently clean up the data accordingly.

In [13]:
team_stats = {}
for tdate, vhref in tqdm(team_ver_href.items()):
        url_team = 'https://sofifa.com'+vhref
        html_team = BeautifulSoup(simple_get(url_team), 'lxml')
        
        span = html_team.find('div', attrs={'class': 'card-body stats'}).find_all('span')
        span_label = (html_team
                      .find('div', attrs={'class': 'card-body stats'})
                      .find_all('span', attrs={'class': re.compile("label.+")}))
        
        # get basic statistics
        overall = float(span_label[0].get_text())
        attack = float(span_label[1].get_text())
        midfield = float(span_label[2].get_text())
        defence = float(span_label[3].get_text())

        prestige = float(html_team.find(string='International Prestige').find_next('span').get_text())
        start_age = float(html_team.find(string='Starting XI Average Age').next_element)
        full_age = float(html_team.find(string='Whole Team Average Age').next_element)
        
        # get other stats lumped together
        stats = [i.get_text() for i in html_team.select('span[class="float-right"] > span[class="label"]')]
        
        # get individual player stats lumped together
        name = [i.get_text() for i in html_team.select('td > div > a ~ a')]
        player_overall = [i.get_text() for i in html_team.select('div[class="col-digit col-oa"] > span')]
        potential = [i.get_text() for i in html_team.select('div[class="col-digit col-pt"] > span')]
        value = [i.get_text() for i in html_team.select('div[class="col-digit col-vl"]')]
        wage = [i.get_text() for i in html_team.select('div[class="col-digit col-wg"]')]
        
        team_stats[tdate] = {
            'overall': overall,
            'attack': attack,
            'midfield': midfield,
            'defence': defence,
            'prestige': prestige,
            'start_age': start_age,
            'full_age': full_age,
            'ext_stats': stats,
            'name': name,
            'player_overall': player_overall,
            'potential': potential,
            'value': value,
            'wage': wage
        }

100%|██████████████████████████████████████████████████████████████████████████| 20594/20594 [6:01:53<00:00,  1.26s/it]


In [18]:
sofifa_data = pd.DataFrame(team_stats).T.reset_index()
sofifa_data.rename({'level_0': 'team', 'level_1': 'date'}, axis=1, inplace=True)

# save to csv
sofifa_data.to_csv('data/sofifa_data.csv',index=False)