In [44]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import requests
import re
import time
from bs4 import BeautifulSoup, Comment
from pathlib import Path
from tqdm import tqdm
%matplotlib inline

plt.style.use('fivethirtyeight')
sns.set_context('notebook')

# Web Scraping

https://sofifa.com

Due to the long time needed to scrape each individual player data, we propose a simpler scraping methodology to get some basic key information from each individual player. We target the average of the difference in overall and potential ratings for each individual player (as a measure of the strength of youth/potential of a team), and their value and wage. 

Note that we did not scrape the huge amount of individual player statistics (pace, shooting, passing, etc.). Instead, we chose to rely on the broad summarized team statistics, the information that was mentioned above, and some other 'atypical' statistics that impose our own assumptions on our model. If those assumptions are true, they could possibly do much better than typical FIFA statistics. If the assumptions do not hold, they nonetheless give us greater intuition into the factors useful in predicting World Cup matches.

An example of an 'atypical' statistic is simply the ratings of the starting goalkeeper of both teams. This follows the assumption that defence wins championships. By using these statistics, we can gain greater intuition to these assumptions.

We reuse the same functions from our team-scraping notebook.

In [3]:
# referenced from https://realpython.com/python-web-scraping-practical-introduction/
def simple_get(url):
    """
    Attempts to scrape the content at 'url' by making a HTTP GET request. 
    If the content-type of the response is some kind of HTML/XML, return the
    text content, otherwise return None.
    """
    import requests
    from contextlib import closing
    
    try:
        with closing(requests.get(url, stream=True)) as resp:
            if is_good_response(resp):
                return resp.content
            else:
                return None
    except AssertionError as error:
        print(error)
        print('Error in scraping of url')


def is_good_response(resp):
    """
    Returns True if response is some kind of HTML/XML
    """
    content_type = resp.headers['Content-Type'].lower()
    return (resp.status_code == 200
           and content_type is not None
           and content_type.find('html') > -1)


In [4]:
def get_date_href():
    url = 'https://sofifa.com/'
    html = BeautifulSoup(simple_get(url), 'html.parser')
    
    dates_href = {}
    
    # for each month
    for m in html.find_all('div', attrs={'class': 'card-title h5'})[:-8]: # note the last 8 items are not dates
        month = m.get_text()
        
        # for each day of the month
        for d in m.find_next('div').find_all('a'):
            day = d.get_text()
            date = day+' '+month
            href = d.get('href')
            
            dates_href[date] = href
    
    return dates_href

In [5]:
dates_href = get_date_href()

In [6]:
# Find all the href attributes for each national team

def get_nation_href(date_url):
    """Given a date url corresponding to a specific update of FIFA 18, returns a dictionary 
    containing the teams and their hrefs"""
    url = 'https://sofifa.com/teams/national'+date_url   
    html = BeautifulSoup(simple_get(url), 'html.parser')

    teams_href = {}

    for link in html.find_all('a', attrs={'href': re.compile("^/team/.+")}):
        if link.get_text() not in teams_href:
            teams_href[link.get_text()] = link.get('href')
            
    return teams_href

In [7]:
teams_href = get_nation_href('?v=WC18&e=159126&set=true') # corresponds to FIFA WC18 Expansion Jun 16
len(teams_href)

49

In [8]:
full_teams_href = {}

for d, dhref in dates_href.items():    
    teams_href = get_nation_href(dhref)
    # merges dictionaries
    full_teams_href = {**teams_href, **full_teams_href}

In [9]:
len(full_teams_href)

62

We note that value and wage are zero for World Cup dates.

In [63]:
team_stats = {}
for date, dhref in tqdm(dates_href.items()):
    for team, thref in full_teams_href.items():
        url_team = 'https://sofifa.com'+thref+dhref
        html_team = BeautifulSoup(simple_get(url_team), 'html.parser')
        
        stats = [i.get_text() for i in html_team.select('span[class="float-right"] > span[class="label"]')]
        name = [i.get_text() for i in html_team.select('td > div > a ~ a')]
        overall = [i.get_text() for i in html_team.select('div[class="col-digit col-oa"] > span')]
        potential = [i.get_text() for i in html_team.select('div[class="col-digit col-pt"] > span')]
        value = [i.get_text() for i in html_team.select('div[class="col-digit col-vl"]')]
        wage = [i.get_text() for i in html_team.select('div[class="col-digit col-wg"]')]
        
        team_stats[(team, date)] = {
            'team_stats': stats,
            'name': name,
            'overall': overall,
            'potential': potential,
            'value': value,
            'wage': wage
        }


  0%|                                                                                          | 0/434 [00:00<?, ?it/s]
  0%|▏                                                                               | 1/434 [01:12<8:44:28, 72.67s/it]
  0%|▎                                                                               | 2/434 [02:12<8:14:36, 68.70s/it]
  1%|▌                                                                               | 3/434 [03:10<7:51:51, 65.69s/it]
  1%|▋                                                                               | 4/434 [04:09<7:35:26, 63.55s/it]
  1%|▉                                                                               | 5/434 [05:06<7:19:55, 61.53s/it]
  1%|█                                                                               | 6/434 [06:09<7:22:19, 62.01s/it]
  2%|█▎                                                                              | 7/434 [07:06<7:11:57, 60.70s/it]
  2%|█▍                                

 16%|████████████                                                                 | 68/434 [1:20:34<5:33:37, 54.69s/it]
 16%|████████████▏                                                                | 69/434 [1:21:28<5:31:29, 54.49s/it]
 16%|████████████▍                                                                | 70/434 [1:22:22<5:29:06, 54.25s/it]
 16%|████████████▌                                                                | 71/434 [1:23:15<5:26:31, 53.97s/it]
 17%|████████████▊                                                                | 72/434 [1:24:09<5:24:04, 53.71s/it]
 17%|████████████▉                                                                | 73/434 [1:25:02<5:22:41, 53.63s/it]
 17%|█████████████▏                                                               | 74/434 [1:25:57<5:24:06, 54.02s/it]
 17%|█████████████▎                                                               | 75/434 [1:26:50<5:21:56, 53.81s/it]
 18%|█████████████▍                     

 31%|███████████████████████▊                                                    | 136/434 [2:21:41<4:23:14, 53.00s/it]
 32%|███████████████████████▉                                                    | 137/434 [2:22:34<4:23:06, 53.15s/it]
 32%|████████████████████████▏                                                   | 138/434 [2:23:29<4:23:58, 53.51s/it]
 32%|████████████████████████▎                                                   | 139/434 [2:24:21<4:21:37, 53.21s/it]
 32%|████████████████████████▌                                                   | 140/434 [2:25:15<4:21:11, 53.30s/it]
 32%|████████████████████████▋                                                   | 141/434 [2:26:06<4:18:09, 52.86s/it]
 33%|████████████████████████▊                                                   | 142/434 [2:27:00<4:17:44, 52.96s/it]
 33%|█████████████████████████                                                   | 143/434 [2:27:52<4:16:22, 52.86s/it]
 33%|█████████████████████████▏         

 47%|███████████████████████████████████▋                                        | 204/434 [3:23:04<3:29:45, 54.72s/it]
 47%|███████████████████████████████████▉                                        | 205/434 [3:23:58<3:27:55, 54.48s/it]
 47%|████████████████████████████████████                                        | 206/434 [3:24:51<3:25:35, 54.10s/it]
 48%|████████████████████████████████████▏                                       | 207/434 [3:25:49<3:29:29, 55.37s/it]
 48%|████████████████████████████████████▍                                       | 208/434 [3:26:43<3:26:51, 54.92s/it]
 48%|████████████████████████████████████▌                                       | 209/434 [3:27:36<3:24:04, 54.42s/it]
 48%|████████████████████████████████████▊                                       | 210/434 [3:28:30<3:22:41, 54.29s/it]
 49%|████████████████████████████████████▉                                       | 211/434 [3:29:23<3:20:11, 53.86s/it]
 49%|███████████████████████████████████

 63%|███████████████████████████████████████████████▋                            | 272/434 [4:24:26<2:27:03, 54.47s/it]
 63%|███████████████████████████████████████████████▊                            | 273/434 [4:25:20<2:25:28, 54.21s/it]
 63%|███████████████████████████████████████████████▉                            | 274/434 [4:26:13<2:24:00, 54.00s/it]
 63%|████████████████████████████████████████████████▏                           | 275/434 [4:27:07<2:23:20, 54.09s/it]
 64%|████████████████████████████████████████████████▎                           | 276/434 [4:28:02<2:22:49, 54.24s/it]
 64%|████████████████████████████████████████████████▌                           | 277/434 [4:28:57<2:22:11, 54.34s/it]
 64%|████████████████████████████████████████████████▋                           | 278/434 [4:30:20<2:44:05, 63.11s/it]
 64%|████████████████████████████████████████████████▊                           | 279/434 [4:31:15<2:36:28, 60.57s/it]
 65%|███████████████████████████████████

 78%|███████████████████████████████████████████████████████████▌                | 340/434 [5:27:54<1:27:45, 56.01s/it]
 79%|███████████████████████████████████████████████████████████▋                | 341/434 [5:28:49<1:26:21, 55.72s/it]
 79%|███████████████████████████████████████████████████████████▉                | 342/434 [5:29:44<1:25:03, 55.47s/it]
 79%|████████████████████████████████████████████████████████████                | 343/434 [5:30:41<1:25:02, 56.07s/it]
 79%|████████████████████████████████████████████████████████████▏               | 344/434 [5:31:36<1:23:45, 55.83s/it]
 79%|████████████████████████████████████████████████████████████▍               | 345/434 [5:32:31<1:22:09, 55.38s/it]
 80%|████████████████████████████████████████████████████████████▌               | 346/434 [5:33:25<1:20:52, 55.15s/it]
 80%|████████████████████████████████████████████████████████████▊               | 347/434 [5:34:22<1:20:25, 55.46s/it]
 80%|███████████████████████████████████

 94%|█████████████████████████████████████████████████████████████████████████▎    | 408/434 [6:31:06<24:13, 55.90s/it]
 94%|█████████████████████████████████████████████████████████████████████████▌    | 409/434 [6:32:02<23:16, 55.88s/it]
 94%|█████████████████████████████████████████████████████████████████████████▋    | 410/434 [6:32:59<22:25, 56.06s/it]
 95%|█████████████████████████████████████████████████████████████████████████▊    | 411/434 [6:33:55<21:32, 56.21s/it]
 95%|██████████████████████████████████████████████████████████████████████████    | 412/434 [6:34:52<20:40, 56.37s/it]
 95%|██████████████████████████████████████████████████████████████████████████▏   | 413/434 [6:35:48<19:38, 56.10s/it]
 95%|██████████████████████████████████████████████████████████████████████████▍   | 414/434 [6:36:45<18:48, 56.42s/it]
 96%|██████████████████████████████████████████████████████████████████████████▌   | 415/434 [6:37:42<17:59, 56.79s/it]
 96%|███████████████████████████████████

In [69]:
team_stats_df = pd.DataFrame(team_stats).T.reset_index()
team_stats_df.columns = ['team', 'date', 'name', 'overall', 'potential', 
                         'team_stats', 'value', 'wage']

In [70]:
team_stats_df.to_csv('../datasets/sofifa/player_stats.csv',index=False)

In [71]:
pd.read_csv('../datasets/sofifa/player_stats.csv')

Unnamed: 0,team,date,name,overall,potential,team_stats,value,wage
0,Brazil,15 Jul 2018,"['Alisson', 'Fagner', 'Thiago Silva', 'Miranda...","['84', '77', '86', '87', '87', '84', '87', '88...","['88', '77', '86', '87', '87', '84', '87', '91...","['51', '73', '50', '39', '33', '67', '76', '73...","['€0', '€0', '€0', '€0', '€0', '€0', '€0', '€0...","['€0', '€0', '€0', '€0', '€0', '€0', '€0', '€0..."
1,England,15 Jul 2018,"['J. Pickford', 'K. Trippier', 'K. Walker', 'J...","['80', '80', '83', '80', '79', '79', '80', '80...","['87', '81', '84', '86', '84', '79', '83', '82...","['36', '20', '39', '31', '41', '41', '43', '55...","['€0', '€0', '€0', '€0', '€0', '€0', '€0', '€0...","['€0', '€0', '€0', '€0', '€0', '€0', '€0', '€0..."
2,Italy,15 Jul 2018,"['G. Buffon', 'A. Florenzi', 'G. Chiellini', '...","['88', '82', '86', '81', '77', '80', '81', '85...","['88', '82', '86', '88', '81', '80', '84', '88...","['73', '60', '67', '64', '80', '78', '29', '26...","['€0', '€0', '€0', '€0', '€0', '€0', '€0', '€0...","['€0', '€0', '€0', '€0', '€0', '€0', '€0', '€0..."
3,Spain,15 Jul 2018,"['De Gea', 'Nacho Fernández', 'Piqué', 'Sergio...","['91', '82', '87', '90', '85', '85', '87', '87...","['93', '84', '87', '90', '85', '88', '87', '87...","['32', '41', '28', '27', '32', '20', '75', '62...","['€0', '€0', '€0', '€0', '€0', '€0', '€0', '€0...","['€0', '€0', '€0', '€0', '€0', '€0', '€0', '€0..."
4,France,15 Jul 2018,"['H. Lloris', 'B. Pavard', 'R. Varane', 'S. Um...","['87', '78', '85', '85', '79', '85', '86', '88...","['87', '84', '90', '89', '89', '94', '91', '90...","['35', '52', '30', '24', '53', '35', '47', '47...","['€0', '€0', '€0', '€0', '€0', '€0', '€0', '€0...","['€0', '€0', '€0', '€0', '€0', '€0', '€0', '€0..."
5,Germany,15 Jul 2018,"['M. Neuer', 'J. Kimmich', 'J. Boateng', 'M. H...","['91', '85', '87', '90', '81', '85', '90', '86...","['91', '89', '87', '90', '82', '85', '90', '86...","['28', '54', '34', '20', '40', '26', '66', '60...","['€0', '€0', '€0', '€0', '€0', '€0', '€0', '€0...","['€0', '€0', '€0', '€0', '€0', '€0', '€0', '€0..."
6,Argentina,15 Jul 2018,"['F. Armani', 'G. Mercado', 'N. Otamendi', 'M....","['77', '77', '86', '81', '79', '81', '77', '82...","['77', '77', '86', '82', '85', '81', '77', '82...","['36', '50', '40', '36', '36', '36', '76', '66...","['€0', '€0', '€0', '€0', '€0', '€0', '€0', '€0...","['€0', '€0', '€0', '€0', '€0', '€0', '€0', '€0..."
7,Portugal,15 Jul 2018,"['Rui Patrício', 'Ricardo Pereira', 'Pepe', 'J...","['85', '80', '88', '82', '80', '84', '84', '84...","['85', '85', '88', '82', '85', '87', '90', '84...","['51', '68', '50', '54', '58', '62', '66', '58...","['€0', '€0', '€0', '€0', '€0', '€0', '€0', '€0...","['€0', '€0', '€0', '€0', '€0', '€0', '€0', '€0..."
8,Turkey,15 Jul 2018,"['V. Babacan', 'G. Gönül', 'M. Topal', 'C. Söy...","['77', '76', '78', '75', '75', '76', '78', '75...","['77', '76', '78', '84', '75', '82', '83', '75...","['37', '51', '37', '32', '62', '52', '66', '62...","['€0', '€0', '€0', '€0', '€0', '€0', '€0', '€0...","['€0', '€0', '€0', '€0', '€0', '€0', '€0', '€0..."
9,Nigeria,15 Jul 2018,"['F. Uzoho', 'V. Moses', 'L. Balogun', 'W. Tro...","['68', '80', '74', '74', '72', '69', '79', '77...","['82', '80', '75', '79', '79', '73', '86', '77...","['68', '75', '71', '79', '55', '74', '58', '75...","['€0', '€0', '€0', '€0', '€0', '€0', '€0', '€0...","['€0', '€0', '€0', '€0', '€0', '€0', '€0', '€0..."
