# Crawling Data

This file is part of my work on Udacity's Nano Degree Programme.

As capstone project I compare the performance of a machine learning model in predicting matches of the 2020 UEFA European Football Championship with my personal bets in a football guessing game played on the platform www.kicktipp.de

In this notebook we acquire football statistics of matches played in
- UEFA Nations League 2018/19
- UEFA Nations League 2020/21
- UEFA Euro 2020 qualifying
and the results of the matches played in the 2020 UEFA European Football Championship.

This is done by crawling wikipedia and the corresponding games on the UEFA page. The parsing of the HTML code is done using BeautifulSoup.

In [1]:
import numpy as np
import pandas as pd

In [2]:
import os
import time
import json

In [3]:
import requests
from urllib.parse import urljoin

In [4]:
from bs4 import BeautifulSoup

In [5]:
path_output = '../data/'

## Auxiliary Functions

In [6]:
def request_page_and_generate_soup(url):
    '''
    INPUT:  URL of a web page
    OUTPUT: Soup made by BeautifulSoup that contains the HTML-code of the page
    '''
    
    try:
        # A delay of three seconds such that we do not make too many requests in a short time by mistake
        time.sleep(3)

        content = requests.get(url)
        text    = content.text
        soup    = BeautifulSoup(text)

        return soup
    
    except:
        return None

## Function to Extract all URLs of UEFA Games from Wikipedia

In [7]:
def find_all_match_urls_from_wiki_page(wiki_url):
    '''
    INPUT:       URL of a wikipedia page
    OUTPUT:      URLs of football games on the UEFA web page
        
    DESCRIPTION: We perform a http-request to wikipedia. (A time delay of three seconds is included to avoid making
                 get-requests too often in a second.)
                 The source code of the page is parsed using BeautifulSoup. Then, we extract all URLs to games on the UEFA 
                 web page. They all have te name 'Report'.
    '''
    all_urls_matches = []
    
    try:
        wiki_soup = request_page_and_generate_soup(wiki_url)
        
        # The name of the links of the matches on the UEFA web page is 'Report'
        all_urls_matches = [link['href'] for link in wiki_soup.find_all('a') if link.text == 'Report']
    except:
        pass
    
    return all_urls_matches

## Process all UEFA pages

In [8]:
def get_feature(soup, elem_type, parameter, name):
    '''
    INPUT:  Soup and different HTML-parameters.
    OUTPUT: The feature from the soup specified by the HTML-elements.
    '''
    try:
        return soup.find(elem_type, {parameter : name}).text
    except:
        return np.nan

In [9]:
def extract_stats_from_stats_page(uefa_stats_page_soup):
    '''
    INPUT:  Soup of the corresponding uefa-page containing the statistics of a football game.
    OUTPUT: Several information from the statistics stored in a dictionary.
    '''
    
    # The features we want to extract from the page and the corresponding HTML-information
    dict_feat = {'goals_home' : ("div", "data-bind", "text: homeGoalsScored"),
                 'goals_away' : ("div", "data-bind", "text: awayGoalsScored"),
                 'attempts_total_home' : ("div", "data-bind", "text: _.find(homeTeam.statistics,{name:'attempts'}) ? _.find(homeTeam.statistics,{name:'attempts'}).value : 0"),
                 'attempts_total_away' : ("div", "data-bind", "text: _.find(awayTeam.statistics,{name:'attempts'}) ? _.find(awayTeam.statistics,{name:'attempts'}).value : 0"),
                 'attempts_off_target_home' : ("div", "data-bind", "text: homeAttempsOff"),
                 'attempts_off_target_away' : ("div", "data-bind", "text: awayAttempsOff"),
                 'attempts_on_target_home' : ("div", "data-bind", "text: homeAttempsOn"),
                 'attempts_on_target_away' : ("div", "data-bind", "text: awayAttempsOn"),
                 'attempts_blocked_home' : ("div", "data-bind", "text: _.find(homeTeam.statistics,{name:'attempts_blocked'}) ? _.find(homeTeam.statistics,{name:'attempts_blocked'}).value : 0"),
                 'attempts_blocked_away' : ("div", "data-bind", "text: _.find(awayTeam.statistics,{name:'attempts_blocked'}) ? _.find(awayTeam.statistics,{name:'attempts_blocked'}).value : 0"),
                 'corners_home' : ("div", "data-bind", "text: homeCorner"),
                 'corners_away' : ("div", "data-bind", "text: awayCorner"),
                 'offsides_home' : ("div", "data-bind", "text: homeOffside"),
                 'offsides_away' : ("div", "data-bind", "text: awayOffside"),
                 'possession_home' : ("div", "data-bind", "text: homeBallPossession + '%'"),
                 'possession_away' : ("div", "data-bind", "text: awayBallPossession + '%'"),
                 'passes_home' : ("div", "data-bind", "text: _.find(homeTeam.statistics,{name:'passes_attempted'}) ? _.find(homeTeam.statistics,{name:'passes_attempted'}).value : 0"),
                 'passes_away' : ("div", "data-bind", "text: _.find(awayTeam.statistics,{name:'passes_attempted'}) ? _.find(awayTeam.statistics,{name:'passes_attempted'}).value : 0"),
                 'passes_completed_home' : ("div", "data-bind", "text: _.find(homeTeam.statistics,{name:'passes_completed'}) ? _.find(homeTeam.statistics,{name:'passes_completed'}).value : 0"),
                 'passes_completed_away' : ("div", "data-bind", "text: _.find(awayTeam.statistics,{name:'passes_completed'}) ? _.find(awayTeam.statistics,{name:'passes_completed'}).value : 0"),
                 'balls_recovered_home' : ("div", "data-bind", "text: homeRecoveredBalls"),
                 'balls_recovered_away' : ("div", "data-bind", "text: awayRecoveredBalls"),
                 'tackles_home' : ("div", "data-bind", "text: _.find(homeTeam.statistics,{name:'tackles'}) ? _.find(homeTeam.statistics,{name:'tackles'}).value : 0"),
                 'tackles_away' : ("div", "data-bind", "text: _.find(awayTeam.statistics,{name:'tackles'}) ? _.find(awayTeam.statistics,{name:'tackles'}).value : 0"),
                 'blocks_home' : ("div", "data-bind", "text: _.find(homeTeam.statistics,{name:'blocked'}) ? _.find(homeTeam.statistics,{name:'blocked'}).value : 0"),
                 'blocks_away' : ("div", "data-bind", "text: _.find(awayTeam.statistics,{name:'blocked'}) ? _.find(awayTeam.statistics,{name:'blocked'}).value : 0"),
                 'clearances_home' : ("div", "data-bind", "text: _.find(homeTeam.statistics,{name:'clearance_completed'}) ? _.find(homeTeam.statistics,{name:'clearance_completed'}).value : 0"),
                 'clearances_away' : ("div", "data-bind", "text: _.find(awayTeam.statistics,{name:'clearance_completed'}) ? _.find(awayTeam.statistics,{name:'clearance_completed'}).value : 0"),
                 'passes_accuracy_home' : ("span", "data-bind", "text: homePassesCompletion"),
                 'passes_accuracy_away' : ("span", "data-bind", "text: awayPassesCompletion")
               }

    # The dictionary that stores the information.
    dict_feat_info = {}

    # Extract the information from the soup
    for feat in dict_feat:
        dict_feat_info[feat] = get_feature(uefa_stats_page_soup, dict_feat[feat][0], dict_feat[feat][1], dict_feat[feat][2])

    return dict_feat_info

In [10]:
def extract_stats_from_main_page(uefa_main_page_soup):
    '''
    INPUT:  Soup of the UEFA main page in case that there is no specific statistics page.
    OUTPUT: Several information from the statistics stored in a dictionary.
    '''
    
    # The features we want to extract from the page and the corresponding HTML-information
    dict_feat = {'goals_home' : ("div", "class", "goals-scored--value graph-bar--number-value graph-bar--number-value__home-team"),
                 'goals_away' : ("div", "class", "goals-scored--value graph-bar--number-value graph-bar--number-value__away-team"),
                 'attempts_blocked_home' : ("div", "class", "blocked--value graph-bar--number-value graph-bar--number-value__home-team"),
                 'attempts_blocked_away' : ("div", "class", "blocked--value graph-bar--number-value graph-bar--number-value__away-team"),
                 'corners_home' : ("div", "class", "corner--value graph-bar--number-value graph-bar--number-value__home-team"),
                 'corners_away' : ("div", "class", "corner--value graph-bar--number-value graph-bar--number-value__away-team"),
                 'offsides_home' : ("div", "class", "offside--value graph-bar--number-value graph-bar--number-value__home-team"),
                 'offsides_away' : ("div", "class", "offside--value graph-bar--number-value graph-bar--number-value__away-team"),
                 'possession_home' : ("div", "class", "ball-possession-value graph-circle-number-value graph-circle-number-value__home-team"),
                 'possession_away' : ("div", "class", "ball-possession-value graph-circle-number-value graph-circle-number-value__away-team"),
                 'passes_home' : ("div", "class", "passes-value graph-bar--number-value graph-bar--number-value__home-team"),
                 'passes_away' : ("div", "class", "passes-value graph-bar--number-value graph-bar--number-value__away-team"),
                 'passes_completed_home' : ("div", "class", "passes-completed-value graph-bar--number-value graph-bar--number-value__home-team"),
                 'passes_completed_away' : ("div", "class", "passes-completed-value graph-bar--number-value graph-bar--number-value__away-team"),
                 'balls_recovered_home' : ("div", "class", "balls-recovered--value graph-bar--number-value graph-bar--number-value__home-team"),
                 'balls_recovered_away' : ("div", "class", "balls-recovered--value graph-bar--number-value graph-bar--number-value__away-team"),
                 'tackles_home' : ("div", "class", "taclkles--value graph-bar--number-value graph-bar--number-value__home-team"),
                 'tackles_away' : ("div", "class", "taclkles--value graph-bar--number-value graph-bar--number-value__away-team"),
                 'blocks_home' : ("div", "class", "blocks-completed--value graph-bar--number-value graph-bar--number-value__home-team"),
                 'blocks_away' : ("div", "class", "blocks-completed--value graph-bar--number-value graph-bar--number-value__away-team"),
                 'clearances_home' : ("div", "class", "clearances-completed--value graph-bar--number-value graph-bar--number-value__home-team"),
                 'clearances_away' : ("div", "class", "clearances-completed--value graph-bar--number-value graph-bar--number-value__away-team"),
                 'passes_accuracy_home' : ("span", "data-bind", "text: homePassesCompletion"),
                 'passes_accuracy_away' : ("span", "data-bind", "text: awayPassesCompletion")
                }
    
    # The following dictionary stores the information on the processed game.
    dict_feat_info = {}
    
    for feat in dict_feat:
        dict_feat_info[feat] = get_feature(uefa_main_page_soup, dict_feat[feat][0], dict_feat[feat][1], dict_feat[feat][2])
    
    # Due to a different design of the web page, we have to handle the information about the attemps in a different way
    # than in the case that there is a specific containing the statistics.
    
    try:
        attempts_home = [e.text for e in uefa_main_page_soup.find_all('div', {'class' : 'total-attempts--value graph-bar--number-value graph-bar--number-value__home-team'})]
        attempts_away = [e.text for e in uefa_main_page_soup.find_all('div', {'class' : 'total-attempts--value graph-bar--number-value graph-bar--number-value__away-team'})]
        
        dict_feat_info['attempts_total_home'] = attempts_home[0]
        dict_feat_info['attempts_on_target_home'] = attempts_home[1]
        dict_feat_info['attempts_off_target_home'] = attempts_home[2]
        
        dict_feat_info['attempts_total_away'] = attempts_away[0]
        dict_feat_info['attempts_on_target_away'] = attempts_away[1]
        dict_feat_info['attempts_off_target_away'] = attempts_away[2]
        
    except:
        feat_attempts = ['attempts_total_home', 'attempts_on_target_home', 'attempts_off_target_home',
                         'attempts_total_away', 'attempts_on_target_away', 'attempts_off_target_away'
                        ]
        
        for feat in feat_attempts:
            dict_feat_info[feat] = np.nan
            
    return dict_feat_info

In [11]:
def get_teams_and_date(soup):
    '''
    INPUT: Soup
    OUTPUT: Outputs a dictionary that contains the name of the home team, the away team and the date of the match.
    '''
    dic_teams_and_date = {}
    
    try:
        scripts = soup.find_all('script', {'type' : 'application/ld+json'})
        information = [json.loads(script.text) for script in scripts]
        
        dic_teams_and_date['team_home'] = information[1]['homeTeam']['name']
        dic_teams_and_date['team_away'] = information[1]['awayTeam']['name']
        dic_teams_and_date['date'] = information[1]['startDate']
        
    except:
        dic_teams_and_date['team_home'] = np.nan
        dic_teams_and_date['team_away'] = np.nan
        dic_teams_and_date['date'] = np.nan
        
    return dic_teams_and_date

In [12]:
def get_url_of_stats_page(uefa_main_page_url, uefa_main_page_soup):
    '''
    INPUT:  URL of the main page of a game played.
    OUPTUT: URL of the corresponding statistics page in case it exists.
    '''
    try:
        url_of_stats_rel = [link['href'] for link in uefa_main_page_soup.find_all('a') if link.text.strip() == 'Stats'][0]
        url_of_stats_abs = urljoin(uefa_main_page_url, url_of_stats_rel)

        return url_of_stats_abs
    except:
        return None

In [13]:
def extract_information_from_uefa(uefa_main_page_url):
    '''
    INPUT:  URL of the main page of a played game.
    OUTPUT: Outputs a dictionary that stores features for this game (home team, away team, date, several game statistics...)
    
    DESCRIPTION: At first we request the main page and extract the date and the teams. Unfortunately, the UEFA changed their
                 page design over time. Therefore, we investigate if there is a link to another page containing the statistics.
                 If that is true, then we have to make another request and process this statistics page afterwards. Otherwise,
                 we can parse the information from the main page.
    '''
    
    print('Process the following url: ', uefa_main_page_url)
    
    # The following dictionary will contain all information on the game and will be returned by the function in the end.
    dict_result = {}
    dict_result['url'] = uefa_main_page_url
    
    # Get Main Page Soup
    print('Request main page...')
    uefa_main_page_soup = request_page_and_generate_soup(uefa_main_page_url)
    
    # Parse Date, Home Team and Away Team
    dict_teams_and_date = get_teams_and_date(uefa_main_page_soup)
        
    # Try to find a link to a statistics page
    url_of_stats_page = get_url_of_stats_page(uefa_main_page_url, uefa_main_page_soup)
    
    # Extract features
    if url_of_stats_page == None:
        print('Extract features from main page...')
        dict_info = extract_stats_from_main_page(uefa_main_page_soup)
    else:
        print('Request stats page...')
        uefa_stats_page_soup = request_page_and_generate_soup(url_of_stats_page)
        dict_info = extract_stats_from_stats_page(uefa_stats_page_soup)
    
    # Update 
    dict_result.update(dict_teams_and_date)
    dict_result.update(dict_info)
    
    print()
        
    return dict_result

In [14]:
def get_teams_and_result(soup):
    '''
    INPUT: Soup
    OUTPUT: Outputs a dictionary that contains the name of home team, the away team and the date of the match.
    
    DESCRIPTION: This function is used for grabbing the the relevant information from the games played in UEFA Euro 2020.
                 Here, we are only interested in the teams and the result. When extracting the result, we have to take into
                 account that we are interested in the result after a possible penalty shootout.
    '''
    dic_teams_and_result = {}
    
    try:
        scripts = soup.find_all('script', {'type' : 'application/ld+json'})
        information = [json.loads(script.text) for script in scripts]
        
        dic_teams_and_result['team_home'] = information[1]['homeTeam']['name']
        dic_teams_and_result['team_away'] = information[1]['awayTeam']['name']
        dic_teams_and_result['goals_home'] = int(information[1]['name'].split()[1].split('-')[0])
        dic_teams_and_result['goals_away'] = int(information[1]['name'].split()[1].split('-')[1])
        
    except:
        dic_teams_and_result['team_home'] = np.nan
        dic_teams_and_result['team_away'] = np.nan
        dic_teams_and_result['goals_home'] = np.nan
        dic_teams_and_result['goals_away'] = np.nan
        
    try:
        result_penalties = soup.find('div', {'class' : 'js-post-match-penalties-score post-match-penalties-score'}).text
        goals_home_after_p = result_penalties.strip()[1:].split()[0].split('-')[0]
        goals_away_after_p = result_penalties.strip()[1:].split()[0].split('-')[1]
        
        dic_teams_and_result['goals_home'] += int(goals_home_after_p)
        dic_teams_and_result['goals_away'] += int(goals_away_after_p)
        
    except:
        pass
        
    return dic_teams_and_result

In [15]:
def extract_teams_and_result_from_uefa(uefa_main_page_url):
    '''
    INPUT:  URL of a game played in UEFA Euro 2020.
    OUTPUT: Dictionary containing the teams, which played the game and the result.
    '''
    
    print('Process the following url: ', uefa_main_page_url)
    
    dict_result = {}
    dict_result['url'] = uefa_main_page_url
    
    # Get Main Page Soup
    print('Request main page...')
    uefa_main_page_soup = request_page_and_generate_soup(uefa_main_page_url)
    
    
    # Parse Date, Home Team and Away Team
    dict_teams_and_result = get_teams_and_result(uefa_main_page_soup)
    
    dict_result.update(dict_teams_and_result)
    
    return dict_result

### Function to Aggregate Information

In [22]:
def generate_df_from_information(list_of_dicts):
    '''
    INPUT:  List of dictionaries, which contain the information and statistics of matches
    OUTPUT: All information aggregated in a single dataframe and a list of urls, where the crawling failed.
    '''
    
    df = pd.DataFrame(list_of_dicts)
    
    # All URLs where the crawling failed
    urls_failure = df[df['goals_home'].isnull()]['url']
    
    df = df[df['goals_home'].notnull()]
    
    return df, urls_failure

## Crawl Data

### Crawl Matches Played before UEFA Euro 2020

In this part we crawl all the data of the games played before the UEFA Euro 2020. The statistics of these games serve as basis for the training of a ML model.

#### Crawl data from wikipedia

In [17]:
# The URLs of the wikipedia pages that contain the links to the pages of the UEFA containing the match information
wiki_urls = ['https://en.wikipedia.org/wiki/2020%E2%80%9321_UEFA_Nations_League_A', 
             'https://en.wikipedia.org/wiki/2020%E2%80%9321_UEFA_Nations_League_B',
             'https://en.wikipedia.org/wiki/2020%E2%80%9321_UEFA_Nations_League_C',
             'https://en.wikipedia.org/wiki/2020%E2%80%9321_UEFA_Nations_League_D',
             'https://en.wikipedia.org/wiki/2018%E2%80%9319_UEFA_Nations_League_A',
             'https://en.wikipedia.org/wiki/2018%E2%80%9319_UEFA_Nations_League_B',
             'https://en.wikipedia.org/wiki/2018%E2%80%9319_UEFA_Nations_League_C',
             'https://en.wikipedia.org/wiki/2018%E2%80%9319_UEFA_Nations_League_D',
             'https://en.wikipedia.org/wiki/UEFA_Euro_2020_qualifying_Group_A',
             'https://en.wikipedia.org/wiki/UEFA_Euro_2020_qualifying_Group_B',
             'https://en.wikipedia.org/wiki/UEFA_Euro_2020_qualifying_Group_C',
             'https://en.wikipedia.org/wiki/UEFA_Euro_2020_qualifying_Group_D',
             'https://en.wikipedia.org/wiki/UEFA_Euro_2020_qualifying_Group_E',
             'https://en.wikipedia.org/wiki/UEFA_Euro_2020_qualifying_Group_F',
             'https://en.wikipedia.org/wiki/UEFA_Euro_2020_qualifying_Group_G',
             'https://en.wikipedia.org/wiki/UEFA_Euro_2020_qualifying_Group_H',
             'https://en.wikipedia.org/wiki/UEFA_Euro_2020_qualifying_Group_I',
             'https://en.wikipedia.org/wiki/UEFA_Euro_2020_qualifying_Group_J',
             'https://en.wikipedia.org/wiki/UEFA_Euro_2020_qualifying_play-offs'
             ]

In [18]:
uefa_all_match_urls = [find_all_match_urls_from_wiki_page(wiki_url) for wiki_url in wiki_urls]
uefa_all_match_urls_flattened = [url for urls_competition in uefa_all_match_urls for url in urls_competition]

#### Crawl data from UEFA: First attempt

In [19]:
game_information = [extract_information_from_uefa(url) for url in uefa_all_match_urls_flattened]

Process the following url:  https://www.uefa.com/uefanationsleague/match/2029980/
Request main page...
Request stats page...

Process the following url:  https://www.uefa.com/uefanationsleague/match/2029979/
Request main page...
Request stats page...

Process the following url:  https://www.uefa.com/uefanationsleague/match/2030010/
Request main page...
Request stats page...

Process the following url:  https://www.uefa.com/uefanationsleague/match/2030007/
Request main page...
Request stats page...

Process the following url:  https://www.uefa.com/uefanationsleague/match/2030025/
Request main page...
Request stats page...

Process the following url:  https://www.uefa.com/uefanationsleague/match/2030024/
Request main page...
Request stats page...

Process the following url:  https://www.uefa.com/uefanationsleague/match/2030054/
Request main page...
Request stats page...

Process the following url:  https://www.uefa.com/uefanationsleague/match/2030055/
Request main page...
Request stats p

Request stats page...

Process the following url:  https://www.uefa.com/uefanationsleague/match/2030008/
Request main page...
Request stats page...

Process the following url:  https://www.uefa.com/uefanationsleague/match/2030013/
Request main page...
Request stats page...

Process the following url:  https://www.uefa.com/uefanationsleague/match/2030028/
Request main page...
Request stats page...

Process the following url:  https://www.uefa.com/uefanationsleague/match/2030027/
Request main page...
Request stats page...

Process the following url:  https://www.uefa.com/uefanationsleague/match/2030057/
Request main page...
Request stats page...

Process the following url:  https://www.uefa.com/uefanationsleague/match/2030056/
Request main page...
Request stats page...

Process the following url:  https://www.uefa.com/uefanationsleague/match/2030100/
Request main page...
Request stats page...

Process the following url:  https://www.uefa.com/uefanationsleague/match/2030101/
Request main 


Process the following url:  https://www.uefa.com/uefanationsleague/match/2030076/
Request main page...
Request stats page...

Process the following url:  https://www.uefa.com/uefanationsleague/match/2030091/
Request main page...
Request stats page...

Process the following url:  https://www.uefa.com/uefanationsleague/match/2030089/
Request main page...
Request stats page...

Process the following url:  https://www.uefa.com/uefanationsleague/match/2030118/
Request main page...
Request stats page...

Process the following url:  https://www.uefa.com/uefanationsleague/match/2030119/
Request main page...
Request stats page...

Process the following url:  https://www.uefa.com/uefanationsleague/match/2029985/
Request main page...
Request stats page...

Process the following url:  https://www.uefa.com/uefanationsleague/match/2029986/
Request main page...
Request stats page...

Process the following url:  https://www.uefa.com/uefanationsleague/match/2030006/
Request main page...
Request stats 

Request stats page...

Process the following url:  https://www.uefa.com/uefanationsleague/season=2019/matches/round=2000959/match=2024037/index.html
Request main page...
Request stats page...

Process the following url:  https://www.uefa.com/uefanationsleague/season=2019/matches/round=2000956/match=2024419/index.html
Request main page...
Request stats page...

Process the following url:  https://www.uefa.com/uefanationsleague/season=2019/matches/round=2000956/match=2024420/index.html
Request main page...
Request stats page...

Process the following url:  https://www.uefa.com/uefanationsleague/season=2019/matches/round=2000957/match=2024421/index.html
Request main page...
Request stats page...

Process the following url:  https://www.uefa.com/uefanationsleague/season=2019/matches/round=2000958/match=2024422/index.html
Request main page...
Request stats page...

Process the following url:  https://www.uefa.com/uefanationsleague/season=2019/matches/round=2000959/match=2024053/index.html
R

Request stats page...

Process the following url:  https://www.uefa.com/uefanationsleague/season=2019/matches/round=2000959/match=2024153/index.html
Request main page...
Request stats page...

Process the following url:  https://www.uefa.com/uefanationsleague/season=2019/matches/round=2000959/match=2024146/index.html
Request main page...
Request stats page...

Process the following url:  https://www.uefa.com/uefanationsleague/season=2019/matches/round=2000959/match=2024155/index.html
Request main page...
Request stats page...

Process the following url:  https://www.uefa.com/uefanationsleague/season=2019/matches/round=2000959/match=2024148/index.html
Request main page...
Request stats page...

Process the following url:  https://www.uefa.com/uefanationsleague/season=2019/matches/round=2000959/match=2024145/index.html
Request main page...
Request stats page...

Process the following url:  https://www.uefa.com/uefanationsleague/season=2019/matches/round=2000959/match=2024154/index.html
R

Request stats page...

Process the following url:  https://www.uefa.com/uefanationsleague/season=2019/matches/round=2000959/match=2024106/index.html
Request main page...
Request stats page...

Process the following url:  https://www.uefa.com/uefanationsleague/season=2019/matches/round=2000959/match=2024108/index.html
Request main page...
Request stats page...

Process the following url:  https://www.uefa.com/uefanationsleague/season=2019/matches/round=2000959/match=2024107/index.html
Request main page...
Request stats page...

Process the following url:  https://www.uefa.com/uefanationsleague/season=2019/matches/round=2000959/match=2024110/index.html
Request main page...
Request stats page...

Process the following url:  https://www.uefa.com/uefanationsleague/season=2019/matches/round=2000959/match=2024109/index.html
Request main page...
Request stats page...

Process the following url:  https://www.uefa.com/uefanationsleague/season=2019/matches/round=2000959/match=2024112/index.html
R

Extract features from main page...

Process the following url:  https://www.uefa.com/uefaeuro/match/2026094/
Request main page...
Extract features from main page...

Process the following url:  https://www.uefa.com/uefaeuro/match/2026095/
Request main page...
Extract features from main page...

Process the following url:  https://www.uefa.com/uefaeuro/match/2025993/
Request main page...
Extract features from main page...

Process the following url:  https://www.uefa.com/uefaeuro/match/2025994/
Request main page...
Extract features from main page...

Process the following url:  https://www.uefa.com/uefaeuro/match/2026017/
Request main page...
Extract features from main page...

Process the following url:  https://www.uefa.com/uefaeuro/match/2026018/
Request main page...
Extract features from main page...

Process the following url:  https://www.uefa.com/uefaeuro/match/2026046/
Request main page...
Extract features from main page...

Process the following url:  https://www.uefa.com/uefae

Extract features from main page...

Process the following url:  https://www.uefa.com/uefaeuro/match/2026035/
Request main page...
Extract features from main page...

Process the following url:  https://www.uefa.com/uefaeuro/match/2026036/
Request main page...
Extract features from main page...

Process the following url:  https://www.uefa.com/uefaeuro/match/2026037/
Request main page...
Extract features from main page...

Process the following url:  https://www.uefa.com/uefaeuro/match/2026061/
Request main page...
Extract features from main page...

Process the following url:  https://www.uefa.com/uefaeuro/match/2026062/
Request main page...
Extract features from main page...

Process the following url:  https://www.uefa.com/uefaeuro/match/2026063/
Request main page...
Extract features from main page...

Process the following url:  https://www.uefa.com/uefaeuro/match/2026073/
Request main page...
Extract features from main page...

Process the following url:  https://www.uefa.com/uefae

Extract features from main page...

Process the following url:  https://www.uefa.com/uefaeuro/match/2026056/
Request main page...
Extract features from main page...

Process the following url:  https://www.uefa.com/uefaeuro/match/2026057/
Request main page...
Extract features from main page...

Process the following url:  https://www.uefa.com/uefaeuro/match/2026074/
Request main page...
Extract features from main page...

Process the following url:  https://www.uefa.com/uefaeuro/match/2026081/
Request main page...
Extract features from main page...

Process the following url:  https://www.uefa.com/uefaeuro/match/2026087/
Request main page...
Extract features from main page...

Process the following url:  https://www.uefa.com/uefaeuro/match/2026158/
Request main page...
Extract features from main page...

Process the following url:  https://www.uefa.com/uefaeuro/match/2026159/
Request main page...
Extract features from main page...

Process the following url:  https://www.uefa.com/uefae

Extract features from main page...

Process the following url:  https://www.uefa.com/uefaeuro/match/2026090/
Request main page...
Extract features from main page...

Process the following url:  https://www.uefa.com/uefaeuro/match/2026163/
Request main page...
Extract features from main page...

Process the following url:  https://www.uefa.com/uefaeuro/match/2026164/
Request main page...
Extract features from main page...

Process the following url:  https://www.uefa.com/uefaeuro/match/2026165/
Request main page...
Extract features from main page...

Process the following url:  https://www.uefa.com/uefaeuro/match/2026166/
Request main page...
Extract features from main page...

Process the following url:  https://www.uefa.com/uefaeuro/match/2026173/
Request main page...
Extract features from main page...

Process the following url:  https://www.uefa.com/uefaeuro/match/2026174/
Request main page...
Extract features from main page...

Process the following url:  https://www.uefa.com/uefae

In [23]:
df_1, urls_failures_1 = generate_df_from_information(game_information)

In [25]:
urls_failures_1

47    https://www.uefa.com/uefanationsleague/match/2...
48    https://www.uefa.com/uefanationsleague/match/2...
49    https://www.uefa.com/uefanationsleague/match/2...
50    https://www.uefa.com/uefanationsleague/match/2...
51    https://www.uefa.com/uefanationsleague/match/2...
61    https://www.uefa.com/uefanationsleague/match/2...
Name: url, dtype: object

#### Crawl data from UEFA: Second attempt

In [26]:
# Usually, we fail on crawling a couple of matches.
# Therefore, we make a second attempt on pages where we failed.

In [27]:
game_information_2 = [extract_information_from_uefa(url) for url in urls_failures_1]

Process the following url:  https://www.uefa.com/uefanationsleague/match/2030114/
Request main page...
Request stats page...

Process the following url:  https://www.uefa.com/uefanationsleague/match/2030758/
Request main page...
Request stats page...

Process the following url:  https://www.uefa.com/uefanationsleague/match/2030759/
Request main page...
Request stats page...

Process the following url:  https://www.uefa.com/uefanationsleague/match/2030760/
Request main page...
Request stats page...

Process the following url:  https://www.uefa.com/uefanationsleague/match/2030761/
Request main page...
Request stats page...

Process the following url:  https://www.uefa.com/uefanationsleague/match/2030096/
Request main page...
Request stats page...



In [29]:
df_2, urls_failures_2 = generate_df_from_information(game_information_2)

In [30]:
urls_failures_2

0    https://www.uefa.com/uefanationsleague/match/2...
1    https://www.uefa.com/uefanationsleague/match/2...
2    https://www.uefa.com/uefanationsleague/match/2...
3    https://www.uefa.com/uefanationsleague/match/2...
4    https://www.uefa.com/uefanationsleague/match/2...
5    https://www.uefa.com/uefanationsleague/match/2...
Name: url, dtype: object

#### Merge Data From Both Attempts and Export

In [31]:
# Aggregate the information gained in both attempts in a single dataframe
df_information = pd.concat([df_1, df_2]).reset_index(drop = True)

In [32]:
# Export the data into the folder data.
file_name_matches_training = 'matches_training_ml.xlsx'

df_information.to_excel(os.path.join(path_output, file_name_matches_training), index = False, encoding = 'iso-8859-1')

### Crawl Results UEFA Euro 2021

Here, we crawl the information of the games played at UEFA Euro 2020. For these matches we are only interested in the teams taking part and the results of the matches. These games will form the validation set in the end.

In [33]:
em_matches_urls = find_all_match_urls_from_wiki_page('https://en.wikipedia.org/wiki/UEFA_Euro_2020')

In [34]:
# Crawl information from matches of UEFA Euro 2020
matches_information_em = [extract_teams_and_result_from_uefa(url) for url in em_matches_urls]

Process the following url:  https://www.uefa.com/uefaeuro-2020/match/2024447/
Request main page...
Process the following url:  https://www.uefa.com/uefaeuro-2020/match/2024448/
Request main page...
Process the following url:  https://www.uefa.com/uefaeuro-2020/match/2024457/
Request main page...
Process the following url:  https://www.uefa.com/uefaeuro-2020/match/2024458/
Request main page...
Process the following url:  https://www.uefa.com/uefaeuro-2020/match/2024467/
Request main page...
Process the following url:  https://www.uefa.com/uefaeuro-2020/match/2024468/
Request main page...
Process the following url:  https://www.uefa.com/uefaeuro-2020/match/2024449/
Request main page...
Process the following url:  https://www.uefa.com/uefaeuro-2020/match/2024450/
Request main page...
Process the following url:  https://www.uefa.com/uefaeuro-2020/match/2024460/
Request main page...
Process the following url:  https://www.uefa.com/uefaeuro-2020/match/2024459/
Request main page...
Process th

In [35]:
df_matches_em_crawling = pd.DataFrame(data = matches_information_em)

In [36]:
# In some cases the crawling failed
df_matches_em_crawling[df_matches_em_crawling['team_home'].isnull()]

Unnamed: 0,url,team_home,team_away,goals_home,goals_away
16,https://www.uefa.com/uefaeuro-2020/match/2024445/,,,,
23,https://www.uefa.com/uefaeuro-2020/match/2024471/,,,,
46,https://www.uefa.com/uefaeuro-2020/match/2024488/,,,,


In [37]:
# The information of the missing games is added by hand
data_missing_matches = [{'url' : 'https://www.uefa.com/uefaeuro-2020/match/2024445/',
                         'team_home' : 'North Macedonia',
                         'team_away' : 'Netherlands',
                         'goals_home' : 0,
                         'goals_away' : 3
                        },
                        {'url' : 'https://www.uefa.com/uefaeuro-2020/match/2024471/',
                         'team_home' : 'Czech Republic',
                         'team_away' : 'England',
                         'goals_home' : 0,
                         'goals_away' : 1,
                        },
                        {'url' : 'https://www.uefa.com/uefaeuro-2020/match/2024488/',
                         'team_home' : 'Czech Republic',
                         'team_away' : 'Denmark',
                         'goals_home' : 1,
                         'goals_away' : 2,
                        }
                       ]

In [38]:
df_matches_missing = pd.DataFrame(data = data_missing_matches)

In [39]:
# Aggregation of the crawled match information and the data added by hand
df_matches_em = pd.concat([df_matches_em_crawling[df_matches_em_crawling['team_home'].notnull()], df_matches_missing], axis = 0) \
                                                                                        .reset_index(drop = True)

In [40]:
cols_to_int = ['goals_home', 'goals_away']

for col in cols_to_int:
    df_matches_em[col] = df_matches_em[col].astype(int)

In [41]:
df_matches_em.head(5)

Unnamed: 0,url,team_home,team_away,goals_home,goals_away
0,https://www.uefa.com/uefaeuro-2020/match/2024447/,Turkey,Italy,0,3
1,https://www.uefa.com/uefaeuro-2020/match/2024448/,Wales,Switzerland,1,1
2,https://www.uefa.com/uefaeuro-2020/match/2024457/,Turkey,Wales,0,2
3,https://www.uefa.com/uefaeuro-2020/match/2024458/,Italy,Switzerland,3,0
4,https://www.uefa.com/uefaeuro-2020/match/2024467/,Switzerland,Turkey,3,1


#### Export

In [42]:
file_name_euro_2020 = 'matches_euro_2020.xlsx'

df_matches_em.to_excel(os.path.join(path_output, file_name_euro_2020), index = False, encoding = 'iso-8859-1')