In [15]:
import numpy as np
import pandas as pd

In [16]:
import time
import json

In [17]:
import requests
from urllib.parse import urljoin

In [18]:
from bs4 import BeautifulSoup

In [19]:
def request_page_and_generate_soup(url):
    '''
    INPUT:  URL to a web page
    OUTPUT: Soup made by BeautifulSoup that contains the HTML-code of the page
    '''
    
    try:
        # A delay of three seconds such that we do not make too many requests in a short time by mistake
        time.sleep(3)

        content = requests.get(url)
        text    = content.text
        soup    = BeautifulSoup(text)

        return soup
    
    except:
        return None

## Extract all URLs to UEFA Games from Wikipedia

In [20]:
def find_all_match_urls_from_wiki_page(wiki_url):
    '''
    INPUT:       URL to a wikipedia page
    OUTPUT:      URLs to football games on the UEFA web page
        
    DESCRIPTION: We perform a http-request to wikipedia. (A time delay of three seconds is included to avoid making
                 get-requests too often in a second.)
                 The source code of the page is parsed using BeautifulSoup. Then, we extract all URLs to games on the UEFA 
                 web page. They all have te name 'Report'.
    '''
    all_urls_matches = []
    
    try:
        wiki_soup = request_page_and_generate_soup(wiki_url)
        
        all_urls_matches = [link['href'] for link in wiki_soup.find_all('a') if link.text == 'Report']
    except:
        pass
    
    return all_urls_matches

## Process all UEFA pages

In [21]:
def get_feature(soup, elem_type, parameter, name):
    '''
    INPUT:  Soup and different HTML-parameters.
    OUTPUT: The feature from the soup specified by the HTML-elements.
    '''
    try:
        return soup.find(elem_type, {parameter : name}).text
    except:
        return np.nan

In [22]:
def extract_stats_from_stats_page(uefa_stats_page_soup):
    '''
    INPUT:  Soup of the corresponding uefa-page containing the statistics of a football game.
    OUTPUT: Several information from the statistics stored in a dictionary.
    '''
    
    # The features we want to extract from the page and the corresponding HTML-information
    dict_feat = {'goals_home' : ("div", "data-bind", "text: homeGoalsScored"),
                 'goals_away' : ("div", "data-bind", "text: awayGoalsScored"),
                 'attempts_total_home' : ("div", "data-bind", "text: _.find(homeTeam.statistics,{name:'attempts'}) ? _.find(homeTeam.statistics,{name:'attempts'}).value : 0"),
                 'attempts_total_away' : ("div", "data-bind", "text: _.find(awayTeam.statistics,{name:'attempts'}) ? _.find(awayTeam.statistics,{name:'attempts'}).value : 0"),
                 'attempts_off_target_home' : ("div", "data-bind", "text: homeAttempsOff"),
                 'attempts_off_target_away' : ("div", "data-bind", "text: awayAttempsOff"),
                 'attempts_on_target_home' : ("div", "data-bind", "text: homeAttempsOn"),
                 'attempts_on_target_away' : ("div", "data-bind", "text: awayAttempsOn"),
                 'attempts_blocked_home' : ("div", "data-bind", "text: _.find(homeTeam.statistics,{name:'attempts_blocked'}) ? _.find(homeTeam.statistics,{name:'attempts_blocked'}).value : 0"),
                 'attempts_blocked_away' : ("div", "data-bind", "text: _.find(awayTeam.statistics,{name:'attempts_blocked'}) ? _.find(awayTeam.statistics,{name:'attempts_blocked'}).value : 0"),
                 'corners_home' : ("div", "data-bind", "text: homeCorner"),
                 'corners_away' : ("div", "data-bind", "text: awayCorner"),
                 'offsides_home' : ("div", "data-bind", "text: homeOffside"),
                 'offsides_away' : ("div", "data-bind", "text: awayOffside"),
                 'possession_home' : ("div", "data-bind", "text: homeBallPossession + '%'"),
                 'possession_away' : ("div", "data-bind", "text: awayBallPossession + '%'"),
                 'passes_home' : ("div", "data-bind", "text: _.find(homeTeam.statistics,{name:'passes_attempted'}) ? _.find(homeTeam.statistics,{name:'passes_attempted'}).value : 0"),
                 'passes_away' : ("div", "data-bind", "text: _.find(awayTeam.statistics,{name:'passes_attempted'}) ? _.find(awayTeam.statistics,{name:'passes_attempted'}).value : 0"),
                 'passes_completed_home' : ("div", "data-bind", "text: _.find(homeTeam.statistics,{name:'passes_completed'}) ? _.find(homeTeam.statistics,{name:'passes_completed'}).value : 0"),
                 'passes_completed_away' : ("div", "data-bind", "text: _.find(awayTeam.statistics,{name:'passes_completed'}) ? _.find(awayTeam.statistics,{name:'passes_completed'}).value : 0"),
                 'balls_recovered_home' : ("div", "data-bind", "text: homeRecoveredBalls"),
                 'balls_recovered_away' : ("div", "data-bind", "text: awayRecoveredBalls"),
                 'tackles_home' : ("div", "data-bind", "text: _.find(homeTeam.statistics,{name:'tackles'}) ? _.find(homeTeam.statistics,{name:'tackles'}).value : 0"),
                 'tackles_away' : ("div", "data-bind", "text: _.find(awayTeam.statistics,{name:'tackles'}) ? _.find(awayTeam.statistics,{name:'tackles'}).value : 0"),
                 'blocks_home' : ("div", "data-bind", "text: _.find(homeTeam.statistics,{name:'blocked'}) ? _.find(homeTeam.statistics,{name:'blocked'}).value : 0"),
                 'blocks_away' : ("div", "data-bind", "text: _.find(awayTeam.statistics,{name:'blocked'}) ? _.find(awayTeam.statistics,{name:'blocked'}).value : 0"),
                 'clearances_home' : ("div", "data-bind", "text: _.find(homeTeam.statistics,{name:'clearance_completed'}) ? _.find(homeTeam.statistics,{name:'clearance_completed'}).value : 0"),
                 'clearances_away' : ("div", "data-bind", "text: _.find(awayTeam.statistics,{name:'clearance_completed'}) ? _.find(awayTeam.statistics,{name:'clearance_completed'}).value : 0"),
                 'passes_accuracy_home' : ("span", "data-bind", "text: homePassesCompletion"),
                 'passes_accuracy_away' : ("span", "data-bind", "text: awayPassesCompletion")
               }

    # The dictionary that stores the information.
    dict_feat_info = {}

    # Extract the information from the soup
    for feat in dict_feat:
        dict_feat_info[feat] = get_feature(uefa_stats_page_soup, dict_feat[feat][0], dict_feat[feat][1], dict_feat[feat][2])

    return dict_feat_info

In [23]:
def extract_stats_from_main_page(uefa_main_page_soup):
    '''
    INPUT:  Soup of the UEFA main page in case that there is no specific statistics page.
    OUTPUT: Several information from the statistics stored in a dictionary.
    '''
    
    # The features we want to extract from the page and the corresponding HTML-information
    dict_feat = {'goals_home' : ("div", "class", "goals-scored--value graph-bar--number-value graph-bar--number-value__home-team"),
                 'goals_away' : ("div", "class", "goals-scored--value graph-bar--number-value graph-bar--number-value__away-team"),
                 'attempts_blocked_home' : ("div", "class", "blocked--value graph-bar--number-value graph-bar--number-value__home-team"),
                 'attempts_blocked_away' : ("div", "class", "blocked--value graph-bar--number-value graph-bar--number-value__away-team"),
                 'corners_home' : ("div", "class", "corner--value graph-bar--number-value graph-bar--number-value__home-team"),
                 'corners_away' : ("div", "class", "corner--value graph-bar--number-value graph-bar--number-value__away-team"),
                 'offsides_home' : ("div", "class", "offside--value graph-bar--number-value graph-bar--number-value__home-team"),
                 'offsides_away' : ("div", "class", "offside--value graph-bar--number-value graph-bar--number-value__away-team"),
                 'possession_home' : ("div", "class", "ball-possession-value graph-circle-number-value graph-circle-number-value__home-team"),
                 'possession_away' : ("div", "class", "ball-possession-value graph-circle-number-value graph-circle-number-value__away-team"),
                 'passes_home' : ("div", "class", "passes-value graph-bar--number-value graph-bar--number-value__home-team"),
                 'passes_away' : ("div", "class", "passes-value graph-bar--number-value graph-bar--number-value__away-team"),
                 'passes_completed_home' : ("div", "class", "passes-completed-value graph-bar--number-value graph-bar--number-value__home-team"),
                 'passes_completed_away' : ("div", "class", "passes-completed-value graph-bar--number-value graph-bar--number-value__away-team"),
                 'balls_recovered_home' : ("div", "class", "balls-recovered--value graph-bar--number-value graph-bar--number-value__home-team"),
                 'balls_recovered_away' : ("div", "class", "balls-recovered--value graph-bar--number-value graph-bar--number-value__away-team"),
                 'tackles_home' : ("div", "class", "taclkles--value graph-bar--number-value graph-bar--number-value__home-team"),
                 'tackles_away' : ("div", "class", "taclkles--value graph-bar--number-value graph-bar--number-value__away-team"),
                 'blocks_home' : ("div", "class", "blocks-completed--value graph-bar--number-value graph-bar--number-value__home-team"),
                 'blocks_away' : ("div", "class", "blocks-completed--value graph-bar--number-value graph-bar--number-value__away-team"),
                 'clearances_home' : ("div", "class", "clearances-completed--value graph-bar--number-value graph-bar--number-value__home-team"),
                 'clearances_away' : ("div", "class", "clearances-completed--value graph-bar--number-value graph-bar--number-value__away-team"),
                 'passes_accuracy_home' : ("span", "data-bind", "text: homePassesCompletion"),
                 'passes_accuracy_away' : ("span", "data-bind", "text: awayPassesCompletion")
                }

    dict_feat_info = {}
    
    for feat in dict_feat:
        dict_feat_info[feat] = get_feature(uefa_main_page_soup, dict_feat[feat][0], dict_feat[feat][1], dict_feat[feat][2])
    
    # In this case we have to handle the information about the attempts in a different way
    try:
        attempts_home = [e.text for e in uefa_main_page_soup.find_all('div', {'class' : 'total-attempts--value graph-bar--number-value graph-bar--number-value__home-team'})]
        attempts_away = [e.text for e in uefa_main_page_soup.find_all('div', {'class' : 'total-attempts--value graph-bar--number-value graph-bar--number-value__away-team'})]
        
        dict_feat_info['attempts_total_home'] = attempts_home[0]
        dict_feat_info['attempts_on_target_home'] = attempts_home[1]
        dict_feat_info['attempts_off_target_home'] = attempts_home[2]
        
        dict_feat_info['attempts_total_away'] = attempts_away[0]
        dict_feat_info['attempts_on_target_away'] = attempts_away[1]
        dict_feat_info['attempts_off_target_away'] = attempts_away[2]
        
    except:
        feat_attempts = ['attempts_total_home', 'attempts_on_target_home', 'attempts_off_target_home',
                         'attempts_total_away', 'attempts_on_target_away', 'attempts_off_target_away'
                        ]
        
        for feat in feat_attempts:
            dict_feat_info[feat] = np.nan
            
    return dict_feat_info

In [24]:
def get_teams_and_date(soup):
    '''
    INPUT: Soup
    OUTPUT: Outputs a dictionary that contains the name of home team, the away team and the date of the match.
    '''
    dic_teams_and_date = {}
    
    try:
        scripts = soup.find_all('script', {'type' : 'application/ld+json'})
        information = [json.loads(script.text) for script in scripts]
        
        dic_teams_and_date['team_home'] = information[1]['homeTeam']['name']
        dic_teams_and_date['team_away'] = information[1]['awayTeam']['name']
        dic_teams_and_date['date'] = information[1]['startDate']
        
    except:
        dic_teams_and_date['team_home'] = np.nan
        dic_teams_and_date['team_away'] = np.nan
        dic_teams_and_date['date'] = np.nan
        
    return dic_teams_and_date

In [25]:
def get_url_of_stats_page(uefa_main_page_url, uefa_main_page_soup):
    '''
    INPUT:  URL of the main page of a game played.
    OUPTUT: URL of the corresponding statistics page in case it exists.
    '''
    try:
        url_of_stats_rel = [link['href'] for link in uefa_main_page_soup.find_all('a') if link.text.strip() == 'Stats'][0]
        url_of_stats_abs = urljoin(uefa_main_page_url, url_of_stats_rel)

        return url_of_stats_abs
    except:
        return None

In [26]:
def extract_information_from_uefa(uefa_main_page_url):
    '''
    INPUT:  URL of the main page of a played game.
    OUTPUT: Outputs a dictionary that stores features for this game (home team, away team, date, several game statistics...)
    
    DESCRIPTION: At first we request the main page and extract the date and the teams. Unfortunately, the UEFA changed their
                 page design over time. Therefore, we investigate if there is a link to another page containing the statistics.
                 If that is true, then we have to make another request and process this statistics page afterwards. Otherwise,
                 we can parse the information from the main page.
    '''
    
    print('Process the following url: ', uefa_main_page_url)
    
    dict_result = {}
    dict_result['url'] = uefa_main_page_url
    
    # Get Main Page Soup
    print('Request main page...')
    uefa_main_page_soup = request_page_and_generate_soup(uefa_main_page_url)
    
    
    # Parse Date, Home Team and Away Team
    dict_teams_and_date = get_teams_and_date(uefa_main_page_soup)
        
    # Try to find a link to a statistics page
    url_of_stats_page = get_url_of_stats_page(uefa_main_page_url, uefa_main_page_soup)
    
    # Extract features
    if url_of_stats_page == None:
        print('Extract features from main page...')
        dict_info = extract_stats_from_main_page(uefa_main_page_soup)
    else:
        print('Request stats page...')
        uefa_stats_page_soup = request_page_and_generate_soup(url_of_stats_page)
        dict_info = extract_stats_from_stats_page(uefa_stats_page_soup)
    
    # Update 
    dict_result.update(dict_teams_and_date)
    dict_result.update(dict_info)
    
    print()
        
    return dict_result

### Preprocessing

In [72]:
def generate_df_from_information(list_of_dicts):
    df = pd.DataFrame(list_of_dicts)
    urls_failure = df[df['goals_home'].isnull()]['URL']
    
    df = df[df['goals_home'].notnull()]
    
    return df, urls_failure

## Crawl Data

### Crawl data from wikipedia

In [27]:
wiki_urls = ['https://en.wikipedia.org/wiki/2020%E2%80%9321_UEFA_Nations_League_A', 
             'https://en.wikipedia.org/wiki/2020%E2%80%9321_UEFA_Nations_League_B',
             'https://en.wikipedia.org/wiki/2020%E2%80%9321_UEFA_Nations_League_C',
             'https://en.wikipedia.org/wiki/2020%E2%80%9321_UEFA_Nations_League_D',
             'https://en.wikipedia.org/wiki/2018%E2%80%9319_UEFA_Nations_League_A',
             'https://en.wikipedia.org/wiki/2018%E2%80%9319_UEFA_Nations_League_B',
             'https://en.wikipedia.org/wiki/2018%E2%80%9319_UEFA_Nations_League_C',
             'https://en.wikipedia.org/wiki/2018%E2%80%9319_UEFA_Nations_League_D',
             'https://en.wikipedia.org/wiki/UEFA_Euro_2020_qualifying_Group_A',
             'https://en.wikipedia.org/wiki/UEFA_Euro_2020_qualifying_Group_B',
             'https://en.wikipedia.org/wiki/UEFA_Euro_2020_qualifying_Group_C',
             'https://en.wikipedia.org/wiki/UEFA_Euro_2020_qualifying_Group_D',
             'https://en.wikipedia.org/wiki/UEFA_Euro_2020_qualifying_Group_E',
             'https://en.wikipedia.org/wiki/UEFA_Euro_2020_qualifying_Group_F',
             'https://en.wikipedia.org/wiki/UEFA_Euro_2020_qualifying_Group_G',
             'https://en.wikipedia.org/wiki/UEFA_Euro_2020_qualifying_Group_H',
             'https://en.wikipedia.org/wiki/UEFA_Euro_2020_qualifying_Group_I',
             'https://en.wikipedia.org/wiki/UEFA_Euro_2020_qualifying_Group_J',
             'https://en.wikipedia.org/wiki/UEFA_Euro_2020_qualifying_play-offs'
             ]

In [28]:
uefa_all_match_urls = [find_all_match_urls_from_wiki_page(wiki_url) for wiki_url in wiki_urls]

In [29]:
uefa_all_match_urls

[['https://www.uefa.com/uefanationsleague/match/2029980/',
  'https://www.uefa.com/uefanationsleague/match/2029979/',
  'https://www.uefa.com/uefanationsleague/match/2030010/',
  'https://www.uefa.com/uefanationsleague/match/2030007/',
  'https://www.uefa.com/uefanationsleague/match/2030025/',
  'https://www.uefa.com/uefanationsleague/match/2030024/',
  'https://www.uefa.com/uefanationsleague/match/2030054/',
  'https://www.uefa.com/uefanationsleague/match/2030055/',
  'https://www.uefa.com/uefanationsleague/match/2030098/',
  'https://www.uefa.com/uefanationsleague/match/2030099/',
  'https://www.uefa.com/uefanationsleague/match/2030124/',
  'https://www.uefa.com/uefanationsleague/match/2030121/',
  'https://www.uefa.com/uefanationsleague/match/2029987/',
  'https://www.uefa.com/uefanationsleague/match/2029993/',
  'https://www.uefa.com/uefanationsleague/match/2030017/',
  'https://www.uefa.com/uefanationsleague/match/2030019/',
  'https://www.uefa.com/uefanationsleague/match/2030034/

In [31]:
uefa_all_match_urls_flattened = [url for urls_competition in uefa_all_match_urls for url in urls_competition]

### Crawl data from UEFA: First attempt

In [40]:
game_information = [extract_information_from_uefa(url) for url in uefa_all_match_urls_flattened]

Process the following url:  https://www.uefa.com/uefanationsleague/match/2029980/
Request main page...
Request stats page...

Process the following url:  https://www.uefa.com/uefanationsleague/match/2029979/
Request main page...
Request stats page...

Process the following url:  https://www.uefa.com/uefanationsleague/match/2030010/
Request main page...
Request stats page...

Process the following url:  https://www.uefa.com/uefanationsleague/match/2030007/
Request main page...
Request stats page...

Process the following url:  https://www.uefa.com/uefanationsleague/match/2030025/
Request main page...
Request stats page...

Process the following url:  https://www.uefa.com/uefanationsleague/match/2030024/
Request main page...
Request stats page...

Process the following url:  https://www.uefa.com/uefanationsleague/match/2030054/
Request main page...
Request stats page...

Process the following url:  https://www.uefa.com/uefanationsleague/match/2030055/
Request main page...
Request stats p

Request stats page...

Process the following url:  https://www.uefa.com/uefanationsleague/match/2030008/
Request main page...
Request stats page...

Process the following url:  https://www.uefa.com/uefanationsleague/match/2030013/
Request main page...
Request stats page...

Process the following url:  https://www.uefa.com/uefanationsleague/match/2030028/
Request main page...
Request stats page...

Process the following url:  https://www.uefa.com/uefanationsleague/match/2030027/
Request main page...
Request stats page...

Process the following url:  https://www.uefa.com/uefanationsleague/match/2030057/
Request main page...
Request stats page...

Process the following url:  https://www.uefa.com/uefanationsleague/match/2030056/
Request main page...
Request stats page...

Process the following url:  https://www.uefa.com/uefanationsleague/match/2030100/
Request main page...
Request stats page...

Process the following url:  https://www.uefa.com/uefanationsleague/match/2030101/
Request main 


Process the following url:  https://www.uefa.com/uefanationsleague/match/2030076/
Request main page...
Request stats page...

Process the following url:  https://www.uefa.com/uefanationsleague/match/2030091/
Request main page...
Request stats page...

Process the following url:  https://www.uefa.com/uefanationsleague/match/2030089/
Request main page...
Request stats page...

Process the following url:  https://www.uefa.com/uefanationsleague/match/2030118/
Request main page...
Request stats page...

Process the following url:  https://www.uefa.com/uefanationsleague/match/2030119/
Request main page...
Request stats page...

Process the following url:  https://www.uefa.com/uefanationsleague/match/2029985/
Request main page...
Request stats page...

Process the following url:  https://www.uefa.com/uefanationsleague/match/2029986/
Request main page...
Request stats page...

Process the following url:  https://www.uefa.com/uefanationsleague/match/2030006/
Request main page...
Request stats 

Request stats page...

Process the following url:  https://www.uefa.com/uefanationsleague/season=2019/matches/round=2000959/match=2024037/index.html
Request main page...
Request stats page...

Process the following url:  https://www.uefa.com/uefanationsleague/season=2019/matches/round=2000956/match=2024419/index.html
Request main page...
Request stats page...

Process the following url:  https://www.uefa.com/uefanationsleague/season=2019/matches/round=2000956/match=2024420/index.html
Request main page...
Request stats page...

Process the following url:  https://www.uefa.com/uefanationsleague/season=2019/matches/round=2000957/match=2024421/index.html
Request main page...
Request stats page...

Process the following url:  https://www.uefa.com/uefanationsleague/season=2019/matches/round=2000958/match=2024422/index.html
Request main page...
Request stats page...

Process the following url:  https://www.uefa.com/uefanationsleague/season=2019/matches/round=2000959/match=2024053/index.html
R

Request stats page...

Process the following url:  https://www.uefa.com/uefanationsleague/season=2019/matches/round=2000959/match=2024153/index.html
Request main page...
Request stats page...

Process the following url:  https://www.uefa.com/uefanationsleague/season=2019/matches/round=2000959/match=2024146/index.html
Request main page...
Request stats page...

Process the following url:  https://www.uefa.com/uefanationsleague/season=2019/matches/round=2000959/match=2024155/index.html
Request main page...
Request stats page...

Process the following url:  https://www.uefa.com/uefanationsleague/season=2019/matches/round=2000959/match=2024148/index.html
Request main page...
Request stats page...

Process the following url:  https://www.uefa.com/uefanationsleague/season=2019/matches/round=2000959/match=2024145/index.html
Request main page...
Request stats page...

Process the following url:  https://www.uefa.com/uefanationsleague/season=2019/matches/round=2000959/match=2024154/index.html
R

Request stats page...

Process the following url:  https://www.uefa.com/uefanationsleague/season=2019/matches/round=2000959/match=2024106/index.html
Request main page...
Request stats page...

Process the following url:  https://www.uefa.com/uefanationsleague/season=2019/matches/round=2000959/match=2024108/index.html
Request main page...
Request stats page...

Process the following url:  https://www.uefa.com/uefanationsleague/season=2019/matches/round=2000959/match=2024107/index.html
Request main page...
Request stats page...

Process the following url:  https://www.uefa.com/uefanationsleague/season=2019/matches/round=2000959/match=2024110/index.html
Request main page...
Request stats page...

Process the following url:  https://www.uefa.com/uefanationsleague/season=2019/matches/round=2000959/match=2024109/index.html
Request main page...
Request stats page...

Process the following url:  https://www.uefa.com/uefanationsleague/season=2019/matches/round=2000959/match=2024112/index.html
R

Extract features from main page...

Process the following url:  https://www.uefa.com/uefaeuro/match/2026094/
Request main page...
Extract features from main page...

Process the following url:  https://www.uefa.com/uefaeuro/match/2026095/
Request main page...
Extract features from main page...

Process the following url:  https://www.uefa.com/uefaeuro/match/2025993/
Request main page...
Extract features from main page...

Process the following url:  https://www.uefa.com/uefaeuro/match/2025994/
Request main page...
Extract features from main page...

Process the following url:  https://www.uefa.com/uefaeuro/match/2026017/
Request main page...
Extract features from main page...

Process the following url:  https://www.uefa.com/uefaeuro/match/2026018/
Request main page...
Extract features from main page...

Process the following url:  https://www.uefa.com/uefaeuro/match/2026046/
Request main page...
Extract features from main page...

Process the following url:  https://www.uefa.com/uefae


Process the following url:  https://www.uefa.com/uefaeuro/match/2026011/
Request main page...
Extract features from main page...

Process the following url:  https://www.uefa.com/uefaeuro/match/2026035/
Request main page...
Extract features from main page...

Process the following url:  https://www.uefa.com/uefaeuro/match/2026036/
Request main page...
Extract features from main page...

Process the following url:  https://www.uefa.com/uefaeuro/match/2026037/
Request main page...
Extract features from main page...

Process the following url:  https://www.uefa.com/uefaeuro/match/2026061/
Request main page...
Extract features from main page...

Process the following url:  https://www.uefa.com/uefaeuro/match/2026062/
Request main page...
Extract features from main page...

Process the following url:  https://www.uefa.com/uefaeuro/match/2026063/
Request main page...
Extract features from main page...

Process the following url:  https://www.uefa.com/uefaeuro/match/2026073/
Request main pag

Extract features from main page...

Process the following url:  https://www.uefa.com/uefaeuro/match/2026051/
Request main page...
Extract features from main page...

Process the following url:  https://www.uefa.com/uefaeuro/match/2026056/
Request main page...
Extract features from main page...

Process the following url:  https://www.uefa.com/uefaeuro/match/2026057/
Request main page...
Extract features from main page...

Process the following url:  https://www.uefa.com/uefaeuro/match/2026074/
Request main page...
Extract features from main page...

Process the following url:  https://www.uefa.com/uefaeuro/match/2026081/
Request main page...
Extract features from main page...

Process the following url:  https://www.uefa.com/uefaeuro/match/2026087/
Request main page...
Extract features from main page...

Process the following url:  https://www.uefa.com/uefaeuro/match/2026158/
Request main page...
Extract features from main page...

Process the following url:  https://www.uefa.com/uefae

Extract features from main page...

Process the following url:  https://www.uefa.com/uefaeuro/match/2026085/
Request main page...
Extract features from main page...

Process the following url:  https://www.uefa.com/uefaeuro/match/2026090/
Request main page...
Extract features from main page...

Process the following url:  https://www.uefa.com/uefaeuro/match/2026163/
Request main page...
Extract features from main page...

Process the following url:  https://www.uefa.com/uefaeuro/match/2026164/
Request main page...
Extract features from main page...

Process the following url:  https://www.uefa.com/uefaeuro/match/2026165/
Request main page...
Extract features from main page...

Process the following url:  https://www.uefa.com/uefaeuro/match/2026166/
Request main page...
Extract features from main page...

Process the following url:  https://www.uefa.com/uefaeuro/match/2026173/
Request main page...
Extract features from main page...

Process the following url:  https://www.uefa.com/uefae

In [61]:
df_1, urls_failures_1 = generate_df_from_information(game_information)

### Crawl data from UEFA: Second attempt

In [51]:
game_information_2 = [extract_information_from_uefa(url) for url in urls_failures_1]

Process the following url:  https://www.uefa.com/uefanationsleague/match/2030114/
Request main page...
Request stats page...

Process the following url:  https://www.uefa.com/uefanationsleague/match/2030758/
Request main page...
Request stats page...

Process the following url:  https://www.uefa.com/uefanationsleague/match/2030759/
Request main page...
Request stats page...

Process the following url:  https://www.uefa.com/uefanationsleague/match/2030760/
Request main page...
Request stats page...

Process the following url:  https://www.uefa.com/uefanationsleague/match/2030761/
Request main page...
Request stats page...

Process the following url:  https://www.uefa.com/uefanationsleague/match/2030096/
Request main page...
Request stats page...

Process the following url:  https://www.uefa.com/uefanationsleague/season=2019/matches/round=2000959/match=2024080/index.html
Request main page...
Request stats page...

Process the following url:  https://www.uefa.com/uefanationsleague/season=

In [67]:
game_information_2 = game_information_second_attempt

In [68]:
df_2, urls_failures_2 = generate_df_from_information(game_information_2)

### Merge data from both attempts

In [70]:
df_information = pd.concat([df_1, df_2]).reset_index(drop = True)

In [73]:
df_information.to_excel('data_from_uefa.xlsx', index = False, encoding = 'iso-8859-1')

## Data wrangling

In [75]:
cols_possession = ['possession_home', 'possession_away']

for col in cols_possession:
    df_information[col] = df_information[col].apply(lambda value : value[:-1])

In [78]:
df_information['Date'] = pd.to_datetime(df_information['Date'])

In [80]:
cols_blacklist = ['URL', 'Team_Home', 'Team_Away', 'Date']

for col in df_information.columns:
    if col not in cols_blacklist:
        df_information[col] = df_information[col].astype(int)

In [99]:
cols_teams = ['Team_Home', 'Team_Away']

for col in cols_teams:
    df_information[col] = df_information[col].apply(lambda team : team if team != 'FYR Macedonia' else 'North Macedonia')

## Feature engineering

In [100]:
df_feat = df_information.copy().drop(columns = ['URL']).rename(columns = {'Team_Home' : 'team_home',
                                                                          'Team_Away' : 'team_away',
                                                                          'Date' : 'date'
                                                                         })

In [101]:
df_feat_home = df_feat.sort_values(by = ['team_home', 'date']).reset_index(drop = True)

In [103]:
df_feat_home.groupby(['team_home']).head(1)

Unnamed: 0,team_home,team_away,date,goals_home,goals_away,attempts_total_home,attempts_total_away,attempts_off_target_home,attempts_off_target_away,attempts_on_target_home,...,balls_recovered_home,balls_recovered_away,tackles_home,tackles_away,blocks_home,blocks_away,clearances_home,clearances_away,passes_accuracy_home,passes_accuracy_away
0,Albania,Israel,2018-09-07 18:45:00+00:00,1,0,14,14,3,6,5,...,29,33,7,10,4,6,0,0,87,86
10,Andorra,Kazakhstan,2018-09-10 18:45:00+00:00,1,1,2,13,1,7,1,...,16,30,8,6,2,0,0,0,52,81
21,Armenia,Liechtenstein,2018-09-06 16:00:00+00:00,2,1,28,5,8,3,13,...,19,15,1,6,1,7,0,0,83,51
32,Austria,Northern Ireland,2018-10-12 18:45:00+00:00,1,0,10,8,4,5,3,...,27,34,1,2,2,3,0,0,77,70
42,Azerbaijan,Kosovo,2018-09-07 16:00:00+00:00,0,0,9,3,7,3,2,...,35,27,2,9,0,0,0,0,80,71
52,Belarus,San Marino,2018-09-08 16:00:00+00:00,5,0,35,3,18,1,12,...,37,25,5,11,1,5,0,0,90,64
62,Belgium,Switzerland,2018-10-12 18:45:00+00:00,2,1,19,16,7,3,7,...,35,37,4,10,8,5,0,0,86,88
72,Bosnia and Herzegovina,Austria,2018-09-11 18:45:00+00:00,1,0,6,13,1,7,1,...,58,45,4,8,5,4,0,0,79,79
83,Bulgaria,Norway,2018-09-09 16:00:00+00:00,1,0,3,13,1,3,2,...,35,29,6,8,6,0,0,0,71,75
94,Croatia,England,2018-10-12 18:45:00+00:00,0,0,9,5,3,3,3,...,34,28,6,1,0,3,0,0,79,83


In [None]:
df

In [85]:
df_feat.sort_values

Unnamed: 0,url,team_home,team_away,date,goals_home,goals_away,attempts_total_home,attempts_total_away,attempts_off_target_home,attempts_off_target_away,...,balls_recovered_home,balls_recovered_away,tackles_home,tackles_away,blocks_home,blocks_away,clearances_home,clearances_away,passes_accuracy_home,passes_accuracy_away
0,https://www.uefa.com/uefanationsleague/match/2...,Italy,Bosnia and Herzegovina,2020-09-04 18:45:00+00:00,1,1,19,9,11,4,...,33,33,2,1,1,4,0,0,87,78
1,https://www.uefa.com/uefanationsleague/match/2...,Netherlands,Poland,2020-09-04 18:45:00+00:00,1,0,15,2,6,1,...,30,33,3,4,0,5,0,0,87,77
2,https://www.uefa.com/uefanationsleague/match/2...,Bosnia and Herzegovina,Poland,2020-09-07 18:45:00+00:00,1,2,7,14,4,7,...,28,25,3,6,2,1,0,0,79,86
3,https://www.uefa.com/uefanationsleague/match/2...,Netherlands,Italy,2020-09-07 18:45:00+00:00,0,1,11,17,4,10,...,36,44,7,4,3,4,0,0,80,89
4,https://www.uefa.com/uefanationsleague/match/2...,Bosnia and Herzegovina,Netherlands,2020-10-11 16:00:00+00:00,0,0,6,14,2,8,...,38,42,3,2,2,3,0,0,78,88
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
559,https://www.uefa.com/uefanationsleague/season=...,Russia,Sweden,2018-10-11 19:45:00+00:00,0,0,12,10,6,6,...,42,38,5,8,3,4,0,0,83,77
560,https://www.uefa.com/uefanationsleague/season=...,Russia,Turkey,2018-10-14 16:00:00+00:00,2,0,11,9,5,5,...,44,38,2,3,0,1,0,0,75,80
561,https://www.uefa.com/uefanationsleague/season=...,Northern Ireland,Bosnia and Herzegovina,2018-09-08 13:00:00+00:00,1,2,26,5,8,2,...,31,31,3,7,0,10,0,0,84,72
562,https://www.uefa.com/uefaeuro/match/2026101/,Switzerland,Georgia,2019-11-15 19:45:00+00:00,1,0,25,14,7,6,...,43,41,4,5,4,8,0,0,90,80


Unnamed: 0,URL,Team_Home,Team_Away,Date,goals_home,goals_away,attempts_total_home,attempts_total_away,attempts_off_target_home,attempts_off_target_away,...,balls_recovered_home,balls_recovered_away,tackles_home,tackles_away,blocks_home,blocks_away,clearances_home,clearances_away,passes_accuracy_home,passes_accuracy_away
0,https://www.uefa.com/uefanationsleague/match/2...,Italy,Bosnia and Herzegovina,2020-09-04 18:45:00+00:00,1,1,19,9,11,4,...,33,33,2,1,1,4,0,0,87,78
1,https://www.uefa.com/uefanationsleague/match/2...,Netherlands,Poland,2020-09-04 18:45:00+00:00,1,0,15,2,6,1,...,30,33,3,4,0,5,0,0,87,77
2,https://www.uefa.com/uefanationsleague/match/2...,Bosnia and Herzegovina,Poland,2020-09-07 18:45:00+00:00,1,2,7,14,4,7,...,28,25,3,6,2,1,0,0,79,86
3,https://www.uefa.com/uefanationsleague/match/2...,Netherlands,Italy,2020-09-07 18:45:00+00:00,0,1,11,17,4,10,...,36,44,7,4,3,4,0,0,80,89
4,https://www.uefa.com/uefanationsleague/match/2...,Bosnia and Herzegovina,Netherlands,2020-10-11 16:00:00+00:00,0,0,6,14,2,8,...,38,42,3,2,2,3,0,0,78,88
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
559,https://www.uefa.com/uefanationsleague/season=...,Russia,Sweden,2018-10-11 19:45:00+00:00,0,0,12,10,6,6,...,42,38,5,8,3,4,0,0,83,77
560,https://www.uefa.com/uefanationsleague/season=...,Russia,Turkey,2018-10-14 16:00:00+00:00,2,0,11,9,5,5,...,44,38,2,3,0,1,0,0,75,80
561,https://www.uefa.com/uefanationsleague/season=...,Northern Ireland,Bosnia and Herzegovina,2018-09-08 13:00:00+00:00,1,2,26,5,8,2,...,31,31,3,7,0,10,0,0,84,72
562,https://www.uefa.com/uefaeuro/match/2026101/,Switzerland,Georgia,2019-11-15 19:45:00+00:00,1,0,25,14,7,6,...,43,41,4,5,4,8,0,0,90,80
