In [2]:
import numpy as np
import pandas as pd

In [3]:
import time
import json

In [4]:
import requests
from urllib.parse import urljoin

In [5]:
from bs4 import BeautifulSoup

In [6]:
def request_page_and_generate_soup(url):
    '''
    INPUT:  URL to a web page
    OUTPUT: Soup made by BeautifulSoup that contains the HTML-code of the page
    '''
    
    try:
        # A delay of three seconds such that we do not make too many requests in a short time by mistake
        time.sleep(3)

        content = requests.get(url)
        text    = content.text
        soup    = BeautifulSoup(text)

        return soup
    
    except:
        return None

In [120]:
def get_teams_and_result(soup):
    '''
    INPUT: Soup
    OUTPUT: Outputs a dictionary that contains the name of home team, the away team and the date of the match.
    '''
    dic_teams_and_date = {}
    
    try:
        scripts = soup.find_all('script', {'type' : 'application/ld+json'})
        information = [json.loads(script.text) for script in scripts]
        
        dic_teams_and_date['team_home'] = information[1]['homeTeam']['name']
        dic_teams_and_date['team_away'] = information[1]['awayTeam']['name']
        dic_teams_and_date['goals_home'] = int(information[1]['name'].split()[1].split('-')[0])
        dic_teams_and_date['goals_away'] = int(information[1]['name'].split()[1].split('-')[1])
        
    except:
        dic_teams_and_date['team_home'] = np.nan
        dic_teams_and_date['team_away'] = np.nan
        dic_teams_and_date['goals_home'] = np.nan
        dic_teams_and_date['goals_away'] = np.nan
        
    try:
        result_penalties = soup.find('div', {'class' : 'js-post-match-penalties-score post-match-penalties-score'}).text
        goals_home_after_p = result_penalties.strip()[1:].split()[0].split('-')[0]
        goals_away_after_p = result_penalties.strip()[1:].split()[0].split('-')[1]
        
        dic_teams_and_date['goals_home'] += int(goals_home_after_p)
        dic_teams_and_date['goals_away'] += int(goals_away_after_p)
        
    except:
        pass
        
    return dic_teams_and_date

In [102]:
em_matches_urls = find_all_match_urls_from_wiki_page('https://en.wikipedia.org/wiki/UEFA_Euro_2020')

In [121]:
def extract_teams_and_result_from_uefa(uefa_main_page_url):
    '''
    INPUT:  ...
    OUTPUT: ...
    
    DESCRIPTION: ...
    '''
    
    print('Process the following url: ', uefa_main_page_url)
    
    dict_result = {}
    dict_result['url'] = uefa_main_page_url
    
    # Get Main Page Soup
    print('Request main page...')
    uefa_main_page_soup = request_page_and_generate_soup(uefa_main_page_url)
    
    
    # Parse Date, Home Team and Away Team
    dict_teams_and_date = get_teams_and_result(uefa_main_page_soup)
    
    dict_result.update(dict_teams_and_date)
    
    return dict_result

In [122]:
game_information_em = [extract_teams_and_result_from_uefa(url) for url in em_matches_urls]

Process the following url:  https://www.uefa.com/uefaeuro-2020/match/2024447/
Request main page...
Process the following url:  https://www.uefa.com/uefaeuro-2020/match/2024448/
Request main page...
Process the following url:  https://www.uefa.com/uefaeuro-2020/match/2024457/
Request main page...
Process the following url:  https://www.uefa.com/uefaeuro-2020/match/2024458/
Request main page...
Process the following url:  https://www.uefa.com/uefaeuro-2020/match/2024467/
Request main page...
Process the following url:  https://www.uefa.com/uefaeuro-2020/match/2024468/
Request main page...
Process the following url:  https://www.uefa.com/uefaeuro-2020/match/2024449/
Request main page...
Process the following url:  https://www.uefa.com/uefaeuro-2020/match/2024450/
Request main page...
Process the following url:  https://www.uefa.com/uefaeuro-2020/match/2024460/
Request main page...
Process the following url:  https://www.uefa.com/uefaeuro-2020/match/2024459/
Request main page...
Process th

In [129]:
df_matches_em_crawling = pd.DataFrame(data = game_information_em)

In [130]:
df_matches_em_crawling[df_matches_em_crawling['team_home'].isnull()]

Unnamed: 0,url,team_home,team_away,goals_home,goals_away
16,https://www.uefa.com/uefaeuro-2020/match/2024445/,,,,
23,https://www.uefa.com/uefaeuro-2020/match/2024471/,,,,
46,https://www.uefa.com/uefaeuro-2020/match/2024488/,,,,


In [128]:
data_missing_matches = [{'url' : 'https://www.uefa.com/uefaeuro-2020/match/2024445/',
                         'team_home' : 'North Macedonia',
                         'team_away' : 'Netherlands',
                         'goals_home' : 0,
                         'goals_away' : 3
                        },
                        {'url' : 'https://www.uefa.com/uefaeuro-2020/match/2024471/',
                         'team_home' : 'Czech Republic',
                         'team_away' : 'England',
                         'goals_home' : 0,
                         'goals_away' : 1,
                        },
                        {'url' : 'https://www.uefa.com/uefaeuro-2020/match/2024488/',
                         'team_home' : 'Czech Republic',
                         'team_away' : 'Denmark',
                         'goals_home' : 1,
                         'goals_away' : 2,
                        }
                       ]

In [131]:
df_matches_missing = pd.DataFrame(data = data_missing_matches)

In [134]:
df_matches_em = pd.concat([df_matches_em_crawling[df_matches_em_crawling['team_home'].notnull()], df_matches_missing], axis = 0) \
                                                                                        .reset_index(drop = True)

In [136]:
cols_to_int = ['goals_home', 'goals_away']

for col in cols_to_int:
    df_matches_em[col] = df_matches_em[col].astype(int)

In [137]:
df_matches_em

Unnamed: 0,url,team_home,team_away,goals_home,goals_away
0,https://www.uefa.com/uefaeuro-2020/match/2024447/,Turkey,Italy,0,3
1,https://www.uefa.com/uefaeuro-2020/match/2024448/,Wales,Switzerland,1,1
2,https://www.uefa.com/uefaeuro-2020/match/2024457/,Turkey,Wales,0,2
3,https://www.uefa.com/uefaeuro-2020/match/2024458/,Italy,Switzerland,3,0
4,https://www.uefa.com/uefaeuro-2020/match/2024467/,Switzerland,Turkey,3,1
5,https://www.uefa.com/uefaeuro-2020/match/2024468/,Italy,Wales,1,0
6,https://www.uefa.com/uefaeuro-2020/match/2024449/,Denmark,Finland,0,1
7,https://www.uefa.com/uefaeuro-2020/match/2024450/,Belgium,Russia,3,0
8,https://www.uefa.com/uefaeuro-2020/match/2024460/,Finland,Russia,0,1
9,https://www.uefa.com/uefaeuro-2020/match/2024459/,Denmark,Belgium,1,2


In [139]:
df_matches_em.to_excel('../data/matches_em.xlsx', index = False, encoding = 'iso-8859-1')