In [1]:
# %% Install dependencies on Google Colab (remove the # sign in front of the syntaxes to install the dependencies to your environment)
#!pip install selenium
#!apt-get update
#!apt install chromium-chromedriver

In [2]:
# %% import needed libraries
from bs4 import BeautifulSoup as bs
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import pandas as pd
import numpy as np
from time import sleep

import warnings
warnings.filterwarnings('ignore')

In [3]:
# %% Set the url link
url = "https://www.flashscore.com/football/england/premier-league/results/"

In [4]:
# %% Configure web driver for the scrape job

options = Options()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
driver = webdriver.Chrome('chromedriver', options=options)

In [5]:
# %% Initiate connection to the url
html = driver.get(url)
# html = html.text
sleep(5)

In [6]:
# %% Initiate beautiful soup
soup = bs(driver.page_source, 'lxml')

In [7]:
# %% Find the class that holds soccer information
divs = soup.find('div', {'class': 'sportName soccer'})

In [8]:
# %% Create an empty list to store the match statistics.
stat_column = []

In [9]:
# %%  Fetch goal details
def parse_goal(bsTag, ownGoal):
    goalTime = bsTag.find(
        'div', {'class': 'smv__timeBox'}).text.replace("'", '')
    scorer = bsTag.find('a', {'class': 'smv__playerName'}).text
    if ownGoal:
        assist = 'NA'
        isOwnGoal = True
    else:
        try:
            assist = bsTag.find_all('div')[-1].find('a').text
        except:
            try:
                assist = bsTag.find('div', {'class': "smv__subIncident"}).text.replace(
                    '(', '').replace(')', '')
            except:
                assist = 'Not assisted'
        isOwnGoal = False
    return [goalTime, scorer, assist, isOwnGoal]


In [10]:
#%% Fetch card details
def parse_card(bsTag, card_type):
    cardTime = bsTag.find(
        'div', {'class': 'smv__timeBox'}).text.replace("'", '')
    player = bsTag.find('a', {'class': 'smv__playerName'}).text
    if card_type == 'red':
        isRed = True
    else:
        isRed = False
    why = bsTag.find_all('div')[-1].text.replace('(', '').replace(')', '')
    return [cardTime, isRed, why]


In [11]:
#%% Fetch substitution data
def parse_substitution(bsTag):
    subTime = bsTag.find(
        'div', {'class': 'smv__timeBox'}).text.replace("'", '')
    try:
        player = bsTag.find('a', {'class': 'smv__playerName'}).text
    except:
        player = 'error'
    try:
        outPlayer = bsTag.find(
            'a', {'class': 'smv__subDown smv__playerName'}).text
    except:
        # bsTag.find('div',{'class':'smv__incidentSubOut '}).find('a').text
        outPlayer = 'error'

    return [subTime, player, outPlayer]


In [12]:
# %%
all_div = divs.find_all(recursive=False)

In [13]:
# %%
match = []


In [14]:
# %%

In [15]:
# %%
rounds = []


In [16]:
# %%
for i in all_div[1:]:
    if len(rounds) > 7:
        break

    if i.text.startswith('Round'):
        rounds.append(i.text)
    else:
        match.append(i)


In [17]:
# %% Define a function to fetch match data
def get_stats(all_stat):
    first_sec = []
    home_event = []
    away_event = []

    for i in all_stat:

        if 'section__title' in i.get('class'):
            score = i.text.split('Half')[1]
            first_sec.append(score)

        elif 'smv__empty' in i.get('class'):
            continue
        else:
            if 'smv__homeParticipant' in i.get('class'):
                event_type = i.find('svg').get('class')
                if event_type[0] == 'soccer':
                    if len(event_type) > 1:
                        itsOwnGoal = True
                    else:
                        itsOwnGoal = False
                    ans = parse_goal(i, itsOwnGoal)
                    print(ans)
                    home_event.append(ans)
                elif event_type[0] == 'card-ico':
                    try:
                        if 'yellow' in event_type[1]:
                            ans = parse_card(i, 'yellow')
                        else:
                            ans = parse_card(i, 'red')
                    except:
                        ans = parse_card(i, 'red')
                        print(ans)
                        home_event.append(ans)
                else:
                    ans = parse_substitution(i)
                    print(ans)
                    home_event.append(ans)
            else:
                event_type = i.find('svg').get('class')
                if event_type[0] == 'soccer':
                    if len(event_type) > 1:
                        itsOwnGoal = True
                    else:
                        itsOwnGoal = False
                    ans = parse_goal(i, itsOwnGoal)
                    print(ans)
                    away_event.append(ans)
                elif event_type[0] == 'card-ico':
                    try:
                        if 'yellow' in event_type[1]:
                            ans = parse_card(i, 'yellow')
                        else:
                            ans = parse_card(i, 'red')
                            print(ans)
                    except:
                        ans = parse_card(i, 'red')
                        print(ans)
                    away_event.append(ans)
                else:
                    ans = parse_substitution(i)
                    print(ans)
                    away_event.append(ans)
    return first_sec, home_event, away_event


In [18]:
# %% Define a column for fetch match statistics.
def get_stats_match():
    global stat_column
    soup2 = bs(driver.page_source, 'lxml')
    x = soup2.find_all('div', {'class': 'stat__category'})
    match_stat = []
    y = [i.find_all(recursive=True) for i in x]
    for l in y:
        temp = [i.text for i in l]
        match_stat.append([temp[0], temp[2]])
        if temp[1] not in stat_column:
            stat_column.append(temp[1])
        print(match_stat)
    return match_stat


In [19]:
# %% Create an empty list to store the scraped data
all_data = []


In [20]:
# %% Scrape the respective match data and append them to the empty list (all_data).
for k in match:
    try:
        home = k.find('div', {
                      "class": "event__participant event__participant--home fontExtraBold"}).text
        away = k.find(
            'div', {"class": "event__participant event__participant--away"}).text
    except:
        try:
            home = k.find(
                'div', {"class": "event__participant event__participant--home"}).text
            away = k.find('div', {
                          "class": "event__participant event__participant--away fontExtraBold"}).text
        except:
            home = k.find(
                'div', {"class": "event__participant event__participant--home"}).text
            away = k.find(
                'div', {"class": "event__participant event__participant--away"}).text
    match_id = k.get('id').rsplit('_')[-1]
    driver.get(
        f'https://www.flashscore.com/match/{match_id}/#/match-summary/match-summary')
    sleep(5)
    stats_soup = bs(driver.page_source, 'lxml')

    stat = stats_soup.find('div', {'class': "smv__verticalSections section"})
#   stats_soup.find_all('div',{'class':'participant__participantName participant__overflow '})
    all_stat = stat.find_all(recursive=False)
    first_sec, home_event, away_event = get_stats(all_stat)
    test_url = f'https://www.flashscore.com/match/{match_id}/#/match-summary' + \
        '/match-statistics/0'
    driver.get(test_url)
    sleep(3)
    # stats goes here
#     ext = '/match-statistics/0'
    all_stat = get_stats_match()
    all_data.append([home, away, first_sec, home_event, away_event, all_stat])
#    print(match_id)


['38', 'Rashford M.', 'Eriksen C.', False]
['46', 'Areola A.', 'Fabianski L.']
['57', 'Antonio M.', 'Scamacca G.']
['61', 'McTominay S.', 'Elanga A.']
['77', 'Fornals P.', 'Downes F.']
['79', 'Fred', 'Eriksen C.']
[['53%', '47%']]
[['53%', '47%'], ['16', '13']]
[['53%', '47%'], ['16', '13'], ['3', '5']]
[['53%', '47%'], ['16', '13'], ['3', '5'], ['7', '2']]
[['53%', '47%'], ['16', '13'], ['3', '5'], ['7', '2'], ['6', '6']]
[['53%', '47%'], ['16', '13'], ['3', '5'], ['7', '2'], ['6', '6'], ['9', '12']]
[['53%', '47%'], ['16', '13'], ['3', '5'], ['7', '2'], ['6', '6'], ['9', '12'], ['5', '10']]
[['53%', '47%'], ['16', '13'], ['3', '5'], ['7', '2'], ['6', '6'], ['9', '12'], ['5', '10'], ['1', '2']]
[['53%', '47%'], ['16', '13'], ['3', '5'], ['7', '2'], ['6', '6'], ['9', '12'], ['5', '10'], ['1', '2'], ['12', '19']]
[['53%', '47%'], ['16', '13'], ['3', '5'], ['7', '2'], ['6', '6'], ['9', '12'], ['5', '10'], ['1', '2'], ['12', '19'], ['5', '2']]
[['53%', '47%'], ['16', '13'], ['3', '5'], ['

In [21]:
# %%


In [22]:
# %% Convert the scraped data into a dataframe
df = pd.DataFrame(data=all_data, columns=[
                  'Home Team', 'Away Team', 'HT/FT', 'Home Events', 'Away Events', 'Game Stats'])
df.head()

Unnamed: 0,Home Team,Away Team,HT/FT,Home Events,Away Events,Game Stats
0,Manchester Utd,West Ham,"[1 - 0, 0 - 0]","[[38, Rashford M., Eriksen C., False], [61, Mc...","[[36, False, Tripping], [46, Areola A., Fabian...","[[53%, 47%], [16, 13], [3, 5], [7, 2], [6, 6],..."
1,Arsenal,Nottingham,"[1 - 0, 4 - 0]","[[5, Martinelli G., Saka B., False], [27, Nels...","[[44, False, Tripping], [56, Johnson B., Gibbs...","[[69%, 31%], [24, 5], [10, 2], [5, 2], [9, 1],..."
2,Liverpool,Leeds,"[1 - 1, 0 - 1]","[[14, Salah M., Robertson A., False], [60, Jon...","[[4, Rodrigo, Not assisted, False], [52, Bamfo...","[[69%, 31%], [22, 14], [10, 6], [8, 6], [4, 2]..."
3,Fulham,Everton,"[0 - 0, 0 - 0]","[[67, Wilson H., Kebano N.], [76, Cairney T., ...","[[64, False, Tripping], [64, Patterson N., Col...","[[57%, 43%], [24, 9], [6, 4], [10, 3], [8, 2],..."
4,Bournemouth,Tottenham,"[1 - 0, 1 - 3]","[[22, Moore K., Tavernier M., False], [49, Moo...","[[46, Lucas Moura, Skipp O.], [57, Sessegnon R...","[[31%, 69%], [6, 23], [4, 7], [2, 10], [0, 6],..."


In [38]:
df.tail()

Unnamed: 0,Home Team,Away Team,Home Events,Away Events,First Half Score,Second Half Score,Ball Possession Home,Ball Possession Away,Goal Attempts Home,Goal Attempts Away,...,Yellow Cards Home,Yellow Cards Away,Total Passes Home,Total Passes Away,Completed Passes Home,Completed Passes Away,Tackles Home,Tackles Away,Attacks Home,Attacks Away
61,Tottenham,Leicester,"[[8, Kane H., Kulusevski D., False], [21, Dier...","[[6, Tielemans Y., Penalty, False], [41, Maddi...",2 - 2,4 - 0,44%,56%,16,19,...,0,2,482,632,403,552,14,16,68,146
62,Newcastle,Bournemouth,"[[67, Isak A., Penalty, False], [71, Murphy J....","[[34, False, Foul], [45+3, False, Foul], [62, ...",0 - 0,1 - 1,72%,28%,20,10,...,2,2,643,245,556,165,16,18,155,63
63,Wolves,Manchester City,"[[70, Hwang Hee-Chan, Guedes G.], [70, Traore ...","[[1, Grealish J., De Bruyne K., False], [13, F...",0 - 2,0 - 1,41%,59%,6,16,...,1,0,2,1,422,622,353,563,19,19
64,Aston Villa,Southampton,"[[41, Ramsey J., Not assisted, False], [44, Do...","[[46, Larios J., Perraud R.], [46, Aribo J., D...",1 - 0,0 - 0,53%,47%,11,7,...,3,1,420,376,312,277,35,28,146,106
65,Nottingham,Fulham,"[[11, Awoniyi T., Yates R., False], [63, Linga...","[[40, False, Tripping], [54, Adarabioyo T., Wi...",1 - 0,1 - 3,41%,59%,11,15,...,2,4,375,555,279,460,10,12,83,132


In [23]:
# %%
df['First Half Score'] = df['HT/FT'].apply(lambda x: x[0])
df['Second Half Score'] = df['HT/FT'].apply(lambda x: x[1])


In [24]:
# %% Drop the Half Time/Full Time Score column.
df.drop('HT/FT', axis=1, inplace=True)


In [25]:
# %%
for i, j in enumerate(stat_column[:-2]):
    print(j)
    df[f'{j} Home'] = df['Game Stats'].apply(lambda x: x[i][0])
    df[f'{j} Away'] = df['Game Stats'].apply(lambda x: x[i][1])
df.drop('Game Stats', axis=1, inplace=True)


Ball Possession
Goal Attempts
Shots on Goal
Shots off Goal
Blocked Shots
Free Kicks
Corner Kicks
Offsides
Throw-in
Goalkeeper Saves
Fouls
Yellow Cards
Total Passes
Completed Passes
Tackles
Attacks


In [26]:
# %% Create separate dataframes for both home and away events.
df_home = df[['Home Team', 'Away Team',
              'First Half Score',	'Second Half Score', 'Home Events']]
df_away = df[['Home Team', 'Away Team',
              'First Half Score',	'Second Half Score', 'Away Events']]


In [27]:
# %% Unpivot the separated dataframes.
df_home_melted = pd.melt(df_home, id_vars=[
                         'Home Team', 'Away Team', 'First Half Score',	'Second Half Score', ],
                         var_name='Event', value_name='Value')
df_away_melted = pd.melt(df_away, id_vars=[
                         'Home Team', 'Away Team', 'First Half Score',	'Second Half Score', ],
                         var_name='Event', value_name='Value')


In [28]:
# %% Remove the brackets from the column contents, and remove trailing commas from the texts.
df_home_melted = df_home_melted['Value'].apply(lambda x: pd.Series(
    str(x).split("]")).str.replace(r'[][]+', '', regex=True).str.lstrip(','))
df_away_melted = df_away_melted['Value'].apply(lambda x: pd.Series(
    str(x).split("]")).str.replace(r'[][]+', '', regex=True).str.lstrip(','))


In [29]:
# %% Rename columns
df_home_melted = df_home_melted.rename({0: 'Home Event_1', 1: 'Home Event_2', 2: 'Home Event_3', 3: 'Home Event_4', 4: 'Home Event_5',
                                        5: 'Home Event_6', 6: 'Home Event_7', 7: 'Home Event_8', 8: 'Home Event_9', 9: 'Home Event_10',
                                        10: 'Home Event_11', 11: 'Home Event_12', 12: 'Home Event_13', 13: 'Home Event_14'}, axis=1)
df_away_melted = df_away_melted.rename({0: 'Away Event_1', 1: 'Away Event_2', 2: 'Away Event_3', 3: 'Away Event_4', 4: 'Away Event_5',
                                        5: 'Away Event_6', 6: 'Away Event_7', 7: 'Away Event_8', 8: 'Away Event_9', 9: 'Away Event_10',
                                        10: 'Away Event_11', 11: 'Away Event_12', 12: 'Away Event_13', 13: 'Away Event_14'}, axis=1)


In [30]:
# %% Drop the original event columns.
df_home = df_home.drop('Home Events', axis=1).reset_index().merge(
    df_home_melted.reset_index()).set_index('index')
df_away = df_away.drop('Away Events', axis=1).reset_index().merge(
    df_away_melted.reset_index()).set_index('index')


In [31]:
# %% Merge the separated events (home and away) into a single dataframe.
match_data = df_home.merge(df_away.drop(
    ['First Half Score', 'Second Half Score'], axis=1), on=['Home Team', 'Away Team'])
try:
  for i in range(15):
    if f'Home Event_{i}' in match_data:
      match_data[f'Home Event_{i}'] = match_data[f'Home Event_{i}']
    else:
      match_data[f'Home Event_{i}'] = pd.Series(
          [np.nan for x in range(len(match_data.index))]
      )
  for i in range(15):
    if f'Away Event_{i}' in match_data:
      match_data[f'Away Event_{i}'] = match_data[f'Away Event_{i}']
    else:
      match_data[f'Away Event_{i}'] = pd.Series(
          [np.nan for x in range(len(match_data.index))]
      )
except:
  pass
match_data['Match'] = match_data['Home Team'] + \
    " " + "VS" + " " + match_data['Away Team']
match_data = match_data[['Match', 'Home Team', 'Away Team', 'First Half Score', 'Second Half Score', 'Home Event_1',
                         'Away Event_1', 'Home Event_2', 'Away Event_2', 'Home Event_3', 'Away Event_3', 'Home Event_4',
                         'Away Event_4', 'Home Event_5', 'Away Event_5', 'Home Event_6', 'Away Event_6', 'Home Event_7',
                         'Away Event_7', 'Home Event_8', 'Away Event_8', 'Home Event_9', 'Away Event_9', 'Home Event_10',
                         'Away Event_10', 'Home Event_11', 'Away Event_11', 'Home Event_11', 'Away Event_12', 'Home Event_13',
                         'Away Event_13', 'Home Event_14',  'Away Event_14']]


In [32]:
# %% Merge the derived match data with the original dataframe.
match_data = df.drop(['Home Events', 'Away Events'], axis=1).merge(match_data.drop(
    ['First Half Score', 'Second Half Score'], axis=1), on=['Home Team', 'Away Team'])


In [33]:
# %% Unpivot match_data by converting the events header into a single column, and another column that hold the information.
# This reduces the number of columns and increases the number of rows.
match_data_melted = pd.melt(match_data, id_vars=['Match', 'Home Team', 'Away Team', 'First Half Score', 'Second Half Score',
                                                 'Ball Possession Home', 'Ball Possession Away', 'Goal Attempts Home',
                                                 'Goal Attempts Away', 'Shots on Goal Home', 'Shots on Goal Away',
                                                 'Shots off Goal Home', 'Shots off Goal Away', 'Blocked Shots Home',
                                                 'Blocked Shots Away', 'Free Kicks Home', 'Free Kicks Away',
                                                 'Corner Kicks Home', 'Corner Kicks Away', 'Offsides Home',
                                                 'Offsides Away', 'Throw-in Home', 'Throw-in Away',
                                                 'Goalkeeper Saves Home', 'Goalkeeper Saves Away', 'Fouls Home',
                                                 'Fouls Away', 'Yellow Cards Home', 'Yellow Cards Away',
                                                 'Total Passes Home', 'Total Passes Away', 'Completed Passes Home',
                                                 'Completed Passes Away', 'Tackles Home', 'Tackles Away', 'Attacks Home',
                                                 'Attacks Away', ], var_name='Event', value_name='Details')


In [34]:
# %% Split the 'Details' columns to extract the information contained in it
match_data_melted[['Minute', 'Event/Player', 'Type/Assist', 'Goal Status']
                  ] = match_data_melted['Details'].apply(lambda x: pd.Series(str(x).split(",")))
match_data_melted = match_data_melted.drop('Details', axis=1)
match_data_melted = match_data_melted.replace("'", '', regex=True)


In [35]:
# %% Extract goals from the data.
goals = match_data_melted.dropna(subset=['Goal Status'])
goals = goals.rename(
    {'Event/Player': 'Scorer', 'Type/Assist': 'Assist'}, axis=1)


In [36]:
# %% Convert the dataframes into csv files.
match_data_melted.to_csv('FlashScore_Matches.csv', index=False, header=True)
goals.to_csv('FlashScore_goals.csv', index=False, header=True)


In [37]:
# %%