# Imports

In [265]:
import pandas as pd
import time

# Vaastav fbref

In [266]:
import requests
from bs4 import BeautifulSoup
from bs4 import Comment
import time
import csv

class MatchData:
    def __init__(self) -> None:
        self.comp = ""
        self.date = ""
        self.round = ""
        self.data = {}


class PlayerData:
    def __init__(self) -> None:
        self.data = []
        self.base_url = ""
        self.matches_links = []
        self.matches = []
        self.match_stat_set = set()

def get_data(url):
    response = requests.get(url)
    if response.status_code != 200:
        raise Exception("Response was code " + str(response.status_code))
    html = response.text
    parsed_html =  BeautifulSoup(html, 'html.parser')
    comments = parsed_html.find_all(string=lambda text: isinstance(text, Comment))
    tables = []
    for c in comments:
        if '<table' in c:
            table_html = BeautifulSoup(c, 'html.parser')
            tables = table_html.find_all('table')
    return tables

def get_table_data(url):
    status_code = 0
    while status_code != 200:
        print("Getting data for: " + url)
        response = requests.get(url)
        status_code = response.status_code
        if status_code != 200:
            time.sleep(5)
    html = response.text
    parsed_html = BeautifulSoup(html, 'html.parser')
    tables = parsed_html.find_all('table')
    return tables[0]

def get_matches_data(player):
    tables = []
    for l in player.matches_links:
        tables += [get_table_data(l)]
    matches = []
    match_stat_set = set()
    for t in tables:
        for row in t.tbody.find_all('tr'):
            data = {}
            class_name = row.get('class')
            if class_name != None and len(class_name) > 0 and 'unused_sub' not in class_name:
                continue
            columns = row.find_all('td') + row.find_all('th')
            for c in columns:
                data_stat = c.get('data-stat')
                match_stat_set.add(data_stat)
                if data_stat in ['date', 'round', 'comp', 'opponent', 'squad']:
                    for i in range(len(c.contents)):
                        a_html = BeautifulSoup(str(c.contents[i]), 'html.parser')
                        a = a_html.find_all('a')
                        if len(a) > 0:
                            if len(a[0].contents) > 0:
                                data[data_stat] = a[0].contents[0]
                elif data_stat == 'match_report':
                    continue
                else:
                    if len(c.contents) == 0:
                        continue
                    data[data_stat] = c.contents[0]
            match = MatchData()
            match.date = data['date']
            match.round = data['round']
            match.comp = data['comp']
            match.data = data
            matches += [match]
    player.matches = matches
    player.match_stat_set = match_stat_set

def get_epl_players():
    tables = get_data("https://fbref.com/en/comps/9/stats/Premier-League-Stats")
    table = tables[0]
    players = {}
    stat_names = set()
    for row in table.tbody.find_all('tr'):
        class_name = row.get('class')
        if class_name != None and len(class_name) > 0:
            continue
        columns = row.find_all('td')
        base_url = ""
        matches_link = ""
        player_id = ""
        stats = {}
        for c in columns:
            data_stat = c.get('data-stat')
            if data_stat == 'player':
                a_html = BeautifulSoup(str(c.contents[0]), 'html.parser')
                a = a_html.find_all('a')
                base_url = "https://fbref.com" + a[0].get('href')
                link = a[0].get('href')
                pieces = link.split('/')
                player_id = pieces[3]
                stats[data_stat] = a[0].contents[0]
                stat_names.add(data_stat)
            elif data_stat == 'squad':
                a_html = BeautifulSoup(str(c.contents[0]), 'html.parser')
                a = a_html.find_all('a')
                stats[data_stat] = a[0].contents[0]
                stat_names.add(data_stat)
            elif data_stat == 'minutes':
                mins = c.contents[0]
                if ',' in mins:
                    mins = int(mins.replace(',', ''))
                stats[data_stat] = mins
                stat_names.add(data_stat)
            elif data_stat == "matches":
                a_html = BeautifulSoup(str(c.contents[0]), 'html.parser')
                a = a_html.find_all('a')
                matches_link = "https://fbref.com" + a[0].get('href')
            elif data_stat == "nationality":
                continue
            else:
                stats[data_stat] = c.contents[0]
                stat_names.add(data_stat)
        player = PlayerData()
        if player_id in players:
            player = players[player_id]
        player.base_url = base_url
        if len(player.matches_links) == 0:
            player.matches_links += [matches_link]
        player.data += [stats]
        players[player_id] = player
    return players, stat_names

# Getting raw data

In [267]:
def get_url( url ):
    df = pd.read_html( url, header=1)[0]
    return df

In [268]:
def get_logs_for_players_in_season(players, season, players_names=[]):
    logs = pd.DataFrame({'A' : []})
    for id, player in list(players.items()):
        player_name = player.data[0]['player'].replace(" ", "-").replace("ï", "i").replace("é", "e").replace("á", "a").replace("ó", "o").replace("Á", "A").replace("ø", "o").replace("ć", "c").replace("í", "i").replace("ú", "u").replace("Ć", "C").replace("ã", "a").replace("ğ", "g").replace("ş", "s").replace("É", "E").replace("ñ", "n").replace("Ł", "L").replace("ń", "n").replace("ß", "ss").replace("ç", "c").replace("İ", "I").replace("č", "c").replace("ö", "o").replace("ë", "e").replace("š", "s").replace("ä", "a").replace("Ç", "C").replace("ü", "u").replace("Ø", "O").replace("ú", "u").replace("ú", "u")
        if player_name not in players_names: 
            continue
        print("Getting data for " + player_name)
        url = 'https://fbref.com/en/players/' + str(id) + '/matchlogs/' + season + '/summary/' + player_name + '-Match-Logs'
        new_player_df = get_url(url)
        # dropping NaN rows
        new_player_df = new_player_df[new_player_df['Date'].notna()]
        if not logs.empty:
            logs = pd.concat([logs, new_player_df])
        else:
            logs = new_player_df
        
        time.sleep(5)
    return logs

In [269]:
players, stats = get_epl_players()

In [270]:
%%time
logs = get_logs_for_players_in_season(players, '2022-2023', ["Mohamed-Salah"])

Getting data for Mohamed-Salah
CPU times: user 51.9 ms, sys: 3.14 ms, total: 55.1 ms
Wall time: 5.31 s


# Modyfing data

In [271]:
# only Premier League
logs = logs[logs.Comp == "Premier League"]

In [272]:
# splitting result column
logs[['WDL', 'GoalsTeams']] = logs['Result'].str.split(' ', expand=True)
logs[['Team Score', 'Opp Score']] = logs['GoalsTeams'].str.split('–', expand=True)
logs['Team Score'] = logs['Team Score'].astype(int)
logs['Opp Score'] = logs['Opp Score'].astype(int)
logs = logs.drop(['Result'], axis=1)

In [284]:
# additional columns
# approximated FPL points - just for Salah
logs["FPL"] = 1 + (logs["Min"] >= 60) + (5 * logs["Gls"]) + (3 * logs["Ast"]) + (1 * logs["Opp Score"] == 0)
logs["Was Home"] = (logs["Venue"] == "Home").astype(int)
logs["GW"] = logs['Round'].str.split(' ', expand=True)[1]

In [285]:
logs.head(6)

Unnamed: 0,Date,Day,Comp,Round,Venue,Squad,Opponent,Start,Pos,Min,...,Succ,Att.1,Match Report,WDL,GoalsTeams,Team Score,Opp Score,FPL,Was Home,GW
1,2022-08-06,Sat,Premier League,Matchweek 1,Away,Liverpool,Fulham,Y,RW,90.0,...,2.0,6.0,Match Report,D,2–2,2,2,7.0,0,1
2,2022-08-15,Mon,Premier League,Matchweek 2,Home,Liverpool,Crystal Palace,Y,"RW,FW",90.0,...,1.0,6.0,Match Report,D,1–1,1,1,2.0,1,2
3,2022-08-22,Mon,Premier League,Matchweek 3,Away,Liverpool,Manchester Utd,Y,RW,90.0,...,2.0,4.0,Match Report,L,1–2,1,2,7.0,0,3
4,2022-08-27,Sat,Premier League,Matchweek 4,Home,Liverpool,Bournemouth,Y,RW,90.0,...,0.0,3.0,Match Report,W,9–0,9,0,3.0,1,4
5,2022-08-31,Wed,Premier League,Matchweek 5,Home,Liverpool,Newcastle Utd,Y,RW,90.0,...,3.0,6.0,Match Report,W,2–1,2,1,8.0,1,5
6,2022-09-03,Sat,Premier League,Matchweek 6,Away,Liverpool,Everton,Y,RW,90.0,...,1.0,2.0,Match Report,D,0–0,0,0,3.0,0,6


# Features

In [282]:
info = ["Date", "Day", "Round", "Was Home"]
features = ["Min", "Gls", "Sh", "SoT", "xG", "npxG", "xAG"]
to_predict = ["FPL"]

In [283]:
logs[info + features + to_predict]

Unnamed: 0,Date,Day,Round,Was Home,Min,Gls,Sh,SoT,xG,npxG,xAG,FPL
1,2022-08-06,Sat,Matchweek 1,0,90.0,1.0,2.0,1.0,0.4,0.4,0.1,7.0
2,2022-08-15,Mon,Matchweek 2,1,90.0,0.0,3.0,1.0,0.3,0.3,0.8,2.0
3,2022-08-22,Mon,Matchweek 3,0,90.0,1.0,3.0,1.0,0.3,0.3,0.3,7.0
4,2022-08-27,Sat,Matchweek 4,1,90.0,0.0,4.0,1.0,1.2,1.2,0.1,3.0
5,2022-08-31,Wed,Matchweek 5,1,90.0,0.0,2.0,0.0,0.2,0.2,0.4,8.0
6,2022-09-03,Sat,Matchweek 6,0,90.0,0.0,3.0,1.0,0.2,0.2,0.2,3.0
11,2022-10-01,Sat,Matchweek 9,1,90.0,0.0,5.0,2.0,0.6,0.6,0.1,5.0
13,2022-10-09,Sun,Matchweek 10,0,68.0,0.0,1.0,0.0,0.1,0.1,0.0,2.0
15,2022-10-16,Sun,Matchweek 11,1,89.0,1.0,3.0,1.0,0.9,0.9,0.0,8.0
16,2022-10-19,Wed,Matchweek 12,1,90.0,0.0,6.0,2.0,0.4,0.4,0.0,3.0
