# Imports

In [1]:
import os
os.chdir("D:\PulpitE\FPL_ML")

In [2]:
import pandas as pd
import time
from vaastav.fbref import *

# Getting raw data

In [3]:
def get_url( url ):
    df = pd.read_html( url, header=1)[0]
    return df

In [4]:
def get_logs_for_players_in_season(players, season, players_names=[]):
    logs = pd.DataFrame({'A' : []})
    for id, player in list(players.items()):
        player_name = player.data[0]['player'].replace(" ", "-").replace("ï", "i").replace("é", "e").replace("á", "a").replace("ó", "o").replace("Á", "A").replace("ø", "o").replace("ć", "c").replace("í", "i").replace("ú", "u").replace("Ć", "C").replace("ã", "a").replace("ğ", "g").replace("ş", "s").replace("É", "E").replace("ñ", "n").replace("Ł", "L").replace("ń", "n").replace("ß", "ss").replace("ç", "c").replace("İ", "I").replace("č", "c").replace("ö", "o").replace("ë", "e").replace("š", "s").replace("ä", "a").replace("Ç", "C").replace("ü", "u").replace("Ø", "O").replace("ú", "u").replace("ú", "u")
        if player_name not in players_names: 
            continue
        print("Getting data for " + player_name)
        url = 'https://fbref.com/en/players/' + str(id) + '/matchlogs/' + season + '/summary/' + player_name + '-Match-Logs'
        new_player_df = get_url(url)
        # dropping NaN rows
        new_player_df = new_player_df[new_player_df['Date'].notna()]
        if not logs.empty:
            logs = pd.concat([logs, new_player_df])
        else:
            logs = new_player_df
        
        time.sleep(5)
    return logs

In [5]:
players, stats = get_epl_players()

In [6]:
%%time
logs = get_logs_for_players_in_season(players, '2022-2023', ["Mohamed-Salah"])

Getting data for Mohamed-Salah
CPU times: total: 46.9 ms
Wall time: 5.14 s


# Modyfing data

In [7]:
# only Premier League
logs = logs[logs.Comp == "Premier League"]

In [8]:
# splitting result column
logs[['WDL', 'GoalsTeams']] = logs['Result'].str.split(' ', expand=True)
logs[['Team Score', 'Opp Score']] = logs['GoalsTeams'].str.split('–', expand=True)
logs['Team Score'] = logs['Team Score'].astype(int)
logs['Opp Score'] = logs['Opp Score'].astype(int)
logs = logs.drop(['Result'], axis=1)

In [9]:
# additional columns
# approximated FPL points - just for Salah
logs["CS"] = (logs["Opp Score"] == 0).astype(int)
logs["Was Home"] = (logs["Venue"] == "Home").astype(int)
logs["GW"] = logs['Round'].str.split(' ', expand=True)[1]
logs["FPL"] = 1 + (logs["Min"] >= 60) + (5 * logs["Gls"]) + (3 * logs["Ast"]) + (1 * logs["CS"])

In [10]:
logs.head(6)

Unnamed: 0,Date,Day,Comp,Round,Venue,Squad,Opponent,Start,Pos,Min,...,Att.1,Match Report,WDL,GoalsTeams,Team Score,Opp Score,CS,Was Home,GW,FPL
1,2022-08-06,Sat,Premier League,Matchweek 1,Away,Liverpool,Fulham,Y,RW,90.0,...,6.0,Match Report,D,2–2,2,2,0,0,1,7.0
2,2022-08-15,Mon,Premier League,Matchweek 2,Home,Liverpool,Crystal Palace,Y,"RW,FW",90.0,...,6.0,Match Report,D,1–1,1,1,0,1,2,2.0
3,2022-08-22,Mon,Premier League,Matchweek 3,Away,Liverpool,Manchester Utd,Y,RW,90.0,...,4.0,Match Report,L,1–2,1,2,0,0,3,7.0
4,2022-08-27,Sat,Premier League,Matchweek 4,Home,Liverpool,Bournemouth,Y,RW,90.0,...,3.0,Match Report,W,9–0,9,0,1,1,4,3.0
5,2022-08-31,Wed,Premier League,Matchweek 5,Home,Liverpool,Newcastle Utd,Y,RW,90.0,...,6.0,Match Report,W,2–1,2,1,0,1,5,8.0
6,2022-09-03,Sat,Premier League,Matchweek 6,Away,Liverpool,Everton,Y,RW,90.0,...,2.0,Match Report,D,0–0,0,0,1,0,6,3.0


# Saving to csv

In [11]:
logs.to_csv("data/logs_all.csv")

# Features

In [12]:
info = ["Date", "Day", "GW", "Was Home"]
features = ["Min", "Gls", "Sh", "SoT", "xG", "npxG", "xAG", "CS"]
to_predict = ["FPL"]

In [13]:
logs[info + features + to_predict]

Unnamed: 0,Date,Day,GW,Was Home,Min,Gls,Sh,SoT,xG,npxG,xAG,CS,FPL
1,2022-08-06,Sat,1,0,90.0,1.0,2.0,1.0,0.4,0.4,0.1,0,7.0
2,2022-08-15,Mon,2,1,90.0,0.0,3.0,1.0,0.3,0.3,0.8,0,2.0
3,2022-08-22,Mon,3,0,90.0,1.0,3.0,1.0,0.3,0.3,0.3,0,7.0
4,2022-08-27,Sat,4,1,90.0,0.0,4.0,1.0,1.2,1.2,0.1,1,3.0
5,2022-08-31,Wed,5,1,90.0,0.0,2.0,0.0,0.2,0.2,0.4,0,8.0
6,2022-09-03,Sat,6,0,90.0,0.0,3.0,1.0,0.2,0.2,0.2,1,3.0
11,2022-10-01,Sat,9,1,90.0,0.0,5.0,2.0,0.6,0.6,0.1,0,5.0
13,2022-10-09,Sun,10,0,68.0,0.0,1.0,0.0,0.1,0.1,0.0,0,2.0
15,2022-10-16,Sun,11,1,89.0,1.0,3.0,1.0,0.9,0.9,0.0,1,8.0
16,2022-10-19,Wed,12,1,90.0,0.0,6.0,2.0,0.4,0.4,0.0,1,3.0


In [14]:
logs[info + features + to_predict].to_csv("data/logs_features.csv")