# Imports

In [1]:
import os
os.chdir("D:\PulpitE\FPL_ML")

In [2]:
import pandas as pd
import time
from vaastav.fbref import *

# Getting raw data

In [3]:
def get_url( url ):
    df = pd.read_html( url, header=1)[0]
    return df

In [4]:
def get_logs_for_players_in_season(players, season, players_names=[]):
    logs = pd.DataFrame({'A' : []})
    for id, player in list(players.items()):
        player_name = player.data[0]['player'].replace(" ", "-").replace("ï", "i").replace("é", "e").replace("á", "a").replace("ó", "o").replace("Á", "A").replace("ø", "o").replace("ć", "c").replace("í", "i").replace("ú", "u").replace("Ć", "C").replace("ã", "a").replace("ğ", "g").replace("ş", "s").replace("É", "E").replace("ñ", "n").replace("Ł", "L").replace("ń", "n").replace("ß", "ss").replace("ç", "c").replace("İ", "I").replace("č", "c").replace("ö", "o").replace("ë", "e").replace("š", "s").replace("ä", "a").replace("Ç", "C").replace("ü", "u").replace("Ø", "O").replace("ú", "u").replace("ú", "u")
        if player_name not in players_names: 
            continue
        print("Getting data for " + player_name)
        url = 'https://fbref.com/en/players/' + str(id) + '/matchlogs/' + season + '/summary/' + player_name + '-Match-Logs'
        new_player_df = get_url(url)
        new_player_df["Name"] = player_name
        
        # dropping NaN rows
        new_player_df = new_player_df[new_player_df['Date'].notna()]
        
        # only Premier League
        new_player_df = new_player_df[new_player_df.Comp == "Premier League"]
        
        # only last 15 matches
        new_player_df = new_player_df[-15:]
        
        if not logs.empty:
            logs = pd.concat([logs, new_player_df])
        else:
            logs = new_player_df
        
        time.sleep(4)
    return logs

In [5]:
players, stats = get_epl_players()

In [6]:
%%time
player_names = ["Mohamed-Salah", "Miguel-Almiron", "Bukayo-Saka", "Kevin-De-Bruyne", "Wilfried-Zaha", "Mason-Mount", "Bruno-Fernandes"]
logs = get_logs_for_players_in_season(players, '2022-2023', player_names)

Getting data for Miguel-Almiron
Getting data for Kevin-De-Bruyne
Getting data for Bruno-Fernandes
Getting data for Mason-Mount
Getting data for Bukayo-Saka
Getting data for Mohamed-Salah
Getting data for Wilfried-Zaha
CPU times: total: 359 ms
Wall time: 28.9 s


# Modyfing data

In [8]:
# splitting result column
logs[['WDL', 'GoalsTeams']] = logs['Result'].str.split(' ', expand=True)
logs[['Team Score', 'Opp Score']] = logs['GoalsTeams'].str.split('–', expand=True)
logs['Team Score'] = logs['Team Score'].astype(int)
logs['Opp Score'] = logs['Opp Score'].astype(int)
logs = logs.drop(['Result'], axis=1)

In [9]:
logs["Min"] = logs["Min"].astype(float)
logs["Gls"] = logs["Gls"].astype(float)
logs["Ast"] = logs["Ast"].astype(float)

In [10]:
# additional columns
# approximated FPL points
logs["CS"] = (logs["Opp Score"] == 0).astype(float)
logs["Was Home"] = (logs["Venue"] == "Home").astype(float)
logs["GW"] = logs['Round'].str.split(' ', expand=True)[1]
logs["FPL"] = 1 + (logs["Min"] >= 60) + (5 * logs["Gls"]) + (3 * logs["Ast"]) + (1 * logs["CS"])

In [11]:
logs

Unnamed: 0,Date,Day,Comp,Round,Venue,Squad,Opponent,Start,Pos,Min,...,Match Report,Name,WDL,GoalsTeams,Team Score,Opp Score,CS,Was Home,GW,FPL
4,2022-08-28,Sun,Premier League,Matchweek 4,Away,Newcastle Utd,Wolves,Y,RW,83.0,...,Match Report,Miguel-Almiron,D,1–1,1,1,0.0,0.0,4,2.0
5,2022-08-31,Wed,Premier League,Matchweek 5,Away,Newcastle Utd,Liverpool,Y,RW,83.0,...,Match Report,Miguel-Almiron,L,1–2,1,2,0.0,0.0,5,2.0
6,2022-09-03,Sat,Premier League,Matchweek 6,Home,Newcastle Utd,Crystal Palace,Y,RW,69.0,...,Match Report,Miguel-Almiron,D,0–0,0,0,1.0,1.0,6,3.0
8,2022-09-17,Sat,Premier League,Matchweek 8,Home,Newcastle Utd,Bournemouth,Y,RW,88.0,...,Match Report,Miguel-Almiron,D,1–1,1,1,0.0,1.0,8,2.0
11,2022-10-01,Sat,Premier League,Matchweek 9,Away,Newcastle Utd,Fulham,Y,RW,90.0,...,Match Report,Miguel-Almiron,W,4–1,4,1,0.0,0.0,9,12.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14,2022-11-06,Sun,Premier League,Matchweek 15,Away,Crystal Palace,West Ham,Y,FW,90.0,...,Match Report,Wilfried-Zaha,W,2–1,2,1,0.0,0.0,15,10.0
16,2022-11-12,Sat,Premier League,Matchweek 16,Away,Crystal Palace,Nott'ham Forest,Y,"FW,LW,LM",90.0,...,Match Report,Wilfried-Zaha,L,0–1,0,1,0.0,0.0,16,2.0
18,2022-12-26,Mon,Premier League,Matchweek 17,Home,Crystal Palace,Fulham,Y,"LW,FW",90.0,...,Match Report,Wilfried-Zaha,L,0–3,0,3,0.0,1.0,17,2.0
19,2022-12-31,Sat,Premier League,Matchweek 18,Away,Crystal Palace,Bournemouth,Y,LW,90.0,...,Match Report,Wilfried-Zaha,W,2–0,2,0,1.0,0.0,18,3.0


# Saving to csv

In [12]:
logs.to_csv("data/logs_all.csv")

# Features

In [13]:
info = ["Date", "Day", "GW", "Was Home"]
features = ["Min", "Gls", "Sh", "SoT", "xG", "npxG", "xAG", "CS"]
to_predict = ["FPL"]

In [14]:
logs[info + features + to_predict]

Unnamed: 0,Date,Day,GW,Was Home,Min,Gls,Sh,SoT,xG,npxG,xAG,CS,FPL
4,2022-08-28,Sun,4,0.0,83.0,0.0,3,0,0.2,0.2,0.0,0.0,2.0
5,2022-08-31,Wed,5,0.0,83.0,0.0,0,0,0.0,0.0,0.0,0.0,2.0
6,2022-09-03,Sat,6,1.0,69.0,0.0,3,0,0.2,0.2,0.1,1.0,3.0
8,2022-09-17,Sat,8,1.0,88.0,0.0,2,1,0.2,0.2,0.1,0.0,2.0
11,2022-10-01,Sat,9,0.0,90.0,2.0,4,2,1.0,1.0,0.0,0.0,12.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
14,2022-11-06,Sun,15,0.0,90.0,1.0,4.0,2.0,0.3,0.3,0.1,0.0,10.0
16,2022-11-12,Sat,16,0.0,90.0,0.0,2.0,0.0,0.9,0.1,0.1,0.0,2.0
18,2022-12-26,Mon,17,1.0,90.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
19,2022-12-31,Sat,18,0.0,90.0,0.0,1.0,0.0,0.1,0.1,0.6,1.0,3.0


In [15]:
logs[info + features + to_predict].to_csv("data/logs_features.csv")