# Imports

In [1]:
import os
os.chdir("D:\PulpitE\FPL_ML")

In [2]:
import pandas as pd
import time
from vaastav.fbref import *
from src.match_names import neutralize_name

# Getting raw data

In [3]:
# only Mo Salah
only_Salah = False

In [4]:
def get_url( url ):
    df = pd.read_html( url, header=1)[0]
    return df

In [5]:
def get_shorter_season(season):
    res = season.split("-")
    return res[0] + "-" + res[1][2:]

In [6]:
def get_logs_for_players_in_season(players, season, players_names=[]):
    logs = pd.DataFrame({'A' : []})
    for id, player in list(players.items()):
        player_name = neutralize_name(player.data[0]['player'])
        if only_Salah and player_name != "Mohamed-Salah": 
            continue
        print("Getting data for " + player_name)
        url = 'https://fbref.com/en/players/' + str(id) + '/matchlogs/' + season + '/summary/' + player_name + '-Match-Logs'
        try:
            new_player_df = get_url(url)
            new_player_df["Name"] = player_name
            new_player_df["Season"] = get_shorter_season(season)


            # dropping NaN rows
            new_player_df = new_player_df[new_player_df['Date'].notna()]

            # only Premier League
            new_player_df = new_player_df[new_player_df.Comp == "Premier League"]

            # # only last 15 matches
            # new_player_df = new_player_df[-15:]

            if not logs.empty:
                logs = pd.concat([logs, new_player_df])
            else:
                logs = new_player_df
        except:
            print("Not found", player_name, season)
        
        # to not get timeouted
        time.sleep(3.2)
    return logs

In [7]:
players, stats = get_epl_players()

In [8]:
len(players.keys())

302

In [9]:
def modify_df(df):
    df[['WDL', 'GoalsTeams']] = df['Result'].str.split(' ', expand=True)
    df[['Team Score', 'Opp Score']] = df['GoalsTeams'].str.split('–', expand=True)
    df['Team Score'] = df['Team Score'].astype(int)
    df['Opp Score'] = df['Opp Score'].astype(int)
    df = df.drop(['Result'], axis=1)
    
    df = df.replace("On matchday squad, but did not play", 0)
    df["Min"] = df["Min"].astype(float)
    df["Gls"] = df["Gls"].astype(float)
    df["Ast"] = df["Ast"].astype(float)
    
    df["CS"] = (df["Opp Score"] == 0).astype(float)
    df["Was Home"] = (df["Venue"] == "Home").astype(float)
    df["GW"] = df['Round'].str.split(' ', expand=True)[1]
    
    df = df.sort_values(by=['Name', 'Date'], ascending=[True, True])
    
    return df

# 2021-22

In [10]:
%%time
# player_names = ["Mohamed-Salah", "Miguel-Almiron", "Bukayo-Saka", "Kevin-De-Bruyne", "Wilfried-Zaha", "Mason-Mount", "Bruno-Fernandes"]
logs_21_22 = get_logs_for_players_in_season(players, '2021-2022', None)

Getting data for Mohamed-Salah
CPU times: total: 109 ms
Wall time: 3.38 s


In [11]:
logs_21_22.shape

(35, 39)

In [12]:
logs_21_22 = modify_df(logs_21_22)
logs_21_22.to_csv("data/logs_21_22.csv")

# 2022-23

In [13]:
%%time
# player_names = ["Mohamed-Salah", "Miguel-Almiron", "Bukayo-Saka", "Kevin-De-Bruyne", "Wilfried-Zaha", "Mason-Mount", "Bruno-Fernandes"]
logs_22_23 = get_logs_for_players_in_season(players, '2022-2023', None)

Getting data for Mohamed-Salah
CPU times: total: 46.9 ms
Wall time: 3.34 s


In [14]:
logs_22_23.shape

(38, 39)

In [15]:
logs_22_23 = modify_df(logs_22_23)
logs_22_23.to_csv("data/logs_22_23.csv")

# Merged logs

In [16]:
# merging
logs = pd.concat([logs_21_22, logs_22_23])

In [17]:
logs.shape

(73, 45)

In [18]:
logs.head(10)[["Date", "Round", "Name"]]

Unnamed: 0,Date,Round,Name
0,2021-08-14,Matchweek 1,Mohamed-Salah
1,2021-08-21,Matchweek 2,Mohamed-Salah
2,2021-08-28,Matchweek 3,Mohamed-Salah
4,2021-09-12,Matchweek 4,Mohamed-Salah
6,2021-09-18,Matchweek 5,Mohamed-Salah
8,2021-09-25,Matchweek 6,Mohamed-Salah
10,2021-10-03,Matchweek 7,Mohamed-Salah
13,2021-10-16,Matchweek 8,Mohamed-Salah
15,2021-10-24,Matchweek 9,Mohamed-Salah
17,2021-10-30,Matchweek 10,Mohamed-Salah


In [19]:
logs.columns

Index(['Date', 'Day', 'Comp', 'Round', 'Venue', 'Squad', 'Opponent', 'Start',
       'Pos', 'Min', 'Gls', 'Ast', 'PK', 'PKatt', 'Sh', 'SoT', 'CrdY', 'CrdR',
       'Touches', 'Tkl', 'Int', 'Blocks', 'xG', 'npxG', 'xAG', 'SCA', 'GCA',
       'Cmp', 'Att', 'Cmp%', 'PrgP', 'Carries', 'PrgC', 'Att.1', 'Succ',
       'Match Report', 'Name', 'Season', 'WDL', 'GoalsTeams', 'Team Score',
       'Opp Score', 'CS', 'Was Home', 'GW'],
      dtype='object')

In [20]:
logs.to_csv("data/logs_all.csv")

# Features check

In [21]:
info = ["Date", "Day", "GW", "Was Home"]
features = ["Min", "Gls", "Sh", "SoT", "xG", "npxG", "xAG", "CS"]

In [22]:
logs[info + features]

Unnamed: 0,Date,Day,GW,Was Home,Min,Gls,Sh,SoT,xG,npxG,xAG,CS
0,2021-08-14,Sat,1,0.0,90.0,1.0,5.0,2.0,0.3,0.3,0.7,1.0
1,2021-08-21,Sat,2,1.0,90.0,0.0,5.0,1.0,0.4,0.4,0.1,1.0
2,2021-08-28,Sat,3,1.0,90.0,1.0,2.0,1.0,1.1,0.3,0.2,0.0
4,2021-09-12,Sun,4,0.0,90.0,1.0,3.0,1.0,0.9,0.9,0.8,1.0
6,2021-09-18,Sat,5,1.0,90.0,1.0,6.0,5.0,0.5,0.5,0.2,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...
51,2023-05-03,Wed,28,1.0,83.0,1.0,2.0,0.0,1.2,0.4,0.2,1.0
52,2023-05-06,Sat,35,1.0,90.0,1.0,3.0,2.0,1.0,1.0,0.2,1.0
53,2023-05-15,Mon,36,0.0,87.0,0.0,5.0,1.0,0.9,0.9,0.9,1.0
54,2023-05-20,Sat,37,1.0,90.0,0.0,2.0,1.0,0.1,0.1,0.4,0.0
