# Imports

In [1]:
import os
os.chdir("D:\PulpitE\FPL_ML")

In [2]:
import pandas as pd
import time
from vaastav.fbref import *
from src.match_names import neutralize_name

# Getting raw data

In [3]:
# only Mo Salah
only_Salah = False

In [4]:
def get_url( url ):
    df = pd.read_html( url, header=1)[0]
    return df

In [5]:
def get_shorter_season(season):
    res = season.split("-")
    return res[0] + "-" + res[1][2:]

In [6]:
def get_logs_for_players_in_season(players, season, players_names=[]):
    logs = pd.DataFrame({'A' : []})
    for id, player in players:
        player_name = neutralize_name(player)
        if only_Salah and player_name != "Mohamed-Salah": 
            continue
        print("Getting data for " + player_name)
        url = 'https://fbref.com/en/players/' + str(id) + '/matchlogs/' + season + '/summary/' + player_name + '-Match-Logs'
        try:
            new_player_df = get_url(url)
            new_player_df["Name"] = player_name
            new_player_df["Season"] = get_shorter_season(season)


            # dropping NaN rows
            new_player_df = new_player_df[new_player_df['Date'].notna()]

            # only Premier League
            # new_player_df = new_player_df[new_player_df.Comp == "Premier League"]

            # # only last 15 matches
            # new_player_df = new_player_df[-15:]

            if not logs.empty:
                logs = pd.concat([logs, new_player_df])
            else:
                logs = new_player_df
        except:
            print("Not found", player_name, season, id, url)
        
        # to not get timeouted
        time.sleep(3.2)
    return logs

In [7]:
tables = get_data("https://fbref.com/en/comps/9/wages/Premier-League-Wages")

In [8]:
# tables[1]

In [9]:
players = get_epl_players()

In [10]:
len(players)

532

In [11]:
# players

In [12]:
# players['774cf58b'].data

In [13]:
def modify_df(df):
    df[['WDL', 'GoalsTeams']] = df['Result'].str.split(' ', n=1, expand=True)
    df[['Team Score', 'Opp Score']] = df['GoalsTeams'].str.split('–', expand=True)
    
    df = df.dropna(subset=['Team Score'])
    df = df.dropna(subset=['Opp Score'])
    
    df['Team Score'] = df['Team Score'].apply(lambda x: x.split(' ')[0].strip() if ' ' in x else x)
    df['Opp Score'] = df['Opp Score'].apply(lambda x: x.split(' ')[0].strip() if ' ' in x else x)

    df['Team Score'] = df['Team Score'].astype(int)
    df['Opp Score'] = df['Opp Score'].astype(int)
    df = df.drop(['Result'], axis=1)
    
    df = df.replace("On matchday squad, but did not play", 0)
    df["Min"] = df["Min"].astype(float)
    df["Gls"] = df["Gls"].astype(float)
    df["Ast"] = df["Ast"].astype(float)
    
    df["CS"] = (df["Opp Score"] == 0).astype(float)
    df["Was Home"] = (df["Venue"] == "Home").astype(float)
    df["GW"] = df['Round'].str.split(' ', expand=True)[1]
    
    df = df.sort_values(by=['Name', 'Date'], ascending=[True, True])
    
    return df.reset_index()

# 2021-22

In [14]:
%%time
# player_names = ["Mohamed-Salah", "Miguel-Almiron", "Bukayo-Saka", "Kevin-De-Bruyne", "Wilfried-Zaha", "Mason-Mount", "Bruno-Fernandes"]
logs_21_22 = get_logs_for_players_in_season(players, '2021-2022', None)

Getting data for Kevin-De-Bruyne
Getting data for Erling-Haaland
Getting data for Casemiro
Getting data for Jadon-Sancho
Getting data for Mohamed-Salah
Getting data for Raphael-Varane
Getting data for Raheem-Sterling
Getting data for Romelu-Lukaku
Getting data for Marcus-Rashford
Getting data for Jack-Grealish
Getting data for Kai-Havertz
Getting data for Gabriel-Jesus
Getting data for Anthony-Martial
Getting data for Joao-Cancelo
Getting data for John-Stones
Getting data for Reece-James
Getting data for Declan-Rice
Getting data for Bruno-Fernandes
Getting data for Phil-Foden
Getting data for Virgil-van-Dijk
Getting data for Rodri
Getting data for Sandro-Tonali
Getting data for Thomas-Partey
Getting data for Antony
Getting data for Mason-Mount
Getting data for Tanguy-Ndombele
Getting data for Thiago-Alcantara
Getting data for Josko-Gvardiol
Getting data for Ben-Chilwell
Getting data for Wesley-Fofana
Getting data for Bukayo-Saka
Getting data for Christopher-Nkunku
Getting data for Will

In [15]:
logs_21_22 = modify_df(logs_21_22)
logs_21_22.shape

(21787, 54)

In [16]:
logs_21_22.to_csv("data/logs/logs_21_22.csv")

In [52]:
logs_21_22.head()

Unnamed: 0,index,Date,Day,Comp,Round,Venue,Squad,Opponent,Start,Pos,...,OG,PKwon,PKcon,WDL,GoalsTeams,Team Score,Opp Score,CS,Was Home,GW
0,0,2021-08-21,Sat,Premier League,Matchweek 2,Home,Brighton,Watford,N,FW,...,,,,W,2–0,2,0,1.0,1.0,2
1,2,2021-08-28,Sat,Premier League,Matchweek 3,Home,Brighton,Everton,N,0,...,,,,L,0–2,0,2,0.0,1.0,3
2,3,2021-09-01,Wed,WCQ,First round,Away,ie Rep. of Ireland,pt Portugal,Y,FW,...,,,,L,1–2,1,2,0.0,0.0,round
3,4,2021-09-04,Sat,WCQ,First round,Home,ie Rep. of Ireland,az Azerbaijan,Y,LM,...,,,,D,1–1,1,1,0.0,1.0,round
4,5,2021-09-19,Sun,Premier League,Matchweek 5,Home,Brighton,Leicester City,N,0,...,,,,W,2–1,2,1,0.0,1.0,5


In [65]:
unwanted_clubs = ['Dynamo Kyiv', 'CSKA Moscow', 'Shakhtar']
logs_21_22_PL = logs_21_22[(logs_21_22["Comp"] == "Premier League") & (~logs_21_22["Squad"].isin(unwanted_clubs))]
logs_21_22_PL.to_csv("data/logs/logs_21_22_PL.csv")
logs_21_22_PL["Squad"].unique()

array(['Brighton', 'West Ham', 'Arsenal', 'Manchester Utd', 'Everton',
       'Liverpool', 'Newcastle Utd', 'Southampton', 'Manchester City',
       'Chelsea', 'Tottenham', 'Burnley', 'Aston Villa', 'Norwich City',
       'Wolves', 'Brentford', 'Crystal Palace', 'Watford',
       'Leicester City', 'Leeds United'], dtype=object)

# 2022-23

In [17]:
%%time
# player_names = ["Mohamed-Salah", "Miguel-Almiron", "Bukayo-Saka", "Kevin-De-Bruyne", "Wilfried-Zaha", "Mason-Mount", "Bruno-Fernandes"]
logs_22_23 = get_logs_for_players_in_season(players, '2022-2023', None)

Getting data for Kevin-De-Bruyne
Getting data for Erling-Haaland
Getting data for Casemiro
Getting data for Jadon-Sancho
Getting data for Mohamed-Salah
Getting data for Raphael-Varane
Getting data for Raheem-Sterling
Getting data for Romelu-Lukaku
Getting data for Marcus-Rashford
Getting data for Jack-Grealish
Getting data for Kai-Havertz
Getting data for Gabriel-Jesus
Getting data for Anthony-Martial
Getting data for Joao-Cancelo
Getting data for John-Stones
Getting data for Reece-James
Getting data for Declan-Rice
Getting data for Bruno-Fernandes
Getting data for Phil-Foden
Getting data for Virgil-van-Dijk
Getting data for Rodri
Getting data for Sandro-Tonali
Getting data for Thomas-Partey
Getting data for Antony
Getting data for Mason-Mount
Getting data for Tanguy-Ndombele
Getting data for Thiago-Alcantara
Getting data for Josko-Gvardiol
Getting data for Ben-Chilwell
Getting data for Wesley-Fofana
Getting data for Bukayo-Saka
Getting data for Christopher-Nkunku
Getting data for Will

In [18]:
logs_22_23 = modify_df(logs_22_23)
logs_22_23.shape

(21653, 54)

In [19]:
logs_22_23.to_csv("data/logs/logs_22_23.csv")

In [64]:
unwanted_clubs = ['Dynamo Kyiv', 'CSKA Moscow', 'Shakhtar']
logs_22_23_PL = logs_22_23[(logs_22_23["Comp"] == "Premier League") & (~logs_22_23["Squad"].isin(unwanted_clubs))]
logs_22_23_PL.to_csv("data/logs/logs_22_23_PL.csv")
logs_22_23_PL["Squad"].unique()

array(['West Ham', 'Brentford', 'Arsenal', 'Manchester Utd', 'Everton',
       'Brighton', 'Bournemouth', 'Liverpool', 'Crystal Palace', 'Fulham',
       'Aston Villa', 'Newcastle Utd', "Nott'ham Forest", 'Chelsea',
       'Tottenham', 'Manchester City', 'Wolves', 'Leicester City'],
      dtype=object)

# 2023-24

In [28]:
%%time
# player_names = ["Mohamed-Salah", "Miguel-Almiron", "Bukayo-Saka", "Kevin-De-Bruyne", "Wilfried-Zaha", "Mason-Mount", "Bruno-Fernandes"]
logs_23_24 = get_logs_for_players_in_season(players, '2023-2024', None)

Getting data for Kevin-De-Bruyne
Getting data for Erling-Haaland
Getting data for Casemiro
Getting data for Jadon-Sancho
Getting data for Mohamed-Salah
Getting data for Raphael-Varane
Getting data for Raheem-Sterling
Getting data for Romelu-Lukaku
Not found Romelu-Lukaku 2023-2024 5eae500a https://fbref.com/en/players/5eae500a/matchlogs/2023-2024/summary/Romelu-Lukaku-Match-Logs
Getting data for Marcus-Rashford
Getting data for Jack-Grealish
Getting data for Kai-Havertz
Getting data for Gabriel-Jesus
Not found Gabriel-Jesus 2023-2024 b66315ae https://fbref.com/en/players/b66315ae/matchlogs/2023-2024/summary/Gabriel-Jesus-Match-Logs
Getting data for Anthony-Martial
Getting data for Joao-Cancelo
Not found Joao-Cancelo 2023-2024 bd6351cd https://fbref.com/en/players/bd6351cd/matchlogs/2023-2024/summary/Joao-Cancelo-Match-Logs
Getting data for John-Stones
Getting data for Reece-James
Getting data for Declan-Rice
Getting data for Bruno-Fernandes
Getting data for Phil-Foden
Getting data for 

In [29]:
logs_23_24 = modify_df(logs_23_24)
logs_23_24.shape

(499, 54)

In [30]:
logs_23_24.to_csv("data/logs_23_24.csv")

In [67]:
unwanted_clubs = ['Dynamo Kyiv', 'CSKA Moscow', 'Shakhtar']
logs_23_24_PL = logs_23_24[(logs_23_24["Comp"] == "Premier League") & (~logs_23_24["Squad"].isin(unwanted_clubs))]
logs_23_24_PL.to_csv("data/logs/logs/logs_23_24_PL.csv")
logs_23_24_PL["Squad"].unique()

array(['Brentford', 'Arsenal', 'Manchester Utd', 'Everton',
       'Sheffield Utd', 'Fulham', 'Newcastle Utd', 'Liverpool',
       'Luton Town', 'West Ham', 'Burnley', "Nott'ham Forest",
       'Bournemouth', 'Manchester City', 'Chelsea', 'Tottenham',
       'Brighton', 'Aston Villa', 'Wolves', 'Crystal Palace'],
      dtype=object)

# Merged logs

In [44]:
# merging
logs = pd.concat([logs_21_22, logs_22_23, logs_23_24]).reset_index()
logs = logs.drop(columns=['level_0', 'index'])

In [45]:
logs.shape

(43939, 53)

In [46]:
logs.head(10)

Unnamed: 0,Date,Day,Comp,Round,Venue,Squad,Opponent,Start,Pos,Min,...,OG,PKwon,PKcon,WDL,GoalsTeams,Team Score,Opp Score,CS,Was Home,GW
0,2021-08-21,Sat,Premier League,Matchweek 2,Home,Brighton,Watford,N,FW,45.0,...,,,,W,2–0,2,0,1.0,1.0,2
1,2021-08-28,Sat,Premier League,Matchweek 3,Home,Brighton,Everton,N,0,0.0,...,,,,L,0–2,0,2,0.0,1.0,3
2,2021-09-01,Wed,WCQ,First round,Away,ie Rep. of Ireland,pt Portugal,Y,FW,71.0,...,,,,L,1–2,1,2,0.0,0.0,round
3,2021-09-04,Sat,WCQ,First round,Home,ie Rep. of Ireland,az Azerbaijan,Y,LM,45.0,...,,,,D,1–1,1,1,0.0,1.0,round
4,2021-09-19,Sun,Premier League,Matchweek 5,Home,Brighton,Leicester City,N,0,0.0,...,,,,W,2–1,2,1,0.0,1.0,5
5,2021-09-22,Wed,EFL Cup,Third round,Home,Brighton,Swansea City,Y,LM,75.0,...,,,,W,2–0,2,0,1.0,1.0,round
6,2021-09-27,Mon,Premier League,Matchweek 6,Away,Brighton,Crystal Palace,N,LM,15.0,...,,,,D,1–1,1,1,0.0,0.0,6
7,2021-10-02,Sat,Premier League,Matchweek 7,Home,Brighton,Arsenal,N,0,0.0,...,,,,D,0–0,0,0,1.0,1.0,7
8,2021-10-09,Sat,WCQ,First round,Away,ie Rep. of Ireland,az Azerbaijan,N,0,0.0,...,,,,W,3–0,3,0,1.0,0.0,round
9,2021-10-12,Tue,Friendlies (M),Friendlies (M),Home,ie Rep. of Ireland,qa Qatar,N,0,0.0,...,,,,W,4–0,4,0,1.0,1.0,(M)


In [47]:
logs.columns

Index(['Date', 'Day', 'Comp', 'Round', 'Venue', 'Squad', 'Opponent', 'Start',
       'Pos', 'Min', 'Gls', 'Ast', 'PK', 'PKatt', 'Sh', 'SoT', 'CrdY', 'CrdR',
       'Touches', 'Tkl', 'Int', 'Blocks', 'xG', 'npxG', 'xAG', 'SCA', 'GCA',
       'Cmp', 'Att', 'Cmp%', 'PrgP', 'Carries', 'PrgC', 'Att.1', 'Succ',
       'Match Report', 'Name', 'Season', 'Fls', 'Fld', 'Off', 'Crs', 'TklW',
       'OG', 'PKwon', 'PKcon', 'WDL', 'GoalsTeams', 'Team Score', 'Opp Score',
       'CS', 'Was Home', 'GW'],
      dtype='object')

In [48]:
logs.to_csv("data/logs_all.csv")

In [68]:
unwanted_clubs = ['Dynamo Kyiv', 'CSKA Moscow', 'Shakhtar']
logs_PL = logs[(logs["Comp"] == "Premier League") & (~logs["Squad"].isin(unwanted_clubs))]
logs_PL.to_csv("data/logs/logs/logs_all_PL.csv")
logs_PL["Squad"].unique()

array(['Brighton', 'West Ham', 'Arsenal', 'Manchester Utd', 'Everton',
       'Liverpool', 'Newcastle Utd', 'Southampton', 'Manchester City',
       'Chelsea', 'Tottenham', 'Burnley', 'Aston Villa', 'Norwich City',
       'Wolves', 'Brentford', 'Crystal Palace', 'Watford',
       'Leicester City', 'Leeds United', 'Bournemouth', 'Fulham',
       "Nott'ham Forest", 'Sheffield Utd', 'Luton Town'], dtype=object)

# Features check

In [49]:
info = ["Date", "Day", "GW", "Was Home"]
features = ["Min", "Gls", "Sh", "SoT", "xG", "npxG", "xAG", "CS"]

In [50]:
logs[info + features]

Unnamed: 0,Date,Day,GW,Was Home,Min,Gls,Sh,SoT,xG,npxG,xAG,CS
0,2021-08-21,Sat,2,1.0,45.0,0.0,1,0,0.3,0.3,0.0,1.0
1,2021-08-28,Sat,3,1.0,0.0,0.0,0,0,0,0,0,0.0
2,2021-09-01,Wed,round,0.0,71.0,0.0,4,0,,,,0.0
3,2021-09-04,Sat,round,1.0,45.0,0.0,3,1,,,,0.0
4,2021-09-19,Sun,5,1.0,0.0,0.0,0,0,0,0,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
43934,2023-08-12,Sat,1,0.0,90.0,0.0,1,1,0.2,0.2,0.3,0.0
43935,2023-08-14,Mon,of,0.0,90.0,0.0,6,2,,,,0.0
43936,2023-08-13,Sun,1,1.0,71.0,1.0,3,2,0.3,0.3,0.1,0.0
43937,2023-08-12,Sat,1,0.0,45.0,0.0,0,0,0.0,0.0,0.1,0.0


In [51]:
logs[logs["Name"] == "Mohamed-Salah"]

Unnamed: 0,Date,Day,Comp,Round,Venue,Squad,Opponent,Start,Pos,Min,...,OG,PKwon,PKcon,WDL,GoalsTeams,Team Score,Opp Score,CS,Was Home,GW
15383,2021-08-14,Sat,Premier League,Matchweek 1,Away,Liverpool,Norwich City,Y,RW,90.0,...,,,,W,3–0,3,0,1.0,0.0,1
15384,2021-08-21,Sat,Premier League,Matchweek 2,Home,Liverpool,Burnley,Y,RW,90.0,...,,,,W,2–0,2,0,1.0,1.0,2
15385,2021-08-28,Sat,Premier League,Matchweek 3,Home,Liverpool,Chelsea,Y,RW,90.0,...,,,,D,1–1,1,1,0.0,1.0,3
15386,2021-09-05,Sun,WCQ,Second round,Away,eg Egypt,ga Gabon,Y*,AM,90.0,...,,,,D,1–1,1,1,0.0,0.0,round
15387,2021-09-12,Sun,Premier League,Matchweek 4,Away,Liverpool,Leeds United,Y,RW,90.0,...,,,,W,3–0,3,0,1.0,0.0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37326,2023-05-15,Mon,Premier League,Matchweek 36,Away,Liverpool,Leicester City,Y,RW,87.0,...,,,,W,3–0,3,0,1.0,0.0,36
37327,2023-05-20,Sat,Premier League,Matchweek 37,Home,Liverpool,Aston Villa,Y,RW,90.0,...,,,,D,1–1,1,1,0.0,1.0,37
37328,2023-05-28,Sun,Premier League,Matchweek 38,Away,Liverpool,Southampton,Y,"RW,RM",90.0,...,,,,D,4–4,4,4,0.0,0.0,38
37329,2023-06-14,Wed,Africa Cup of Nations qualification,Group stage,Away,eg Egypt,gn Guinea,Y*,FW,90.0,...,,,,W,2–1,2,1,0.0,0.0,stage
