# Scraping FBREF


### Import Required Libraries

In [None]:
from selenium import webdriver 
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import ElementClickInterceptedException
import pandas as pd

## Fetching teams from top 5 leagues to include in our model

### seting leagues websites

In [5]:
# create variables for the URLs of the leagues
PLurl = "https://fbref.com/fr/comps/9/Statistiques-Premier-League"
LaligaUrl = "https://fbref.com/fr/comps/12/Statistiques-La-Liga"
BundesligaUrl = "https://fbref.com/fr/comps/20/Statistiques-Bundesliga" 
SerieAUrl = "https://fbref.com/fr/comps/11/Statistiques-Serie-A"
Ligue1Url = "https://fbref.com/fr/comps/13/Statistiques-Ligue-1"


In [9]:
# create path variable
path = "C:\\Users\\azedd\\Downloads\\chromedriver-win64\\chromedriver.exe"

# Create teams dataframe
teamsdf = pd.DataFrame(columns=['Team','league'])

# Develop a fetch function
def teams_fetch(url):
    
    # set up the webdriver
    driver = webdriver.Chrome(service=webdriver.ChromeService(executable_path=path))
    driver.get(url)
    teams = driver.find_elements(By.XPATH, f"//td[@data-stat = 'team']")

    # Extracting the league name from the URL
    league = url.split('-')
    league.pop(0)
    league = " ".join([str(x) for x in league])
    
    # Adding team names and league to the Dataframe
    for team in teams :

        if team.text not in teamsdf['Team'].values:

            teamsdf.loc[len(teamsdf)] = [team.text, league]


    # close the driver
    driver.quit()
    return teamsdf

In [10]:

teams_fetch(PLurl)
teams_fetch(LaligaUrl)
teams_fetch(BundesligaUrl)
teams_fetch(SerieAUrl)
teams_fetch(Ligue1Url)

Unnamed: 0,Team,league
0,Liverpool,Premier League
1,Arsenal,Premier League
2,Manchester City,Premier League
3,Chelsea,Premier League
4,Newcastle Utd,Premier League
...,...,...
92,Angers,Ligue 1
93,Le Havre,Ligue 1
94,Reims,Ligue 1
95,Saint-Étienne,Ligue 1


## Now let's fetch players data

### Create players dataframe

In [89]:
playersDF = pd.DataFrame(columns=[
    'Player',
    'Nation',
    'Position',
    'Age',
    'Matches Played',
    'Starts',
    'Minutes',
    '90s Played',
    'Goals',
    'Assists',
    'Goals + Assists',
    'Non-Penalty Goals',
    'Penalty Goals Made',
    'Penalty Attempts',
    'Yellow Cards',
    'Red Cards',
    'xG',
    'npxG',
    'xAG',
    'npxG + xAG',
    'Progressive Carries',
    'Progressive Passes',
    'Progressive Receives',
    'Goals Per 90',
    'Assists Per 90',
    'Goals + Assists Per 90',
    'Non-Penalty Goals Per 90',
    'Non-Penalty Goals + Assists Per 90',
    'xG Per 90',
    'xAG Per 90',
    'xG + xAG Per 90',
    'npxG Per 90',
    'npxG + xAG Per 90'
])

In [88]:
fetchedteams = []
notfetched = []

### Fetching function

In [11]:
# create function that scrap all players data from a team name
def players_fetch(team_name):

    # set up the webdriver
    Url = f"https://fbref.com/fr"
    driver = webdriver.Chrome(service=webdriver.ChromeService(executable_path=path))
    driver.get(Url)

    # create an automation to search teams
    try : 
        team_search = driver.find_element(By.XPATH, "//input[@tabindex = '1'][@type = 'search']")
        team_search.send_keys(team_name + Keys.ENTER)
        team_page = driver.find_element(By.LINK_TEXT, F"{team_name}")
        team_page.click()


    except NoSuchElementException or ElementClickInterceptedException as err:
        drive = driver.find_element(By.XPATH, "//*[@id='sh_squads-tab']")
        drive.click()
        team_page = driver.find_element(By.XPATH, "//*[@id='sh_squads']/div[1]/div[1]/strong")
        team_page.click()

        


    
    position = ['AT', 'DF', 'MT,DF','MT,AT', 'MT', 'GB']
    for i in range(37):
        
        try:
            data = driver.find_element(By.XPATH,f"//tr [@data-row = '{i}']")
            
        except NoSuchElementException as err :
            saison = driver.find_element(By.XPATH,"//div [@class = 'button2 prev']")
            saison.click()
            data = driver.find_element(By.XPATH,f"//tr [@data-row = '{i}']")
            

        
        data = data.text.split(' ')
        data.pop(-1)
        if len(data) < 33:
            continue

        #searching for position index and handling name problems
        if data[3] in position:
            data.pop(1)
            playersDF.loc[len(playersDF)] = data
    
        elif data[4] in position:
            name = data.pop(0) + " " + data.pop(0)
            data.insert(0, name)
            data.pop(1)
            playersDF.loc[len(playersDF)] = data
            
        elif data[5] in position:
            name = data.pop(0) + " " + data.pop(0) + " " + data.pop(0)
            data.insert(0, name)
            data.pop(1)
            playersDF.loc[len(playersDF)] = data
        
        
    driver.quit()
    fetchedteams.append(team_name)
    return f"{team_name} players data fetched successfully!"

### Testing

In [142]:
# Example usage to fetch players from Liverpool
players_fetch("Liverpool")

'Liverpool players data fetched successfully!'

In [92]:
# see what the players dataframe looks like
print(playersDF.info())
playersDF

<class 'pandas.core.frame.DataFrame'>
Index: 24 entries, 0 to 23
Data columns (total 33 columns):
 #   Column                              Non-Null Count  Dtype 
---  ------                              --------------  ----- 
 0   Player                              24 non-null     object
 1   Nation                              24 non-null     object
 2   Position                            24 non-null     object
 3   Age                                 24 non-null     object
 4   Matches Played                      24 non-null     object
 5   Starts                              24 non-null     object
 6   Minutes                             24 non-null     object
 7   90s Played                          24 non-null     object
 8   Goals                               24 non-null     object
 9   Assists                             24 non-null     object
 10  Goals + Assists                     24 non-null     object
 11  Non-Penalty Goals                   24 non-null     object
 12  P

Unnamed: 0,Player,Nation,Position,Age,Matches Played,Starts,Minutes,90s Played,Goals,Assists,...,Goals Per 90,Assists Per 90,Goals + Assists Per 90,Non-Penalty Goals Per 90,Non-Penalty Goals + Assists Per 90,xG Per 90,xAG Per 90,xG + xAG Per 90,npxG Per 90,npxG + xAG Per 90
0,Mohamed Salah,EGY,AT,32,38,38,3371,37.5,29,18,...,0.77,0.48,1.25,0.53,1.01,0.67,0.38,1.05,0.49,0.87
1,Virgil van Dijk,NED,DF,33,37,37,3330,37.0,3,1,...,0.08,0.03,0.11,0.08,0.11,0.06,0.02,0.08,0.06,0.08
2,Ryan Gravenberch,NED,MT,22,37,37,3160,35.1,0,4,...,0.0,0.11,0.11,0.0,0.11,0.03,0.09,0.12,0.03,0.12
3,Alexis Mac Allister,ARG,MT,25,35,30,2599,28.9,5,5,...,0.17,0.17,0.35,0.17,0.35,0.1,0.16,0.26,0.1,0.26
4,Ibrahima Konaté,FRA,DF,25,31,30,2560,28.4,1,2,...,0.04,0.07,0.11,0.04,0.11,0.06,0.03,0.1,0.06,0.1
5,Dominik Szoboszlai,HUN,MT,23,36,29,2491,27.7,6,6,...,0.22,0.22,0.43,0.22,0.43,0.27,0.27,0.53,0.27,0.53
6,Andrew Robertson,SCO,DF,30,33,29,2482,27.6,0,1,...,0.0,0.04,0.04,0.0,0.04,0.04,0.17,0.21,0.04,0.21
7,Alisson,BRA,GB,31,28,28,2508,27.9,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.02,0.02,0.0,0.02
8,Luis Díaz,COL,AT,27,36,28,2399,26.7,13,5,...,0.49,0.19,0.68,0.49,0.68,0.45,0.19,0.64,0.45,0.64
9,Trent Alexander-Arnold,ENG,DF,25,33,28,2365,26.3,3,6,...,0.11,0.23,0.34,0.11,0.34,0.07,0.28,0.35,0.07,0.35


### Now, let's fetch

In [131]:

for team in teamsdf['Team'].values:
    if team in fetchedteams:
        continue
    try:
        print(players_fetch(team))
        fetchedteams.append(team)
    except Exception as e:
        print(f"Error fetching data for {team}: {e}")
        if team not in notfetched:
            notfetched.append(team)
        continue

### See the teams for which we lack player data

In [None]:
# Print Unfetchd Teams
print(notfetched)


['Newcastle Utd', "Nott'ham", 'Manchester Utd', 'Wolves', 'St. Pauli', ' ']


### Fixing unfetched teams

In [None]:
teamsdf.loc[4, 'Team'] = "Newcastle United"  
teamsdf.loc[14, 'Team'] = "Manchester United"  
teamsdf.loc[6, 'Team'] = "Nottingham Forest"
teamsdf.loc[54, 'Team'] = "St pauli"
teamsdf.loc[15, 'Team'] = "Wolverhampton Wanderers"
teamsdf = teamsdf.drop(20)



In [15]:
#Checking for any unfetched teams
notfetched == []

True

### Save the players data to a CSV file

In [None]:
playersDF.to_csv("players_data.csv")
teamsdf.to_csv("teams_data.csv")