In [38]:
pip install fuzzywuzzy

Collecting fuzzywuzzy
  Obtaining dependency information for fuzzywuzzy from https://files.pythonhosted.org/packages/43/ff/74f23998ad2f93b945c0309f825be92e04e0348e062026998b5eefef4c33/fuzzywuzzy-0.18.0-py2.py3-none-any.whl.metadata
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl.metadata (4.9 kB)
Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Installing collected packages: fuzzywuzzy
Successfully installed fuzzywuzzy-0.18.0
Note: you may need to restart the kernel to use updated packages.


In [3]:
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
import time
from bs4 import BeautifulSoup
import pandas as pd
import re
import unicodedata
import matplotlib.pyplot as plt
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import random



In [4]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [19]:
comprehensive_df = pd.read_csv('comprehensive_df.csv')
comprehensive_df['Appearances'] = pd.to_numeric(comprehensive_df['Appearances'])
comprehensive_df['Season'] = comprehensive_df['Season'].astype(str)

In [6]:
# Define the URL and headers
url = 'https://fbref.com/en/comps/Big5/defense/players/Big-5-European-Leagues-Stats'
headers = {'User-Agent': 'Mozilla/5.0'}

# Send the GET request
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')

# Select the tbody element that contains all the rows
tbody = soup.select_one('#stats_defense > tbody')

# Initialize a list to store player data
players_data = []

# Check if tbody is found
if tbody:
    # Select all the rows within the tbody
    rows = tbody.select('tr')

        # Iterate through each row and extract the relevant data
    for row in rows:
        player_name_element = row.select_one('td:nth-child(2)')
        tackles_made_element = row.select_one('td:nth-child(11)')
        tackle_percentage_element = row.select_one('td:nth-child(17)')
        interceptions_element = row.select_one('td:nth-child(22)')
        age_element = row.select_one('td:nth-child(7)')
        nation_link = row.select_one('td.left.poptip > a')
    
        # Extract data, leaving fields blank if elements are missing
        player_name = player_name_element.text.strip() if player_name_element else ''
        tackles_made = tackles_made_element.text.strip() if tackles_made_element else ''
        tackle_percentage = tackle_percentage_element.text.strip() if tackle_percentage_element else ''
        interceptions = interceptions_element.text.strip() if interceptions_element else ''
        age = age_element.text.strip() if age_element else ''
         # Extract nationality and remove "Football" part
        nationality = ''
        if nation_link:
            nationality = nation_link['href'].split('/')[-1].replace('-', ' ').title()
            # Remove "Football" and everything after it
            nationality = nationality.split(' Football')[0]
    
        # Append the extracted data to the list
        players_data.append({
            'Player Name': player_name,
            'Tackles Made': tackles_made,
            'Tackle Percentage': tackle_percentage,
            'Interceptions': interceptions,
            'Age': age,
            'Nationality': nationality
        })

    # Convert the list of dictionaries to a DataFrame
    df_players = pd.DataFrame(players_data)

    # Display the DataFrame
    print(df_players)
else:
    print("Table body not found. Please check the selector or page structure.")


            Player Name Tackles Made Tackle Percentage Interceptions Age  \
0            Max Aarons           19              58.8             8  23   
1      Brenden Aaronson           18              50.0             2  22   
2       Paxten Aaronson            2             100.0             0  19   
3     Keyliane Abdallah            0                               0  17   
4      Yunis Abdelhamid           35              57.8            39  35   
...                 ...          ...               ...           ...  ..   
2961    Martin Ødegaard           17              29.3            15  24   
2962        Milan Đurić            0               0.0             1  33   
2963                                                                       
2964        Milan Đurić            5              50.0             0  33   
2965   Mateusz Łęgowski           12              42.3             7  20   

                 Nationality  
0                    England  
1              United Sta

In [7]:
# Function to normalize a string
def normalize_string(s):
    """
    Normalize a string by removing accents and converting to lowercase.
    """
    # Decompose the unicode string into its base characters and accents
    s = unicodedata.normalize('NFKD', s)
    
    # Encode to ASCII to remove accents, then decode back to string
    s = s.encode('ASCII', 'ignore').decode('ASCII')
    
    # Convert to lowercase
    s = s.lower()
    
    return s

# Apply the normalization function to the 'Player Name' column
df_players['Player Name'] = df_players['Player Name'].apply(normalize_string)

# Display the DataFrame to verify the changes
print(df_players.head())

         Player Name Tackles Made Tackle Percentage Interceptions Age  \
0         max aarons           19              58.8             8  23   
1   brenden aaronson           18              50.0             2  22   
2    paxten aaronson            2             100.0             0  19   
3  keyliane abdallah            0                               0  17   
4   yunis abdelhamid           35              57.8            39  35   

     Nationality  
0        England  
1  United States  
2  United States  
3         France  
4        Morocco  


In [8]:
# Filter comprehensive_df for the 2023/2024 season
comprehensive_copy = comprehensive_df[(comprehensive_df['Appearances'] > 15)]

# Perform the comparison
matched_players = df_players[df_players['Player Name'].isin(comprehensive_copy['Player'])]

# Display the matched players
print("Matched players in 2023/2024 season:")
print(matched_players)

Matched players in 2023/2024 season:
            Player Name Tackles Made Tackle Percentage Interceptions Age  \
0            max aarons           19              58.8             8  23   
1      brenden aaronson           18              50.0             2  22   
4      yunis abdelhamid           35              57.8            39  35   
5     salis abdul samed           14              44.4            12  23   
7       laurent abergel           52              39.6            61  30   
...                 ...          ...               ...           ...  ..   
2955       nadir zortea            4              25.0             1  24   
2956         kurt zouma           17              62.5            29  28   
2957      igor zubeldia           25              73.1            32  26   
2958   martin zubimendi           27              54.8            37  24   
2961     martin degaard           17              29.3            15  24   

        Nationality  
0           England  
1     

In [9]:
len(comprehensive_copy)

28300

In [82]:
comprehensive_copy.head()

Unnamed: 0,Player,Age,Position,Kit Number,Nationality,In Squad,Appearances,Goals,Assists,Yellow Cards,...,Minutes Played,Club,Season,Team,Placement,League,Champions League,Domestic Cup,TOTY,TOTY_Nominee
10119,thibaut courtois,31,Goalkeeper,1,Belgium,8,5,0,0,0,...,423.0,Real Madrid,2023/2024,real-madrid,1,laliga,1,0,0,0
10120,andriy lunin,24,Goalkeeper,13,Ukraine,55,31,0,0,2,...,2850.0,Real Madrid,2023/2024,real-madrid,1,laliga,1,0,0,0
10121,kepa arrizabalaga,28,Goalkeeper,25,Spain,50,20,0,0,2,...,1767.0,Real Madrid,2023/2024,real-madrid,1,laliga,1,0,0,0
10122,lucas canizares,21,Goalkeeper,0,Spain,8,0,0,0,0,...,0.0,Real Madrid,2023/2024,real-madrid,1,laliga,1,0,0,0
10123,mario de luis,21,Goalkeeper,0,Spain,3,0,0,0,0,...,0.0,Real Madrid,2023/2024,real-madrid,1,laliga,1,0,0,0


In [10]:
# Find players who are in comprehensive_2023_2024 but not in df_players
missing_players = comprehensive_copy[comprehensive_copy['Player'].apply(lambda x: x not in final_df['Player Name'].values)]

# Display the missing players along with their other attributes
missing_players

Unnamed: 0,Player,Age,Position,Kit Number,Nationality,In Squad,Appearances,Goals,Assists,Yellow Cards,...,Minutes Played,Club,Season,Team,Placement,League,Champions League,Domestic Cup,TOTY,TOTY_Nominee
1,victor valdes,26,Goalkeeper,1,Spain,52,49,0,0,3,...,4410.0,FC Barcelona,2008/2009,fc-barcelona,1,laliga,1,1,0,0
6,martin caceres,21,Centre-Back,2,Uruguay,51,23,0,0,3,...,1523.0,FC Barcelona,2008/2009,fc-barcelona,1,laliga,1,1,0,0
8,gerard pique,21,Centre-Back,3,Spain,50,45,3,1,7,...,3932.0,FC Barcelona,2008/2009,fc-barcelona,1,laliga,1,1,0,0
9,rafa marquez,29,Centre-Back,4,Mexico,41,37,3,3,7,...,3020.0,FC Barcelona,2008/2009,fc-barcelona,1,laliga,1,1,0,0
10,carles puyol,30,Centre-Back,5,Spain,49,45,1,3,8,...,3768.0,FC Barcelona,2008/2009,fc-barcelona,1,laliga,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57954,cheikh sabaly,24,Left Winger,14,Senegal,33,31,3,1,5,...,1472.0,FC Metz,2023/2024,fc-metz,16,ligue-1,0,0,0,0
57969,isaak toure,20,Centre-Back,95,France,26,24,1,1,6,...,2052.0,FC Lorient,2023/2024,fc-lorient,17,ligue-1,0,0,0,0
58002,aiyegun tosin,25,Centre-Forward,27,Benin,23,20,2,1,0,...,974.0,FC Lorient,2023/2024,fc-lorient,17,ligue-1,0,0,0,0
58019,cheick oumar konate,19,Right-Back,15,Mali,22,20,1,1,4,...,1139.0,Clermont Foot 63,2023/2024,clermont-foot-63,18,ligue-1,0,0,0,0


In [68]:
# Filter the DataFrame for the 2023/2024 season and players with more than 5 appearances
comprehensive_copy = comprehensive_df[(comprehensive_df['Appearances'] > 15) & (comprehensive_df['Season'] > '2017/2018')]

# Find players who are in comprehensive_2023_2024 but not in df_players
missing_players_full = comprehensive_copy[comprehensive_copy['Player'].apply(lambda x: x not in final_df['Player Name'].values)]

# Display the missing players along with their other attributes
missing_players_full

Unnamed: 0,Player,Age,Position,Kit Number,Nationality,In Squad,Appearances,Goals,Assists,Yellow Cards,...,Minutes Played,Club,Season,Team,Placement,League,Champions League,Domestic Cup,TOTY,TOTY_Nominee
6473,,23,Centre-Back,15,France,57,45,2,1,11,...,3654.0,FC Barcelona,2018/2019,fc-barcelona,1,laliga,0,0,0,0
6521,juanfran torres,33,Right-Back,20,Spain,42,30,0,2,8,...,2336.0,Atlético de Madrid,2018/2019,atletico-madrid,2,laliga,0,0,0,0
6550,daniel carvajal,26,Right-Back,2,Spain,40,37,1,6,13,...,3140.0,Real Madrid,2018/2019,real-madrid,3,laliga,0,0,0,0
6553,nacho fernandez,28,Centre-Back,6,Spain,48,30,0,1,9,...,2493.0,Real Madrid,2018/2019,real-madrid,3,laliga,0,0,0,0
6576,mariano diaz,24,Centre-Forward,7,Dominican Republic,24,19,4,0,2,...,602.0,Real Madrid,2018/2019,real-madrid,3,laliga,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57998,mohamed bamba,21,Centre-Forward,9,Cote d'Ivoire,16,16,8,3,3,...,1149.0,FC Lorient,2023/2024,fc-lorient,17,ligue-1,0,0,0,0
58002,aiyegun tosin,25,Centre-Forward,27,Benin,23,20,2,1,0,...,974.0,FC Lorient,2023/2024,fc-lorient,17,ligue-1,0,0,0,0
58019,cheick oumar konate,19,Right-Back,15,Mali,22,20,1,1,4,...,1139.0,Clermont Foot 63,2023/2024,clermont-foot-63,18,ligue-1,0,0,0,0
58035,shamar nicholson,26,Centre-Forward,23,Jamaica,30,28,4,3,2,...,1657.0,Clermont Foot 63,2023/2024,clermont-foot-63,18,ligue-1,0,0,0,0


In [None]:
# Ensure the 'Player Name' column in df_players is of string type
final_df['Player Name'] = final_df['Player Name'].astype(str)

# Ensure the 'Player' column in missing_players is of string type
missing_players['Player'] = missing_players['Player'].astype(str)

# List to store matched results
matched_players = []

# Iterate over the missing players
for missing_name in missing_players['Player']:
    # Perform fuzzy matching
    best_match = process.extractOne(missing_name, final_df['Player Name'], scorer=fuzz.ratio)
    
    # If a match is found, unpack the result
    if best_match:
        match_name, match_score, match_index = best_match
        
        # Ensure the matched age is converted to an integer (in case it isn't already)
        matched_age = int(final_df.iloc[match_index]['Age'])
        missing_age = int(missing_players[missing_players['Player'] == missing_name]['Age'].values[0])
        
        # Calculate the age difference
        age_difference = abs(missing_age - matched_age)

        # Store the matched information including the age difference
        matched_players.append({
            'Missing Player': missing_name,
            'Matched Player': match_name,
            'Match Score': match_score,
            'Missing Age': missing_age,
            'Matched Age': matched_age,
            'Age Difference': age_difference,
            'Missing Nationality': missing_players[missing_players['Player'] == missing_name]['Nationality'].values[0],
            'Matched Nationality': final_df.iloc[match_index]['Nationality']
        })


# Convert the matched_players list to a DataFrame for easy viewing
matched_players_df = pd.DataFrame(matched_players)


In [85]:
matched_players_df.dtypes

Missing Player         object
Matched Player         object
Match Score             int64
Missing Age             int64
Matched Age             int64
Age Difference          int64
Missing Nationality    object
Matched Nationality    object
dtype: object

In [86]:
# Display the matched players
matched_players_df

Unnamed: 0,Missing Player,Matched Player,Match Score,Missing Age,Matched Age,Age Difference,Missing Nationality,Matched Nationality
0,daniel carvajal,dani carvajal,93,31,31,0,Spain,Spain
1,nacho fernandez,enzo fernandez,83,33,22,11,Spain,Argentina
2,,natan,75,33,22,11,Spain,Brazil
3,savinho,savio,83,19,19,0,Brazil,Brazil
4,viktor tsygankov,viktor tsyhankov,94,25,25,0,Ukraine,Ukraine
5,memphis depay,memphis,70,29,29,0,Netherlands,Netherlands
6,dani vivian,daniel vivian,92,23,24,1,Spain,Spain
7,,natan,75,33,22,11,Spain,Brazil
8,,natan,75,33,22,11,Spain,Brazil
9,rodri sanchez,robert sanchez,81,23,25,2,Spain,Spain


In [45]:
#Defensive stats
# Create a session
session = requests.Session()

# Define the retry strategy with a longer backoff time
retry = Retry(
    total=5,  # Total number of retries
    backoff_factor=5,  # Exponential backoff factor (5, 10, 20, 40 seconds)
    status_forcelist=[429, 500, 502, 503, 504],  # Retry on these HTTP status codes
    allowed_methods=["HEAD", "GET", "OPTIONS"]  # Retry for these methods
)

# Attach the HTTPAdapter with the retry strategy to the session
adapter = HTTPAdapter(max_retries=retry)
session.mount("http://", adapter)
session.mount("https://", adapter)

base_url = "https://fbref.com/en/comps/Big5/{}/defense/players/{}-Big-5-European-Leagues-Stats"
seasons = [
    "2017-2018", "2018-2019", "2019-2020", "2020-2021", "2021-2022", "2022-2023",
    "2023-2024"
]

all_players_data = []

for season in seasons:
    if season == "2023-2024":
        url = "https://fbref.com/en/comps/Big5/defense/players/Big-5-European-Leagues-Stats"
    else:
        url = base_url.format(season, season)
    
    # Make the request with retry logic
    try:
        response = session.get(url, headers={'User-Agent': 'Mozilla/5.0'})
        response.raise_for_status()  # Raise an error for bad status codes
        soup = BeautifulSoup(response.content, 'html.parser')
        rows = soup.select("#stats_defense > tbody > tr")
        
        for row in rows:
            player_name_element = row.select_one('td:nth-child(2)')
            tackles_made_element = row.select_one('td:nth-child(11)')
            tackle_percentage_element = row.select_one('td:nth-child(17)')
            interceptions_element = row.select_one('td:nth-child(22)')
            age_element = row.select_one('td:nth-child(7)')
            nation_link = row.select_one('td.left.poptip > a')
            
            if player_name_element and tackles_made_element and tackle_percentage_element and interceptions_element and age_element and nationality_element:
                player_name = player_name_element.text.strip()
                tackles_made = tackles_made_element.text.strip()
                tackle_percentage = tackle_percentage_element.text.strip()
                interceptions = interceptions_element.text.strip()
                         # Check if age_text is not empty and contains a dash
                age_text = age_element.text.strip()
                if age_text and '-' in age_text:
                    age = int(age_text.split('-')[0]) - 1
                elif age_text:  # Ensure age_text is not empty
                    age = int(age_text)
                else:
                    age = None  # or any default value if age is missing
                    
                 # Extract nationality and remove "Football" part
                nationality = ''
                if nation_link:
                    nationality = nation_link['href'].split('/')[-1].replace('-', ' ').title()
                    # Remove "Football" and everything after it
                    nationality = nationality.split(' Football')[0]
                
                all_players_data.append({
                    'Season': season,
                    'Player Name': player_name,
                    'Tackles Made': tackles_made,
                    'Tackle Percentage': tackle_percentage,
                    'Interceptions': interceptions,
                    'Age': age,
                    'Nationality': nationality
                })
        
        # Introduce a random delay between 5 and 10 seconds to avoid detection
        time.sleep(random.uniform(5, 10))

    except requests.exceptions.RequestException as e:
        print(f"Failed to fetch data for season {season}: {e}")

In [46]:
all_players_df = pd.DataFrame(all_players_data)
all_players_df

Unnamed: 0,Season,Player Name,Tackles Made,Tackle Percentage,Interceptions,Age,Nationality
0,2017-2018,Patrick van Aanholt,32,47.1,47,26.0,Netherlands
1,2017-2018,Rolando Aarons,4,66.7,1,21.0,England
2,2017-2018,Rolando Aarons,8,14.3,2,21.0,England
3,2017-2018,Ignazio Abate,17,54.5,8,30.0,Italy
4,2017-2018,Aymen Abdennour,3,57.1,4,27.0,Tunisia
...,...,...,...,...,...,...,...
16768,2023-2024,Chrisantus Uche,0,100.0,0,20.0,Nigeria
16769,2023-2024,Mikel Vesga,1,50.0,1,30.0,Spain
16770,2023-2024,Daniel Vivian,0,,0,24.0,Spain
16771,2023-2024,Iñaki Williams,0,,0,29.0,Ghana


In [47]:
#Passing stats
# Create a session
session = requests.Session()

# Define the retry strategy with a longer backoff time
retry = Retry(
    total=5,  # Total number of retries
    backoff_factor=5,  # Exponential backoff factor (5, 10, 20, 40 seconds)
    status_forcelist=[429, 500, 502, 503, 504],  # Retry on these HTTP status codes
    allowed_methods=["HEAD", "GET", "OPTIONS"]  # Retry for these methods
)

# Attach the HTTPAdapter with the retry strategy to the session
adapter = HTTPAdapter(max_retries=retry)
session.mount("http://", adapter)
session.mount("https://", adapter)

base_url = "https://fbref.com/en/comps/Big5/{}/passing/players/{}-Big-5-European-Leagues-Stats"
seasons = [
    "2017-2018", "2018-2019", "2019-2020", "2020-2021", "2021-2022", "2022-2023",
    "2023-2024"
]

all_passing_data = []

for season in seasons:
    if season == "2023-2024":
        url = "https://fbref.com/en/comps/Big5/passing/players/Big-5-European-Leagues-Stats"
    else:
        url = base_url.format(season, season)
    
    # Make the request with retry logic
    try:
        response = session.get(url, headers={'User-Agent': 'Mozilla/5.0'})
        response.raise_for_status()  # Raise an error for bad status codes
        soup = BeautifulSoup(response.content, 'html.parser')
        rows = soup.select("#stats_passing > tbody > tr")
        
        for row in rows:
            player_name_element = row.select_one('td:nth-child(2)')
            attempted_passes_element = row.select_one('td:nth-child(11)')
            completion_percentage_element = row.select_one('td:nth-child(12)')
            progressive_pass_distance_element = row.select_one('td:nth-child(14)')
            expected_assists_element = row.select_one('td:nth-child(25)')
            key_passes_element = row.select_one('td:nth-child(28)')
            age_element = row.select_one('td:nth-child(7)')
            nation_link = row.select_one('td.left.poptip > a')
            
            if player_name_element and attempted_passes_element and completion_percentage_element and progressive_pass_distance_element and expected_assists_element and key_passes_element and age_element and nationality_element:
                player_name = player_name_element.text.strip()
                attempted_passes = attempted_passes_element.text.strip()
                completion_percentage = completion_percentage_element.text.strip()
                progressive_pass_distance = progressive_pass_distance_element.text.strip()
                expected_assists = expected_assists_element.text.strip()
                key_passes = key_passes_element.text.strip()
                         # Get the raw age text
                         # Check if age_text is not empty and contains a dash
                age_text = age_element.text.strip()
                if age_text and '-' in age_text:
                    age = int(age_text.split('-')[0]) - 1
                elif age_text:  # Ensure age_text is not empty
                    age = int(age_text)
                else:
                    age = None  # or any default value if age is missing
                    
                 # Extract nationality and remove "Football" part
                nationality = ''
                if nation_link:
                    nationality = nation_link['href'].split('/')[-1].replace('-', ' ').title()
                    # Remove "Football" and everything after it
                    nationality = nationality.split(' Football')[0]
                
                all_passing_data.append({
                    'Season': season,
                    'Player Name': player_name,
                    'Attempted Passes': attempted_passes,
                    'Completion Percentage': completion_percentage,
                    'Progressive Pass Distance': progressive_pass_distance,
                    'Expected Assists': expected_assists,
                    'Key Passes': key_passes,
                    'Age': age,
                    'Nationality': nationality
                })
        
        # Introduce a random delay between 5 and 10 seconds to avoid detection
        time.sleep(random.uniform(5, 10))

    except requests.exceptions.RequestException as e:
        print(f"Failed to fetch data for season {season}: {e}")

# Convert the scraped data into a DataFrame
df_passing = pd.DataFrame(all_passing_data)


In [48]:
df_passing

Unnamed: 0,Season,Player Name,Attempted Passes,Completion Percentage,Progressive Pass Distance,Expected Assists,Key Passes,Age,Nationality
0,2017-2018,Patrick van Aanholt,1176,75.2,6422,2.1,18,26.0,Netherlands
1,2017-2018,Rolando Aarons,44,65.9,77,0.0,0,21.0,England
2,2017-2018,Rolando Aarons,120,72.5,325,0.2,3,21.0,England
3,2017-2018,Ignazio Abate,776,80.5,4535,0.5,10,30.0,Italy
4,2017-2018,Aymen Abdennour,333,93.1,1557,0.0,0,27.0,Tunisia
...,...,...,...,...,...,...,...,...,...
16768,2023-2024,Chrisantus Uche,11,63.6,7,0.0,1,20.0,Nigeria
16769,2023-2024,Mikel Vesga,38,71.1,118,0.0,0,30.0,Spain
16770,2023-2024,Daniel Vivian,19,94.7,137,0.0,0,24.0,Spain
16771,2023-2024,Iñaki Williams,26,61.5,42,0.0,0,29.0,Ghana


In [49]:
#Goalkeeping stats
# Create a session
session = requests.Session()

# Define the retry strategy with a longer backoff time
retry = Retry(
    total=5,  # Total number of retries
    backoff_factor=5,  # Exponential backoff factor (5, 10, 20, 40 seconds)
    status_forcelist=[429, 500, 502, 503, 504],  # Retry on these HTTP status codes
    allowed_methods=["HEAD", "GET", "OPTIONS"]  # Retry for these methods
)

# Attach the HTTPAdapter with the retry strategy to the session
adapter = HTTPAdapter(max_retries=retry)
session.mount("http://", adapter)
session.mount("https://", adapter)

base_url = "https://fbref.com/en/comps/Big5/{}/keepers/players/{}-Big-5-European-Leagues-Stats"
seasons = [
    "2017-2018", "2018-2019", "2019-2020", "2020-2021", "2021-2022", "2022-2023",
    "2023-2024"
]

all_goalkeeper_data = []

for season in seasons:
    if season == "2023-2024":
        url = "https://fbref.com/en/comps/Big5/keepers/players/Big-5-European-Leagues-Stats"
    else:
        url = base_url.format(season, season)
    
    # Make the request with retry logic
    try:
        response = session.get(url, headers={'User-Agent': 'Mozilla/5.0'})
        response.raise_for_status()  # Raise an error for bad status codes
        soup = BeautifulSoup(response.content, 'html.parser')
        rows = soup.select("#stats_keeper > tbody > tr")
        
        for row in rows:
            player_name_element = row.select_one('td:nth-child(2)')
            position_element = row.select_one('td:nth-child(4)')
            goals_against_element = row.select_one('td:nth-child(13)')
            shots_on_target_against_element = row.select_one('td:nth-child(15)')
            saves_element = row.select_one('td:nth-child(16)')
            save_percentage_element = row.select_one('td:nth-child(17)')
            clean_sheets_element = row.select_one('td:nth-child(21)')
            clean_sheet_percentage_element = row.select_one('td:nth-child(22)')
            age_element = row.select_one('td:nth-child(7)')
            nation_link = row.select_one('td.left.poptip > a')
            
            if player_name_element and position_element and goals_against_element and shots_on_target_against_element and saves_element and save_percentage_element and clean_sheets_element and clean_sheet_percentage_element and age_element and nationality_element:
                player_name = player_name_element.text.strip()
                position = position_element.text.strip()
                goals_against = goals_against_element.text.strip() or '0'
                shots_on_target_against = shots_on_target_against_element.text.strip() or '0'
                saves = saves_element.text.strip() or '0'
                save_percentage = save_percentage_element.text.strip() or '0'
                clean_sheets = clean_sheets_element.text.strip() or '0'
                clean_sheet_percentage = clean_sheet_percentage_element.text.strip() or '0'
                         # Get the raw age text
                 # Process age to handle dashes
                                  # Check if age_text is not empty and contains a dash
                age_text = age_element.text.strip()
                if age_text and '-' in age_text:
                    age = int(age_text.split('-')[0]) - 1
                elif age_text:  # Ensure age_text is not empty
                    age = int(age_text)
                else:
                    age = None  # or any default value if age is missing
                    
                 # Extract nationality and remove "Football" part
                nationality = ''
                if nation_link:
                    nationality = nation_link['href'].split('/')[-1].replace('-', ' ').title()
                    # Remove "Football" and everything after it
                    nationality = nationality.split(' Football')[0]
                
                all_goalkeeper_data.append({
                    'Season': season,
                    'Player Name': player_name,
                    'Position': position,
                    'Goals Against': goals_against,
                    'Shots on Target Against': shots_on_target_against,
                    'Saves': saves,
                    'Save Percentage': save_percentage,
                    'Clean Sheets': clean_sheets,
                    'Clean Sheet Percentage': clean_sheet_percentage,
                    'Age': age,
                    'Nationality': nationality
                })
        
        # Introduce a random delay between 5 and 10 seconds to avoid detection
        time.sleep(random.uniform(5, 10))

    except requests.exceptions.RequestException as e:
        print(f"Failed to fetch data for season {season}: {e}")

# Convert the scraped data into a DataFrame
df_goalkeepers = pd.DataFrame(all_goalkeeper_data)


In [50]:
df_goalkeepers

Unnamed: 0,Season,Player Name,Position,Goals Against,Shots on Target Against,Saves,Save Percentage,Clean Sheets,Clean Sheet Percentage,Age,Nationality
0,2017-2018,Antonio Adán,GK,53,132,82,62.1,9,30.0,30,Spain
1,2017-2018,René Adler,GK,18,53,35,67.9,4,28.6,32,Germany
2,2017-2018,Adrián,GK,29,96,68,70.8,6,31.6,30,Spain
3,2017-2018,Alisson,GK,28,135,105,81.5,17,45.9,24,Brazil
4,2017-2018,Sergio Álvarez,GK,29,75,47,62.7,3,18.8,30,Spain
...,...,...,...,...,...,...,...,...,...,...,...
1246,2022-2023,Petar Zovko,GK,2,3,1,33.3,0,0,20,Bosnia And Herzegovina
1247,2023-2024,Paulo Gazzaniga,GK,1,4,3,75.0,0,0.0,31,Argentina
1248,2023-2024,Alex Padilla,GK,1,2,1,50.0,0,0.0,19,Spain
1249,2023-2024,Rui Silva,GK,1,2,1,50.0,0,0.0,29,Portugal


In [64]:
# Step 1: Merge all_players_df with df_passing
# Step 1: Merge all_players_df with df_passing on 'Name', 'Season', and 'Age'
combined_df = pd.merge(all_players_df, df_passing, on=['Player Name', 'Season', 'Age'], how='outer')

#combined_df

combined_df_check = combined_df[(combined_df['Season'] > '2022/2023')]
combined_df_check

# Step 2: Merge the result with df_goalkeepers on 'Name', 'Season', and 'Age'
final_df = pd.merge(combined_df, df_goalkeepers, on=['Player Name', 'Season', 'Age'], how='outer')
final_df

Unnamed: 0,Season,Player Name,Tackles Made,Tackle Percentage,Interceptions,Age,Nationality_x,Attempted Passes,Completion Percentage,Progressive Pass Distance,...,Key Passes,Nationality_y,Position,Goals Against,Shots on Target Against,Saves,Save Percentage,Clean Sheets,Clean Sheet Percentage,Nationality
0,2019-2020,Aaron Connolly,6,30.0,0,19.0,Republic Of Ireland,155,72.9,183,...,6,Republic Of Ireland,,,,,,,,
1,2020-2021,Aaron Connolly,3,0.0,0,20.0,Republic Of Ireland,94,76.6,176,...,6,Republic Of Ireland,,,,,,,,
2,2021-2022,Aaron Connolly,0,0.0,0,21.0,Republic Of Ireland,19,73.7,16,...,1,Republic Of Ireland,,,,,,,,
3,2017-2018,Aaron Cresswell,21,63.3,44,27.0,England,1723,69.9,9345,...,34,England,,,,,,,,
4,2018-2019,Aaron Cresswell,16,52.0,23,28.0,England,1082,78.4,5043,...,17,England,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18366,2018-2019,Ștefan Radu,27,65.8,42,31.0,Romania,1329,84.0,7405,...,9,Romania,,,,,,,,
18367,2019-2020,Ștefan Radu,25,65.1,34,32.0,Romania,1378,86.2,7370,...,13,Romania,,,,,,,,
18368,2020-2021,Ștefan Radu,24,62.5,45,33.0,Romania,1635,82.9,9253,...,19,Romania,,,,,,,,
18369,2021-2022,Ștefan Radu,6,55.6,5,34.0,Romania,330,81.2,1809,...,0,Romania,,,,,,,,


In [65]:
# Apply the normalization function to the 'Player Name' column
final_df['Player Name'] = final_df['Player Name'].apply(normalize_string)

# Display the DataFrame to verify the changes
print(final_df.head())

      Season      Player Name Tackles Made Tackle Percentage Interceptions  \
0  2019-2020   aaron connolly            6              30.0             0   
1  2020-2021   aaron connolly            3               0.0             0   
2  2021-2022   aaron connolly            0               0.0             0   
3  2017-2018  aaron cresswell           21              63.3            44   
4  2018-2019  aaron cresswell           16              52.0            23   

    Age        Nationality_x Attempted Passes Completion Percentage  \
0  19.0  Republic Of Ireland              155                  72.9   
1  20.0  Republic Of Ireland               94                  76.6   
2  21.0  Republic Of Ireland               19                  73.7   
3  27.0              England             1723                  69.9   
4  28.0              England             1082                  78.4   

  Progressive Pass Distance  ... Key Passes        Nationality_y Position  \
0                       183

In [67]:
# Ensure the 'Player Name' column in df_players is of string type
final_df['Player Name'] = final_df['Player Name'].astype(str)

# Ensure the 'Player' column in missing_players is of string type
missing_players['Player'] = missing_players['Player'].astype(str)

# List to store matched results
matched_players = []

# Iterate over the missing players
for missing_name in missing_players['Player']:
    # Perform fuzzy matching
    best_match = process.extractOne(missing_name, final_df['Player Name'], scorer=fuzz.ratio)
    
    # If a match is found, unpack the result
    if best_match:
        match_name, match_score, match_index = best_match
        
        # Ensure the matched age is converted to an integer (in case it isn't already)
        matched_age = int(final_df.iloc[match_index]['Age'])
        missing_age = int(missing_players[missing_players['Player'] == missing_name]['Age'].values[0])
        
        # Calculate the age difference
        age_difference = abs(missing_age - matched_age)

        # Store the matched information including the age difference
        matched_players.append({
            'Missing Player': missing_name,
            'Matched Player': match_name,
            'Match Score': match_score,
            'Missing Age': missing_age,
            'Matched Age': matched_age,
            'Age Difference': age_difference,
            'Missing Nationality': missing_players[missing_players['Player'] == missing_name]['Nationality'].values[0],
            'Matched Nationality': final_df.iloc[match_index]['Nationality']
        })


# Convert the matched_players list to a DataFrame for easy viewing
matched_players_df = pd.DataFrame(matched_players)


KeyboardInterrupt: 

In [None]:
matched_players_df