In [38]:
pip install fuzzywuzzy

Collecting fuzzywuzzy
  Obtaining dependency information for fuzzywuzzy from https://files.pythonhosted.org/packages/43/ff/74f23998ad2f93b945c0309f825be92e04e0348e062026998b5eefef4c33/fuzzywuzzy-0.18.0-py2.py3-none-any.whl.metadata
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl.metadata (4.9 kB)
Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Installing collected packages: fuzzywuzzy
Successfully installed fuzzywuzzy-0.18.0
Note: you may need to restart the kernel to use updated packages.


In [90]:
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
import time
from bs4 import BeautifulSoup
import pandas as pd
import re
import unicodedata
import matplotlib.pyplot as plt
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

In [77]:
comprehensive_df = pd.read_csv('comprehensive_df.csv')

In [78]:
# Define the URL and headers
url = 'https://fbref.com/en/comps/Big5/defense/players/Big-5-European-Leagues-Stats'
headers = {'User-Agent': 'Mozilla/5.0'}

# Send the GET request
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')

# Select the tbody element that contains all the rows
tbody = soup.select_one('#stats_defense > tbody')

# Initialize a list to store player data
players_data = []

# Check if tbody is found
if tbody:
    # Select all the rows within the tbody
    rows = tbody.select('tr')

        # Iterate through each row and extract the relevant data
    for row in rows:
        player_name_element = row.select_one('td:nth-child(2)')
        tackles_made_element = row.select_one('td:nth-child(11)')
        tackle_percentage_element = row.select_one('td:nth-child(17)')
        interceptions_element = row.select_one('td:nth-child(22)')
        age_element = row.select_one('td:nth-child(7)')
        nation_link = row.select_one('td.left.poptip > a')
    
        # Extract data, leaving fields blank if elements are missing
        player_name = player_name_element.text.strip() if player_name_element else ''
        tackles_made = tackles_made_element.text.strip() if tackles_made_element else ''
        tackle_percentage = tackle_percentage_element.text.strip() if tackle_percentage_element else ''
        interceptions = interceptions_element.text.strip() if interceptions_element else ''
        age = age_element.text.strip() if age_element else ''
         # Extract nationality and remove "Football" part
        nationality = ''
        if nation_link:
            nationality = nation_link['href'].split('/')[-1].replace('-', ' ').title()
            # Remove "Football" and everything after it
            nationality = nationality.split(' Football')[0]
    
        # Append the extracted data to the list
        players_data.append({
            'Player Name': player_name,
            'Tackles Made': tackles_made,
            'Tackle Percentage': tackle_percentage,
            'Interceptions': interceptions,
            'Age': age,
            'Nationality': nationality
        })

    # Convert the list of dictionaries to a DataFrame
    df_players = pd.DataFrame(players_data)

    # Display the DataFrame
    print(df_players)
else:
    print("Table body not found. Please check the selector or page structure.")


            Player Name Tackles Made Tackle Percentage Interceptions Age  \
0            Max Aarons           19              58.8             8  23   
1      Brenden Aaronson           18              50.0             2  22   
2       Paxten Aaronson            2             100.0             0  19   
3     Keyliane Abdallah            0                               0  17   
4      Yunis Abdelhamid           35              57.8            39  35   
...                 ...          ...               ...           ...  ..   
2961    Martin Ødegaard           17              29.3            15  24   
2962        Milan Đurić            0               0.0             1  33   
2963                                                                       
2964        Milan Đurić            5              50.0             0  33   
2965   Mateusz Łęgowski           12              42.3             7  20   

                 Nationality  
0                    England  
1              United Sta

In [79]:
# Function to normalize a string
def normalize_string(s):
    """
    Normalize a string by removing accents and converting to lowercase.
    """
    # Decompose the unicode string into its base characters and accents
    s = unicodedata.normalize('NFKD', s)
    
    # Encode to ASCII to remove accents, then decode back to string
    s = s.encode('ASCII', 'ignore').decode('ASCII')
    
    # Convert to lowercase
    s = s.lower()
    
    return s

# Apply the normalization function to the 'Player Name' column
df_players['Player Name'] = df_players['Player Name'].apply(normalize_string)

# Display the DataFrame to verify the changes
print(df_players.head())

         Player Name Tackles Made Tackle Percentage Interceptions Age  \
0         max aarons           19              58.8             8  23   
1   brenden aaronson           18              50.0             2  22   
2    paxten aaronson            2             100.0             0  19   
3  keyliane abdallah            0                               0  17   
4   yunis abdelhamid           35              57.8            39  35   

     Nationality  
0        England  
1  United States  
2  United States  
3         France  
4        Morocco  


In [80]:
len(comprehensive_2023_2024)

1592

In [81]:
# Filter comprehensive_df for the 2023/2024 season
comprehensive_2023_2024 = comprehensive_df[comprehensive_df['Season'] == '2023/2024']

# Perform the comparison
matched_players = df_players[df_players['Player Name'].isin(comprehensive_2023_2024['Player'])]

# Display the matched players
print("Matched players in 2023/2024 season:")
print(matched_players)

Matched players in 2023/2024 season:
            Player Name Tackles Made Tackle Percentage Interceptions Age  \
0            max aarons           19              58.8             8  23   
1      brenden aaronson           18              50.0             2  22   
2       paxten aaronson            2             100.0             0  19   
3     keyliane abdallah            0                               0  17   
4      yunis abdelhamid           35              57.8            39  35   
...                 ...          ...               ...           ...  ..   
2956         kurt zouma           17              62.5            29  28   
2957      igor zubeldia           25              73.1            32  26   
2958   martin zubimendi           27              54.8            37  24   
2960     lovro zvonarek            0               0.0             0  18   
2961     martin degaard           17              29.3            15  24   

        Nationality  
0           England  
1     

In [82]:
comprehensive_2023_2024.head()

Unnamed: 0,Player,Age,Position,Kit Number,Nationality,In Squad,Appearances,Goals,Assists,Yellow Cards,...,Minutes Played,Club,Season,Team,Placement,League,Champions League,Domestic Cup,TOTY,TOTY_Nominee
10119,thibaut courtois,31,Goalkeeper,1,Belgium,8,5,0,0,0,...,423.0,Real Madrid,2023/2024,real-madrid,1,laliga,1,0,0,0
10120,andriy lunin,24,Goalkeeper,13,Ukraine,55,31,0,0,2,...,2850.0,Real Madrid,2023/2024,real-madrid,1,laliga,1,0,0,0
10121,kepa arrizabalaga,28,Goalkeeper,25,Spain,50,20,0,0,2,...,1767.0,Real Madrid,2023/2024,real-madrid,1,laliga,1,0,0,0
10122,lucas canizares,21,Goalkeeper,0,Spain,8,0,0,0,0,...,0.0,Real Madrid,2023/2024,real-madrid,1,laliga,1,0,0,0
10123,mario de luis,21,Goalkeeper,0,Spain,3,0,0,0,0,...,0.0,Real Madrid,2023/2024,real-madrid,1,laliga,1,0,0,0


In [83]:
# Filter the DataFrame for the 2023/2024 season and players with more than 5 appearances
comprehensive_2023_2024 = comprehensive_df[(comprehensive_df['Season'] == '2023/2024') & (comprehensive_df['Appearances'] > 20)]

# Find players who are in comprehensive_2023_2024 but not in df_players
missing_players = comprehensive_2023_2024[comprehensive_2023_2024['Player'].apply(lambda x: x not in df_players['Player Name'].values)]

# Display the missing players along with their other attributes
missing_players

Unnamed: 0,Player,Age,Position,Kit Number,Nationality,In Squad,Appearances,Goals,Assists,Yellow Cards,...,Minutes Played,Club,Season,Team,Placement,League,Champions League,Domestic Cup,TOTY,TOTY_Nominee
10131,daniel carvajal,31,Right-Back,2,Spain,47,41,6,5,7,...,3351.0,Real Madrid,2023/2024,real-madrid,1,laliga,1,0,0,0
10134,nacho fernandez,33,Centre-Back,6,Spain,51,45,0,1,5,...,3126.0,Real Madrid,2023/2024,real-madrid,1,laliga,1,0,0,0
10153,,33,Centre-Forward,14,Spain,52,49,17,3,1,...,2097.0,Real Madrid,2023/2024,real-madrid,1,laliga,1,0,0,0
10218,savinho,19,Right Winger,16,Brazil,42,41,11,10,6,...,3253.0,Girona FC,2023/2024,fc-girona,3,laliga,0,0,0,0
10220,viktor tsygankov,25,Right Winger,8,Ukraine,34,34,8,7,0,...,2275.0,Girona FC,2023/2024,fc-girona,3,laliga,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57943,danley jean jacques,23,Defensive Midfield,27,Haiti,36,36,0,0,4,...,2841.0,FC Metz,2023/2024,fc-metz,16,ligue-1,0,0,0,0
57952,,22,Centre-Forward,0,Georgia,22,22,14,4,2,...,1965.0,FC Metz,2023/2024,fc-metz,16,ligue-1,0,0,0,0
57954,cheikh sabaly,24,Left Winger,14,Senegal,33,31,3,1,5,...,1472.0,FC Metz,2023/2024,fc-metz,16,ligue-1,0,0,0,0
57969,isaak toure,20,Centre-Back,95,France,26,24,1,1,6,...,2052.0,FC Lorient,2023/2024,fc-lorient,17,ligue-1,0,0,0,0


In [84]:
# Ensure the 'Player Name' column in df_players is of string type
df_players['Player Name'] = df_players['Player Name'].astype(str)

# Ensure the 'Player' column in missing_players is of string type
missing_players['Player'] = missing_players['Player'].astype(str)

# List to store matched results
matched_players = []

# Iterate over the missing players
for missing_name in missing_players['Player']:
    # Perform fuzzy matching
    best_match = process.extractOne(missing_name, df_players['Player Name'], scorer=fuzz.ratio)
    
    # If a match is found, unpack the result
    if best_match:
        match_name, match_score, match_index = best_match
        
        # Ensure the matched age is converted to an integer (in case it isn't already)
        matched_age = int(df_players.iloc[match_index]['Age'])
        missing_age = int(missing_players[missing_players['Player'] == missing_name]['Age'].values[0])
        
        # Calculate the age difference
        age_difference = abs(missing_age - matched_age)

        # Store the matched information including the age difference
        matched_players.append({
            'Missing Player': missing_name,
            'Matched Player': match_name,
            'Match Score': match_score,
            'Missing Age': missing_age,
            'Matched Age': matched_age,
            'Age Difference': age_difference,
            'Missing Nationality': missing_players[missing_players['Player'] == missing_name]['Nationality'].values[0],
            'Matched Nationality': df_players.iloc[match_index]['Nationality']
        })


# Convert the matched_players list to a DataFrame for easy viewing
matched_players_df = pd.DataFrame(matched_players)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing_players['Player'] = missing_players['Player'].astype(str)


In [85]:
matched_players_df.dtypes

Missing Player         object
Matched Player         object
Match Score             int64
Missing Age             int64
Matched Age             int64
Age Difference          int64
Missing Nationality    object
Matched Nationality    object
dtype: object

In [86]:
# Display the matched players
matched_players_df.head(50)

Unnamed: 0,Missing Player,Matched Player,Match Score,Missing Age,Matched Age,Age Difference,Missing Nationality,Matched Nationality
0,daniel carvajal,dani carvajal,93,31,31,0,Spain,Spain
1,nacho fernandez,enzo fernandez,83,33,22,11,Spain,Argentina
2,,natan,75,33,22,11,Spain,Brazil
3,savinho,savio,83,19,19,0,Brazil,Brazil
4,viktor tsygankov,viktor tsyhankov,94,25,25,0,Ukraine,Ukraine
5,memphis depay,memphis,70,29,29,0,Netherlands,Netherlands
6,dani vivian,daniel vivian,92,23,24,1,Spain,Spain
7,,natan,75,33,22,11,Spain,Brazil
8,,natan,75,33,22,11,Spain,Brazil
9,rodri sanchez,robert sanchez,81,23,25,2,Spain,Spain


In [92]:
# Create a session
session = requests.Session()

# Define the retry strategy with a longer backoff time
retry = Retry(
    total=5,  # Total number of retries
    backoff_factor=5,  # Exponential backoff factor (5, 10, 20, 40 seconds)
    status_forcelist=[429, 500, 502, 503, 504],  # Retry on these HTTP status codes
    allowed_methods=["HEAD", "GET", "OPTIONS"]  # Retry for these methods
)

# Attach the HTTPAdapter with the retry strategy to the session
adapter = HTTPAdapter(max_retries=retry)
session.mount("http://", adapter)
session.mount("https://", adapter)

base_url = "https://fbref.com/en/comps/Big5/{}/defense/players/{}-Big-5-European-Leagues-Stats"
seasons = [
    "2008-2009", "2009-2010", "2010-2011", "2011-2012", "2012-2013",
    "2013-2014", "2014-2015", "2015-2016", "2016-2017", "2017-2018",
    "2018-2019", "2019-2020", "2020-2021", "2021-2022", "2022-2023",
    "2023-2024"
]

all_players_data = []

for season in seasons:
    if season == "2023-2024":
        url = "https://fbref.com/en/comps/Big5/defense/players/Big-5-European-Leagues-Stats"
    else:
        url = base_url.format(season, season)
    
    # Make the request with retry logic
    try:
        response = session.get(url, headers={'User-Agent': 'Mozilla/5.0'})
        response.raise_for_status()  # Raise an error for bad status codes
        soup = BeautifulSoup(response.content, 'html.parser')
        rows = soup.select("#stats_defense > tbody > tr")
        
        for row in rows:
            player_name_element = row.select_one('td:nth-child(2)')
            tackles_made_element = row.select_one('td:nth-child(11)')
            tackle_percentage_element = row.select_one('td:nth-child(17)')
            interceptions_element = row.select_one('td:nth-child(22)')
            age_element = row.select_one('td:nth-child(7)')
            nationality_element = row.select_one('td.left.poptip > a')
            
            if player_name_element and tackles_made_element and tackle_percentage_element and interceptions_element and age_element and nationality_element:
                player_name = player_name_element.text.strip()
                tackles_made = tackles_made_element.text.strip()
                tackle_percentage = tackle_percentage_element.text.strip()
                interceptions = interceptions_element.text.strip()
                age = age_element.text.strip()
                nationality = nationality_element.text.strip().split(' ')[0]  # Get the part before '-football'
                
                all_players_data.append({
                    'Season': season,
                    'Player Name': player_name,
                    'Tackles Made': tackles_made,
                    'Tackle Percentage': tackle_percentage,
                    'Interceptions': interceptions,
                    'Age': age,
                    'Nationality': nationality
                })
        
        # Introduce a random delay between 5 and 10 seconds to avoid detection
        time.sleep(random.uniform(5, 10))

    except requests.exceptions.RequestException as e:
        print(f"Failed to fetch data for season {season}: {e}")

NameError: name 'random' is not defined

In [93]:
all_players_data

[{'Season': '2008-2009',
  'Player Name': 'Jacques Abardonado',
  'Tackles Made': '',
  'Tackle Percentage': '',
  'Interceptions': '',
  'Age': '30',
  'Nationality': 'fr'},
 {'Season': '2008-2009',
  'Player Name': 'Ignazio Abate',
  'Tackles Made': '',
  'Tackle Percentage': '',
  'Interceptions': '',
  'Age': '21',
  'Nationality': 'it'},
 {'Season': '2008-2009',
  'Player Name': 'Christian Abbiati',
  'Tackles Made': '',
  'Tackle Percentage': '',
  'Interceptions': '',
  'Age': '31',
  'Nationality': 'it'},
 {'Season': '2008-2009',
  'Player Name': 'Pato Abbondanzieri',
  'Tackles Made': '',
  'Tackle Percentage': '',
  'Interceptions': '',
  'Age': '35',
  'Nationality': 'ar'},
 {'Season': '2008-2009',
  'Player Name': 'Elvis Abbruscato',
  'Tackles Made': '',
  'Tackle Percentage': '',
  'Interceptions': '',
  'Age': '27',
  'Nationality': 'it'},
 {'Season': '2008-2009',
  'Player Name': 'Djamel Abdoun',
  'Tackles Made': '',
  'Tackle Percentage': '',
  'Interceptions': '',
  