# SCRAPE THE STATUS OF SERIE A PLAYERS FROM TRANSFERMARKT.IT

In [None]:
# import libraries
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
import unicodedata
import pandas as pd
import numpy as np
import time
import re

In [None]:
# API dataframes
df_players = pd.read_csv('raw_data/all_players_2024_25.csv',
                        index_col=0)
df_teams = pd.read_csv('raw_data/df_serie_a_teams_2024_25.csv',
                      index_col=0)

# DATA PREPARATION

In [None]:
# renaming these two variables (different dataframe, but same variable name)
df_players = df_players.rename(columns={'name': 'player_name'})
df_teams = df_teams.rename(columns={'name': 'team_name'})

In [None]:
# name transformation for the research
def process_player_name(row):
    if '. ' in row['player_name']:
        initial_char = re.split(r'\.\s', row['player_name'])[0]
        last_name = re.split(r'\.\s', row['player_name'])[1]

        if initial_char.lower() == row['first_name'][0].lower():
            first_name = re.split(r'\s', row['first_name'])[0]
            return f'{first_name} {last_name}'

        else:
            return f'{row["first_name"]} {row["last_name"]}'

    return row['player_name']

In [None]:
df_players['processed_player_name'] = df_players.apply(process_player_name, axis=1)

df_players['processed_player_name'].to_list()[:20]

['Fodé Ballo-Touré',
 'Divock Origi',
 'Antonio Mirante',
 'Simon Kjær',
 'Rade Krunić',
 'Emil Roback',
 'Marko Lazetić',
 'Marco Pellegrino',
 'Silvano Vos',
 'Andrei Coubiș',
 'Victor Eletu',
 'Adam Bakoune',
 'Diego Sia',
 'Andrea Bartoccioni',
 'Mattia Liberali',
 'Alessandro Bonomi',
 'Alessandro Longoni',
 'Mattia Caldara',
 'Samuel Chukwueze',
 'Luka Jović']

In [None]:
# prefixes removal
def clean_team_name(team_name):
    return re.sub(r'^[A-Z]{1,3}\s', '', team_name)

In [None]:
df_teams['team_name_cleaned'] = df_teams['team_name'].apply(clean_team_name)

serie_A_teams_2024_25 = df_teams['team_name_cleaned'].to_list()
serie_A_teams_2024_25

['Lazio',
 'Milan',
 'Cagliari',
 'Napoli',
 'Udinese',
 'Genoa',
 'Juventus',
 'Roma',
 'Atalanta',
 'Bologna',
 'Fiorentina',
 'Torino',
 'Verona',
 'Inter',
 'Empoli',
 'Venezia',
 'Parma',
 'Lecce',
 'Como',
 'Monza']

In [None]:
# join operation
df = pd.merge(df_players[['team_id', 'player_id', 'processed_player_name', 'player_name', 'first_name', 'last_name']],
              df_teams[['team_id', 'team_name', 'team_name_cleaned']],
              how='left',
              on='team_id')

df = df[['player_id', 'processed_player_name', 'player_name', 'first_name', 'last_name',
         'team_id', 'team_name', 'team_name_cleaned']]

df.head()

Unnamed: 0,player_id,processed_player_name,player_name,first_name,last_name,team_id,team_name,team_name_cleaned
0,105,Fodé Ballo-Touré,F. Ballo-Touré,Fodé,Ballo-Touré,489,AC Milan,Milan
1,305,Divock Origi,D. Origi,Divock,Okoth Origi,489,AC Milan,Milan
2,765,Antonio Mirante,A. Mirante,Antonio,Mirante,489,AC Milan,Milan
3,2045,Simon Kjær,S. Kjær,Simon,Thorup Kjær,489,AC Milan,Milan
4,31054,Rade Krunić,R. Krunić,Rade,Krunić,489,AC Milan,Milan


In [None]:
# special letters convertion into the simple ones
def normalize_special_letters(text):
    special_letter_map = {
                            'æ': 'ae', 'Æ': 'Ae',
                            'œ': 'oe', 'Œ': 'Oe',
                            'ß': 'ss',
                            'Ĳ': 'IJ', 'ĳ': 'ij',
                            'Đ': 'D', 'đ': 'd',
                            'Ł': 'L', 'ł': 'l',
                            'Ø': 'O', 'ø': 'o',
                            'Ŋ': 'N', 'ŋ': 'n'
                        }

    for letter, replacement in special_letter_map.items():
        text = text.replace(letter, replacement)

    # diacritic removal
    normalized = unicodedata.normalize('NFD', text)
    text_without_special_letters = ''.join([char for char in normalized if not unicodedata.combining(char)])

    return text_without_special_letters

# SETTING UP SCRAPING FUNCTIONS

In [None]:
# funtion to close the opened iframe
def iframe_closer():
    try:
        iframe = WebDriverWait(driver, 2).until(
            EC.presence_of_element_located((By.ID, 'sp_message_iframe_953827'))
            )
        driver.switch_to.frame(iframe)
        driver.find_element(By.CLASS_NAME, 'accept-all').click()
        WebDriverWait(driver, 300).until(lambda d: d.execute_script('return document.readyState') == 'complete')
        driver.switch_to.default_content()
    except TimeoutException:
        pass

In [None]:
# funtion to search a player
def search_player(name):
    search_input = WebDriverWait(driver, 2).until(
        EC.element_to_be_clickable((By.CLASS_NAME, 'tm-header__input--search-field'))
    )
    search_input.clear()
    search_input.send_keys(name)
    search_button = WebDriverWait(driver, 2).until(
        EC.element_to_be_clickable((By.CLASS_NAME, 'tm-header__input--search-send'))
    )
    driver.execute_script("arguments[0].click();", search_button)

In [None]:
# funtion that find the way to search a player (with which name format)
def find_player(player, first_name, last_name):
    # full name
    full_name = f'{first_name} {last_name}'

    search_player(full_name)
    WebDriverWait(driver, 300).until(lambda d: d.execute_script('return document.readyState') == 'complete')
    iframe_closer()
    WebDriverWait(driver, 300).until(lambda d: d.execute_script('return document.readyState') == 'complete')

    try:
        team_fields = WebDriverWait(driver, 2).until(
            EC.presence_of_all_elements_located((By.CLASS_NAME, 'tiny_wappen'))
        )
        return
    # standardized name format
    except TimeoutException:
        search_player(player)
        WebDriverWait(driver, 300).until(lambda d: d.execute_script('return document.readyState') == 'complete')
        iframe_closer()
        WebDriverWait(driver, 300).until(lambda d: d.execute_script('return document.readyState') == 'complete')

        try:
            team_fields = WebDriverWait(driver, 2).until(
                EC.presence_of_all_elements_located((By.CLASS_NAME, 'tiny_wappen'))
            )
            return
        # last name
        except TimeoutException:
            search_player(last_name)
            WebDriverWait(driver, 300).until(lambda d: d.execute_script('return document.readyState') == 'complete')
            iframe_closer()
            WebDriverWait(driver, 300).until(lambda d: d.execute_script('return document.readyState') == 'complete')

In [None]:
# function to match the searhed player with the correct one came out from the research
def player_matching_1(team):
    try:
        box = WebDriverWait(driver, 2).until(
                EC.presence_of_all_elements_located((By.CLASS_NAME, 'items'))
            )[0]

        items_status = box.find_elements(By.CLASS_NAME, 'tiny_wappen')
        for item in items_status:
            status = item.get_attribute('title')

            team_lower = team.lower()
            status_lower = status.lower()

            if (team_lower in status_lower and
                  ((team_lower == 'milan' and 'futuro' in status_lower) or
                   (team_lower == 'juventus' and 'next gen' in status_lower) or
                   (team_lower == 'atalanta' and 'u23' in status_lower))):
                return f'{team} second team'

            elif (team_lower in status_lower and
                      ('primavera' in status_lower or
                       re.search(r' u\s*[0-9]+', status_lower) or
                       re.search(r' under\s*[0-9]+', status_lower))):
                return f'{team} youth team'

            elif team_lower in status_lower:
                return team

            elif item == items_status[-1]:
                return 'continue'

    except TimeoutException:
        return 'player not found'

In [None]:
# function to match the searhed player with the correct one came out from the research
def player_matching_2(team):
    serie_A_teams = serie_A_teams_2024_25.copy()
    serie_A_teams.remove(team)
    try:
        box = WebDriverWait(driver, 2).until(
                EC.presence_of_all_elements_located((By.CLASS_NAME, 'items'))
            )[0]

        items_status = box.find_elements(By.CLASS_NAME, 'tiny_wappen')
        for item in items_status:
            status = item.get_attribute('title')
            status_lower = status.lower()

            if any(serie_A_team.lower() in status_lower for serie_A_team in serie_A_teams):

                for serie_A_team in serie_A_teams:
                    if (serie_A_team.lower() in status_lower and
                          ((serie_A_team.lower() == 'milan' and 'futuro' in status_lower) or
                           (serie_A_team.lower() == 'juventus' and 'next gen' in status_lower) or
                           (serie_A_team.lower() == 'atalanta' and 'u23' in status_lower))):
                        return f'{serie_A_team} second team'

                    elif (serie_A_team.lower() in status_lower and
                          ('primavera' in status_lower or
                           re.search(r' u\s*[0-9]+', status_lower) or
                           re.search(r' under\s*[0-9]+', status_lower))):
                        return f'{serie_A_team} youth team'

                    elif serie_A_team.lower() in status_lower:
                        return serie_A_team

            elif item == items_status[-1]:
                return 'continue'

    except TimeoutException:
        return 'player not found'

In [None]:
# function to match the searhed player with the correct one came out from the research
def player_matching_3(team):
    serie_A_teams = serie_A_teams_2024_25.copy()
    serie_A_teams.remove(team)
    try:
        box = WebDriverWait(driver, 2).until(
                EC.presence_of_all_elements_located((By.CLASS_NAME, 'items'))
            )[0]

        items_status = box.find_elements(By.CLASS_NAME, 'tiny_wappen')
        for item in items_status:
            status = item.get_attribute('title')

            team_lower = team.lower()
            status_lower = status.lower()

            if (team_lower in status_lower and
                  ((team_lower == 'milan' and 'futuro' in status_lower) or
                   (team_lower == 'juventus' and 'next gen' in status_lower) or
                   (team_lower == 'atalanta' and 'u23' in status_lower))):
                return f'{team} second team'

            elif (team_lower in status_lower and
                      ('primavera' in status_lower or
                       re.search(r' u\s*[0-9]+', status_lower) or
                       re.search(r' under\s*[0-9]+', status_lower))):
                return f'{team} youth team'

            elif team_lower in status_lower:
                return team

            elif any(serie_A_team.lower() in status_lower for serie_A_team in serie_A_teams):
                for serie_A_team in serie_A_teams:

                    if (serie_A_team.lower() in status_lower and
                          ((serie_A_team.lower() == 'milan' and 'futuro' in status_lower) or
                           (serie_A_team.lower() == 'juventus' and 'next gen' in status_lower) or
                           (serie_A_team.lower() == 'atalanta' and 'u23' in status_lower))):
                        return f'{serie_A_team} second team'

                    elif (serie_A_team.lower() in status_lower and
                          ('primavera' in status_lower or
                           re.search(r' u\s*[0-9]+', status_lower) or
                           re.search(r' under\s*[0-9]+', status_lower))):
                        return f'{serie_A_team} youth team'

                    elif serie_A_team.lower() in status_lower:
                        return serie_A_team

        return 'Team of another league or released or retired'

    except TimeoutException:
        return 'player not found'

In [None]:
# funtion to find the data we need: the player status
def status_research(team):
    try:
        iframe_closer()
        next_page = WebDriverWait(driver, 2).until(
            EC.presence_of_all_elements_located((By.CLASS_NAME, 'tm-pagination__list-item--icon-next-page'))
        )[0]
        while True:
            try:
                next_page = WebDriverWait(driver, 2).until(
                    EC.presence_of_all_elements_located((By.CLASS_NAME, 'tm-pagination__list-item--icon-next-page'))
                )[0]

                final_status = player_matching_1(team)

                if final_status == 'continue':
                    next_page_link = next_page.find_element(By.CLASS_NAME, 'tm-pagination__link')
                    driver.execute_script("arguments[0].click();", next_page_link)
                    WebDriverWait(driver, 300).until(lambda d: d.execute_script('return document.readyState') == 'complete')
                    iframe_closer()
                    WebDriverWait(driver, 300).until(lambda d: d.execute_script('return document.readyState') == 'complete')
                    continue
                else:
                    return final_status

            except TimeoutException:
                final_status = player_matching_1(team)

                if final_status == 'continue':
                    first_page = WebDriverWait(driver, 2).until(
                        EC.presence_of_all_elements_located((By.CLASS_NAME, 'tm-pagination__list-item--icon-first-page'))
                    )[0]
                    driver.execute_script("arguments[0].click();", first_page)
                    WebDriverWait(driver, 300).until(lambda d: d.execute_script('return document.readyState') == 'complete')
                    iframe_closer()
                    WebDriverWait(driver, 300).until(lambda d: d.execute_script('return document.readyState') == 'complete')

                    while True:
                        try:
                            next_page = WebDriverWait(driver, 2).until(
                                EC.presence_of_all_elements_located((By.CLASS_NAME, 'tm-pagination__list-item--icon-next-page'))
                            )[0]

                            final_status = player_matching_2(team)

                            if final_status == 'continue':
                                next_page_link = next_page.find_element(By.CLASS_NAME, 'tm-pagination__link')
                                driver.execute_script("arguments[0].click();", next_page_link)
                                WebDriverWait(driver, 300).until(lambda d: d.execute_script('return document.readyState') == 'complete')
                                iframe_closer()
                                WebDriverWait(driver, 300).until(lambda d: d.execute_script('return document.readyState') == 'complete')
                                continue
                            else:
                                return final_status

                        except TimeoutException:
                            final_status = player_matching_2(team)

                            if final_status == 'continue':
                                return 'player not found'

                            else:
                                return final_status
                else:
                    return final_status

    except TimeoutException:
        return player_matching_3(team)

In [None]:
df['player_status'] = None

subsets = np.array_split(df, 3)

# SCRAPING PHASE

THE PLAYERS LIST WAS DIVIDED INTO THREE PARTS TO REDUCE COMPUTATION TIME AND PREVENT POTENTIAL PERFORMANCE ISSUES

In [None]:
url = 'https://www.transfermarkt.it'
driver = webdriver.Chrome()
driver.implicitly_wait(10)
driver.maximize_window()
driver.get(url)

try:
    time.sleep(3)
    iframe_closer()

    for index, row in subsets[0].iterrows():
        player_name = normalize_special_letters(row['processed_player_name'])
        player_first_name = normalize_special_letters(row['first_name'])
        player_last_name = normalize_special_letters(row['last_name'])

        find_player(player_name, player_first_name, player_last_name)

        df.loc[index, 'player_status'] = status_research(team=row['team_name_cleaned'])

        print(f"{df.loc[index, 'processed_player_name']}: {df.loc[index, 'player_status']}")


finally:
    driver.quit()

Fodé Ballo-Touré: Milan second team
Divock Origi: Milan second team
Antonio Mirante: Team of another league or released or retired
Simon Kjær: Team of another league or released or retired
Rade Krunić: Team of another league or released or retired
Emil Roback: Team of another league or released or retired
Marko Lazetić: Team of another league or released or retired
Marco Pellegrino: Team of another league or released or retired
Silvano Vos: Milan second team
Andrei Coubiș: Milan second team
Victor Eletu: Milan youth team
Adam Bakoune: Milan youth team
Diego Sia: Milan second team
Andrea Bartoccioni: Team of another league or released or retired
Mattia Liberali: Milan second team
Alessandro Bonomi: Milan youth team
Alessandro Longoni: Milan youth team
Mattia Caldara: Team of another league or released or retired
Samuel Chukwueze: Milan
Luka Jović: Milan
Olivier Giroud: Team of another league or released or retired
Tammy Abraham: Milan
Rafael Leão: Milan
Youssouf Fofana: Milan
Noah Okafo

Kingstone Mutandwa: Cagliari
Gabriele Zappa: Cagliari
Gianluca Gaetano: Cagliari
Răzvan Marin: Cagliari
Yerry Mina: Cagliari
José Palomino: Cagliari
Jakub Jankto: Cagliari
Michel Adopo: Cagliari
Alessandro Deiola: Cagliari
Benito Nicolas Viola: Cagliari
Tommaso Augello: Cagliari
Mateusz Wieteska: Cagliari
Paulo Azzi: Cagliari
Nadir Zortea: Cagliari
Antoine Makoumbou: Cagliari
Matteo Prati: Cagliari
Adam Obert: Cagliari
Velizar-Iliya Iliev: Cagliari youth team
Andra Cogoni: Cagliari youth team
Carlo Soldati: Cagliari youth team
Simone Scuffet: Cagliari
Giuseppe Ciocci: Cagliari
Marco Tremolada: Team of another league or released or retired
Simone Ghidotti: Team of another league or released or retired
Matteo Piombino: Team of another league or released or retired
Raphaël Varane: Team of another league or released or retired
Fabio Rispoli: Verona
Nicholas Ioannou: Team of another league or released or retired
Nicholas Gioacchini: Team of another league or released or retired
Liam Kerriga

In [None]:
url = 'https://www.transfermarkt.it'
driver = webdriver.Chrome()
driver.implicitly_wait(10)
driver.maximize_window()
driver.get(url)

try:
    time.sleep(3)
    iframe_closer()

    for index, row in subsets[1].iterrows():
        player_name = normalize_special_letters(row['processed_player_name'])
        player_first_name = normalize_special_letters(row['first_name'])
        player_last_name = normalize_special_letters(row['last_name'])

        find_player(player_name, player_first_name, player_last_name)

        df.loc[index, 'player_status'] = status_research(team=row['team_name_cleaned'])

        print(f"{df.loc[index, 'processed_player_name']}: {df.loc[index, 'player_status']}")


finally:
    driver.quit()

Herculano Nabian: Team of another league or released or retired
Samuele Angori: Team of another league or released or retired
Stiven Shpendi: Team of another league or released or retired
Duccio Degli Innocenti: Team of another league or released or retired
Gabriele Guarino: Team of another league or released or retired
Alessandro Renzi: Team of another league or released or retired
Andrea Sodero: Atalanta second team
Ismael Konate: Empoli youth team
Hemsley Akpa-Chukwu: Empoli youth team
Bohdan Popov: Empoli youth team
Pietro Pellegri: Empoli
Tyronne Ebuehi: Empoli
Mattia De Sciglio: Empoli
Szymon Żurkowski: Empoli
Liberato Cacace: Empoli
Ardian Ismajli: Empoli
Emanuel Gyasi: Empoli
Liam Henderson: Empoli
Alberto Grassi: Empoli
Luca Belardinelli: Empoli
Nicolas Haas: Empoli
Ola Solbakken: Empoli
Youssef Maleh: Empoli
Faustino Adebola Rasheed Anjorin: Empoli
Mattia Viti: Empoli
Saba Sazonov: Empoli
Emmanuel Ekong: Empoli
Jacopo Fazzini: Empoli
Luca Marianucci: Empoli
Lorenzo Tosto: Emp

Valentín Castellanos: Lazio
Tijjani Noslin: Lazio
Gustav Isaksen: Lazio
Loum Tchaouna: Lazio
Marco Bertini: Team of another league or released or retired
Diego González: Lazio
Cristo Muñoz: Lazio
Matías Vecino: Lazio
Elseid Hysaj: Lazio
Toma Bašić: Lazio
Mattéo Guendouzi: Lazio
Alessio Romagnoli: Lazio
Samuel Gigot: Lazio
Patric: Lazio
Adam Marušić: Lazio
Christos Mandas: Lazio
Luca Pellegrini: Lazio
Nicolò Rovella: Lazio
Mattia Zaccagni: Lazio
Ivan Provedel: Lazio
Gaetano Castrovilli: Lazio
Nuno Tavares: Lazio
Alessio Furlanetto: Lazio
Oluwafisayo Faruq Dele-Bashiru: Lazio
Mario Gila: Lazio
Filipe Bordon: Lazio
Davide Renzetti: Lazio
Saná Fernandes: player not found
Santiago Pierotti: Lecce
Alexis Blin: Team of another league or released or retired
Nicola Sansone: Lecce
Kastriot Dermaku: Team of another league or released or retired
Marco Bleve: Team of another league or released or retired
Lorenzo Venuti: Team of another league or released or retired
Marcin Listkowski: Team of anothe

In [None]:
url = 'https://www.transfermarkt.it'
driver = webdriver.Chrome()
driver.implicitly_wait(10)
driver.maximize_window()
driver.get(url)

try:
    time.sleep(3)
    iframe_closer()

    for index, row in subsets[2].iterrows():
        player_name = normalize_special_letters(row['processed_player_name'])
        player_first_name = normalize_special_letters(row['first_name'])
        player_last_name = normalize_special_letters(row['last_name'])

        find_player(player_name, player_first_name, player_last_name)

        df.loc[index, 'player_status'] = status_research(team=row['team_name_cleaned'])

        print(f"{df.loc[index, 'processed_player_name']}: {df.loc[index, 'player_status']}")


finally:
    driver.quit()

Marco D'Alessandro: Team of another league or released or retired
José Machín: Team of another league or released or retired
Davide Bettella: Team of another league or released or retired
Patrick Ciurria: Monza
Samuele Vignato: Monza
Kevin Maussi Martins: Monza youth team
Dany Mota: Monza
Andrea Petagna: Monza
Mirko Marić: Monza
Omari Forson: Monza
Davide Diaw: Monza
Daniel Maldini: Monza
Milan Đurić: Team of another league or released or retired
Gianluca Caprari: Monza
Alessandro Berretta: Monza youth team
Endris Scaramelli: Monza youth team
Alessandro Bianco: Monza
Warren Bondo: Monza
Matteo Pessina: Monza
Mattia Valoti: Monza
Stefano Sensi: Monza
Roberto Gagliardini: Monza
Giorgos Kyriakopoulos: Monza
Niccolò Postiglione: Monza youth team
Andrea Carboni: Monza
Danilo D'Ambrosio: Monza
Pablo Marí: Monza
Samuele Birindelli: Monza
Pedro Pereira: Monza
Luca Caldirola: Monza
Armando Izzo: Monza
Andrea Mazza: Monza
Davide Bifulco: Monza youth team
Stefano Turati: Monza
Alessio Cragno: Mon

Lorenzo Busato: Team of another league or released or retired
Saad El Haddad: Venezia
Simone Ascione: Team of another league or released or retired
Hans Nicolussi: Venezia
Michael Svoboda: Venezia
Magnus Kofod Andersen: Venezia
John Yeboah: Venezia
Marin Šverko: Venezia
Bjarki Bjarkason: Venezia
Joseph Alfred Duncan: Venezia
Antonio Candela: Venezia
Francesco Zampano: Venezia
Antonio Luca Fiordilino: Venezia
Giorgio Altare: Venezia
Ridgeciano Haps: Venezia
Jay Idzes: Venezia
Domen Črnigoj: Venezia
Gianluca Busio: Venezia
Filip Stanković: Venezia
Franco Carboni: Venezia
Joël Schingtienne: Venezia
Richie Sagrado: Venezia
Zaccaria Rioda: Venezia youth team
Jesse Joronen: Venezia
Bruno: Venezia
Matteo Grandi: Venezia
Thomas Henry: player not found
Ondrej Duda: Verona
Federico Ceccherini: Team of another league or released or retired
Kevin Lasagna: Team of another league or released or retired
Mattia Chiesa: Team of another league or released or retired
Ajdin Hrustić: Team of another league

# SOME ANALYSIS AND CORRECTION OF THE ERRORS

In [None]:
df.shape

(935, 9)

In [None]:
df.groupby('player_status').count()['processed_player_name']

player_status
Atalanta                                          27
Atalanta second team                              13
Atalanta youth team                                1
Bologna                                           31
Bologna youth team                                 2
Cagliari                                          26
Cagliari youth team                                3
Como                                              33
Como youth team                                    2
Empoli                                            28
Empoli youth team                                  4
Fiorentina                                        26
Fiorentina youth team                              3
Genoa                                             32
Genoa youth team                                   1
Inter                                             26
Inter youth team                                   8
Juventus                                          27
Juventus second team            

In [None]:
df.to_csv('C:/Users/dinge/Desktop/UNI/Data Management And Visualization/Data Management/Project/df_all_status_players_2024_25.csv',
          index=False)

In [None]:
df[df['player_status']=='player not found']

Unnamed: 0,player_id,processed_player_name,player_name,first_name,last_name,team_id,team_name,team_name_cleaned,player_status
76,658,Houssem-Eddine Aouar,H. Aouar,Houssem-Eddine,Chaâbane Aouar,497,AS Roma,Roma,player not found
103,47302,Diego Llorente,Diego Llorente,Diego Javier,Llorente Ríos,497,AS Roma,Roma,player not found
182,346923,Mihajlo Ilić,M. Ilić,Mihajlo,Ilić,500,Bologna,Bologna,player not found
210,2055,Marko Rog,M. Rog,Marko,Rog,490,Cagliari,Cagliari,player not found
359,196843,Christian Dalle Mura,C. Dalle Mura,Christian,Dalle Mura,502,Fiorentina,Fiorentina,player not found
406,348533,Seydou Fini,S. Fini,Seydou,Fini,495,Genoa,Genoa,player not found
407,437091,Christos Papadopoulos,C. Papadopoulos,Christos,Papadopoulos,495,Genoa,Genoa,player not found
457,220,Eddie Salcedo,E. Salcedo,Eddie Anthony,Salcedo Mora,505,Inter,Inter,player not found
472,195512,Martín Satriano,M. Satriano,Martín Adrián,Satriano Costa,505,Inter,Inter,player not found
495,129687,Marley Aké,M. Aké,Marley Martin Mickaël Justin,Aké,496,Juventus,Juventus,player not found


Houssem-Eddine Aouar ---> Team of another league or released or retired

Diego Llorente ---> Team of another league or released or retired

Mihajlo Ilić ---> Team of another league or released or retired

Marko Rog ---> Team of another league or released or retired

Christian Dalle Mura ---> Team of another league or released or retired

Seydou Fini ---> Team of another league or released or retired

Eddie Salcedo ---> Team of another league or released or retired

Martín Satriano ---> Team of another league or released or retired

Marley Aké ---> Team of another league or released or retired

Marcos Antônio ---> Team of another league or released or retired

Saná Fernandes ---> Team of another league or released or retired

Natan ---> Team of another league or released or retired

Martin Turk ---> Team of another league or released or retired

Caumenan Ange Emmanu N'Guessan ---> Team of another league or released or retired

Patricio Nehuén Pérez ---> Team of another league or released or retired

Walace ---> Team of another league or released or retired

João Ferreira ---> Team of another league or released or retired

Matheus Martins ---> Team of another league or released or retired

Vivaldo Semedo ---> Team of another league or released or retired

Thomas Henry ---> Team of another league or released or retired
<br>
----------------------------------------------------------
<br>

Christos Papadopoulos ---> Juventus second team

In [None]:
# players not found correction
player_not_found_of_another_league = df[df['player_status']=='player not found']['processed_player_name'].to_list()
player_not_found_of_another_league.remove('Christos Papadopoulos')

for player in player_not_found_of_another_league:
    idx = df[df['processed_player_name']==player].index[0]
    df.loc[idx, 'player_status'] = 'Team of another league or released or retired'

In [None]:
df[df['player_status']=='player not found']

Unnamed: 0,player_id,processed_player_name,player_name,first_name,last_name,team_id,team_name,team_name_cleaned,player_status
407,437091,Christos Papadopoulos,C. Papadopoulos,Christos,Papadopoulos,495,Genoa,Genoa,player not found


In [None]:
df.loc[407, 'player_status'] = 'Juventus second team'
df[df['processed_player_name']=='Christos Papadopoulos']

Unnamed: 0,player_id,processed_player_name,player_name,first_name,last_name,team_id,team_name,team_name_cleaned,player_status
407,437091,Christos Papadopoulos,C. Papadopoulos,Christos,Papadopoulos,495,Genoa,Genoa,Juventus second team


In [None]:
# Balotelli is missing
df[df['player_status']=='Genoa']

Unnamed: 0,player_id,processed_player_name,player_name,first_name,last_name,team_id,team_name,team_name_cleaned,player_status
209,237,Gastón Pereiro,G. Pereiro,Gastón Rodrigo,Pereiro López,490,Cagliari,Cagliari,Genoa
405,310109,David Ankeye,D. Ankeye,David,Akpan Ankeye,495,Genoa,Genoa,Genoa
410,1850,Milan Badelj,M. Badelj,Milan,Badelj,495,Genoa,Genoa,Genoa
411,1938,Ruslan Malinovskyi,R. Malinovskyi,Ruslan,Malinovskyi,495,Genoa,Genoa,Genoa
412,3430,Caleb Ekuban,C. Ekuban,Caleb,Ansah Ekuban,495,Genoa,Genoa,Genoa
413,15881,Morten Frendrup,M. Frendrup,Morten,Wetche Frendrup,495,Genoa,Genoa,Genoa
414,31094,Andrea Pinamonti,A. Pinamonti,Andrea,Pinamonti,495,Genoa,Genoa,Genoa
415,31493,Filippo Melegoni,F. Melegoni,Filippo,Melegoni,495,Genoa,Genoa,Genoa
416,35544,Johan Vásquez,J. Vásquez,Johan Felipe,Vásquez Ibarra,495,Genoa,Genoa,Genoa
417,36980,Morten Thorsby,M. Thorsby,Morten,Thorsby,495,Genoa,Genoa,Genoa


In [None]:
df_players[df_players['last_name']=='Balotelli']

Unnamed: 0,team_id,player_id,player_name,first_name,last_name,birth_date,country,nationality,height,weight,position,processed_player_name


In [None]:
# one is missing (I don't know who he is)
df[df['player_status']=='Monza']

Unnamed: 0,player_id,processed_player_name,player_name,first_name,last_name,team_id,team_name,team_name_cleaned,player_status
627,31532,Patrick Ciurria,P. Ciurria,Patrick,Ciurria,1579,Monza,Monza,Monza
628,312985,Samuele Vignato,S. Vignato,Samuele,Vignato,1579,Monza,Monza,Monza
630,30603,Dany Mota,Dany Mota,Dany,Mota Carvalho,1579,Monza,Monza,Monza
631,30879,Andrea Petagna,A. Petagna,Andrea,Petagna,1579,Monza,Monza,Monza
632,9072,Mirko Marić,M. Marić,Mirko,Marić,1579,Monza,Monza,Monza
633,284242,Omari Forson,O. Forson,Omari Nathan,Forson,1579,Monza,Monza,Monza
634,30663,Davide Diaw,D. Diaw,Davide Djily,Diaw,1579,Monza,Monza,Monza
635,134926,Daniel Maldini,D. Maldini,Daniel,Maldini Fossa,1579,Monza,Monza,Monza
637,30460,Gianluca Caprari,G. Caprari,Gianluca,Caprari,1579,Monza,Monza,Monza
640,302432,Alessandro Bianco,A. Bianco,Alessandro,Bianco,1579,Monza,Monza,Monza


In [None]:
df_players[df_players['last_name']=='Đurić'][]

Unnamed: 0,team_id,player_id,player_name,first_name,last_name,birth_date,country,nationality,height,weight,position,processed_player_name
636,1579,31692,M. Đurić,Milan,Đurić,1990-05-22,Bosnia and Herzegovina,Bosnia and Herzegovina,198 cm,94 kg,Attacker,Milan Đurić


In [None]:
# correction
df.loc[636, 'player_status'] = 'Monza'
df[df['player_name']=='M. Đurić']

Unnamed: 0,player_id,processed_player_name,player_name,first_name,last_name,team_id,team_name,team_name_cleaned,player_status
636,31692,Milan Đurić,M. Đurić,Milan,Đurić,1579,Monza,Monza,Monza


In [None]:
# there's one extra (I don't know who he is)
df[df['player_status']=='Verona']

Unnamed: 0,player_id,processed_player_name,player_name,first_name,last_name,team_id,team_name,team_name_cleaned,player_status
254,451211,Fabio Rispoli,F. Rispoli,Fabio,Rispoli,895,Como,Como,Verona
888,25349,Ondrej Duda,O. Duda,Ondrej,Duda,504,Verona,Verona,Verona
893,59421,Daniel Mosquera,D. Mosquera,Daniel Fernando,Mosquera Bonilla,504,Verona,Verona,Verona
897,129892,Juan Cruz,J. Cruz,Juan Manuel,Cruz,504,Verona,Verona,Verona
899,177745,Faride Alidou,F. Alidou,Faride,Alidou,504,Verona,Verona,Verona
901,236955,Amin Sarr,A. Sarr,Amin,Sarr,504,Verona,Verona,Verona
903,343287,Dailon Livramento,Dailon Livramento,Dailon,Rocha Livramento do Rosario,504,Verona,Verona,Verona
904,359100,Alphadjo Cissè,A. Cissè,Alphadjo,Cissè,504,Verona,Verona,Verona
905,408634,Junior Ajayi,J. Ajayi,Junior,Ajayi,504,Verona,Verona,Verona
907,418,Suat Serdar,S. Serdar,Suat,Serdar,504,Verona,Verona,Verona


In [None]:
# correction
df.loc[254, 'player_status'] = 'Team of another league or released or retired'
df[df['processed_player_name']=='Fabio Rispoli']

Unnamed: 0,player_id,processed_player_name,player_name,first_name,last_name,team_id,team_name,team_name_cleaned,player_status
254,451211,Fabio Rispoli,F. Rispoli,Fabio,Rispoli,895,Como,Como,Team of another league or released or retired


In [None]:
df.groupby('player_status').count()['player_id']

player_status
Atalanta                                          27
Atalanta second team                              13
Atalanta youth team                                1
Bologna                                           31
Bologna youth team                                 2
Cagliari                                          26
Cagliari youth team                                3
Como                                              33
Como youth team                                    2
Empoli                                            28
Empoli youth team                                  4
Fiorentina                                        26
Fiorentina youth team                              3
Genoa                                             32
Genoa youth team                                   1
Inter                                             26
Inter youth team                                   8
Juventus                                          27
Juventus second team            

In [None]:
df = df['player_id', 'player_status']

In [None]:
df.to_csv('raw_data/df_all_status_players_2024_25.csv',
          index=False)