In [16]:
# Import libraries
import pandas as pd
import json
import numpy as np
import requests
import datetime
import time
import os
import glob

# Collect seria A teams and their composition

In [2]:
url = "https://v3.football.api-sports.io/leagues"

headers = {
    'x-rapidapi-key': '51d2c444f814ff411c5b2dc66e16ad20',
    'x-rapidapi-host': 'v3.football.api-sports.io'
}

response = requests.get(url, headers = headers)
leagues = response.json()

# Find serie A's ID
serie_a_id = None
for league in leagues['response']:
    if league['league']['name'] == 'Serie A' and league['country']['name'] == 'Italy':
        serie_a_id = league['league']['id']
        break

print(f"Serie A ID: {serie_a_id}")

Serie A ID: 135


In [3]:
season = "2024"
url = f"https://v3.football.api-sports.io/teams"

params = {
    'league': serie_a_id,
    'season': season
}

response = requests.get(url, headers = headers, params = params)
teams = response.json()

# Check whether the request was successful
if response.status_code == 200:
    # Extract team information
    teams_data = []
    for team in teams['response']:
        teams_data.append({
            'team_id': team['team']['id'],
            'name': team['team']['name'],
            'country': team['team']['country'],
            'founded': team['team']['founded'],
            'venue_name': team['venue']['name'],
            'venue_city': team['venue']['city'],
            'venue_capacity': team['venue']['capacity']
        })
    
    # Create DataFrame
    df_teams = pd.DataFrame(teams_data)

In [4]:
df_teams

Unnamed: 0,team_id,name,country,founded,venue_name,venue_city,venue_capacity
0,487,Lazio,Italy,1900,Stadio Olimpico,Roma,68530
1,489,AC Milan,Italy,1899,Stadio Giuseppe Meazza,Milano,80018
2,490,Cagliari,Italy,1920,Unipol Domus,Cagliari,16416
3,492,Napoli,Italy,1904,Stadio Diego Armando Maradona,Napoli,60240
4,494,Udinese,Italy,1896,Bluenergy Stadium,Udine,25952
5,495,Genoa,Italy,1893,Stadio Comunale Luigi Ferraris,Genova,36703
6,496,Juventus,Italy,1897,Allianz Stadium,Torino,45666
7,497,AS Roma,Italy,1927,Stadio Olimpico,Roma,68530
8,499,Atalanta,Italy,1907,Gewiss Stadium,Bergamo,21300
9,500,Bologna,Italy,1909,Stadio Renato Dall'Ara,Bologna,39279


In [9]:
# Save team data
path = 'raw_data/teams'

if not os.path.exists(path):
    os.makedirs(path)

df_teams.to_csv('raw_data/serie_a_teams_2024_25.csv')

In [10]:
def get_team_players(team_id):
    
    # URL of the endpoint to obtain the players of a team
    url = f"https://v3.football.api-sports.io/players"

    # Headers (include API key)
    headers = {
        'x-rapidapi-key': '51d2c444f814ff411c5b2dc66e16ad20',
        'x-rapidapi-host': 'v3.football.api-sports.io'
    }

    players_data = []
    page = 1

    while True:
        params = {
            'team': team_id,
            'season': "2024",
            'page': page
        }
        
        # Request
        response = requests.get(url, headers = headers, params = params)
        players = response.json()

        if response.status_code == 200:
            # If there is no more data, it interrupts the cycle
            if not players['response']:
                break
            
            # Extract info about players
            for player_info in players['response']:
                player = player_info['player']
                stat = player_info['statistics'][0]  # Statistics are divided by competition, let's take the first (main league) one
                
                # Data of interest
                player_data = {
                    'team_id': team_id,
                    'player_id': player['id'],
                    'name': player['name'],
                    'first_name': player['firstname'],
                    'last_name': player['lastname'],
                    'birth_date': player['birth']['date'],
                    'country': player['birth']['country'],
                    'nationality': player['nationality'],
                    'height': player['height'],
                    'weight': player['weight'],
                    'position': stat['games']['position']
                }
                
                players_data.append(player_data)
            
            # Go to next page
            page += 1
        else:
            print(f"Errore: {response.status_code}")
            return None

    # Crete DF with all acquired data
    df_players = pd.DataFrame(players_data)
    return df_players

In [11]:
# Iteration on each team in the df_teams dataframe
for index, row in df_teams.iterrows():
    team_id = row['team_id']
    team_name = row['name']

    # Call the function to get team players
    df_players = get_team_players(team_id)

    if df_players is not None and not df_players.empty:
        # Creates a name for the CSV file based on the team name
        csv_filename = os.path.join(path, f"{team_name}_players.csv")
        
        # Save player data in a CSV file
        df_players.to_csv(csv_filename, index = False)
        
        # Confirmation
        print(f"Dati dei giocatori di {team_name} salvati in {csv_filename}")
    else:
        # Error 
        print(f"Nessun dato trovato per {team_name}")

    time.sleep(0.3)  # 0.3s delay not to exceed the API rate limit

Dati dei giocatori di Lazio salvati in raw_data/teams\Lazio_players.csv
Dati dei giocatori di AC Milan salvati in raw_data/teams\AC Milan_players.csv
Dati dei giocatori di Cagliari salvati in raw_data/teams\Cagliari_players.csv
Dati dei giocatori di Napoli salvati in raw_data/teams\Napoli_players.csv
Dati dei giocatori di Udinese salvati in raw_data/teams\Udinese_players.csv
Dati dei giocatori di Genoa salvati in raw_data/teams\Genoa_players.csv
Dati dei giocatori di Juventus salvati in raw_data/teams\Juventus_players.csv
Dati dei giocatori di AS Roma salvati in raw_data/teams\AS Roma_players.csv
Dati dei giocatori di Atalanta salvati in raw_data/teams\Atalanta_players.csv
Dati dei giocatori di Bologna salvati in raw_data/teams\Bologna_players.csv
Dati dei giocatori di Fiorentina salvati in raw_data/teams\Fiorentina_players.csv
Dati dei giocatori di Torino salvati in raw_data/teams\Torino_players.csv
Dati dei giocatori di Verona salvati in raw_data/teams\Verona_players.csv
Dati dei gio

# HANDLE DUPLICATES  

In [12]:
# List of CSV files
file_paths = glob.glob("raw_data/teams/*.csv")

# List to store all DFs
dataframes = []

# Read each csv and add to list
for file in file_paths:
    df = pd.read_csv(file)
    dataframes.append(df)

# Concat all DF
combined_df = pd.concat(dataframes, ignore_index = True)

In [13]:
# Find players who appear in more than one team
duplicate_players = combined_df[combined_df.duplicated(subset = 'player_id', keep = False)]

# Sort by player_id for easy viewing
duplicate_players_sorted = duplicate_players.sort_values(by = 'player_id')

# Displays players appearing in multiple teams
print(duplicate_players_sorted[['player_id','name', 'first_name', 'last_name', 'team_id']])

     player_id          name  first_name        last_name  team_id
802        123   P. Pellegri      Pietro         Pellegri      503
351        123   P. Pellegri      Pietro         Pellegri      511
331        215   S. Esposito  Sebastiano         Esposito      511
478        215   S. Esposito  Sebastiano         Esposito      505
257        319    S. Luperto  Sebastiano          Luperto      490
..         ...           ...         ...              ...      ...
657     335102       L. Hasa        Luis             Hasa      867
190     353609   I. Sulemana     Ibrahim  Sulemana Kakari      499
278     353609   I. Sulemana     Ibrahim  Sulemana Kakari      490
120     361497  Dean Huijsen  Dean Donny          Huijsen      497
546     361497  Dean Huijsen  Dean Donny          Huijsen      496

[126 rows x 5 columns]


In [14]:
# Define folder's path
directory_path = 'raw_data/teams'  

# Iterate on all CSV files in the folder
for file_name in os.listdir(directory_path):

    if file_name.endswith('.csv'):
        file_path = os.path.join(directory_path, file_name)
        
        df = pd.read_csv(file_path)
        
        # Remove duplicates
        df_cleaned = df.drop_duplicates()
        # Correct apostrophe encoding error
        df_cleaned['name'] = df_cleaned['name'].str.replace('&apos;', "'")
        # Overwrite the original file with the updated DataFrame
        df_cleaned.to_csv(file_path, index = False)

        print(f"Duplicated removed and CSV overwritten: {file_name}")

Duplicated removed and CSV overwritten: AC Milan_players.csv
Duplicated removed and CSV overwritten: AS Roma_players.csv
Duplicated removed and CSV overwritten: Atalanta_players.csv
Duplicated removed and CSV overwritten: Bologna_players.csv
Duplicated removed and CSV overwritten: Cagliari_players.csv
Duplicated removed and CSV overwritten: Como_players.csv
Duplicated removed and CSV overwritten: Empoli_players.csv
Duplicated removed and CSV overwritten: Fiorentina_players.csv
Duplicated removed and CSV overwritten: Genoa_players.csv
Duplicated removed and CSV overwritten: Inter_players.csv
Duplicated removed and CSV overwritten: Juventus_players.csv
Duplicated removed and CSV overwritten: Lazio_players.csv
Duplicated removed and CSV overwritten: Lecce_players.csv
Duplicated removed and CSV overwritten: Monza_players.csv
Duplicated removed and CSV overwritten: Napoli_players.csv
Duplicated removed and CSV overwritten: Parma_players.csv
Duplicated removed and CSV overwritten: Torino_pla

In [15]:
df_duplicated = pd.merge(duplicate_players_sorted, df_teams[['team_id', 'name']], on = 'team_id', how = 'left')
df_duplicated = df_duplicated[['team_id', 'player_id', 'name_x', 'first_name', 'last_name', 'name_y']]
df_duplicated.to_csv('raw_data/duplicated_players.csv', index = False)

# HANDLE DUPLICATES 2

In [18]:
players_unique = df_duplicated[['player_id', 'first_name', 'last_name']].drop_duplicates()
players_list = players_unique.player_id.tolist()

In [19]:
# Define function performing the API request for transfers of a player given an ID
def get_player_transfers(player_id):
        
    headers = {
    'x-rapidapi-key': '51d2c444f814ff411c5b2dc66e16ad20',
    'x-rapidapi-host': 'v3.football.api-sports.io'
    }

    url = f"https://v3.football.api-sports.io/transfers"
    
    querystring = {"player": player_id}
    
    # Request
    response = requests.get(url, headers = headers, params = querystring)
    # get data on JSON
    return response.json()

# Example
player_id = 123  
transfers = get_player_transfers(player_id)
transfers

{'get': 'transfers',
 'parameters': {'player': '123'},
 'errors': [],
 'results': 1,
 'paging': {'current': 1, 'total': 1},
 'response': [{'player': {'id': 123, 'name': 'P. Pellegri'},
   'update': '2024-10-14T10:26:17+00:00',
   'transfers': [{'date': '2025-07-01',
     'type': 'N/A',
     'teams': {'in': {'id': 503,
       'name': 'Torino',
       'logo': 'https://media.api-sports.io/football/teams/503.png'},
      'out': {'id': 511,
       'name': 'Empoli',
       'logo': 'https://media.api-sports.io/football/teams/511.png'}}},
    {'date': '2024-08-30',
     'type': 'Loan',
     'teams': {'in': {'id': 511,
       'name': 'Empoli',
       'logo': 'https://media.api-sports.io/football/teams/511.png'},
      'out': {'id': 503,
       'name': 'Torino',
       'logo': 'https://media.api-sports.io/football/teams/503.png'}}},
    {'date': '2022-07-01',
     'type': 'N/A',
     'teams': {'in': {'id': 503,
       'name': 'Torino',
       'logo': 'https://media.api-sports.io/football/teams/5

In [20]:
last_transfer = transfers['response'][0]['transfers'][0]

# Create a df with the info about last transfer
df_transfer = pd.DataFrame([{
    'From Team': last_transfer['teams']['out']['name'],
    'To Team': last_transfer['teams']['in']['name'],
    'Transfer Type': last_transfer['type']
}])

df_transfer

Unnamed: 0,From Team,To Team,Transfer Type
0,Empoli,Torino,


In [21]:
def extract_last_transfer(transfers, player):
    # Extract and sort transfers by date (newest to oldest)
    transfers_list = transfers['response'][0]['transfers']
    transfers_sorted = sorted(transfers_list, key=lambda x: datetime.datetime.strptime(x['date'], '%Y-%m-%d'), reverse=True)

    # Check to find the most recent valid transfer
    if transfers_sorted[0]['type'] == "N/A" and len(transfers_sorted) > 1:
        last_transfer = transfers_sorted[1]  # If the first is ‘NA’, take the second
    else:
        last_transfer = transfers_sorted[0]  # Otherwise take the first one (the most recent)

    # Creates the DataFrame with the transfer information
    df_transfer = pd.DataFrame([{
        'player_id': player,
        'from_team': last_transfer['teams']['out']['name'],
        'to_team': last_transfer['teams']['in']['name'],
        'transfer_type': last_transfer['type']
    }])
    
    return df_transfer

In [22]:
# Create DF to store all data
df_last_transfers = pd.DataFrame(columns = ['player_id', 'from_team', 'to_team', 'transfer_type'])

# Get all data
for player in players_list:

    transfers = get_player_transfers(player)
    last_transfer = extract_last_transfer(transfers, player)
    df_last_transfers = pd.concat([df_last_transfers, last_transfer], ignore_index = True)

    time.sleep(0.3)
    
df_last_transfers    

Unnamed: 0,player_id,from_team,to_team,transfer_type
0,123,Torino,Empoli,Loan
1,215,Inter,Empoli,Loan
2,319,Napoli,Empoli,€ 2.5M
3,325,Napoli,Cagliari,Loan
4,858,Juventus,Empoli,Loan
...,...,...,...,...
58,314254,Bologna,Venezia,Loan
59,323936,Juventus,AS Roma,€ 25.6M
60,335102,Juventus U23,Lecce,
61,353609,Cagliari,Atalanta,€ 7.5M


In [24]:
df_last_transfers = pd.merge(df_last_transfers, players_unique, on = 'player_id', how = 'left')
df_last_transfers = pd.merge(df_last_transfers, df_teams[['name', 'team_id']], left_on = 'to_team', right_on = 'name', how = 'left')

df_last_transfers.drop('name', axis = 1, inplace = True)
df_last_transfers.rename(columns = {"team_id": "actual_team_id"}, inplace = True)

df_last_transfers

Unnamed: 0,player_id,from_team,to_team,transfer_type,first_name,last_name,actual_team_id
0,123,Torino,Empoli,Loan,Pietro,Pellegri,511.0
1,215,Inter,Empoli,Loan,Sebastiano,Esposito,511.0
2,319,Napoli,Empoli,€ 2.5M,Sebastiano,Luperto,511.0
3,325,Napoli,Cagliari,Loan,Gianluca,Gaetano,490.0
4,858,Juventus,Empoli,Loan,Mattia,De Sciglio,511.0
...,...,...,...,...,...,...,...
58,314254,Bologna,Venezia,Loan,Antonio,Raimondo,517.0
59,323936,Juventus,AS Roma,€ 25.6M,Matías,Soulé Malvano,497.0
60,335102,Juventus U23,Lecce,,Luis,Hasa,867.0
61,353609,Cagliari,Atalanta,€ 7.5M,Ibrahim,Sulemana Kakari,499.0


In [25]:
df_last_transfers.actual_team_id = pd.to_numeric(df_last_transfers.actual_team_id, errors = 'coerce').fillna(np.nan).astype('Int64')
df_last_transfers.to_csv('raw_data/last_transfers.csv')

In [28]:
files = glob.glob("raw_data/teams/*.csv")
team_dataframes = {file.split('/')[-1]: pd.read_csv(file) for file in files}

In [47]:
# Define a function to be used to update team composition
def delete_fill_row(df, current_team_id):
    rows_to_drop = []
    rows_no_transfer = []

    for index, row in df.iterrows():
        if pd.isna(row['actual_team_id']) and pd.notna(row['to_team']):
            rows_to_drop.append(index)

        elif pd.isna(row['actual_team_id']) and pd.isna(row['to_team']):
            rows_no_transfer.append(index)
            
    df = df.drop(index = rows_to_drop)
    df.loc[rows_no_transfer, 'actual_team_id'] = current_team_id

    return df

In [50]:
# List of itlian team IDs
italian_team_ids = df_teams.team_id.tolist()  

# For each team in dict 'team_dataframes', merge
for team_file, df_team in team_dataframes.items():
    # Merge team and transfers
    df_merged = pd.merge(df_team, df_last_transfers[['player_id', 'actual_team_id', 'to_team']], on = 'player_id', how='left')
    # Get current team id
    current_team_id = df_team['team_id'].iloc[0]
    # We only keep players who belong to the current team and an Italian team
    df_merged = delete_fill_row(df_merged, current_team_id)
    df_updated = df_merged[(df_merged['actual_team_id'] == current_team_id)]
    # Remove column 'actual_team_id' 
    df_updated = df_updated.drop(columns = ['actual_team_id', 'to_team'])
    # Crate directory 'updated_teams' 
    output_directory = 'raw_data/updated_teams'
    if not os.path.exists(output_directory):
        os.makedirs(output_directory)

    # Overwrite the original CSV with the updated data
    file_name = os.path.basename(team_file)
    df_updated.to_csv(f'{output_directory}/updated_{file_name}', index = False)