In [1]:
import pandas as pd
import os

def process_player_stats():
    """
    Reads individual player stat CSVs from the /data folder, cleans them,
    combines them, and saves a new unified player dataset.
    """
    print("Processing local player stat files...")
    
    files_to_process = {
        "Premier League": "player_stats_epl_2024.csv",
        "La Liga": "player_stats_laliga_2024.csv",
        "Serie A": "player_stats_seriea_2024.csv",
        "Bundesliga": "player_stats_bundesliga_2024.csv",
        "Ligue 1": "player_stats_ligue1_2024.csv"
    }
    
    all_player_stats = []
    
    for league, filename in files_to_process.items():
        try:
            file_path = os.path.join("../data", filename) # Use ../data from the notebooks folder
            player_df = pd.read_csv(file_path)
            
            # --- Data Cleaning ---
            # Remove the repeating header rows that get copied over
            player_df = player_df[player_df['Player'] != 'Player'].copy()
            
            # Add a league column
            player_df['League'] = league
            
            all_player_stats.append(player_df)
            print(f"Successfully processed: {filename}")

        except FileNotFoundError:
            print(f"Error: Could not find '{filename}'. Please check the file name.")
        except Exception as e:
            print(f"An error occurred with {filename}: {e}")

    if all_player_stats:
        # Combine all dataframes and reset the index
        combined_df = pd.concat(all_player_stats, ignore_index=True)
        # Convert relevant columns to numeric types, errors='coerce' will turn non-numbers into NaN
        numeric_cols = ['Age', 'MP', 'Starts', 'Min', '90s', 'Gls', 'Ast', 'G+A', 'G-PK', 'PK', 'PKatt', 'CrdY', 'CrdR', 'xG', 'npxG', 'xAG']
        for col in numeric_cols:
            if col in combined_df.columns:
                combined_df[col] = pd.to_numeric(combined_df[col], errors='coerce')
        
        # Drop rows where player name is null (often artifacts of copying)
        combined_df.dropna(subset=['Player'], inplace=True)
        
        return combined_df
    else:
        return pd.DataFrame()

# --- Main Execution ---
player_dataset = process_player_stats()

if not player_dataset.empty:
    output_path = os.path.join("../data", "combined_player_stats_2024.csv")
    player_dataset.to_csv(output_path, index=False)
    
    print("\nProcessing complete!")
    print(f"Combined player dataset saved successfully to: {output_path}")
    print(f"Total players processed: {len(player_dataset)}")
    print("\nFirst 5 rows of the new dataset:")
    display(player_dataset.head())
else:
    print("\nProcessing failed. No data was saved.")

Processing local player stat files...
Successfully processed: player_stats_epl_2024.csv
Successfully processed: player_stats_laliga_2024.csv
Successfully processed: player_stats_seriea_2024.csv
Successfully processed: player_stats_bundesliga_2024.csv
Successfully processed: player_stats_ligue1_2024.csv

Processing complete!
Combined player dataset saved successfully to: ../data\combined_player_stats_2024.csv
Total players processed: 2852

First 5 rows of the new dataset:


Unnamed: 0,Rk,Player,Nation,Pos,Squad,Age,Born,MP,Starts,Min,...,G+A.1,G-PK.1,G+A-PK,xG.1,xAG.1,xG+xAG,npxG.1,npxG+xAG.1,Matches,League
0,1,Max Aarons,eng ENG,DF,Bournemouth,23,2000,20,13,,...,0.07,0.0,0.07,0.0,0.06,0.06,0.0,0.06,Matches,Premier League
1,2,Joshua Acheampong,eng ENG,DF,Chelsea,17,2006,1,0,6.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Matches,Premier League
2,3,Tyler Adams,us USA,MF,Bournemouth,24,1999,3,1,121.0,...,0.0,0.0,0.0,0.0,0.06,0.06,0.0,0.06,Matches,Premier League
3,4,Tosin Adarabioyo,eng ENG,DF,Fulham,25,1997,20,18,,...,0.11,0.11,0.11,0.04,0.01,0.05,0.04,0.05,Matches,Premier League
4,5,Elijah Adebayo,eng ENG,FW,Luton Town,25,1998,27,16,,...,0.63,0.63,0.63,0.37,0.04,0.42,0.37,0.42,Matches,Premier League


In [2]:
import pandas as pd
import numpy as np # Import numpy for handling potential infinite values
import matplotlib.pyplot as plt
import seaborn as sns

# Load your new combined player dataset
file_path = '../data/combined_player_stats_2024.csv'
df_players = pd.read_csv(file_path)

# Check the first few rows and the data types
display(df_players.head())
df_players.info()

Unnamed: 0,Rk,Player,Nation,Pos,Squad,Age,Born,MP,Starts,Min,...,G+A.1,G-PK.1,G+A-PK,xG.1,xAG.1,xG+xAG,npxG.1,npxG+xAG.1,Matches,League
0,1,Max Aarons,eng ENG,DF,Bournemouth,23,2000,20,13,,...,0.07,0.0,0.07,0.0,0.06,0.06,0.0,0.06,Matches,Premier League
1,2,Joshua Acheampong,eng ENG,DF,Chelsea,17,2006,1,0,6.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Matches,Premier League
2,3,Tyler Adams,us USA,MF,Bournemouth,24,1999,3,1,121.0,...,0.0,0.0,0.0,0.0,0.06,0.06,0.0,0.06,Matches,Premier League
3,4,Tosin Adarabioyo,eng ENG,DF,Fulham,25,1997,20,18,,...,0.11,0.11,0.11,0.04,0.01,0.05,0.04,0.05,Matches,Premier League
4,5,Elijah Adebayo,eng ENG,FW,Luton Town,25,1998,27,16,,...,0.63,0.63,0.63,0.37,0.04,0.42,0.37,0.42,Matches,Premier League


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2852 entries, 0 to 2851
Data columns (total 38 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Rk          2852 non-null   int64  
 1   Player      2852 non-null   object 
 2   Nation      2850 non-null   object 
 3   Pos         2852 non-null   object 
 4   Squad       2852 non-null   object 
 5   Age         2852 non-null   int64  
 6   Born        2852 non-null   int64  
 7   MP          2852 non-null   int64  
 8   Starts      2852 non-null   int64  
 9   Min         1325 non-null   float64
 10  90s         2852 non-null   float64
 11  Gls         2852 non-null   int64  
 12  Ast         2852 non-null   int64  
 13  G+A         2852 non-null   int64  
 14  G-PK        2852 non-null   int64  
 15  PK          2852 non-null   int64  
 16  PKatt       2852 non-null   int64  
 17  CrdY        2852 non-null   int64  
 18  CrdR        2852 non-null   int64  
 19  xG          2852 non-null  

In [3]:
# Sort by 'Gls' (Goals) in descending order and show the top 10
top_scorers = df_players.sort_values(by='Gls', ascending=False).head(10)
display(top_scorers[['Player', 'Squad', 'League', 'Gls']])

Unnamed: 0,Player,Squad,League,Gls
2023,Harry Kane,Bayern Munich,Bundesliga,36
1965,Serhou Guirassy,Stuttgart,Bundesliga,28
223,Erling Haaland,Manchester City,Premier League,27
2654,Kylian Mbappé,Paris S-G,Ligue 1,27
724,Artem Dovbyk,Girona,La Liga,24
2146,Loïs Openda,RB Leipzig,Bundesliga,24
1548,Lautaro Martínez,Inter,Serie A,24
1125,Alexander Sørloth,Villarreal,La Liga,23
414,Cole Palmer,Chelsea,Premier League,22
247,Alexander Isak,Newcastle Utd,Premier League,21


In [4]:
# Sort by 'Ast' (Assists) in descending order
top_playmakers = df_players.sort_values(by='Ast', ascending=False).head(10)
display(top_playmakers[['Player', 'Squad', 'League', 'Ast']])

Unnamed: 0,Player,Squad,League,Ast
627,Alex Baena,Villarreal,La Liga,14
1959,Álex Grimaldo,Leverkusen,Bundesliga,13
554,Ollie Watkins,Aston Villa,Premier League,13
1864,Julian Brandt,Dortmund,Bundesliga,11
2207,Leroy Sané,Bayern Munich,Bundesliga,11
2297,Florian Wirtz,Leverkusen,Bundesliga,11
2233,Xavi Simons,RB Leipzig,Bundesliga,11
1178,Nico Williams,Athletic Club,La Liga,11
414,Cole Palmer,Chelsea,Premier League,11
1852,Jan-Niklas Beste,Heidenheim,Bundesliga,11


In [5]:
# Create a 'G+A_per_90' column. We filter for players with significant minutes.
df_players_min = df_players[df_players['Min'] > 900].copy() # At least 10 full matches

# Calculate Goals + Assists per 90 minutes
df_players_min['G+A_per_90'] = (df_players_min['Gls'] + df_players_min['Ast']) / df_players_min['90s']

# Sort by this new metric
most_efficient = df_players_min.sort_values(by='G+A_per_90', ascending=False).head(10)
display(most_efficient[['Player', 'Squad', 'Min', 'Gls', 'Ast', 'G+A_per_90']])

Unnamed: 0,Player,Squad,Min,Gls,Ast,G+A_per_90
565,Callum Wilson,Newcastle Utd,991.0,9,1,0.909091
1399,Davide Frattesi,Inter,955.0,6,3,0.849057
723,Anastasios Douvikas,Celta Vigo,905.0,7,1,0.792079
835,Luis Javier Suárez,Almería,917.0,6,0,0.588235
1709,Eldor Shomurodov,Cagliari,936.0,3,3,0.576923
2547,Ablie Jallow,Metz,918.0,3,2,0.490196
1883,Václav Černý,Wolfsburg,921.0,4,1,0.490196
1564,Arkadiusz Milik,Juventus,919.0,4,1,0.490196
2832,Vitinha,Marseille,923.0,3,2,0.485437
801,Gonçalo Guedes,Villarreal,951.0,3,2,0.471698


In [6]:
# Calculate non-penalty xG overperformance
df_players_min['xG_Overperformance'] = df_players_min['G-PK'] - df_players_min['npxG']

# Sort to find the most clinical finishers
clinical_finishers = df_players_min.sort_values(by='xG_Overperformance', ascending=False).head(10)
display(clinical_finishers[['Player', 'Squad', 'G-PK', 'npxG', 'xG_Overperformance']])

Unnamed: 0,Player,Squad,G-PK,npxG,xG_Overperformance
723,Anastasios Douvikas,Celta Vigo,7,4.6,2.4
2336,Michael Amir Murillo,Marseille,3,0.8,2.2
1883,Václav Černý,Wolfsburg,4,2.3,1.7
1863,Rafael Borré,Werder Bremen,4,2.4,1.6
598,Domingos André Ribeiro Almeida,Valencia,2,0.6,1.4
2206,Jadon Sancho,Dortmund,2,0.7,1.3
1212,Yann Aurel Bisseck,Inter,2,0.9,1.1
2160,Felix Passlack,Bochum,2,0.9,1.1
97,Trevoh Chalobah,Chelsea,1,0.2,0.8
2547,Ablie Jallow,Metz,2,1.2,0.8


In [7]:
# Filter for players whose position ('Pos') contains 'MF' for Midfielder
midfielders = df_players[df_players['Pos'].str.contains('MF', na=False)].copy()

# Sort midfielders by non-penalty goals
top_scoring_midfielders = midfielders.sort_values(by='G-PK', ascending=False).head(10)

display(top_scoring_midfielders[['Player', 'Squad', 'Pos', 'Gls', 'G-PK']])

Unnamed: 0,Player,Squad,Pos,Gls,G-PK
186,Phil Foden,Manchester City,"FW,MF",19,19
642,Jude Bellingham,Real Madrid,MF,19,18
2274,Deniz Undav,Stuttgart,"FW,MF",18,18
1845,Maximilian Beier,Hoffenheim,"FW,MF",16,16
414,Cole Palmer,Chelsea,"FW,MF",22,13
2087,Donyell Malen,Dortmund,"FW,MF",13,13
1902,Ermedin Demirović,Augsburg,"FW,MF",15,12
1656,Christian Pulisic,Milan,"FW,MF",12,12
530,Leandro Trossard,Arsenal,"FW,MF",12,12
230,Kai Havertz,Arsenal,"MF,FW",13,12


In [8]:
# Filter for players aged 21 or younger with significant minutes
young_players = df_players[(df_players['Age'] <= 21) & (df_players['Min'] > 900)].copy()

# Sort them by total Goals + Assists
top_young_talents = young_players.sort_values(by='G+A', ascending=False).head(10)

display(top_young_talents[['Player', 'Age', 'Squad', 'Gls', 'Ast', 'G+A']])

Unnamed: 0,Player,Age,Squad,Gls,Ast,G+A
1809,Karim Adeyemi,21,Dortmund,3,1,4
2349,Arthur Atta,20,Metz,1,2,3
286,Luca Koleosho,18,Burnley,1,1,2
2493,Maxime Estève,21,Montpellier,1,1,2
1781,Kenan Yıldız,18,Juventus,2,0,2
779,Gavi,18,Barcelona,1,1,2
2831,Alan Virginius,20,Clermont Foot,0,2,2
2674,Lucas Mincarelli,19,Montpellier,1,1,2
1757,Kacper Urbanski,18,Bologna,0,1,1
1783,Nicola Zalewski,21,Roma,0,1,1


In [9]:
# We'll reuse the df_players_min from the previous step which has the xG_Overperformance column
# Or you can recalculate it:
df_players_min['xG_Overperformance'] = df_players_min['G-PK'] - df_players_min['npxG']

# Sort in ascending order to find the biggest underperformers
most_wasteful = df_players_min.sort_values(by='xG_Overperformance', ascending=True).head(10)

display(most_wasteful[['Player', 'Squad', 'G-PK', 'npxG', 'xG_Overperformance']])

Unnamed: 0,Player,Squad,G-PK,npxG,xG_Overperformance
59,Beto,Everton,3,6.0,-3.0
2832,Vitinha,Marseille,3,5.1,-2.1
1443,Emil Holm,Atalanta,1,2.7,-1.7
1183,Roman Yaremchuk,Valencia,3,4.6,-1.6
1380,Kingsley Ehizibue,Udinese,0,1.3,-1.3
1169,Jonathan Viera,Almería,0,1.3,-1.3
2831,Alan Virginius,Clermont Foot,0,1.3,-1.3
1214,Paulo Azzi,Cagliari,0,1.2,-1.2
1757,Kacper Urbanski,Bologna,0,1.1,-1.1
1564,Arkadiusz Milik,Juventus,4,5.1,-1.1


In [10]:
# We'll use players with significant minutes again
df_players_min['Creative_Overperformance'] = df_players_min['Ast'] - df_players_min['xAG']

# Sort by xAG to find the most creative passers, regardless of actual assists
most_creative_passers = df_players_min.sort_values(by='xAG', ascending=False).head(10)

display(most_creative_passers[['Player', 'Squad', 'Ast', 'xAG', 'Creative_Overperformance']])

Unnamed: 0,Player,Squad,Ast,xAG,Creative_Overperformance
2206,Jadon Sancho,Dortmund,2,3.0,-1.0
2832,Vitinha,Marseille,2,2.9,-0.9
1169,Jonathan Viera,Almería,3,2.7,0.3
1564,Arkadiusz Milik,Juventus,1,2.6,-1.6
1064,Óscar Rodríguez Arnaiz,Getafe,1,2.1,-1.1
1809,Karim Adeyemi,Dortmund,1,2.0,-1.0
1399,Davide Frattesi,Inter,3,1.9,1.1
617,José Arnaiz,Osasuna,1,1.8,-0.8
1785,Alessandro Zanoli,Salernitana,2,1.8,0.2
1863,Rafael Borré,Werder Bremen,0,1.8,-1.8


In [11]:
# Filter for defenders and defensive midfielders
defenders = df_players[df_players['Pos'].str.contains('DF|MF,DF', na=False)].copy()

# Create a total 'Defensive Actions' column
if 'Tkl' in defenders.columns and 'Int' in defenders.columns:
    defenders['Defensive_Actions'] = defenders['Tkl'] + defenders['Int']

    # Sort to find the busiest defenders
    busiest_defenders = defenders.sort_values(by='Defensive_Actions', ascending=False).head(10)
    display(busiest_defenders[['Player', 'Squad', 'Pos', 'Tkl', 'Int', 'Defensive_Actions']])
else:
    print("Defensive stats ('Tkl', 'Int') not found in the dataset.")

Defensive stats ('Tkl', 'Int') not found in the dataset.
