In [2]:
import pandas as pd
import os
import glob # Library to find files matching a pattern

def combine_all_player_stats():
    """
    Finds all individual player stat CSVs from the 'player_stats' subfolder,
    cleans the data, and combines it into a single master CSV.
    """
    print("Starting the process to combine all historical player stats...")
    
    # --- UPDATED: Pointing to the 'player_stats' subfolder ---
    data_path = '../data/player_stats/'
    
    # This pattern will find all 75 files inside the subfolder
    file_pattern = os.path.join(data_path, 'player_stats_*.csv')
    all_files = glob.glob(file_pattern)
    
    if not all_files:
        print("ERROR: No player stat files found. Please check your filenames and folder path.")
        return pd.DataFrame()

    print(f"Found {len(all_files)} files to process.")
    
    all_player_stats = []
    
    league_map = {
        'epl': 'Premier League', 'laliga': 'La Liga', 'serieA': 'Serie A',
        'bundesliga': 'Bundesliga', 'ligue1': 'Ligue 1'
    }

    for file in all_files:
        try:
            filename = os.path.basename(file)
            parts = filename.replace('.csv', '').split('_')
            league_abbr = parts[2]
            year = int(parts[3])
            
            league_name = league_map.get(league_abbr, 'Unknown League')
            season = f"{year-1}-{year}"

            player_df = pd.read_csv(file)
            player_df = player_df[player_df['Player'] != 'Player'].copy()
            player_df['League'] = league_name
            player_df['Season'] = season
            
            all_player_stats.append(player_df)

        except Exception as e:
            print(f"Could not process {file}. Error: {e}")

    if all_player_stats:
        combined_df = pd.concat(all_player_stats, ignore_index=True)
        
        numeric_cols = [
            'Age', 'MP', 'Starts', 'Min', '90s', 'Gls', 'Ast', 'G+A', 
            'G-PK', 'PK', 'PKatt', 'CrdY', 'CrdR', 'xG', 'npxG', 'xAG'
        ]
        for col in numeric_cols:
            if col in combined_df.columns:
                combined_df[col] = pd.to_numeric(combined_df[col], errors='coerce')
        
        combined_df.dropna(subset=['Player'], inplace=True)
        return combined_df
    else:
        return pd.DataFrame()

# --- Main Execution ---
player_dataset = combine_all_player_stats()

if not player_dataset.empty:
    # Save the final combined file to the main 'data' folder
    output_path = os.path.join("../data", "combined_player_stats_2011-2025.csv")
    player_dataset.to_csv(output_path, index=False)
    
    print("\n--- SUCCESS! ---")
    print(f"All {len(player_dataset)} player records have been combined and saved to: {output_path}")
    
    # Display a sample of the final data
    display(player_dataset.head())
    display(player_dataset.tail())
else:
    print("\nProcessing failed. No data was saved.")

Starting the process to combine all historical player stats...
Found 75 files to process.

--- SUCCESS! ---
All 41542 player records have been combined and saved to: ../data\combined_player_stats_2011-2025.csv


Unnamed: 0,Rk,Player,Nation,Pos,Squad,Age,Born,MP,Starts,Min,...,PrgC,PrgP,PrgR,xG.1,xAG.1,xG+xAG,npxG.1,npxG+xAG.1,S,Sesaon
0,1,Mohammed Abdellaoue,no NOR,FW,Hannover 96,24.0,1985,26.0,26.0,,...,,,,,,,,,,
1,2,Yacine Abdessadki,ma MAR,MF,Freiburg,29.0,1981,21.0,20.0,,...,,,,,,,,,,
2,3,Mathias Abel,de GER,DF,Kaiserslautern,29.0,1981,19.0,19.0,,...,,,,,,,,,,
3,4,René Adler,de GER,GK,Leverkusen,25.0,1985,32.0,32.0,,...,,,,,,,,,,
4,5,David Alaba,at AUT,"DF,MF",Hoffenheim,18.0,1992,17.0,17.0,,...,,,,,,,,,,


Unnamed: 0,Rk,Player,Nation,Pos,Squad,Age,Born,MP,Starts,Min,...,PrgC,PrgP,PrgR,xG.1,xAG.1,xG+xAG,npxG.1,npxG+xAG.1,S,Sesaon
41538,630,Piotr Zieliński,pl POL,MF,Inter,30.0,1994,26.0,8.0,980.0,...,26,65,24,0.24,0.08,0.32,0.1,0.18,,
41539,631,Nadir Zortea,it ITA,"DF,MF",Cagliari,25.0,1999,35.0,33.0,,...,69,63,144,0.1,0.08,0.18,0.1,0.18,,
41540,632,Szymon Żurkowski,pl POL,MF,Empoli,26.0,1997,5.0,0.0,106.0,...,1,1,3,0.07,0.0,0.07,0.07,0.07,,
41541,633,Milan Đurić,ba BIH,FW,Monza,34.0,1990,18.0,13.0,,...,2,21,30,0.21,0.08,0.28,0.21,0.28,,
41542,634,Milan Đurić,ba BIH,FW,Parma,34.0,1990,9.0,3.0,268.0,...,0,7,6,0.08,0.0,0.08,0.08,0.08,,


In [None]:
import pandas as pd
import os

def clean_historical_player_data():
    """
    Loads the raw combined player stats file, cleans it thoroughly, 
    and saves a new version ready for analysis.
    """
    print("Starting the data cleaning process...")
    
    # --- Load Your Raw Combined File ---
    file_path = '../data/combined_player_stats_2011-2025.csv'
    try:
        # We use decimal=',' to handle numbers like '23,2' from your previous screenshot
        df = pd.read_csv(file_path, decimal=',')
        print("Raw dataset loaded successfully.")
    except FileNotFoundError:
        print(f"ERROR: The file '{file_path}' was not found.")
        return

    # --- 1. Fix Column Names ---
    # Rename duplicate '.1' columns to '_per_90' for clarity
    cols_to_rename = {
        'Gls.1': 'Gls_per_90', 'Ast.1': 'Ast_per_90', 'G+A.1': 'G+A_per_90',
        'G-PK.1': 'G-PK_per_90', 'G+A-PK': 'G+A-PK_per_90', 'xG.1': 'xG_per_90',
        'xAG.1': 'xAG_per_90', 'xG+xAG': 'xG+xAG_per_90', 'npxG.1': 'npxG_per_90',
        'npxG+xAG.1': 'npxG+xAG_per_90'
    }
    df.rename(columns=cols_to_rename, inplace=True)
    
    # Fix the 'Sesaon' typo if it exists
    if 'Sesaon' in df.columns:
        df.rename(columns={'Sesaon': 'Season'}, inplace=True)
    print("Step 1/4: Column names cleaned and renamed.")
    
    # --- 2. Clean the 'Nation' Column ---
    # Keep only the 3-letter country code (e.g., 'no NOR' -> 'NOR')
    if 'Nation' in df.columns:
        df['Nation'] = df['Nation'].astype(str).str.split(' ').str[1]
    print("Step 2/4: 'Nation' column cleaned.")

    # --- 3. Drop Useless Columns ---
    cols_to_drop = ['Matches', 'S']
    for col in cols_to_drop:
        if col in df.columns:
            df.drop(columns=[col], inplace=True)
    print("Step 3/4: Useless columns dropped.")
    
    # --- 4. Convert All Columns to Correct Data Types ---
    # This is the most important step!
    # Identify all columns that should be numbers
    numeric_cols = [
        'Age', 'Born', 'MP', 'Starts', 'Min', '90s', 'Gls', 'Ast', 'G+A', 'G-PK', 'PK',
        'PKatt', 'CrdY', 'CrdR', 'xG', 'npxG', 'xAG', 'npxG+xAG', 'PrgC', 'PrgP', 'PrgR',
        'Gls_per_90', 'Ast_per_90', 'G+A_per_90', 'G-PK_per_90', 'G+A-PK_per_90',
        'xG_per_90', 'xAG_per_90', 'xG+xAG_per_90', 'npxG_per_90', 'npxG+xAG_per_90'
    ]
    for col in numeric_cols:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce')
    
    # Fill all missing numeric values (like old xG stats) with 0
    df.fillna(0, inplace=True)
    print("Step 4/4: Data types converted and missing values filled with 0.")

    return df

# --- Main Execution ---
df_clean = clean_historical_player_data()

if df_clean is not None:
    # Save the final, clean dataset to a new file
    output_path = os.path.join("../data", "combined_player_stats_2011-2025_CLEAN.csv")
    df_clean.to_csv(output_path, index=False)
    
    print("\n--- SUCCESS! ---")
    print(f"Clean dataset saved to: {output_path}")
    
    # Display a sample to confirm the cleaning was successful
    display(df_clean.head())

Starting the data cleaning process...
Raw dataset loaded successfully.
Step 1/4: Column names cleaned and renamed.
Step 2/4: 'Nation' column cleaned.


  df = pd.read_csv(file_path, decimal=',')


Step 3/4: Useless columns dropped.
Step 4/4: Data types converted and missing values filled with 0.

--- SUCCESS! ---
Clean dataset saved to: ../data\combined_player_stats_2011-2025_CLEAN.csv


Unnamed: 0,Rk,Player,Nation,Pos,Squad,Age,Born,MP,Starts,Min,...,npxG+xAG,PrgC,PrgP,PrgR,xG_per_90,xAG_per_90,xG+xAG_per_90,npxG_per_90,npxG+xAG_per_90,Season
0,1.0,Mohammed Abdellaoue,NOR,FW,Hannover 96,24.0,1985.0,26.0,26.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,2.0,Yacine Abdessadki,MAR,MF,Freiburg,29.0,1981.0,21.0,20.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,3.0,Mathias Abel,GER,DF,Kaiserslautern,29.0,1981.0,19.0,19.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,4.0,René Adler,GER,GK,Leverkusen,25.0,1985.0,32.0,32.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,5.0,David Alaba,AUT,"DF,MF",Hoffenheim,18.0,1992.0,17.0,17.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [6]:
import pandas as pd
import os

def merge_and_clean_ucl_stats():
    """
    Loads the two separate UCL player stat files, combines them, cleans the data,
    and saves a single, unified master file.
    """
    print("Starting the process to merge UCL player stats...")
    
    # Define the paths to your two files
    file_part1 = '../data/UCL_player_stats_2011-2017.csv'
    file_part2 = '../data/UCL_player_stats_2018-2025.csv'
    
    try:
        # Load both CSV files into pandas DataFrames
        df1 = pd.read_csv(file_part1, decimal=',')
        df2 = pd.read_csv(file_part2, decimal=',')
        print("Both UCL player stat files loaded successfully.")
    except FileNotFoundError as e:
        print(f"ERROR: A file was not found. Please check your filenames in the 'data' folder.")
        print(f"Missing file: {e.filename}")
        return
    except Exception as e:
        print(f"An error occurred while reading the files: {e}")
        return

    # --- Combine the two DataFrames ---
    # pandas.concat is smart and will automatically add new columns from df2.
    # For rows from df1, the new columns will be filled with NaN (Not a Number).
    combined_df = pd.concat([df1, df2], ignore_index=True)
    print("Step 1/4: Files successfully combined.")

    # --- Clean the Combined Data (same logic as before) ---
    
    # 1. Fix Column Names (rename .1 columns to _per_90)
    cols_to_rename = {
        'Gls.1': 'Gls_per_90', 'Ast.1': 'Ast_per_90', 'G+A.1': 'G+A_per_90',
        'G-PK.1': 'G-PK_per_90', 'G+A-PK': 'G+A-PK_per_90', 'xG.1': 'xG_per_90',
        'xAG.1': 'xAG_per_90', 'xG+xAG': 'xG+xAG_per_90', 'npxG.1': 'npxG_per_90',
        'npxG+xAG.1': 'npxG+xAG_per_90'
    }
    combined_df.rename(columns=cols_to_rename, inplace=True)
    print("Step 2/4: Column names cleaned.")

    # 2. Remove Repeating Headers from the copy-paste process
    combined_df = combined_df[combined_df['Player'] != 'Player'].copy()
    print("Step 3/4: Repeating header rows removed.")
    
    # 3. Fill Missing Values
    # The most important step: this finds all the empty cells for the new columns
    # in the old data (the NaNs) and fills them with 0.
    # It also converts all appropriate columns to numbers.
    
    # First, identify all potential numeric columns
    numeric_cols = [
        'Age', 'Born', 'MP', 'Starts', 'Min', '90s', 'Gls', 'Ast', 'G+A', 'G-PK', 'PK',
        'PKatt', 'CrdY', 'CrdR', 'xG', 'npxG', 'xAG', 'npxG+xAG', 'PrgC', 'PrgP', 'PrgR',
        'Gls_per_90', 'Ast_per_90', 'G+A_per_90', 'G-PK_per_90', 'G+A-PK_per_90',
        'xG_per_90', 'xAG_per_90', 'xG+xAG_per_90', 'npxG_per_90', 'npxG+xAG_per_90'
    ]
    
    # Convert only the columns that actually exist in the combined DataFrame
    for col in numeric_cols:
        if col in combined_df.columns:
            combined_df[col] = pd.to_numeric(combined_df[col], errors='coerce')

    # Now, fill all resulting NaN values in the entire DataFrame with 0
    combined_df.fillna(0, inplace=True)
    print("Step 4/4: Data types converted and missing values filled with 0.")

    return combined_df

# --- Main Execution ---
ucl_player_dataset = merge_and_clean_ucl_stats()

if ucl_player_dataset is not None:
    # Save the final, unified file
    output_path = os.path.join("../data", "all_ucl_player_stats_2011-2025_CLEAN.csv")
    ucl_player_dataset.to_csv(output_path, index=False)
    
    print("\n--- SUCCESS! ---")
    print(f"Unified UCL player dataset saved to: {output_path}")
    
    # Display samples to confirm everything worked
    print("\nSample from the start of the file (old data):")
    display(ucl_player_dataset.head())
    
    print("\nSample from the end of the file (new data with xG):")
    display(ucl_player_dataset.tail())
else:
    print("\nProcessing failed.")

Starting the process to merge UCL player stats...
Both UCL player stat files loaded successfully.
Step 1/4: Files successfully combined.
Step 2/4: Column names cleaned.
Step 3/4: Repeating header rows removed.
Step 4/4: Data types converted and missing values filled with 0.

--- SUCCESS! ---
Unified UCL player dataset saved to: ../data\all_ucl_player_stats_2011-2025_CLEAN.csv

Sample from the start of the file (old data):


Unnamed: 0,Rk,Player,Nation,Pos,Squad,Age,Born,MP,Starts,Min,...,xAG,npxG+xAG,PrgC,PrgP,PrgR,xG_per_90,xAG_per_90,xG+xAG_per_90,npxG_per_90,npxG+xAG_per_90
0,1,Patrick van Aanholt,nl NED,DF,eng Chelsea,19.0,1990.0,4,1,105.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,Ignazio Abate,it ITA,"DF,MF",it Milan,23.0,1986.0,6,4,384.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,Christian Abbiati,it ITA,GK,it Milan,33.0,1977.0,6,6,468.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,Éric Abidal,fr FRA,DF,es Barcelona,30.0,1979.0,8,6,560.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,David Abraham,ar ARG,DF,ch Basel,24.0,1986.0,6,6,540.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0



Sample from the end of the file (new data with xG):


Unnamed: 0,Rk,Player,Nation,Pos,Squad,Age,Born,MP,Starts,Min,...,xAG,npxG+xAG,PrgC,PrgP,PrgR,xG_per_90,xAG_per_90,xG+xAG_per_90,npxG_per_90,npxG+xAG_per_90
10912,882,Tanguy Zoukrou,fr FRA,DF,ch Young Boys,21.0,2003.0,3,3,270.0,...,0.0,0.0,3.0,3.0,0.0,0.0,0.01,0.01,0.0,0.01
10913,883,Oleksandr Zubkov,ua UKR,"FW,MF",ua Shakhtar,27.0,1996.0,8,7,532.0,...,1.0,2.3,11.0,20.0,31.0,0.22,0.17,0.4,0.22,0.4
10914,884,Lovro Zvonarek,hr CRO,"MF,FW",at Sturm Graz,19.0,2005.0,6,0,140.0,...,0.1,0.5,3.0,5.0,9.0,0.22,0.07,0.29,0.22,0.29
10915,885,Martin Ødegaard,no NOR,MF,eng Arsenal,25.0,1998.0,11,9,797.0,...,1.5,3.4,31.0,58.0,39.0,0.21,0.17,0.38,0.21,0.38
10916,886,Łukasz Łakomy,pl POL,MF,ch Young Boys,23.0,2001.0,7,5,444.0,...,0.2,0.5,5.0,20.0,9.0,0.06,0.04,0.1,0.06,0.1
