In [1]:
import pandas as pd
import os

print("Starting the data merging process...")

try:
    # 1. Load all the clean datasets
    # Make sure these files exist in your ../data/ folder
    df_teams = pd.read_csv('../data/combined_standings_2024.csv')
    df_players_league = pd.read_csv('../data/combined_player_stats_2024.csv')
    df_players_ucl = pd.read_csv('../data/combined_ucl_player_stats_2024.csv')
    df_ucl_progress = pd.read_csv('../data/ucl_team_progress_2024.csv')

    print("All datasets loaded successfully.")

    # 2. MERGE 1: Add team league performance to each league player
    # We merge on the 'Squad' and 'League' columns
    master_df = pd.merge(df_players_league, df_teams, on=['Squad', 'League'], how='left')
    print("Step 1/3: Merged team league stats onto players.")

    # 3. MERGE 2: Add UCL player stats to the master dataframe
    # We select only the key player stats from the UCL data to avoid duplicate columns
    ucl_stats_to_add = df_players_ucl[['Player', 'Squad', 'Gls', 'Ast', 'Min', 'xG', 'xAG']]
    
    # We add '_ucl' as a suffix to these columns to distinguish them from league stats
    master_df = pd.merge(master_df, ucl_stats_to_add, on=['Player', 'Squad'], how='left', suffixes=('_league', '_ucl'))
    print("Step 2/3: Merged UCL player stats.")

    # 4. MERGE 3: Add UCL team progress to the master dataframe
    master_df = pd.merge(master_df, df_ucl_progress, on='Squad', how='left')
    print("Step 3/3: Merged UCL team progress.")

    # 5. FINAL CLEANUP: Handle missing values (NaNs)
    # Players who didn't play in the UCL will have NaN for UCL stats. We'll fill these with 0.
    ucl_cols = ['Gls_ucl', 'Ast_ucl', 'Min_ucl', 'xG_ucl', 'xAG_ucl']
    for col in ucl_cols:
        if col in master_df.columns:
            master_df[col] = master_df[col].fillna(0)
    
    # For UCL progress, we can fill NaN with 'Did Not Qualify'
    master_df['UCL_Progress'] = master_df['UCL_Progress'].fillna('Did Not Qualify')
    print("Final cleanup complete.")

    # 6. SAVE THE MASTER DATASET
    output_path = os.path.join("../data", "master_dataset_2024.csv")
    master_df.to_csv(output_path, index=False)

    print("\n--- SUCCESS! ---")
    print(f"Master dataset created successfully and saved to: {output_path}")
    print(f"The final dataset has {master_df.shape[0]} rows and {master_df.shape[1]} columns.")
    display(master_df.head())

except FileNotFoundError as e:
    print(f"\nERROR: A file was not found. Please make sure all four source CSVs are in the 'data' folder.")
    print(f"Missing file: {e.filename}")
except Exception as e:
    print(f"\nAn error occurred: {e}")

Starting the data merging process...
All datasets loaded successfully.
Step 1/3: Merged team league stats onto players.
Step 2/3: Merged UCL player stats.
Step 3/3: Merged UCL team progress.
Final cleanup complete.

--- SUCCESS! ---
Master dataset created successfully and saved to: ../data\master_dataset_2024.csv
The final dataset has 2852 rows and 63 columns.


Unnamed: 0,Rk_x,Player,Nation,Pos,Squad,Age,Born,MP_x,Starts,Min_league,...,Top Team Scorer,Goalkeeper,Notes,Last 5,Gls_ucl,Ast_ucl,Min_ucl,xG,xAG_ucl,UCL_Progress
0,1,Max Aarons,eng ENG,DF,Bournemouth,23,2000,20,13,,...,Dominic Solanke - 19,Neto,,,0.0,0.0,0.0,,0.0,Did Not Qualify
1,2,Joshua Acheampong,eng ENG,DF,Chelsea,17,2006,1,0,6.0,...,Cole Palmer - 22,Đorđe Petrović,→ Conference League via league finish,,0.0,0.0,0.0,,0.0,Did Not Qualify
2,3,Tyler Adams,us USA,MF,Bournemouth,24,1999,3,1,121.0,...,Dominic Solanke - 19,Neto,,,0.0,0.0,0.0,,0.0,Did Not Qualify
3,4,Tosin Adarabioyo,eng ENG,DF,Fulham,25,1997,20,18,,...,Rodrigo Muniz - 9,Bernd Leno,,,0.0,0.0,0.0,,0.0,Did Not Qualify
4,5,Elijah Adebayo,eng ENG,FW,Luton Town,25,1998,27,16,,...,Carlton Morris - 11,Thomas Kaminski,Relegated,,0.0,0.0,0.0,,0.0,Did Not Qualify
