In [20]:
import pandas as pd
import os

def create_definitive_master_dataset():
    """
    Loads all historical data, cleans and standardizes key columns first,
    then merges them into a single, reliable master DataFrame.
    """
    print("Starting the definitive data merging process...")
    data_path = '../data/'

    try:
        # 1. Load Your Four Historical Datasets
        df_league_standings = pd.read_csv(os.path.join(data_path, 'combined_league_standings_2011-2025.csv'))
        df_league_players = pd.read_csv(os.path.join(data_path, 'combined_player_stats_2011-2025_CLEAN.csv'))
        df_ucl_players = pd.read_csv(os.path.join(data_path, 'all_ucl_player_stats_2011-2025_CLEAN.csv'))
        df_ucl_teams = pd.read_csv(os.path.join(data_path, 'all_ucl_team_performance_and_progress.csv'))
        
        print("All four historical datasets loaded successfully.")

    except FileNotFoundError as e:
        print(f"\nERROR: A file was not found. Please check your filenames. Missing file: {e.filename}")
        return

    # 2. PRE-MERGE CLEANING AND STANDARDIZATION
    print("\nCleaning and standardizing names for a reliable merge...")
    
    name_replacements = {
        'Paris S-G': 'Paris Saint-Germain', 'Inter': 'Internazionale', 'Manchester Utd': 'Manchester United'
    }
    
    all_dfs = [df_league_standings, df_league_players, df_ucl_players, df_ucl_teams]
    for df in all_dfs:
        df.columns = df.columns.str.strip()
        if 'Player' in df.columns:
            df['Player'] = df['Player'].str.strip()
        if 'Squad' in df.columns:
            df['Squad'] = df['Squad'].str.strip()
            df['Squad'] = df['Squad'].replace(name_replacements)
        if 'Season' in df.columns:
            df['Season'] = df['Season'].astype(str).str.strip()

    # 3. PERFORM THE MERGES
    master_df = pd.merge(df_league_players, df_league_standings, on=['Squad', 'League', 'Season'], how='left', suffixes=('_player', '_team'))
    
    ucl_stats_to_add = df_ucl_players[['Player', 'Squad', 'Gls', 'Ast', 'Min', 'Season']]
    master_df = pd.merge(master_df, ucl_stats_to_add, on=['Player', 'Squad', 'Season'], how='left', suffixes=('_league', '_ucl'))
    
    # This specifically selects ONLY the 'UCL_progress' column (and keys) to add
    ucl_progress_to_add = df_ucl_teams[['Squad', 'Season', 'UCL_progress']]
    master_df = pd.merge(master_df, ucl_progress_to_add, on=['Squad', 'Season'], how='left')
    
    # 4. FINAL CLEANUP
    ucl_cols = ['Gls_ucl', 'Ast_ucl', 'Min_ucl']
    for col in ucl_cols:
        if col in master_df.columns:
            master_df[col] = master_df[col].fillna(0)
    
    master_df['UCL_progress'] = master_df['UCL_progress'].fillna('Did Not Qualify')
    
    master_df = master_df.loc[:,~master_df.columns.duplicated()]

    print("\nAll merges and cleanup complete.")
    return master_df

# --- Main Execution ---
final_master_dataset = create_definitive_master_dataset()

if final_master_dataset is not None:
    output_path = os.path.join("../data", "master_dataset_2011-2025.csv")
    final_master_dataset.to_csv(output_path, index=False)

    print(f"\n--- SUCCESS! ---")
    print(f"Final master dataset created and saved to: {output_path}")
    display(final_master_dataset.head())
else:
    print("\nMerging process failed.")

Starting the definitive data merging process...
All four historical datasets loaded successfully.

Cleaning and standardizing names for a reliable merge...


  df_league_players = pd.read_csv(os.path.join(data_path, 'combined_player_stats_2011-2025_CLEAN.csv'))



All merges and cleanup complete.

--- SUCCESS! ---
Final master dataset created and saved to: ../data\master_dataset_2011-2025.csv


Unnamed: 0,Rk_player,Player,Nation,Pos,Squad,Age,Born,MP_player,Starts,Min_league,...,Notes,xG_team,xGA,xGD,xGD/90,Last 5,Gls_ucl,Ast_ucl,Min_ucl,UCL_progress
0,1.0,Mohammed Abdellaoue,NOR,FW,Hannover 96,24.0,1985.0,26.0,26.0,0.0,...,‚Üí Europa League via league finish,,,,,,0.0,0.0,0.0,Did Not Qualify
1,2.0,Yacine Abdessadki,MAR,MF,Freiburg,29.0,1981.0,21.0,20.0,0.0,...,,,,,,,0.0,0.0,0.0,Did Not Qualify
2,3.0,Mathias Abel,GER,DF,Kaiserslautern,29.0,1981.0,19.0,19.0,0.0,...,,,,,,,0.0,0.0,0.0,Did Not Qualify
3,4.0,Ren√© Adler,GER,GK,Leverkusen,25.0,1985.0,32.0,32.0,0.0,...,‚Üí Champions League via league finish,,,,,,0.0,0.0,0.0,Did Not Qualify
4,5.0,David Alaba,AUT,"DF,MF",Hoffenheim,18.0,1992.0,17.0,17.0,0.0,...,,,,,,,0.0,0.0,0.0,Did Not Qualify


In [23]:
import pandas as pd
import os

def display_column_names():
    """
    Loads each of the four main data files and prints their column names.
    """
    print("--- Displaying Column Names for Each CSV File ---")
    
    # Path to your data folder, relative to the notebook's location
    data_path = '../data/'
    
    # List of the four files you want to inspect
    files_to_check = [
        'combined_league_standings_2011-2025.csv',
        'combined_player_stats_2011-2025_CLEAN.csv',
        'all_ucl_player_stats_2011-2025_CLEAN.csv',
        'all_ucl_team_performance_and_progress.csv'
    ]
    
    for filename in files_to_check:
        try:
            file_path = os.path.join(data_path, filename)
            
            # Read the CSV file
            df = pd.read_csv(file_path)
            
            # Print the filename and its list of columns
            print(f"\n--- Columns in: {filename} ---")
            print(df.columns.tolist())
            
        except FileNotFoundError:
            print(f"\n--- ERROR: Could not find file: {filename} ---")
        except Exception as e:
            print(f"\n--- An error occurred with {filename}: {e} ---")

# --- Main Execution ---
display_column_names()

--- Displaying Column Names for Each CSV File ---

--- Columns in: combined_league_standings_2011-2025.csv ---
['Rk', 'Squad', 'MP', 'W', 'D', 'L', 'GF', 'GA', 'GD', 'Pts', 'Pts/MP', 'Top Team Scorer', 'Goalkeeper', 'League', 'Season', 'xG', 'xGA', 'xGD', 'xGD/90', 'Last 5']

--- Columns in: combined_player_stats_2011-2025_CLEAN.csv ---
['Rk', 'Player', 'Nation', 'Pos', 'Squad', 'Age', 'Born', 'MP', 'Starts', 'Min', '90s', 'Gls', 'Ast', 'G+A', 'G-PK', 'PK', 'PKatt', 'CrdY', 'CrdR', 'Gls_per_90', 'Ast_per_90', 'G+A_per_90', 'G-PK_per_90', 'G+A-PK_per_90', 'Season', 'League', 'xG', 'npxG', 'xAG', 'npxG+xAG', 'PrgC', 'PrgP', 'PrgR', 'xG_per_90', 'xAG_per_90', 'xG+xAG_per_90', 'npxG_per_90', 'npxG+xAG_per_90']

--- Columns in: all_ucl_player_stats_2011-2025_CLEAN.csv ---
['Rk', 'Player', 'Nation', 'Pos', 'Squad', 'Age', 'Born', 'MP', 'Starts', 'Min', '90s', 'Gls', 'Ast', 'G+A', 'G-PK', 'PK', 'PKatt', 'CrdY', 'CrdR', 'Gls_per_90', 'Ast_per_90', 'G+A_per_90', 'G-PK_per_90', 'G+A-PK_per_90', 

In [27]:
import pandas as pd
import os

def clean_ucl_teams_file():
    """
    Loads the UCL teams file, finds and removes all 'Unnamed' columns,
    and saves the clean version, overwriting the original.
    """
    print("Starting the cleaning process for 'all_ucl_team_performance_and_progress.csv'...")
    
    # Define the path to the file
    file_path = '../data/all_ucl_team_performance_and_progress.csv'
    
    try:
        # Load the file
        df = pd.read_csv(file_path, low_memory=False)
        print("File loaded successfully.")

    except FileNotFoundError:
        print(f"ERROR: The file was not found at {file_path}")
        return

    # --- Find and Remove 'Unnamed' columns from the entire DataFrame ---
    # 1. Get a list of all columns that contain 'Unnamed'
    unnamed_cols = [col for col in df.columns if 'Unnamed' in col]
    
    if unnamed_cols:
        # 2. Drop these columns
        df.drop(columns=unnamed_cols, inplace=True)
        print(f"Successfully found and removed the following useless columns: {unnamed_cols}")
    else:
        print("No 'Unnamed' columns were found to remove.")

    # --- Save the Cleaned File ---
    # This overwrites your old file with the corrected version
    df.to_csv(file_path, index=False)
    
    print(f"\n--- SUCCESS! ---")
    print(f"The file '{os.path.basename(file_path)}' has been permanently cleaned and saved.")
    
    # Display the new, clean column names to confirm
    print("\nHere are the final, clean column names:")
    print(df.columns.tolist())

# --- Run the cleaning process ---
clean_ucl_teams_file()

Starting the cleaning process for 'all_ucl_team_performance_and_progress.csv'...
File loaded successfully.
No 'Unnamed' columns were found to remove.

--- SUCCESS! ---
The file 'all_ucl_team_performance_and_progress.csv' has been permanently cleaned and saved.

Here are the final, clean column names:
['Rk', 'Squad', 'MP', 'W', 'D', 'L', 'GF', 'GA', 'GD', 'Pts', 'Attendance', 'Top Team Scorer', 'Goalkeeper', 'Notes', 'Season']


In [7]:
import pandas as pd
import os

def create_and_test_master_dataset():
    """
    Loads the three main stat files and the dedicated UCL progress file,
    performs a robust merge, and saves the final master dataset.
    """
    print("--- Starting Definitive Data Merge (Test Run) ---")
    data_path = '../data/'

    try:
        # --- 1. Load All Datasets ---
        df_league_standings = pd.read_csv(os.path.join(data_path, 'combined_league_standings_2011-2025.csv'))
        df_league_players = pd.read_csv(os.path.join(data_path, 'combined_player_stats_2011-2025_CLEAN.csv'))
        df_ucl_players = pd.read_csv(os.path.join(data_path, 'all_ucl_player_stats_2011-2025_CLEAN.csv'))
        # --- Loading your new, dedicated progress file ---
        df_ucl_progress = pd.read_csv(os.path.join(data_path, 'UCL_team_progress_2011-2025.csv'))
        
        print("\n‚úÖ All datasets loaded successfully for the test run.")

    except FileNotFoundError as e:
        print(f"\n--- FATAL ERROR --- \nCould not find a required file. Missing file: {e.filename}")
        return

    # --- 2. PRE-MERGE CLEANING AND STANDARDIZATION ---
    print("\nCleaning and standardizing all data for a perfect merge...")
    all_dfs = [df_league_standings, df_league_players, df_ucl_players, df_ucl_progress]
    for df in all_dfs:
        df.columns = df.columns.str.strip()
        if 'Player' in df.columns: df['Player'] = df['Player'].str.strip()
        if 'Squad' in df.columns: df['Squad'] = df['Squad'].str.strip()
        if 'Season' in df.columns: df['Season'] = df['Season'].astype(str).str.strip()
    print("Standardization complete.")

    # --- 3. PERFORM THE MERGES ---
    print("\nMerging all datasets...")
    
    # Merge 1: Team league performance onto league players
    master_df = pd.merge(df_league_players, df_league_standings, on=['Squad', 'League', 'Season'], how='left', suffixes=('_player', '_team'))
    
    # Merge 2: UCL player stats
    ucl_stats_to_add = df_ucl_players[['Player', 'Squad', 'Gls', 'Ast', 'Min', 'Season']]
    master_df = pd.merge(master_df, ucl_stats_to_add, on=['Player', 'Squad', 'Season'], how='left', suffixes=('_league', '_ucl'))
    
    # Merge 3: Your dedicated UCL team progress file
    master_df = pd.merge(master_df, df_ucl_progress, on=['Squad', 'Season'], how='left')
    print("All merges complete.")

    # --- 4. FINAL CLEANUP ---
    print("\nPerforming final cleanup...")
    ucl_cols = ['Gls_ucl', 'Ast_ucl', 'Min_ucl']
    for col in ucl_cols:
        if col in master_df.columns:
            master_df[col] = master_df[col].fillna(0)
    
    master_df['UCL_Progress'] = master_df['UCL_Progress'].fillna('Did Not Qualify')
    
    master_df = master_df.loc[:,~master_df.columns.duplicated()]
    print("Cleanup complete.")
    
    return master_df

# --- Main Execution ---
final_master_dataset = create_and_test_master_dataset()

if final_master_dataset is not None:
    output_path = os.path.join("../data", "master_dataset_2011-2025.csv")
    final_master_dataset.to_csv(output_path, index=False)

    print(f"\n\n--- SUCCESS! ---")
    print(f"The final master dataset has been created successfully.")
    print(f"Saved as: {output_path}")
    
    # --- VERIFICATION for your TEST DATA ---
    print("\nVerification for 2010-2011 Season:")
    print("Checking Barcelona players (UCL Winner in 2010-2011)...")
    verification_sample = final_master_dataset[
        (final_master_dataset['Squad'] == 'Barcelona') & 
        (final_master_dataset['Season'] == '2010-2011')
    ]
    
    if not verification_sample.empty and verification_sample['UCL_Progress'].iloc[0] == 'W':
        print("‚úÖ TEST PASSED: 'UCL_Progress' for Barcelona is correctly set to 'W'.")
        display(verification_sample.head())
    else:
        print("‚ùå TEST FAILED: The 'UCL_Progress' for Barcelona was not correctly merged. Please check for name mismatches.")

else:
    print("\n--- Process Halted ---")

--- Starting Definitive Data Merge (Test Run) ---

‚úÖ All datasets loaded successfully for the test run.

Cleaning and standardizing all data for a perfect merge...
Standardization complete.

Merging all datasets...
All merges complete.

Performing final cleanup...
Cleanup complete.


--- SUCCESS! ---
The final master dataset has been created successfully.
Saved as: ../data\master_dataset_2011-2025.csv

Verification for 2010-2011 Season:
Checking Barcelona players (UCL Winner in 2010-2011)...
‚úÖ TEST PASSED: 'UCL_Progress' for Barcelona is correctly set to 'W'.


Unnamed: 0,Rk_player,Player,Nation,Pos,Squad,Age,Born,MP_player,Starts,Min_league,...,GA,GD,Pts,Pts/MP,Top Team Scorer,Goalkeeper,Gls_ucl,Ast_ucl,Min_ucl,UCL_Progress
15555,3,√âric Abidal,FRA,DF,Barcelona,30,1979,26,23,0,...,21.0,74.0,96.0,2.53,Lionel Messi - 31,V√≠ctor Vald√©s,0.0,0.0,0.0,W
15560,8,Adriano,BRA,"DF,MF",Barcelona,25,1984,14,11,937,...,21.0,74.0,96.0,2.53,Lionel Messi - 31,V√≠ctor Vald√©s,0.0,0.0,0.0,W
15562,10,Ibrahim Afellay,NED,"FW,MF",Barcelona,24,1986,16,7,669,...,21.0,74.0,96.0,2.53,Lionel Messi - 31,V√≠ctor Vald√©s,0.0,0.0,0.0,W
15572,20,Thiago Alc√°ntara,ESP,MF,Barcelona,19,1991,12,6,666,...,21.0,74.0,96.0,2.53,Lionel Messi - 31,V√≠ctor Vald√©s,0.0,0.0,0.0,W
15582,30,Dani Alves,BRA,"DF,MF",Barcelona,27,1983,35,31,0,...,21.0,74.0,96.0,2.53,Lionel Messi - 31,V√≠ctor Vald√©s,0.0,0.0,0.0,W


In [30]:
import pandas as pd
import os

def run_merge_audit():
    """
    Performs a diagnostic merge to identify why UCL progress is not matching.
    """
    print("--- Starting Merge Audit ---")
    data_path = '../data/'

    try:
        # Load the two key files for this specific problem
        df_players = pd.read_csv(os.path.join(data_path, 'combined_player_stats_2011-2025_CLEAN.csv'))
        df_ucl_teams = pd.read_csv(os.path.join(data_path, 'all_ucl_team_performance_and_progress.csv'))
        print("Files loaded successfully for audit.")

    except FileNotFoundError as e:
        print(f"ERROR: A file was not found. Missing file: {e.filename}")
        return

    # --- Clean the key columns in both DataFrames ---
    for df in [df_players, df_ucl_teams]:
        df.columns = df.columns.str.strip()
        if 'Squad' in df.columns:
            df['Squad'] = df['Squad'].str.strip()
        if 'Season' in df.columns:
            df['Season'] = df['Season'].astype(str).str.strip()

    # --- Perform an Indicator Merge ---
    # This special merge adds a '_merge' column that tells us the source of each row
    audit_df = pd.merge(
        df_players[['Player', 'Squad', 'Season']], # We only need keys from the player file
        df_ucl_teams[['Squad', 'Season', 'UCL_progress']], # And keys from the UCL file
        on=['Squad', 'Season'],
        how='left', # A 'left' merge keeps every player from the left file
        indicator=True # This is the magic part!
    )

    # --- Find and Display the Mismatches ---
    # Filter for rows that were only in the 'left' file (the player stats)
    # This means they failed to find a match in the UCL teams file
    failed_matches = audit_df[audit_df['_merge'] == 'left_only']

    if not failed_matches.empty:
        print("\n--- AUDIT RESULTS: Found Mismatches! ---")
        print("The following Squads and Seasons from your player file could NOT find a match in your UCL teams file.")
        print("This is why 'UCL_progress' is being filled with 'Did Not Qualify'.\n")
        
        # Get a unique list of the failing squad names
        failing_squads = failed_matches[['Squad', 'Season']].drop_duplicates().sort_values(by='Squad')
        
        print("Unique Squad/Season combinations that are failing to merge:")
        display(failing_squads.head(20)) # Display the first 20 mismatches
    else:
        print("\n--- AUDIT RESULTS: No Mismatches Found ---")
        print("The merge keys ('Squad', 'Season') appear to be perfectly aligned.")

# --- Run the Audit ---
run_merge_audit()

--- Starting Merge Audit ---
Files loaded successfully for audit.

--- AUDIT RESULTS: Found Mismatches! ---
The following Squads and Seasons from your player file could NOT find a match in your UCL teams file.
This is why 'UCL_progress' is being filled with 'Did Not Qualify'.

Unique Squad/Season combinations that are failing to merge:


Unnamed: 0,Squad,Season
30865,Ajaccio,2022-2023
25749,Ajaccio,2013-2014
25189,Ajaccio,2012-2013
24634,Ajaccio,2011-2012
19957,Alav√©s,2018-2019
18838,Alav√©s,2016-2017
19384,Alav√©s,2017-2018
23474,Alav√©s,2024-2025
20500,Alav√©s,2019-2020
21070,Alav√©s,2020-2021


In [11]:
import pandas as pd
import os

def compare_squad_names():
    """
    Directly compares the unique squad names from the league and UCL player files
    to find systematic naming differences.
    """
    print("--- Starting Head-to-Head Squad Name Comparison ---")
    data_path = '../data/'

    try:
        # Load the two player stat files
        df_league_players = pd.read_csv(os.path.join(data_path, 'combined_player_stats_2011-2025_CLEAN.csv'))
        df_ucl_players = pd.read_csv(os.path.join(data_path, 'all_ucl_player_stats_2011-2025_CLEAN.csv'))
        print("Files loaded successfully for comparison.")

    except FileNotFoundError as e:
        print(f"ERROR: A file was not found. Missing file: {e.filename}")
        return

    # --- Clean the 'Squad' columns ---
    for df in [df_league_players, df_ucl_players]:
        if 'Squad' in df.columns:
            df['Squad'] = df['Squad'].str.strip()

    # --- Create unique sets of squad names ---
    league_squad_names = set(df_league_players['Squad'].unique())
    ucl_squad_names = set(df_ucl_players['Squad'].unique())

    # --- Find the differences ---
    # Find names that are in the UCL file but NOT in the league file
    ucl_only_names = sorted(list(ucl_squad_names - league_squad_names))
    
    # Find names that are in the league file but NOT in the UCL file (for comparison)
    league_only_names = sorted(list(league_squad_names - ucl_squad_names))

    if ucl_only_names:
        print("\n\n--- üö® DIAGNOSTIC RESULTS: Systematic Mismatches Found! ---")
        print("\nThe following squad names exist in your UCL file but could NOT be found in your League file.")
        print("This is the root cause of the merge failure. We need to create a 'translation' for these names.")
        
        print("\n--- Squad Names ONLY in UCL File ---")
        print(ucl_only_names)
        
        print("\n--- For reference, here are some names ONLY in the League file ---")
        print(league_only_names[:20]) # Show first 20 for comparison
    else:
        print("\n\n--- DIAGNOSTIC: No squad name differences found. ---")
        print("This would indicate a very unusual issue, possibly with the 'Player' or 'Season' columns.")

# --- Run the comparison ---
compare_squad_names()

--- Starting Head-to-Head Squad Name Comparison ---
Files loaded successfully for comparison.


--- üö® DIAGNOSTIC RESULTS: Systematic Mismatches Found! ---

The following squad names exist in your UCL file but could NOT be found in your League file.
This is the root cause of the merge failure. We need to create a 'translation' for these names.

--- Squad Names ONLY in UCL File ---
['at Austria Wien', 'at RB Salzburg', 'at Sturm Graz', 'az Qarabaƒü', 'be Anderlecht', 'be Antwerp', 'be Club Brugge', 'be Genk', 'be Gent', 'bg Ludogorets', 'by BATE Borisov', 'ch Basel', 'ch Young Boys', 'cy APOEL FC', 'cz Slavia Prague', 'cz Sparta Prague', 'cz Viktoria Plze≈à', 'de Bayern Munich', 'de Dortmund', 'de Eint Frankfurt', 'de Gladbach', 'de Hoffenheim', 'de Leverkusen', 'de RB Leipzig', 'de Schalke 04', 'de Stuttgart', 'de Union Berlin', 'de Werder Bremen', 'de Wolfsburg', 'dk FC Copenhagen', 'dk Midtjylland', 'dk Nordsj√¶lland', 'eng Arsenal', 'eng Aston Villa', 'eng Chelsea', 'eng Leicester

In [None]:
import pandas as pd
import os

def create_definitive_master_dataset():
    """
    Loads all historical data, removes country-code prefixes from UCL squad names,
    standardizes all names, and performs a final, robust merge.
    """
    print("--- Starting Definitive Data Merge with Prefix Removal ---")
    data_path = '../data/'

    try:
        # 1. Load All Four Historical Datasets
        df_league_standings = pd.read_csv(os.path.join(data_path, 'combined_league_standings_2011-2025.csv'))
        df_league_players = pd.read_csv(os.path.join(data_path, 'combined_player_stats_2011-2025_CLEAN.csv'))
        df_ucl_players = pd.read_csv(os.path.join(data_path, 'all_ucl_player_stats_2011-2025_CLEAN.csv'))
        df_ucl_teams = pd.read_csv(os.path.join(data_path, 'all_ucl_team_performance_and_progress.csv'))
        print("\n‚úÖ Step 1/4: All four historical datasets loaded successfully.")

    except FileNotFoundError as e:
        print(f"\n--- FATAL ERROR --- \nCould not find a required file. Missing file: {e.filename}")
        return

    # --- 2. PRE-MERGE CLEANING AND STANDARDIZATION ---
    print("\nStep 2/4: Cleaning and standardizing all data for a perfect merge...")
    
    # --- THIS IS THE KEY FIX ---
    # Loop through the two UCL files and remove the country code prefix from the 'Squad' column.
    for df in [df_ucl_players, df_ucl_teams]:
        if 'Squad' in df.columns:
            # Splits 'eng Arsenal' into ['eng', 'Arsenal'] and joins everything from the second element onwards.
            df['Squad'] = df['Squad'].str.split(' ').str[1:].str.join(' ')
    print("Country-code prefixes removed from UCL squad names.")
            
    # Now, perform the standard cleaning on all files
    all_dfs = [df_league_standings, df_league_players, df_ucl_players, df_ucl_teams]
    for df in all_dfs:
        df.columns = df.columns.str.strip()
        if 'Player' in df.columns: df['Player'] = df['Player'].str.strip()
        if 'Squad' in df.columns: df['Squad'] = df['Squad'].str.strip()
        if 'Season' in df.columns: df['Season'] = df['Season'].astype(str).str.strip()
    print("Standard cleaning complete.")

    # --- 3. PERFORM THE MERGES ---
    print("\nStep 3/4: Merging all datasets...")
    
    master_df = pd.merge(df_league_players, df_league_standings, on=['Squad', 'League', 'Season'], how='left', suffixes=('_player', '_team'))
    
    ucl_stats_to_add = df_ucl_players[['Player', 'Squad', 'Gls', 'Ast', 'Min', 'Season']]
    master_df = pd.merge(master_df, ucl_stats_to_add, on=['Player', 'Squad', 'Season'], how='left', suffixes=('_league', '_ucl'))
    
    ucl_progress_to_add = df_ucl_teams[['Squad', 'Season', 'UCL_progress']]
    master_df = pd.merge(master_df, ucl_progress_to_add, on=['Squad', 'Season'], how='left')
    print("All merges complete.")

    # --- 4. FINAL CLEANUP ---
    print("\nStep 4/4: Performing final cleanup...")
    ucl_cols = ['Gls_ucl', 'Ast_ucl', 'Min_ucl']
    for col in ucl_cols:
        if col in master_df.columns:
            master_df[col] = master_df[col].fillna(0)
    
    master_df['UCL_progress'] = master_df['UCL_progress'].fillna('Did Not Qualify')
    
    master_df = master_df.loc[:,~master_df.columns.duplicated()]
    print("Cleanup complete.")
    
    return master_df

# --- Main Execution ---
final_master_dataset = create_definitive_master_dataset()

if final_master_dataset is not None:
    output_path = os.path.join("../data", "master_dataset_2011-2025.csv")
    final_master_dataset.to_csv(output_path, index=False)

    print(f"\n\n--- SUCCESS! ---")
    print(f"The final master dataset has been created successfully.")
    print(f"Saved as: {output_path}")
    
    # Verification: Check a team that was previously failing, like Arsenal from 2010-2011
    print("\nVerification: Checking Arsenal players from the 2010-2011 season:")
    display(final_master_dataset[
        (final_master_dataset['Squad'] == 'Arsenal') & 
        (final_master_dataset['Season'] == '2010-2011')
    ].head())
else:
    print("\n--- Process Halted ---")

--- Starting Definitive Data Merge with Prefix Removal ---

‚úÖ Step 1/4: All four historical datasets loaded successfully.

Step 2/4: Cleaning and standardizing all data for a perfect merge...
Country-code prefixes removed from UCL squad names.
Standard cleaning complete.

Step 3/4: Merging all datasets...
All merges complete.

Step 4/4: Performing final cleanup...
Cleanup complete.


--- SUCCESS! ---
The final master dataset has been created successfully.
Saved as: ../data\master_dataset_2011-2025.csv

Verification: Checking Arsenal players from the 2010-2011 season:


Unnamed: 0,Rk_player,Player,Nation,Pos,Squad,Age,Born,MP_player,Starts,Min_league,...,GA,GD,Pts,Pts/MP,Top Team Scorer,Goalkeeper,Gls_ucl,Ast_ucl,Min_ucl,UCL_progress
7326,9,Manuel Almunia,ESP,GK,Arsenal,33,1977,8,8,720,...,43.0,29.0,68.0,1.79,Robin van Persie - 18,Wojciech Szczƒôsny,0.0,0.0,161.0,R16
7335,18,Andrey Arshavin,RUS,"FW,MF",Arsenal,29,1981,37,25,0,...,43.0,29.0,68.0,1.79,Robin van Persie - 18,Wojciech Szczƒôsny,3.0,2.0,275.0,R16
7366,49,Nicklas Bendtner,DEN,FW,Arsenal,22,1988,17,3,511,...,43.0,29.0,68.0,1.79,Robin van Persie - 18,Wojciech Szczƒôsny,0.0,0.0,185.0,R16
7419,102,Marouane Chamakh,MAR,FW,Arsenal,26,1984,29,18,0,...,43.0,29.0,68.0,1.79,Robin van Persie - 18,Wojciech Szczƒôsny,3.0,0.0,319.0,R16
7424,107,Ga√´l Clichy,FRA,DF,Arsenal,25,1985,33,33,0,...,43.0,29.0,68.0,1.79,Robin van Persie - 18,Wojciech Szczƒôsny,0.0,1.0,457.0,R16


In [2]:
import pandas as pd
import os

def merge_2026_data():
    """
    Loads all five 2026 data files, performs a robust pre-merge cleaning,
    merges them sequentially, and saves the final master dataset.
    """
    print("--- Starting 2026 Data Merging Process ---")
    data_path = '../data/'

    try:
        # --- 1. Load All Five 2026 Datasets ---
        df_league_standings = pd.read_csv(os.path.join(data_path, 'combined_league_standings_2026.csv'))
        df_league_players = pd.read_csv(os.path.join(data_path, 'combined_player_stats_2026.csv'))
        df_ucl_players = pd.read_csv(os.path.join(data_path, 'ucl_player_stats_2026.csv'))
        # Using both ucl_league_table (for team stats) and ucl_team_progress (for progress)
        df_ucl_teams = pd.read_csv(os.path.join(data_path, 'ucl_league_table_2026.csv'))
        df_ucl_progress = pd.read_csv(os.path.join(data_path, 'ucl_team_progress_2026.csv'))
        
        print("\n‚úÖ All five 2026 datasets loaded successfully.")

    except FileNotFoundError as e:
        print(f"\n--- FATAL ERROR --- \nCould not find a required file. Missing file: {e.filename}")
        return

    # --- 2. PRE-MERGE CLEANING AND STANDARDIZATION ---
    print("\nCleaning and standardizing all data for a perfect merge...")
    
    # This dictionary fixes common team name mismatches
    name_replacements = {
        'Paris S-G': 'Paris Saint-Germain',
        'Inter': 'Internazionale',
        'Manchester Utd': 'Manchester United'
    }

    all_dfs = [df_league_standings, df_league_players, df_ucl_players, df_ucl_teams, df_ucl_progress]
    current_season = '2025-2026'
    for df in all_dfs:
        df['Season'] = current_season
        df.columns = df.columns.str.strip()
        if 'Player' in df.columns: df['Player'] = df['Player'].str.strip()
        if 'Squad' in df.columns:
            df['Squad'] = df['Squad'].str.strip()
            df['Squad'] = df['Squad'].replace(name_replacements)
    
    # --- 3. EXPLICITLY RENAME COLUMNS FOR 100% UNIQUENESS ---
    df_league_players.rename(columns={'Min': 'Min_league', 'Gls': 'Gls_league', 'Ast': 'Ast_league', 'xG': 'xG_player', 'xAG': 'xAG_player'}, inplace=True)
    df_league_standings.rename(columns={'Rk': 'League_Rk', 'Pts': 'League_Pts', 'MP': 'MP_team'}, inplace=True)
    df_ucl_players.rename(columns={'Min': 'Min_ucl', 'Gls': 'Gls_ucl', 'Ast': 'Ast_ucl'}, inplace=True)
    
    print("Standardization and renaming complete.")

    # --- 4. PERFORM THE MERGES SEQUENTIALLY ---
    print("\nMerging all datasets...")
    
    # Start with league players as the base
    master_df = df_league_players.copy()
    
    # Merge 1: Team league performance
    master_df = pd.merge(master_df, df_league_standings[['Squad', 'Season', 'League_Rk', 'League_Pts']], on=['Squad', 'Season'], how='left')
    
    # Merge 2: UCL player stats
    master_df = pd.merge(master_df, df_ucl_players[['Player', 'Squad', 'Season', 'Min_ucl', 'Gls_ucl', 'Ast_ucl']], on=['Player', 'Squad', 'Season'], how='left')
    
    # Merge 3: UCL team progress
    master_df = pd.merge(master_df, df_ucl_progress[['Squad', 'Season', 'UCL_progress']], on=['Squad', 'Season'], how='left')
    
    print("All merges complete.")

    # --- 5. FINAL CLEANUP ---
    print("\nPerforming final cleanup...")
    ucl_cols = ['Min_ucl', 'Gls_ucl', 'Ast_ucl']
    for col in ucl_cols:
        master_df[col] = master_df[col].fillna(0)
    
    master_df['UCL_progress'] = master_df['UCL_progress'].fillna('Did Not Qualify')
    
    # Remove any fully duplicated columns
    master_df = master_df.loc[:,~master_df.columns.duplicated()]
    print("Cleanup complete.")
    
    return master_df

# --- Main Execution ---
master_df_2026 = merge_2026_data()

if master_df_2026 is not None:
    # Save the final, unified master dataset for the 2026 season
    output_path = os.path.join("../data", "master_dataset_2026.csv")
    master_df_2026.to_csv(output_path, index=False)

    print(f"\n\n--- SUCCESS! ---")
    print(f"The final master dataset for the 2026 season has been created successfully.")
    print(f"Saved as: {output_path}")
    
    print("\nVerification: Here is a sample of the final merged data:")
    display(master_df_2026.head())
else:
    print("\n--- Process Halted --- Please review the error message above.")

--- Starting 2026 Data Merging Process ---

‚úÖ All five 2026 datasets loaded successfully.

Cleaning and standardizing all data for a perfect merge...
Standardization and renaming complete.

Merging all datasets...
All merges complete.

Performing final cleanup...
Cleanup complete.


--- SUCCESS! ---
The final master dataset for the 2026 season has been created successfully.
Saved as: ../data\master_dataset_2026.csv

Verification: Here is a sample of the final merged data:


Unnamed: 0,Rk,Player,Nation,Pos,Squad,Age,Born,MP,Starts,Min_league,...,npxG.1,npxG+xAG.1,Matches,Season,League_Rk,League_Pts,Min_ucl,Gls_ucl,Ast_ucl,UCL_progress
0,1,Brenden Aaronson,us USA,FW,Leeds United,24-361,2000.0,7,4,397.0,...,0.17,0.33,Matches,2025-2026,15,8,0.0,0.0,0.0,Did Not Qualify
1,2,Joshua Acheampong,eng ENG,DF,Chelsea,19-166,2006.0,3,2,194.0,...,0.04,0.04,Matches,2025-2026,7,11,0.0,0.0,0.0,League
2,3,Tyler Adams,us USA,MF,Bournemouth,26-246,1999.0,7,7,603.0,...,0.02,0.03,Matches,2025-2026,4,14,0.0,0.0,0.0,Did Not Qualify
3,4,Tosin Adarabioyo,eng ENG,DF,Chelsea,28-024,1997.0,4,3,354.0,...,0.01,0.01,Matches,2025-2026,7,11,0.0,0.0,0.0,League
4,5,Simon Adingra,ci CIV,FW,Sunderland,23-290,2002.0,6,4,273.0,...,0.05,0.24,Matches,2025-2026,9,11,0.0,0.0,0.0,Did Not Qualify
