In [1]:
import pandas as pd
import os
import glob

def combine_all_league_standings():
    """
    Finds all league standing CSVs from the 'standings' subfolder,
    parses the 'league_year.csv' format, and combines them into one file.
    """
    print("Starting the process to combine all historical league standings...")
    
    # --- UPDATED: Pointing to your 'standings' subfolder ---
    data_path = '../data/standings/'
    
    # The pattern '*' will find all files in the folder
    file_pattern = os.path.join(data_path, '*.csv')
    all_files = glob.glob(file_pattern)
    
    if not all_files:
        print("ERROR: No CSV files found in 'data/standings/'. Please check the folder.")
        return pd.DataFrame()

    print(f"Found {len(all_files)} files to process.")
    
    all_standings = []
    
    # --- UPDATED: Map to handle the new filenames ---
    league_map = {
        'premier_league': 'Premier League',
        'laliga': 'La Liga',
        'serieA': 'Serie A', # Note the capital 'A' to match your example
        'bundesliga': 'Bundesliga',
        'ligue1': 'Ligue 1'
    }

    for file in all_files:
        try:
            filename = os.path.basename(file)
            
            # --- UPDATED: Parsing logic for 'league-name_year' format ---
            parts = filename.replace('.csv', '').split('_')
            year = int(parts[-1]) # The year is always the last part
            league_key = '_'.join(parts[:-1]) # The league name is everything before the last underscore
            
            league_name = league_map.get(league_key, 'Unknown League')
            season = f"{year-1}-{year}"

            # --- Read and Clean Data ---
            df = pd.read_csv(file)
            df = df[df['Squad'] != 'Squad'].copy() # Clean repeating headers
            df['League'] = league_name
            df['Season'] = season
            
            all_standings.append(df)

        except Exception as e:
            print(f"Could not process {file}. Error: {e}")

    if all_standings:
        combined_df = pd.concat(all_standings, ignore_index=True)
        
        # --- Convert data types ---
        numeric_cols = ['Rk', 'MP', 'W', 'D', 'L', 'GF', 'GA', 'GD', 'Pts', 'xG', 'xGA', 'xGD']
        for col in numeric_cols:
            if col in combined_df.columns:
                combined_df[col] = pd.to_numeric(combined_df[col], errors='coerce')
        
        combined_df.dropna(subset=['Squad'], inplace=True)
        return combined_df
    else:
        return pd.DataFrame()

# --- Main Execution ---
league_dataset = combine_all_league_standings()

if not league_dataset.empty:
    # Save the final file to the main 'data' folder
    output_path = os.path.join("../data", "combined_league_standings_2011-2025.csv")
    league_dataset.to_csv(output_path, index=False)
    
    print("\n--- SUCCESS! ---")
    print(f"All records have been combined and saved to: {output_path}")
    
    display(league_dataset.head()) # Shows earliest season data
    display(league_dataset.tail()) # Shows latest season data
else:
    print("\nProcessing failed. No data was saved.")

Starting the process to combine all historical league standings...
Found 75 files to process.

--- SUCCESS! ---
All records have been combined and saved to: ../data\combined_league_standings_2011-2025.csv


Unnamed: 0,Rk,Squad,MP,W,D,L,GF,GA,GD,Pts,...,Top Team Scorer,Goalkeeper,Notes,League,Season,xG,xGA,xGD,xGD/90,Last 5
0,1,Dortmund,34,23,6,5,67,22,45,75,...,Lucas Barrios - 16,Roman Weidenfeller,→ Champions League via league finish,Bundesliga,2010-2011,,,,,
1,2,Leverkusen,34,20,8,6,64,44,20,68,...,Arturo Vidal - 10,René Adler,→ Champions League via league finish,Bundesliga,2010-2011,,,,,
2,3,Bayern Munich,34,19,8,7,81,40,41,65,...,Mario Gómez - 28,Hans-Jörg Butt,→ Champions League via league finish,Bundesliga,2010-2011,,,,,
3,4,Hannover 96,34,19,3,12,49,45,4,60,...,Didier Ya Konan - 14,Florian Fromlowitz,→ Europa League via league finish,Bundesliga,2010-2011,,,,,
4,5,Mainz 05,34,18,4,12,52,39,13,58,...,André Schürrle - 14,Christian Wetklo,→ Europa League via league finish,Bundesliga,2010-2011,,,,,


Unnamed: 0,Rk,Squad,MP,W,D,L,GF,GA,GD,Pts,...,Top Team Scorer,Goalkeeper,Notes,League,Season,xG,xGA,xGD,xGD/90,Last 5
1461,16,Parma,38,7,15,16,44,58,-14,36,...,Ange-Yoan Bonny - 6,Zion Suzuki,,Serie A,2024-2025,43.0,56.4,-13.3,-0.35,
1462,17,Lecce,38,8,10,20,27,58,-31,34,...,Nikola Krstović - 11,Wladimiro Falcone,,Serie A,2024-2025,35.0,57.1,-22.5,-0.59,
1463,18,Empoli,38,6,13,19,33,59,-26,31,...,Sebastiano Esposito - 8,Devis Vásquez,Relegated,Serie A,2024-2025,33.0,50.1,-17.6,-0.46,
1464,19,Venezia,38,5,14,19,32,56,-24,29,...,Joel Pohjanpalo - 6,Filip Stankovic,Relegated,Serie A,2024-2025,35.8,57.5,-21.7,-0.57,
1465,20,Monza,38,3,9,26,28,69,-41,18,...,Dany Mota - 5,Stefano Turati,Relegated,Serie A,2024-2025,29.9,54.7,-24.8,-0.65,
