In [1]:
import pandas as pd
import os

In [2]:
# First let's get each player position and id for the season
def process_season_data(data_directory, seasons):
    for season in seasons:
        season_path = os.path.join(data_directory, season)

        if os.path.isdir(season_path):
            player_idlist_path = os.path.join(season_path, "player_idlist.csv")
            cleaned_players_path = os.path.join(season_path, "cleaned_players.csv")
            output_file_path = os.path.join(season_path, "processed_players.csv")

            if os.path.exists(player_idlist_path) and os.path.exists(cleaned_players_path):
                try:
                    player_idlist_df = pd.read_csv(player_idlist_path)
                    cleaned_players_df = pd.read_csv(cleaned_players_path)

                    merged_df = pd.merge(
                        player_idlist_df[['id', 'first_name', 'second_name']],
                        cleaned_players_df[['first_name', 'second_name', 'element_type']],
                        on=['first_name', 'second_name'],
                        how='inner'
                    )

                    # Rename element_type to position
                    merged_df.rename(columns={'element_type': 'position'}, inplace=True)

                    # Save the DataFrame to a new file
                    merged_df.to_csv(output_file_path, index=False)
                    print(f"Processed and saved: {output_file_path}")
                except Exception as e:
                    print(f"Error processing season {season}: {e}")
            else:
                print(f"Missing required files in {season_path}: player_idlist.csv or cleaned_players.csv")

In [3]:
data_directory = "Fantasy-Premier-League/data"
seasons = ["2020-21", "2021-22", "2022-23", "2023-24", "2024-25"]
process_season_data(data_directory, seasons)

Processed and saved: Fantasy-Premier-League/data/2020-21/processed_players.csv
Processed and saved: Fantasy-Premier-League/data/2021-22/processed_players.csv
Processed and saved: Fantasy-Premier-League/data/2022-23/processed_players.csv
Processed and saved: Fantasy-Premier-League/data/2023-24/processed_players.csv
Processed and saved: Fantasy-Premier-League/data/2024-25/processed_players.csv


In [9]:
def merge_player_ids(data_directory, seasons, output_file="master_player_list.csv"):
    player_data = {}
    next_unique_id = 1
    
    # We want to base ids on 2024-25 season
    main_season = "2024-25"
    main_season_path = os.path.join(data_directory, main_season, "processed_players.csv")
    
    if os.path.exists(main_season_path):
        try:
            main_processed_df = pd.read_csv(main_season_path)
            
            for _, row in main_processed_df.iterrows():
                full_name = f"{row['first_name']} {row['second_name']}"
                if full_name not in player_data:
                    player_data[full_name] = {
                        "First_Name": row['first_name'],
                        "Last_Name": row['second_name'],
                        "Unique_ID": row['id']
                    }
                # Add the 24_id for the main season
                player_data[full_name]['24_id'] = row['id']
                next_unique_id = max(next_unique_id, row['id'] + 1)  # Ensure greater ids to avoid reusing

        except Exception as e:
            print(f"Error processing {main_season_path}: {e}")
    else:
        print(f"Missing processed_players.csv for season: {main_season}")

    # Process other seasons
    for season in seasons:
        if season != main_season:
            season_path = os.path.join(data_directory, season, "processed_players.csv")
            if os.path.exists(season_path):
                try:
                    processed_df = pd.read_csv(season_path)
                    
                    season_short = season[:4][-2:]
                    
                    for _, row in processed_df.iterrows():
                        full_name = f"{row['first_name']} {row['second_name']}"
                        if full_name not in player_data:
                            player_data[full_name] = {
                                "First_Name": row['first_name'],
                                "Last_Name": row['second_name'],
                                "Unique_ID": next_unique_id
                            }
                            next_unique_id += 1
                        # Add the season ID
                        player_data[full_name][f"{season_short}_id"] = row['id']
                except Exception as e:
                    print(f"Error processing {season_path}: {e}")

    consolidated_df = pd.DataFrame.from_dict(player_data, orient='index').reset_index(drop=True)
    
    # Ensure all ID columns are integers
    id_columns = [col for col in consolidated_df.columns if col.endswith("_id")]
    consolidated_df[id_columns] = consolidated_df[id_columns].fillna(-1).astype(int)
    
    # Save the consolidated DataFrame to a CSV
    output_path = os.path.join(data_directory, output_file)
    consolidated_df.to_csv(output_path, index=False)
    print(f"Consolidated player data saved to {output_path}")

    return consolidated_df

In [10]:
merge_player_ids(data_directory, seasons)

Consolidated player data saved to Fantasy-Premier-League/data/master_player_list.csv


Unnamed: 0,First_Name,Last_Name,Unique_ID,24_id,22_id,23_id,20_id,21_id
0,Fábio,Ferreira Vieira,1,1,25,4,-1,-1
1,Gabriel,Fernando de Jesus,2,2,28,8,282,263
2,Gabriel,dos Santos Magalhães,3,3,16,5,-1,-1
3,Kai,Havertz,4,4,145,6,500,141
4,Karl,Hein,5,5,655,646,-1,532
...,...,...,...,...,...,...,...,...
1748,Ty,Barnett,1749,-1,-1,770,-1,-1
1749,Noha,Lemina,1750,-1,-1,800,-1,-1
1750,Wesley,Okoduwa,1751,-1,-1,832,-1,-1
1751,Fletcher,Holman,1752,-1,-1,840,-1,-1


In [14]:
# Now let's check for players that were renamed between seasons
consolidated_file = "master_player_list.csv"
consolidated_path = os.path.join(data_directory, consolidated_file)
consolidated_df = pd.read_csv(consolidated_path)

surname_groups = consolidated_df.groupby("Last_Name").filter(lambda x: len(x) > 1)
if not surname_groups.empty:
    output_path = os.path.join(data_directory, "players_with_same_surnames.csv")
    surname_groups.sort_values(by="Last_Name").to_csv(output_path, index=False)
else:
    print("No players with the same surnames found.")

In [None]:
# Upon some manual checking we have some players to rename (this will have to include their folders for each
# season), we will use the newest names.