In [15]:
import pandas as pd
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
from getpass import getuser

# Get the current user's name
user = getuser()

# Paths to the datasets
fpl_data_path = rf'C:\Users\{user}\Documents\GitHub\dream-team-fpl-prediction\data\fpl_data.xlsx'
players_agg_path = rf'C:\Users\{user}\Documents\GitHub\dream-team-fpl-prediction\data\players_agg.xlsx'
team_stats_path = rf'C:\Users\{user}\Documents\GitHub\dream-team-fpl-prediction\data\teams_stats.xlsx'

# Read the datasets
fpl_data = pd.read_excel(fpl_data_path)
players_agg = pd.read_excel(players_agg_path)
team_stats = pd.read_excel(team_stats_path)

# List of seasons to analyze (2016-2017 to 2022-2023)
seasons = [f'{year}-{year+1}' for year in range(2016, 2023)]

# Initialize lists to store the final merged data and matched goalkeepers for all seasons
final_merged_data = []
all_matched_goalkeepers = []

# Function to get the best fuzzy match for a player name, with a threshold of 83
def get_best_match(player_name, player_list, threshold=83):
    result = process.extractOne(player_name, player_list, scorer=fuzz.token_sort_ratio)
    if result and result[1] >= threshold:  # Ensure score meets the threshold
        return result
    else:
        return None, None

# Iterate through each season
for season in seasons:
    # Filter for the current season
    fpl_season_data = fpl_data[fpl_data['Season'] == season]
    players_agg_season_data = players_agg[(players_agg['Season'] == season) & 
                                          (players_agg['Pos'] == 'GK') & 
                                          (players_agg['MP'].between(10, 38))]

    # Filter out players from fpl_data where all columns except 'Player', 'Season', and 'selected_by_percent' are zero
    fpl_season_data = fpl_season_data[(fpl_season_data.drop(columns=['Player', 'Season', 'selected_by_percent']) != 0).any(axis=1)]

    # Keep only those observations where 'total_points' is in the 60% percentile
    percentile_60 = fpl_season_data['total_points'].quantile(0.6)
    fpl_season_data = fpl_season_data[fpl_season_data['total_points'] >= percentile_60]

    # List to store the results
    matched_players = []

    # Iterate through each player in the players_agg_season_data dataset (to match players_agg to fpl)
    for player in players_agg_season_data['Player']:
        best_match, score = get_best_match(player, fpl_season_data['Player'].tolist())
        if best_match and score:  # Ensure we have a valid match and score
            matched_players.append((player, best_match, score))

    # Convert the matched players list to a DataFrame
    matched_players_df = pd.DataFrame(matched_players, columns=['Player_Agg', 'Player_FPL', 'Score'])

    # Store the matched goalkeepers for the season
    matched_players_df['Season'] = season
    all_matched_goalkeepers.append(matched_players_df)

    # Merge the matched players with the original data to get a complete dataset
    merged_fpl = fpl_season_data.merge(matched_players_df, left_on='Player', right_on='Player_FPL')
    merged_data = merged_fpl.merge(players_agg_season_data, left_on='Player_Agg', right_on='Player', suffixes=('_FPL', '_Agg'))

    # Drop unnecessary columns
    merged_data.drop(columns=['Player_FPL', 'Player_Agg'], inplace=True)

    # Append the season's merged data to the final list
    final_merged_data.append(merged_data)

# Concatenate all the season data into one final DataFrame
final_merged_data_df = pd.concat(final_merged_data, ignore_index=True)

# Concatenate all the matched goalkeepers data into one DataFrame
all_matched_goalkeepers_df = pd.concat(all_matched_goalkeepers, ignore_index=True)

# Merge the final goalkeeper dataset with team_stats
final_dataset = final_merged_data_df.merge(team_stats, on=['Season', 'Squad'], how='left')

# Remove duplicate columns, e.g., Season_x, Season_y, etc.
final_dataset = final_dataset.loc[:, ~final_dataset.columns.str.contains('_x|_y')]

# Save the final merged dataset to an Excel file
output_path = rf'C:\Users\{user}\Documents\GitHub\dream-team-fpl-prediction\data\goalkeeper_dataset.xlsx'
final_dataset.to_excel(output_path, index=False)

# Save the list of all matched goalkeepers to a separate Excel file
matched_goalkeepers_output_path = rf'C:\Users\{user}\Documents\GitHub\dream-team-fpl-prediction\data\matched_goalkeepers.xlsx'
all_matched_goalkeepers_df.to_excel(matched_goalkeepers_output_path, index=False)

print(f"Final goalkeeper dataset saved as 'goalkeeper_dataset.xlsx'.")
print(f"List of all matched goalkeepers saved as 'matched_goalkeepers.xlsx'.")


Final goalkeeper dataset saved as 'goalkeeper_dataset.xlsx'.
List of all matched goalkeepers saved as 'matched_goalkeepers.xlsx'.
