In [6]:
import pandas as pd
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
from getpass import getuser

# Get the current user's name
user = getuser()

# Paths to the datasets
fpl_data_path = rf'C:\Users\{user}\Documents\GitHub\dream-team-fpl-prediction\data\fpl_data.xlsx'
players_agg_path = rf'C:\Users\{user}\Documents\GitHub\dream-team-fpl-prediction\data\players_agg.xlsx'

# Read the datasets
fpl_data = pd.read_excel(fpl_data_path)
players_agg = pd.read_excel(players_agg_path)

# Filter for the 2016-2017 season
fpl_2016_2017 = fpl_data[fpl_data['Season'] == '2016-2017']
players_agg_2016_2017 = players_agg[players_agg['Season'] == '2016-2017']

# Filter out players from fpl_data where all columns except 'Player' and 'Season' are zero
fpl_2016_2017 = fpl_2016_2017[(fpl_2016_2017.drop(columns=['Player', 'Season']) != 0).any(axis=1)]

# Function to get the best fuzzy match for a player name
def get_best_match(player_name, player_list):
    result = process.extractOne(player_name, player_list, scorer=fuzz.token_sort_ratio)
    if result:
        match, score = result
        return match, score
    else:
        return None, None

# List to store the results
matched_players = []

# Copy the player list from players_agg_2016_2017 to keep track of unmatched players
unmatched_players = players_agg_2016_2017['Player'].tolist()

# Iterate through each player in the fpl_2016_2017 dataset
for player in fpl_2016_2017['Player']:
    best_match, score = get_best_match(player, unmatched_players)
    if best_match and score:  # Ensure we have a valid match and score
        matched_players.append((player, best_match, score))
        unmatched_players.remove(best_match)  # Remove matched player from the list

# Convert the matched players list to a DataFrame
matched_players_df = pd.DataFrame(matched_players, columns=['Player_FPL', 'Player_Agg', 'Score'])

# Save the matched players DataFrame to an Excel file
match_output_path = rf'C:\Users\{user}\Documents\GitHub\dream-team-fpl-prediction\data\matched_players_2016_2017.xlsx'
matched_players_df.to_excel(match_output_path, index=False)

# Merge the matched players with the original data to get a complete dataset
merged_fpl = fpl_2016_2017.merge(matched_players_df, left_on='Player', right_on='Player_FPL')
merged_data = merged_fpl.merge(players_agg_2016_2017, left_on='Player_Agg', right_on='Player', suffixes=('_FPL', '_Agg'))

# Drop unnecessary columns
merged_data.drop(columns=['Player_FPL', 'Player_Agg'], inplace=True)

# Save the merged dataset to an Excel file
output_path = rf'C:\Users\{user}\Documents\GitHub\dream-team-fpl-prediction\data\merged_fpl_data_2016_2017.xlsx'
merged_data.to_excel(output_path, index=False)

# Print out the matched player names for review
for index, row in matched_players_df.iterrows():
    print(f"FPL Player: {row['Player_FPL']} - Aggregated Player: {row['Player_Agg']} - Match Score: {row['Score']}")

print(f"Merged dataset saved as 'merged_fpl_data_2016_2017.xlsx'.")
print(f"Matched players dataset saved as 'matched_players_2016_2017.xlsx'.")


FPL Player: David Ospina - Aggregated Player: David Ospina - Match Score: 100
FPL Player: Petr Cech - Aggregated Player: Peter Crouch - Match Score: 76
FPL Player: Laurent Koscielny - Aggregated Player: Laurent Koscielny - Match Score: 100
FPL Player: Per Mertesacker - Aggregated Player: Per Mertesacker - Match Score: 100
FPL Player: Gabriel Armando de Abreu - Aggregated Player: Evandro Goebel - Match Score: 53
FPL Player: Héctor Bellerín - Aggregated Player: Héctor Bellerín - Match Score: 100
FPL Player: Kieran Gibbs - Aggregated Player: Kieran Gibbs - Match Score: 100
FPL Player: Mathieu Debuchy - Aggregated Player: Mathieu Debuchy - Match Score: 100
FPL Player: Carl Jenkinson - Aggregated Player: Carl Jenkinson - Match Score: 100
FPL Player: Nacho Monreal - Aggregated Player: Nacho Monreal - Match Score: 100
FPL Player: Calum Chambers - Aggregated Player: Calum Chambers - Match Score: 100
FPL Player: Alexis Sánchez - Aggregated Player: Alexis Sánchez - Match Score: 100
FPL Player: T