In [1]:
import os
import pandas as pd
from unidecode import unidecode
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

# Load the players_agg.xlsx file
players_agg_path = r'C:\Users\aldi\Documents\GitHub\dream-team-fpl-prediction\data\players_agg.xlsx'
#players_agg_path = r'C:\Users\ALESSANDRO\Documents\GitHub\dream-team-fpl-prediction\data\players_agg.xlsx'
players_agg_df = pd.read_excel(players_agg_path)

# Load the fpl_data.csv file
fpl_data_path = r'C:\Users\aldi\Documents\GitHub\dream-team-fpl-prediction\data\fpl_data.xlsx'
#fpl_data_path = r'C:\Users\ALESSANDRO\Documents\GitHub\dream-team-fpl-prediction\data\fpl_data.csv'
fpl_data_df = pd.read_excel(fpl_data_path)

# Clean the 'Player' column in players_agg_df
players_agg_df['Player'] = players_agg_df['Player'].apply(unidecode)

# Clean the 'Player' column in fpl_data_df
fpl_data_df['Player'] = fpl_data_df['Player'].apply(unidecode)

# Filter the players_agg_df for season 2016-2017 onwards
players_agg_df = players_agg_df[players_agg_df['Season'] >= '2016-2017']

# Extract surname and first_name from 'Player' column in both datasets
players_agg_df['surname'] = players_agg_df['Player'].apply(lambda name: name.split()[-1])
players_agg_df['first_name'] = players_agg_df['Player'].apply(lambda name: ' '.join(name.split()[:-1]))
fpl_data_df['surname'] = fpl_data_df['Player'].apply(lambda name: name.split()[-1])
fpl_data_df['first_name'] = fpl_data_df['Player'].apply(lambda name: ' '.join(name.split()[:-1]))

# Perform an outer join on 'Player' and 'Season' variables
merged_df = pd.merge(players_agg_df, fpl_data_df, on=['Player', 'Season'], how='outer')

# Unmatched observations based on surname_x and surname_y
unmatched_left = merged_df[merged_df['surname_x'].isnull()]
unmatched_right = merged_df[merged_df['surname_y'].isnull()]
matched = merged_df[(merged_df['surname_x'].notnull()) & (merged_df['surname_y'].notnull())]

# Print the number of observations in each DataFrame
print("Number of observations in Unmatched Left DataFrame:", unmatched_left.shape[0])
print("Number of observations in Unmatched Right DataFrame:", unmatched_right.shape[0])
print("Number of observations in Matched DataFrame:", matched.shape[0])

# Total number of observations in the merged DataFrame
total_observations = len(unmatched_left) + len(unmatched_right) + len(matched)
print("Total number of observations in the merged DataFrame:", total_observations)

# Export the merged DataFrame to an xlsx file
output_folder = r'C:\Users\aldi\Documents\GitHub\dream-team-fpl-prediction\data'
output_path = os.path.join(output_folder, 'prematch_df.xlsx')
merged_df.to_excel(output_path, index=False)

Number of observations in Unmatched Left DataFrame: 1931
Number of observations in Unmatched Right DataFrame: 764
Number of observations in Matched DataFrame: 2928
Total number of observations in the merged DataFrame: 5623


In [None]:
# Create an empty DataFrame to store matched rows
matched_rows = pd.DataFrame()

# Create an empty list to keep track of matched players
matched_players = []

# Iterate over each row in unmatched_left
for index_left, row_left in unmatched_left.iterrows():
    max_sim_score = 0
    best_match_index = None

    # Compare the surname of the current row with surnames in unmatched_right
    for index_right, row_right in unmatched_right.iterrows():
        sim_score_surname = fuzz.token_set_ratio(row_left['surname_x'], row_right['surname_y'])

        # Check if the Season variable also matches
        if sim_score_surname >= 70 and row_left['Season'] == row_right['Season']:
            if sim_score_surname > max_sim_score and row_right['Player'] not in matched_players:
                max_sim_score = sim_score_surname
                best_match_index = index_right

    # If no match found with surname threshold 70, try with 50 for both surname and first_name
    if best_match_index is None:
        for index_right, row_right in unmatched_right.iterrows():
            sim_score_surname = fuzz.token_set_ratio(row_left['surname_x'], row_right['surname_y'])
            sim_score_first_name = fuzz.token_set_ratio(row_left['first_name_x'], row_right['first_name_y'])

            # Check if the Season variable also matches
            if (
                sim_score_surname >= 50
                and sim_score_first_name >= 50
                and row_left['Season'] == row_right['Season']
                and row_right['Player'] not in matched_players
            ):
                if (sim_score_surname + sim_score_first_name) / 2 > max_sim_score:
                    max_sim_score = (sim_score_surname + sim_score_first_name) / 2
                    best_match_index = index_right

    # If a match is found, replace the 'Player' column with the matched player's name
    if best_match_index is not None:
        matched_player_left = unmatched_left.at[index_left, 'Player']
        matched_player_right = unmatched_right.at[best_match_index, 'Player']
        print(f"Matched players: '{matched_player_left}' with '{matched_player_right}'")
        
        # Update the list of matched players
        matched_players.append(matched_player_right)

        # Replace the 'Player' column in unmatched_left with the matched player's name
        unmatched_left.at[index_left, 'Player'] = matched_player_right

        # Also, append the matched rows to the matched_rows DataFrame
        matched_row = pd.concat([pd.DataFrame(row_left).T, pd.DataFrame(unmatched_right.loc[best_match_index]).T], ignore_index=True)
        matched_rows = pd.concat([matched_rows, matched_row], ignore_index=True)

# Print the number of matched and unmatched observations after the second round of merging
print("Number of matched observations after the second round of merging:", len(matched_rows))
print("Number of unmatched observations after the second round of merging:", len(merged_df) - len(matched_rows))


Matched players: 'Gabriel Armando de Abreu' with 'Adrian'
Matched players: 'Santiago Cazorla' with 'Dan Agyei'
Matched players: 'Serge Gnabry' with 'Dele Alli'
Matched players: 'Joel Campbell' with 'Joel Castro Pereira'
Matched players: 'Yaya Sanogo' with 'Santi Cazorla'
Matched players: 'Takuma Asano' with 'Ben Chilwell'
Matched players: 'Eunan O'Kane' with 'Lee Chung-yong'
Matched players: 'Shaun MacDonald' with 'Diego Costa'
Matched players: 'Emerson Hyndman' with 'Danny Drinkwater'
Matched players: 'Tokelo Rantie' with 'Ovie Ejaria'
Matched players: 'Tendayi Darikwa' with 'Bernardo Espinosa'
Matched players: 'Fredrik Ulvestad' with 'Fabio'
Matched players: 'Rouwen Hennings' with 'Fernandinho'
Matched players: 'Chris Long' with 'Fernando'
Matched players: 'Abdul Rahman Baba' with 'Zeki Fryers'
Matched players: 'Matt Miazga' with 'Idrissa Gana Gueye'
Matched players: 'Oscar dos Santos Emboaba Junior' with 'Aleix Garcia'
Matched players: 'Juan Cuadrado' with 'Brad Guzan'
Matched playe

Matched players: 'Jayson Molumby' with 'Jazz Richards'
Matched players: 'Nick Pope' with 'Romain Saiss'
Matched players: 'Jonathan Walters' with 'Carlos Sanchez'
Matched players: 'Nahki Wells' with 'Fabian Schar'
Matched players: 'Anders Lindegaard' with 'Andre Schurrle'
Matched players: 'Abd-Al-Ali Morakinyo Olaposi Koiki' with 'Jean Seri'
Matched players: 'Josh Benson' with 'Xande Silva'
Matched players: 'Anthony Driscoll-Glennon' with 'Caglar Soyuncu'
Matched players: 'Alex Smithies' with 'Denis Suarez'
Matched players: 'Matthew Connolly' with 'Isaac Success'
Matched players: 'Ashley Darel Jazz Richards' with 'Adama Traore'
Matched players: 'Greg Halford' with 'Ruben Vinagre'
Matched players: 'David Junior Hoilett' with 'Andre-Frank Zambo Anguissa'
Matched players: 'HA(c)ctor BellerAn' with 'Max Aarons'
Matched players: 'Laurent Koscielny' with 'Angelino'
Matched players: 'Konstantinos Mavropanos' with 'Borja Baston'
Matched players: 'Carl Jenkinson' with 'Muhamed Besic'
Matched pla