In [1]:
import os
import pandas as pd
from unidecode import unidecode
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

In [2]:


# Load the players_agg.xlsx file
players_agg_path = r'C:\Users\aldi\Documents\GitHub\dream-team-fpl-prediction\data\players_agg.xlsx'
#players_agg_path = r'C:\Users\ALESSANDRO\Documents\GitHub\dream-team-fpl-prediction\data\players_agg.xlsx'
players_agg_df = pd.read_excel(players_agg_path)

# Load the fpl_data.csv file
fpl_data_path = r'C:\Users\aldi\Documents\GitHub\dream-team-fpl-prediction\data\fpl_data.xlsx'
#fpl_data_path = r'C:\Users\ALESSANDRO\Documents\GitHub\dream-team-fpl-prediction\data\fpl_data.csv'
fpl_data_df = pd.read_excel(fpl_data_path)

# Clean the 'Player' column in players_agg_df
players_agg_df['Player'] = players_agg_df['Player'].apply(unidecode)

# Clean the 'Player' column in fpl_data_df
fpl_data_df['Player'] = fpl_data_df['Player'].apply(unidecode)

# Filter the players_agg_df for season 2016-2017 onwards
players_agg_df = players_agg_df[players_agg_df['Season'] >= '2016-2017']

# Extract surname and first_name from 'Player' column in both datasets
players_agg_df['surname'] = players_agg_df['Player'].apply(lambda name: name.split()[-1])
players_agg_df['first_name'] = players_agg_df['Player'].apply(lambda name: ' '.join(name.split()[:-1]))
fpl_data_df['surname'] = fpl_data_df['Player'].apply(lambda name: name.split()[-1])
fpl_data_df['first_name'] = fpl_data_df['Player'].apply(lambda name: ' '.join(name.split()[:-1]))

# Perform an outer join on 'Player' and 'Season' variables
merged_df = pd.merge(players_agg_df, fpl_data_df, on=['Player', 'Season'], how='outer')


In [3]:
# Filter the merged_df for the season 2016-2017
filtered_df = merged_df[merged_df['Season'] >= '2016-2017'].copy()

In [4]:
# Clean some dirty characters
filtered_df['Player'] = filtered_df['Player'].str.replace('A\(c\)','e')
filtered_df['Player'] = filtered_df['Player'].str.replace('A\(r\)','i')
filtered_df['Player'] = filtered_df['Player'].str.replace('A 1/4 ','u')
filtered_df['Player'] = filtered_df['Player'].str.replace('A-','i')
filtered_df['Player'] = filtered_df['Player'].str.replace('A\+-','n')
filtered_df['Player'] = filtered_df['Player'].str.replace('A,','o')
filtered_df['Player'] = filtered_df['Player'].str.replace('A!','a')

In [5]:
# All unique player names
names = list(filtered_df['Player'].unique())

In [6]:
# Build a dictionary with all names matched by the algorithm for each name in the list
THRESHOLD = 0.7
matches = dict()
for name in names:
    results = process.extractBests(name, names, scorer=fuzz.token_set_ratio,
                                   score_cutoff=THRESHOLD)
    matches[name] = []
    for result in results:
        if result[0] == name:
            continue
        if result[1] > 80:
            matches[name].append(result)

In [7]:
# Empty dict that will store for each name:
# - The name to which it maps to (e.g. 'Aleix Garcia Serrano' maps to 'Aleix Garcia')
# - The name(s) that maps to it (e.g. 'Aleix Garcia' is mapped by 'Aleix Garcia')
# - The name(s) that are a false match (e.g. 'Alexis Sanchez' is ignored by 'Aleix Garcia')
matches_final = dict()

In [8]:
# For every unique name build an empty dict that will store the data
for name in matches.keys():
    
    # If the algorithm didn't find any possible match is skipped
    if len(matches[name]) == 0:
        continue
    
    # Empty dict for the name
    matches_final[name] = {
        'Mapped_by': set(),  # If exists, a set of name(s) that refer to this name
        'Maps_to': '',       # If exists, a name to which this one should refer to
        'Ignores': set()     # If exists, a set of bad matches that should be ignored
        }

In [None]:
### For every name that had at least a match by the algorithm...
for name in matches_final.keys(): 
    
    # -------------------- MULTI-NAME CHECK SECTION --------------------
    # This part of the code checks wheter all the names that have a possible relationship refer
    # to the same player or not
    
    # Starting from a name (name) gathers all the name that could be related and stores them in the
    # names_checked list
    names_to_check = [name]
    names_checked = []
    while len(names_to_check) > 0:
        for el in matches[names_to_check[0]]:
            if el[0] not in names_checked:
                if el[0] not in names_to_check:
                    names_to_check.append(el[0])
        names_checked.append(names_to_check[0])
        names_to_check.pop(0)
    
    # Asks wheter all those names refer to the same player; if so, it asks to point which name is
    # the correct one and the data will be updated accordingly
    print('\nDo the following names all refer to the same player?')
    for el in names_checked:
        print(f' {names_checked.index(el)+1}. {el}')
    idx = input('If so, enter the number of the proper name, otherwise enter nothing: ')
    
    if idx.isdigit():
        
        idx = int(idx)-1
        
        if idx > (len(names_checked)-1) or idx < 0:
            raise IndexError('The entered number is not one of the choices')
        
        for name_checked in names_checked:
            
            if name_checked == names_checked[idx]:
                continue
                
            else:
                matches_final[name_checked]['Maps_to'] = names_checked[idx]
                matches_final[names_checked[idx]]['Mapped_by'].add(name_checked)
                
                for name_to_ignore in names_checked:
                    if name_to_ignore != names_checked[idx] and name_to_ignore != name_checked:
                        matches_final[name_checked]['Ignores'].add(name_to_ignore)
        
    elif len(idx) == 0:
        
        # If the names were just two and they do not refer to the same player, an ignore
        # relationship is set between the two so they will not be analized in the name-to-name
        # section
        if len(names_checked) == 2:
            matches_final[names_checked[0]]['Ignores'].add(names_checked[1])
            matches_final[names_checked[1]]['Ignores'].add(names_checked[0])
        continue
    
    else:
        raise ValueError('The entered value is invalid')
        
    # -------------------- NAME-TO-NAME MATCH SECTION --------------------
    # For names that still have unresolved matches, check them
            
    for el in matches[name]:
        
        # Skip the matched name if it has already a relationship with the key name ('name' variable)
        if el[0] in matches_final[name]['Mapped_by'] or el[0] == matches_final[name]['Maps_to']\
            or el[0] in matches_final[name]['Ignores']:
                continue
        
        # Asks if the two names matches
        check = input(f'Is {el[0]} a good match for {name} ([y]/n)? ' )
        
        # If so, it asks to choos which of the two should be considered the proper one and update
        # the data accordingly
        if (len(check) == 0) or (check == 'y'):
            check = input(f'-> {el[0]} or {name} should be the proper name (1/2)? ' )
            
            if check == '1':
                matches_final[name]['Maps_to'] = el[0]
                matches_final[el[0]]['Mapped_by'].add(name)
                
            elif check == '2':
                matches_final[el[0]]['Maps_to'] = name
                matches_final[name]['Mapped_by'].add(el[0])
                
            else:
                raise ValueError(f'{check} is an invalid answer')
                
        # If they do not match an ignore relationship is set between the two
        elif check == 'n':
            matches_final[name]['Ignores'].add(el[0])
            matches_final[el[0]]['Ignores'].add(name)
        
        else:
            raise ValueError(f'{check} is an invalid answer')


Do the following names all refer to the same player?
 1. Adrian
 2. Adrian Mariappa
 3. Adrian San Miguel del Castillo
 4. Adrian Bernabe
 5. Adrian Blake
If so, enter the number of the proper name, otherwise enter nothing: 

Do the following names all refer to the same player?
 1. Dan Agyei
 2. Daniel Agyei
If so, enter the number of the proper name, otherwise enter nothing: 1

Do the following names all refer to the same player?
 1. Dele Alli
 2. Bamidele Alli
If so, enter the number of the proper name, otherwise enter nothing: 1

Do the following names all refer to the same player?
 1. Harry Arter
 2. Harry Tyrer
If so, enter the number of the proper name, otherwise enter nothing: 

Do the following names all refer to the same player?
 1. Charlie Austin
 2. Charlie Patino
If so, enter the number of the proper name, otherwise enter nothing: 

Do the following names all refer to the same player?
 1. Antonio Barragan
 2. Antonio Barreca
If so, enter the number of the proper name, othe

If so, enter the number of the proper name, otherwise enter nothing: 

Do the following names all refer to the same player?
 1. Glen Johnson
 2. Ben Johnson
 3. Brennan Johnson
If so, enter the number of the proper name, otherwise enter nothing: 

Do the following names all refer to the same player?
 1. Kenedy
 2. Robert Kenedy Nunes do Nascimento
If so, enter the number of the proper name, otherwise enter nothing: 1

Do the following names all refer to the same player?
 1. Bojan Krkic
 2. Bojan Krkic Perez
If so, enter the number of the proper name, otherwise enter nothing: 1

Do the following names all refer to the same player?
 1. Lucas Leiva
 2. Leiva Lucas
If so, enter the number of the proper name, otherwise enter nothing: 1

Do the following names all refer to the same player?
 1. Fernando Llorente
 2. Fernando
 3. Fernando Marcal
 4. Fernando Luiz Rosa
 5. Fernando Francisco Reges
 6. Fernando MarASSal
If so, enter the number of the proper name, otherwise enter nothing: 

Do th


Do the following names all refer to the same player?
 1. Jonathan Walters
 2. Jonathan Calleri
If so, enter the number of the proper name, otherwise enter nothing: 

Do the following names all refer to the same player?
 1. Willian
 2. Willian Jose
 3. Willian Borges Da Silva
 4. Willian Jose Da Silva
 5. Willian Borges da Silva
 6. Joe Williams
If so, enter the number of the proper name, otherwise enter nothing: 

Do the following names all refer to the same player?
 1. Harry Winks
 2. Harry Lewis
If so, enter the number of the proper name, otherwise enter nothing: 

Do the following names all refer to the same player?
 1. Matt Worthington
 2. Matthew Worthington
If so, enter the number of the proper name, otherwise enter nothing: 1

Do the following names all refer to the same player?
 1. Muhamed Besic
 2. Muhamed BeaiA
If so, enter the number of the proper name, otherwise enter nothing: 1

Do the following names all refer to the same player?
 1. Gaetan Bong
 2. GaA<<tan Bong
If so, 