# Path

For players:
* Find player names that are not already in the database
* Try to match names across data sources using fuzzy matching
* For any that do not match, manually fix names

For clubs:
* Find club names that are not already in the database
* Manually match club names and add correct ids
    * Time could be saved with fuzzy matching

In [32]:
import numpy as np
import pandas as pd

futbin = pd.read_csv("csv files/futbin_2020.csv")
ustat = pd.read_csv("csv files/ustat_2020.csv")

fix_club_ids = pd.read_csv("data repair csvs/fix_club_ids.csv")
fix_name_ids = pd.read_csv("data repair csvs/fix_name_ids.csv")

In [33]:
futbin = futbin.rename(columns={"name": "player_name", "club": "team_title"})

In [34]:
ustat.team_title = ustat.team_title.str.split(',')
ustat = ustat.explode('team_title').reset_index(drop=True)

## Match players to id
* Valid from FIFA 23 -> FIFA 22

In [18]:
# Remove players that have multiple FIFA cards in a given year (i.e. transfers or upgrades/downgrades)
futbin = futbin.drop_duplicates(subset=['player_name']).reset_index()

new_names = futbin['player_name']
new_ids = futbin['player_id'].tolist()
old_ids = fix_name_ids['id'].tolist()

new_fb_names = []
new_fb_ids = []

# Add player name and id from futbin that are not already in the database
for i in range(len(new_ids)):
    if not new_ids[i] in old_ids:
        new_fb_names.append(new_names[i])
        new_fb_ids.append(new_ids[i])

# Create dataframe with new/unmatched players
new_players = pd.DataFrame()
new_players['player_name'] = new_fb_names
new_players['player_id'] = new_fb_ids

In [19]:
from rapidfuzz import process, fuzz, utils

def getMatchScore(futbin_df, understat_df, player_ids):
    res_strings = []
    for i in range(len(futbin_df)):
        # Finds best possible match for each player name
        cur = process.extract(futbin_df[i], understat_df, scorer=fuzz.token_set_ratio, limit=1, processor=utils.default_process)
        # Create result containing futbin_id, futbin player name, understat player name, and score generated from function
        res_strings.append([player_ids[i], futbin_df[i], cur[0][0], cur[0][1]])

    return res_strings

In [20]:
# Creates a list of all old and new names to match to the new set of players
tot_ustat = list(set(fix_name_ids.ustat_name) | set(ustat.player_name))

# Match new names to the list of player names
finds = pd.DataFrame(getMatchScore(new_players.player_name, tot_ustat, new_players.player_id), 
                     columns = ['id', 'futbin_name', 'ustat_name', 'score'])

In [21]:
# Export to a temporary csv for manually data cleaning, sorted by score
finds.sort_values(by='score').to_csv("data repair csvs/check_names.csv", index=False)

In [22]:
# Append the cleaned data to the existing list of players
temp = pd.read_csv("data repair csvs/check_names.csv")
unpack_nam = pd.concat([fix_name_ids, temp], ignore_index=True, sort=False)
unpack_nam.to_csv("data repair csvs/fix_name_ids.csv", index=False)

## Match club to id
* Valid from FIFA 23 -> FIFA 22

In [23]:
fix_club_ids

Unnamed: 0,fb_team,futbin_id,us_team,db_id
0,Toulouse FC,1809.0,Toulouse,1.0
1,1. FC Köln,31.0,FC Cologne,2.0
2,1. FSV Mainz 05,169.0,Mainz 05,3.0
3,AC Ajaccio,614.0,Ajaccio,4.0
4,AC Monza,111811.0,Monza,5.0
...,...,...,...,...
125,Roma,52.0,Roma,73.0
126,Brescia,190.0,Brescia,122.0
127,SPAL,112791.0,SPAL 2013,123.0
128,Amiens SC,1816.0,Amiens,124.0


In [28]:
# Creates a dictionary containing club names and ids from futbin

db_id_dict = {}

fb_t = futbin.team_title
idlist = futbin.club_id
    
for i in range(len(fb_t)):
    cur = fb_t[i]
    if not cur in db_id_dict:
        db_id_dict[cur] = idlist[i]
        
db_id_dict

{'Southampton': 17,
 'Aston Villa': 2,
 'Everton': 7,
 'Manchester City': 10,
 'Watford': 1795,
 'Manchester United': 11,
 'Liverpool': 9,
 'Brighton & Hove Albion': 1808,
 'Chelsea': 5,
 'Sheffield United': 1794,
 'Newcastle United': 13,
 'West Ham United': 19,
 'Norwich City': 1792,
 'Leicester City': 95,
 'Arsenal': 1,
 'Tottenham Hotspur': 18,
 'Burnley': 1796,
 'Bournemouth': 1943,
 'Crystal Palace': 1799,
 'Wolverhampton Wanderers': 110,
 'Granada CF': 110832,
 'R. Valladolid CF': 462,
 'Sevilla FC': 481,
 'FC Barcelona': 241,
 'CD Leganés': 100888,
 'Getafe CF': 1860,
 'Real Betis': 449,
 'SD Eibar': 467,
 'D. Alavés': 463,
 'Valencia CF': 461,
 'CA Osasuna': 479,
 'RC Celta': 450,
 'Levante UD': 1853,
 'Real Sociedad': 457,
 'RCD Espanyol': 452,
 'RCD Mallorca': 453,
 'Villarreal CF': 483,
 'Real Madrid': 243,
 'Athletic Club': 448,
 'Atlético Madrid': 240,
 'Bologna': 189,
 'Lazio': 46,
 'Roma': 52,
 'Napoli': 48,
 'Hellas Verona': 206,
 'Parma': 50,
 'Fiorentina': 110374,
 'C

In [31]:
# Get all of the names and ids from futbin along with the names and ids already in the database
clubs = list(db_id_dict)
new_ids = list(db_id_dict.values())
old_ids = fix_club_ids['futbin_id'].tolist()
old_ustat = fix_club_ids['us_team']
old_fbin = fix_club_ids['fb_team']

new_clubs = pd.DataFrame()

# Find only the new names fetched from understat
new_ustat_name = list(set(ustat.team_title.unique()).difference(old_ustat))
new_fb_name = []
new_fb_id = []

# If the team is new, add their name and id to the club id file
for i in range(len(new_ids)):
    if not int(new_ids[i]) in old_ids or not clubs[i] in old_fbin:
        new_fb_name.append(clubs[i])
        new_fb_id.append(new_ids[i])
        
new_clubs['fb_team'] = new_fb_name
new_clubs['futbin_id'] = new_fb_id

# For clubs that do not have matching ids or had a name change, add extra values to the id column
for i in range(len(new_fb_name) - len(new_ustat_name)):
    new_ustat_name.append('None')

new_clubs['us_team'] = new_ustat_name
new_clubs['db_id'] = [0 for i in range(len(new_ustat_name))]

# Combine the old club ids csv and the new clubs, then export for manual repair
fix_club_ids = pd.concat([fix_club_ids, new_clubs], ignore_index=True, sort=False)
fix_club_ids.to_csv('data repair csvs/fix_club_ids.csv', index=False)

97
0             Toulouse FC
1              1. FC Köln
2         1. FSV Mainz 05
3              AC Ajaccio
4                AC Monza
              ...        
125                  Roma
126               Brescia
127                  SPAL
128             Amiens SC
129    Fortuna Düsseldorf
Name: fb_team, Length: 130, dtype: object
Southampton
Aston Villa
Everton
Manchester City
Watford
Manchester United
Liverpool
Brighton & Hove Albion
Chelsea
Sheffield United
Newcastle United
West Ham United
Norwich City
Leicester City
Arsenal
Tottenham Hotspur
Burnley
Bournemouth
Crystal Palace
Wolverhampton Wanderers
Granada CF
R. Valladolid CF
Sevilla FC
FC Barcelona
CD Leganés
Getafe CF
Real Betis
SD Eibar
D. Alavés
Valencia CF
CA Osasuna
RC Celta
Levante UD
Real Sociedad
RCD Espanyol
RCD Mallorca
Villarreal CF
Real Madrid
Athletic Club
Atlético Madrid
Bologna
Lazio
Roma
Napoli
Hellas Verona
Parma
Fiorentina
Cagliari
Atalanta
Genoa
Lecce
Brescia
Piemonte Calcio
Sampdoria
Milan
Udinese
Torino
Inter
S