# Path

For players:
* Find player names that are not already in the database
* Try to match names across data sources using fuzzy matching
* For any that do not match, manually fix names

For clubs:
* Find club names that are not already in the database
* Manually match club names and add correct ids
    * Time could be saved with fuzzy matching

In [17]:
import numpy as np
import pandas as pd

futbin = pd.read_csv("csv files/futbin_2022.csv")
ustat = pd.read_csv("csv files/ustat_2022.csv")

fix_club_ids = pd.read_csv("data repair csvs/fix_club_ids.csv")
fix_name_ids = pd.read_csv("data repair csvs/fix_name_ids.csv")

In [18]:
futbin = futbin.rename(columns={"name": "player_name", "club": "team_title"})

In [19]:
ustat.team_title = ustat.team_title.str.split(',')
ustat = ustat.explode('team_title').reset_index(drop=True)

## Match players to id
* Valid from FIFA 23 -> FIFA 22

In [28]:
# Remove players that have multiple FIFA cards in a given year (i.e. transfers or upgrades/downgrades)
futbin = futbin.drop_duplicates(subset=['player_name']).reset_index()

new_names = futbin['player_name']
new_ids = futbin['player_id'].tolist()
old_ids = fix_name_ids['id'].tolist()

new_fb_names = []
new_fb_ids = []

# Add player name and id from futbin that are not already in the database
for i in range(len(new_ids)):
    if not new_ids[i] in old_ids:
        new_fb_names.append(new_names[i])
        new_fb_ids.append(new_ids[i])

# Create dataframe with new/unmatched players
new_players = pd.DataFrame()
new_players['player_name'] = new_fb_names
new_players['player_id'] = new_fb_ids

Unnamed: 0,player_name,player_id
0,Matt Turner,233267
1,Brandon Williams,243414
2,Imran Louza,242794
3,Max Aarons,240060
4,Frank Onyeka,239529
...,...,...
326,Kevin-Prince Boateng,173909
327,Dan-Axel Zagadou,238476
328,Marwin Hitz,190243
329,Martin Hinteregger,201922


In [21]:
from rapidfuzz import process, fuzz, utils

def getMatchScore(pd.DataFrame: futbin_df, pd.DataFrame: understat_df, List[int]: player_ids):
    res_strings = []
    for i in range(len(futbin_df)):
        # Finds best possible match for each player name
        cur = process.extract(futbin_df[i], understat_df, scorer=fuzz.token_set_ratio, limit=1, processor=utils.default_process)
        # Create result containing futbin_id, futbin player name, understat player name, and score generated from function
        res_strings.append([player_ids[i], futbin_df[i], cur[0][0], cur[0][1]])

    return res_strings

In [22]:
# Creates a list of all old and new names to match to the new set of players
tot_ustat = list(set(fix_name_ids.ustat_name) | set(ustat.player_name))

# Match new names to the list of player names
finds = pd.DataFrame(getMatchScore(new_players.player_name, tot_ustat, new_players.player_id), 
                     columns = ['id', 'futbin_name', 'ustat_name', 'score'])

Unnamed: 0,id,futbin_name,ustat_name,score
317,104389,Rune Almenning Jarstein,Kevin-Prince Boateng,55.813953
27,169697,Darren Randolph,Dan Ndoye,58.333333
277,230043,Faitout Maouassa,Moussa Doumbia,60.000000
267,230727,Gautier Larsonneur,Serge Aurier,60.000000
32,204529,Michy Batshuayi,Joshua Kimmich,62.068966
...,...,...,...,...
133,227508,Melero,Gonzalo Melero,100.000000
132,228302,Alfonso Pedraza Sag,Alfonso,100.000000
131,224574,Orbelín Pineda,Orbelín Pineda,100.000000
138,204876,Carlos Clerc Martínez,Carlos Clerc,100.000000


In [8]:
# Export to a temporary csv for manually data cleaning, sorted by score
finds.sort_values(by='score').to_csv("data repair csvs/check_names.csv", index=False)

In [27]:
# Append the cleaned data to the existing list of players
temp = pd.read_csv("data repair csvs/check_names.csv")
unpack_nam = pd.concat([fix_name_ids, temp], ignore_index=True, sort=False)
unpack_nam.to_csv("data repair csvs/fix_name_ids.csv", index=False)

## Match club to id
* Valid from FIFA 23 -> FIFA 22

In [101]:
fix_club_ids

Unnamed: 0,fb_team,futbin_id,us_team,db_id
0,Toulouse FC,1809,Toulouse,1
1,1. FC Köln,31,FC Cologne,2
2,1. FSV Mainz 05,169,Mainz 05,3
3,AC Ajaccio,614,Ajaccio,4
4,AC Monza,111811,Monza,5
...,...,...,...,...
93,VfL Wolfsburg,175,Wolfsburg,94
94,Villarreal CF,483,Villarreal,95
95,Werder Bremen,38,Werder Bremen,96
96,West Ham,19,West Ham,97


In [130]:
# Creates a dictionary containing club names and ids from futbin

db_id_dict = {}

fb_t = futbin.team_title
idlist = futbin.club_id
    
for i in range(len(fbt)):
    cur = fb_t[i]
    if not cur in db_id_dict:
        db_id_dict[cur] = idlist[i]
        
#db_id_dict

In [29]:
# Get all of the names and ids from futbin along with the names and ids already in the database
clubs = list(db_id_dict)
new_ids = list(db_id_dict.values())
old_ids = fix_club_ids['futbin_id'].tolist()
old_ustat = fix_club_ids['us_team']

new_clubs = pd.DataFrame()

# Find only the new names fetched from understat
new_ustat_name = list(set(ustat.team_title.unique()).difference(old_ustat))
new_fb_name = []
new_fb_id = []

# If the team is new, add their name and id to the club id file
for i in range(len(new_ids)):
    if not int(new_ids[i]) in old_ids:
        new_fb_name.append(clubs[i])
        new_fb_id.append(new_ids[i])
        
new_clubs['fb_team'] = new_fb_name
new_clubs['futbin_id'] = new_fb_id

# For clubs that do not have matching ids or had a name change, add extra values to the id column
for i in range(len(new_fb_name) - len(new_ustat_name)):
    new_ustat_name.append('None')

new_clubs['us_team'] = new_ustat_name
new_clubs['db_id'] = [0 for i in range(len(new_ustat_name))]

# Combine the old club ids csv and the new clubs, then export for manual repair
fix_club_ids = pd.concat([fix_club_ids, new_clubs], ignore_index=True, sort=False)
fix_club_ids.to_csv('data repair csvs/fix_club_ids.csv', index=False)

NameError: name 'db_id_dict' is not defined