Using the package [Faker](https://faker.readthedocs.io/en/master/) to produce a dataframe <code> namesWithInfo</code> containing names with gender, firstVsLast and locality information

In [None]:
!pip install Faker

In [None]:
import pandas as pd
from faker import Faker #we use the package faker to generate names with gender, firstVsLast and locality information
Faker.seed(123) #set the seed for reproducibility

In [None]:
#correspondance between language codes and names. (Taken from googletrans package, added dk for danish, tw for twi (ghana dialect))
LANGUAGES = {'af': 'afrikaans','am': 'amharic','ar': 'arabic','az': 'azerbaijani','be': 'belarusian','bg': 'bulgarian','bn': 'bengali',
             'bs': 'bosnian','ca': 'catalan','ceb': 'cebuano','co': 'corsican','cs': 'czech','cy': 'welsh','dk':'danish','da': 'danish','de': 'german',
             'el': 'greek','en': 'english','eo': 'esperanto','es': 'spanish','et': 'estonian','eu': 'basque','fa': 'persian','fi': 'finnish',
             'fr': 'french','fy': 'frisian','ga': 'irish','gd': 'scots gaelic','gl': 'galician','gu': 'gujarati','ha': 'hausa','haw': 'hawaiian',
             'he': 'hebrew','hi': 'hindi','hmn': 'hmong','hr': 'croatian','ht': 'haitian creole','hu': 'hungarian','hy': 'armenian','id': 'indonesian',
             'ig': 'igbo','is': 'icelandic','it': 'italian','iw': 'hebrew','ja': 'japanese','jw': 'javanese','ka': 'georgian','kk': 'kazakh',
             'km': 'khmer','kn': 'kannada','ko': 'korean','ku': 'kurdish (kurmanji)','ky': 'kyrgyz','la': 'latin','lb': 'luxembourgish',
             'lo': 'lao','lt': 'lithuanian','lv': 'latvian','mg': 'malagasy','mi': 'maori','mk': 'macedonian','ml': 'malayalam','mn': 'mongolian',
             'mr': 'marathi','ms': 'malay','mt': 'maltese','my': 'myanmar (burmese)','ne': 'nepali','nl': 'dutch','no': 'norwegian','ny': 'chichewa',
             'or': 'odia','pa': 'punjabi','pl': 'polish','ps': 'pashto','pt': 'portuguese','ro': 'romanian','ru': 'russian','sd': 'sindhi',
             'si': 'sinhala','sk': 'slovak','sl': 'slovenian','sm': 'samoan','sn': 'shona','so': 'somali','sq': 'albanian','sr': 'serbian',
             'st': 'sesotho','su': 'sundanese','sv': 'swedish','sw': 'swahili','ta': 'tamil','te': 'telugu','tg': 'tajik','th': 'thai','tw':'twi (ghana dialect)',
             'tl': 'filipino','tr': 'turkish','ug': 'uyghur','uk': 'ukrainian','ur': 'urdu','uz': 'uzbek','vi': 'vietnamese','xh': 'xhosa',
             'yi': 'yiddish','yo': 'yoruba','zh-cn': 'chinese (simplified)','zh-tw': 'chinese (traditional)','zu': 'zulu'}
             
#after some side preprocessing on excel (removing inactive locals...), hardcode a list of valid Faker local codes to produce names from different localities. (tw_GH stands for Twi, Ghanaian)
FAKERLOCALS = {'ar_AA': 'Arabic (Egypt)','ar_PS': 'Arabic (Palestine)','ar_SA': 'Arabic (Saudi Arabia)','bg_BG': 'Bulgarian','cs_CZ': 'Czech','de_AT': 'German (Austria)',
               'de_CH': 'German (Switzerland)','de_DE': 'German','dk_DK': 'Danish','el_GR': 'Greek','en_AU': 'English (Australia)','en_CA': 'English (Canada)',
               'en_GB': 'English (Great Britain)','en_IN': 'English (India)','en_NZ': 'English (New Zealand)','en_US': 'English (United State)','es_ES': 'Spanish (Spain)',
               'es_MX': 'Spanish (Mexico)','et_EE': 'Estonian','fa_IR': 'Persian (Iran)','fi_FI': 'Finnish','fr_CH': 'French (Switzerland)','fr_FR': 'French',
               'fr_QC': 'French (Quebec)','he_IL': 'Hebrew (Israel)','hi_IN': 'Hindi','hr_HR': 'Croatian','hu_HU': 'Hungarian','hy_AM': 'Armenian',
               'id_ID': 'Indonesia','it_IT': 'Italian','ja_JP': 'Japanese','ka_GE': 'Georgian (Georgia)','ko_KR': 'Korean','lt_LT': 'Lithuanian','lv_LV': 'Latvian',
               'ne_NP': 'Nepali','nl_NL': 'Dutch (Netherlands)','no_NO': 'Norwegian','pl_PL': 'Polish','pt_BR': 'Portuguese (Brazil)','pt_PT': 'Portuguese (Portugal)',
               'ro_RO': 'Romanian','ru_RU': 'Russian','sl_SI': 'Slovene','sv_SE': 'Swedish','ta_IN': 'Tamil (India)','th_TH': 'Thai (Thailand)','tr_TR': 'Turkish',
               'tw_GH': 'Twi (Ghana)','uk_UA': 'Ukrainian','zh_CN': 'Chinese (China)','zh_TW': 'Chinese (Taiwan)'}

In [None]:
#we use Faker to generate fake names, and stop when all possible names have been generated.
#Unfortunately it was not possible to get a full list of names for every locals programatically
#construct a dictionnary of names using different Faker generators for locality, gender, and firstVsLast

#note that this step is packaged in the function produceNamesWithInfoDataset in utils.py

ITER_MAX = 2000000
PATIENCE = 4

namesDict = dict()

#loop over the different localities in Faker
for code in FAKERLOCALS.keys():
    namesLocal = dict()
    fake = Faker(code)
    #construct a dict of generators for different kind of names (first, last, male, female name)
    nameGenerators = dict(FNFemale = fake.first_name_female, FNMale = fake.first_name_male,LN = fake.last_name)

    #loop over the different types of names
    for kind, generator in nameGenerators.items():
        #print(kind," ,", code) #debug
        namesLocalGenderFstvsLst = []
        nuniquePrev = 0
        countSame = 0
        iterNb = 0

        #could not get a full list of the names Faker uses for every locality
        #So we generate names until we do not get any novelty
        while countSame < PATIENCE and iterNb < ITER_MAX:
            iterNb += 1
            #Faker will generate more names by making composite names (e.g. Smith-Doe) [could have used set() here]
            namesLocalGenderFstvsLst += generator().split("-")
            # check every 5000 iterations how much new names we are getting
            if iterNb % 5000 == 0: 
                nunique = len(set(namesLocalGenderFstvsLst))
                if nuniquePrev == nunique : 
                    countSame += 1 
                nuniquePrev = nunique

        namesLocal[kind] = list(set(namesLocalGenderFstvsLst))

    namesDict[code] = namesLocal

In [None]:
#build correspondance between namesDict keys and names features
correspGender = dict(FNFemale = "Female", FNMale = "Male",LN = "Last")
correspFirstVsLast = dict(FNFemale = "Firstname", FNMale = "Firstname",LN = "Lastname")

In [None]:
#extract columns to input in a dataframe
namesList = []
firstVsLast = []
gender = []
locality1List = []
locality2List= []

for locality2,namesLocalDict in namesDict.items():
    for genderFstvsLst,names in namesLocalDict.items():
        #print(locality," ", genderFstvsLst," ",len(names)) #debug
        namesList += names
    
        firstVsLast += [correspFirstVsLast[genderFstvsLst]]*len(names)
        gender += [correspGender[genderFstvsLst]]*len(names)


        if locality2 == "zh_TW": locality1 = "zh-tw"
        elif locality2 == "zh_CN": locality1 = "zh-cn"
        else: locality1 = locality2.split("_")[0]

        locality1List += [locality1]*len(names)
        locality2List += [locality2]*len(names)

In [None]:
#we do not want to recognize names because they are capitalized
namesList = [name.lower() for name in namesList]

In [None]:
#construct a dataframe with columns given by the previously constructed lists
namesWithInfo = pd.DataFrame(dict(names=namesList,firstVsLast = firstVsLast,gender = gender,locality1=locality1List,locality2=locality2List))

In [None]:
#set column with names instead of codes
namesWithInfo["locality1Names"] = [LANGUAGES[key] for key in namesWithInfo.locality1]
namesWithInfo["locality2Names"] = [FAKERLOCALS[key] for key in namesWithInfo.locality2]

In [None]:
#reorder columns
namesWithInfo = namesWithInfo[["names","firstVsLast","gender","locality1","locality1Names","locality2","locality2Names"]]

In [None]:
#remove some names containing weird characters, for example composed names with "/"
excludeChars = ["(",")",".",":",";","&","/","0","8","["]
namesWithInfo = namesWithInfo.loc[[all(exChar not in l for exChar in excludeChars) for l in namesWithInfo.names],:]

In [None]:
#remove empty names
namesWithInfo = namesWithInfo.loc[[len(l) != 0 for l in namesWithInfo.names],:]

In [None]:
#reset index because removing rows messed up the index
namesWithInfo.reset_index(drop=True,inplace=True)

In [None]:
#namesWithInfo.to_csv(path_or_buf="namesWithInfo.csv")

In [None]:
#Some names are repeated between the different categories
len(namesWithInfo),len(namesWithInfo.names.unique())

(53869, 34580)

In [None]:
namesWithInfo.head()

Unnamed: 0,names,firstVsLast,gender,locality1,locality1Names,locality2,locality2Names
0,ابتهاج,Firstname,Female,ar,arabic,ar_AA,Arabic (Egypt)
1,ريناد,Firstname,Female,ar,arabic,ar_AA,Arabic (Egypt)
2,يسرى,Firstname,Female,ar,arabic,ar_AA,Arabic (Egypt)
3,دارين,Firstname,Female,ar,arabic,ar_AA,Arabic (Egypt)
4,لتين,Firstname,Female,ar,arabic,ar_AA,Arabic (Egypt)


Produce a list of names and a list of multiple features for the different names, to fit to a multi-label classification task.

In [None]:
import pandas as pd
import numpy as np

In [None]:
#after some experiments we exclude names whose locality is perfectly inferred from their alphabet. This reduces the vocabulary.
easyLanguages = ["arabic","armenian","nepali","hebrew","bulgarian",
                 "georgian","hindi","thai","tamil","greek","korean",
                 "chinese (simplified)","chinese (traditional)",
                 "japanese","persian","ukrainian"]
                 
namesData = pd.read_csv(filepath_or_buffer="namesWithInfo.csv",index_col=0)
namesData = namesData.loc[~namesData.locality1Names.isin(easyLanguages)]

In [None]:
#group by names, mapping a name to all its possible features (locality,gender,firstVsLast)
namesVsFeatures = namesData.groupby("names").apply(lambda g: list(set(g.locality1Names)) + list(set(g.gender)) + list(set(g.firstVsLast)))
features = namesVsFeatures.values
names = namesVsFeatures.index.values
names = [name.strip().lower() for name in names]

In [None]:
#save the names and features data
with open("ressources/names.txt","w") as f:
    for name in names:
      f.write(name +'\n')
      
np.save("ressources/features.npy",features)