### Page is divided as follows:
### 1. [Preprocessing](#I.-PREPROCESSING)
### 2. [Fuzzy Matching](#II.-FUZZY-MATCHING)


### Imports / Load Data

In [1]:
import pandas as pd 
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
from collections import Counter
# from tqdm.notebook import trange, tqdm
from tqdm.notebook import tqdm_notebook
import re

In [2]:
df = pd.read_csv("00_02_01_more_disambiguation_requied.csv", names=['raw_name'], encoding='utf-8')
df.shape

(5470, 1)

In [3]:
pd.set_option('display.max_colwidth', -1)
df

Unnamed: 0,raw_name
0,a chris heath
1,aaron adalja
2,aaron bernstein
3,aaron c davis
4,aaron carroll
...,...
5465,zuo feng zhang
5466,zweli mkhize
5467,ángela hernández
5468,—


# I. PREPROCESSING
---------

## 1. Special characters


Need to address special characters before fuzzy matching.

#### a) Fix rows with "—" before addressing other special characters.

In [4]:
df[df.raw_name.str.contains('—', regex=False)]

Unnamed: 0,raw_name
1784,gordon —
2716,karen c carroll—director
5468,—
5469,—vicky mckeever


In [5]:
df.at[1784, 'raw_name'] = 'gordon'
df.at[2716, 'raw_name'] = 'karen c carroll'
df.drop(5468, inplace=True)
df.at[5469, 'raw_name'] = 'vicky mckeever'

In [6]:
print(df.shape)
df.loc[[1784, 2716, 5469]]

(5469, 1)


Unnamed: 0,raw_name
1784,gordon
2716,karen c carroll
5469,vicky mckeever


#### b) Get names with any non-english alphabet characters

In [7]:
ascii_lowercase = 'abcdefghijklmnopqrstuvwxyz'

In [8]:
idx_specialchars = []
for idx, name in df['raw_name'].iteritems():
    for char in name:
        if char != ' ':
            if char not in ascii_lowercase:
                print(char)
                idx_specialchars.append(idx)
                break

ñ
é
é
ö
í
á
ó
í
ñ
í
ó
ù
ö
é
ç
ó
ú
ö
é
é
é
é
é
é
é
á
é
ô
á
í
é
í
ó
ó
ö
é
ñ
ü
í
é
á
ü
é
é
á
á
ł
í
ç
ø
ï
6
é
ö
á
á
é
á
é
ü
á
á


In [9]:
pd.set_option('display.max_rows', 100)
df.loc[idx_specialchars]

Unnamed: 0,raw_name
111,aldo muñoz armenta
143,alexander kekulé
327,andrés manuel lópez obrador
797,cecilia söderberg nauclér
798,cecília müller
997,colm o moráin
1091,daniel lópez regalado
1480,emílio ribas
1487,eric cioe peña
1580,fernando rodríguez artalejo


In [10]:
# replace special characters with regular characters
# https://stackoverflow.com/questions/50253753/how-to-replace-accents-in-a-column-of-a-pandas-dataframe
df.raw_name.loc[idx_specialchars] = df.raw_name.loc[idx_specialchars].str.normalize('NFKD')\
       .str.encode('ascii', errors='ignore')\
       .str.decode('utf-8')

In [11]:
df.loc[idx_specialchars]

Unnamed: 0,raw_name
111,aldo munoz armenta
143,alexander kekule
327,andres manuel lopez obrador
797,cecilia soderberg naucler
798,cecilia muller
997,colm o morain
1091,daniel lopez regalado
1480,emilio ribas
1487,eric cioe pena
1580,fernando rodriguez artalejo


In [12]:
# 5040	thomas 6 25
df.at[5040, 'raw_name']

'thomas 6 25'

In [13]:
df.at[5040, 'raw_name'] = 'thomas'
df.at[5040, 'raw_name']

'thomas'

## 2. Some More General Processing 

### a) How many words in each name?

In [14]:
df['num_words'] = df['raw_name'].apply(lambda x: len(x.split()))
df.num_words.value_counts()

2    4469
3    626 
1    321 
4    46  
6    3   
5    3   
7    1   
Name: num_words, dtype: int64

In [15]:
df[df.num_words == 7]

Unnamed: 0,raw_name,num_words
3170,lucy jones center for science and seismology,7


In [16]:
df[df.num_words == 6]

Unnamed: 0,raw_name,num_words
2297,jennifer vines multnomah county health officer,6
4483,ross mcdonald the new york times,6
4641,scott pelley fact checks mike bloomberg,6


In [17]:
df[df.num_words == 5]

Unnamed: 0,raw_name,num_words
1166,david geffen school of medicine,5
1525,ernest n morial convention center,5
4465,ronald reagan ucla medical center,5


In [18]:
df[df.num_words == 4]

Unnamed: 0,raw_name,num_words
141,alexander huang chieh cheng,4
162,alfonso xi el cid,4
248,ana maria henao restrepo,4
249,ana silvia gonzalez reiche,4
275,andres manuel lopez obrador,4
327,andres manuel lopez obrador,4
677,brice de le vingne,4
726,candy gunther brown johnny,4
752,carol chiung hui peng,4
832,charles j ogletree jr,4


### b) Lets address some of the cases above:


#### 1. Drop names with 5 or 7 words (names of medical centers/institutions):

In [19]:
df.drop(df[df.num_words == 7].index, inplace=True)
df.drop(df[df.num_words == 5].index, inplace=True)

In [20]:
print(df[df.num_words == 5])
print(df[df.num_words == 7])

Empty DataFrame
Columns: [raw_name, num_words]
Index: []
Empty DataFrame
Columns: [raw_name, num_words]
Index: []


#### 2. fixing names with 6 words

In [21]:
df.at[2297, 'raw_name']

'jennifer vines multnomah county health officer'

In [22]:
df.at[2297, 'raw_name'] = 'jennifer vines'
df.at[2297, 'raw_name']

'jennifer vines'

In [23]:
df.at[4483, 'raw_name']

'ross mcdonald the new york times'

In [24]:
df.at[4483, 'raw_name'] = 'ross mcdonald'
df.at[4483, 'raw_name']

'ross mcdonald'

In [25]:
df.at[4641, 'raw_name']

'scott pelley fact checks mike bloomberg'

In [26]:
df.at[4641, 'raw_name'] = 'scott pelley'
df.at[4641, 'raw_name']
# df.drop(4641, inplace=True)

'scott pelley'

In [27]:
df[df.num_words == 6]

Unnamed: 0,raw_name,num_words
2297,jennifer vines,6
4483,ross mcdonald,6
4641,scott pelley,6


#### 3. fixing names with 4 words

In [28]:
# https://www.bbc.com/news/uk-england-london-52064450
df.at[1012, 'raw_name']

'cousin hisham el khider'

In [29]:
df.at[1012, 'raw_name'] = 'hisham el khider'
df.at[1012, 'raw_name']

'hisham el khider'

In [30]:
# 1185 david hume kennerlycourtesy gerald
# referring to library - https://www.seattletimes.com/nation-world/the-last-time-the-government-sought-a-warp-speed-vaccine-it-was-a-fiasco/
print(df.at[1185, 'raw_name'])
df.drop(1185, inplace=True)

david hume kennerlycourtesy gerald


In [31]:
# 1690	geert de clercq paris
df.at[1690, 'raw_name']

'geert de clercq paris'

In [32]:
df.at[1690, 'raw_name'] = 'geert de clercq'
df.at[1690, 'raw_name']

'geert de clercq'

In [33]:
df[df['raw_name'].str.contains(' cbs news')]

Unnamed: 0,raw_name,num_words
2395,joaquin morante cbs news,4
5434,zaheer shah cbs news,4


In [34]:
df['raw_name'] = df['raw_name'].str.replace(' cbs news', '', regex=False)

In [35]:
print(df[df['raw_name'].str.contains(' cbs news')])
print(df.at[2395, 'raw_name'])
print(df.at[5434, 'raw_name'])

Empty DataFrame
Columns: [raw_name, num_words]
Index: []
joaquin morante
zaheer shah


In [36]:
# 2818	kellyanne conway bashes biden
df.at[2818, 'raw_name']

'kellyanne conway bashes biden'

In [37]:
df.at[2818, 'raw_name'] = 'kellyanne conway'
df.at[2818, 'raw_name']

'kellyanne conway'

In [38]:
# 3374	martin luther king jr
print(df.at[3374, 'raw_name'])
df.drop(3374, inplace=True)

martin luther king jr


In [39]:
# 3684	ms von der leyen
df.at[3684, 'raw_name']

'ms von der leyen'

In [40]:
df.at[3684, 'raw_name'] = 'von der leyen'
df.at[3684, 'raw_name']

'von der leyen'

In [41]:
# 4098	ping an good doctor
# Ping An Good Doctor (01833.HK) is the world leading one-stop healthcare ecosystem platform in China. 
print(df.at[4098, 'raw_name'])
df.drop(4098, inplace=True)

ping an good doctor


In [42]:
# 4208	ray of grady hospital
# likely referring to Susan Ray - 'Dr. Ray of Grady Hospital... ' (link below)
# https://www.nytimes.com/2020/03/09/health/coronavirus-n95-face-masks.html
df[df['raw_name'].str.contains('ray')]

Unnamed: 0,raw_name,num_words
211,amanda spray,2
456,arvind narayanan,2
892,chris murray,2
942,christopher murray,2
1293,dena grayson,2
1792,gray,1
1810,gregory c gray,3
1812,gregory gray,2
2085,james grayson,2
2663,judy murray,2


In [43]:
df.at[4208, 'raw_name']

'ray of grady hospital'

In [44]:
df.at[4208, 'raw_name'] = 'susan ray'
df.at[4208, 'raw_name']

'susan ray'

In [45]:
# 4473	roselle chen new york
df.at[4473, 'raw_name']

'roselle chen new york'

In [46]:
df.at[4473, 'raw_name'] = 'roselle chen'
df.at[4473, 'raw_name']

'roselle chen'

In [47]:
# https://www.google.com/search?client=safari&rls=en&q=steven+a+cohen+military&ie=UTF-8&oe=UTF-8
print(df.at[4883, 'raw_name'])
df.drop(4883, inplace=True)

steven a cohen military


In [48]:
# 4221 - redros
print(df.at[4221, 'raw_name'])
df.at[4221, 'raw_name'] = 'tedros adhanom ghebreyesus'
print(df.at[4221, 'raw_name'])

redros adhanom ghebreyesus
tedros adhanom ghebreyesus


### c) Drop duplicate names

In [49]:
df[df.duplicated(['raw_name'])]

Unnamed: 0,raw_name,num_words
327,andres manuel lopez obrador,4
1487,eric cioe pena,3
2201,jean francois delfraissy,3
2297,jennifer vines,6
2348,jesus silva herzog,3
2395,joaquin morante,4
2818,kellyanne conway,4
3174,luis diaz izquierdo,3
3199,magnus gisslen,2
4750,sofia mendonca,2


In [50]:
df.drop_duplicates(subset='raw_name', inplace=True)
df.shape

(5447, 2)

In [51]:
# df.reset_index(drop=True, inplace=True)
df.reset_index(inplace=True)
df.rename(columns={'index':'orig_index'}, inplace=True)
df

Unnamed: 0,orig_index,raw_name,num_words
0,0,a chris heath,3
1,1,aaron adalja,2
2,2,aaron bernstein,2
3,3,aaron c davis,3
4,4,aaron carroll,2
...,...,...,...
5442,5463,zsolt katona,2
5443,5464,zubin damania,2
5444,5465,zuo feng zhang,3
5445,5466,zweli mkhize,2


In [52]:
df.to_csv('disambiguation_preprocessed.csv', index=False)

# II. FUZZY MATCHING
---------

#### Notes from Syed - 
1. We only keep the names that has multiple match against them
In this way around 5000 candidates can go down to possibly ~1000 candidates.
2. Against each name we have three columns, e.g.
raw_name, all_candidates_with_score, possible_candidate
tedros, "(tedros,100), ('tedros adhanom ghebreyesus',90), ('tedros ghebreyesus',90), ('tedros ahanom ghebreyesus',90),('tidres',67)", tedros adhanom ghebreyesus
The `possible_candidate` being the one with highest score. If more than one candidate has the highest score, break the tie by keeping the longest string.
3. We keep the similar names together in nearby rows so that when we divide the file between us for hand checking, we get the similar names in a single sight.

#### Fuzzy Matching Pseudocode Notes
1. Get list to compare to fuzzy matches: get all names that matches by first letter of first/middle/last name etc (excluding self)
    * this is to reduce the number of comparisons.
2. do fuzzy match w/ generated list to get top-20 hits: 
3. Accepatance criteria:
    * get matches above threshold 90 (this is pretty reliable, anything under 90 is almost never a good match).
        * if no matches > 90 score, return self.
        * if 1 match > 90, return match if longer than self.
        * if multiple matches > 90: (bit complicated)
            * get the top matches above 90 (ex, 95,95)
                * if only 1 highest score, return match if longer than self.
                * if multiple high scores (ex, 90, 90, 90):
                    * return longest match if longer than self.
                    * For single names: * try to return possible matches that have the single name as a word in match as opposed to substring. 

### 1. Get Fuzzy Matches

In [53]:
# alphabet_index = {}
# for i in ascii_lowercase:
#     alphabet_index[str(i)] = []
#     for j in ascii_lowercase:
#         alphabet_index[str(i + j)] = []
        
# for idx, name in df['raw_name'].iteritems():
#     for word in name.split():
#         alphabet_index[word[0:2]].append(idx)


name_index = {}
for i in ascii_lowercase:
    name_index[str(i)] = []
#     for j in ascii_lowercase:
#         name_index[str(i + j)] = []
        
for idx, name in df['raw_name'].iteritems():
    for word in name.split():
        name_index[word[0]].append(name)

In [54]:
def get_fuzzy_matches(name, match_indexer):
    # get first two letters of each word in name
    # letters_to_match = [s[0:2] for s in name.split()]
    letters_to_match = [s[0] for s in name.split()]

    
    # get all names that match the letter groups
    get_names_to_match = []
    for i in letters_to_match:
        get_names_to_match.extend(match_indexer[i])
        
    # remove duplicate names
    names_list = list(set(get_names_to_match))
    # remove self
    choices = [i for i in names_list if i != name]
    matches = process.extract(name, choices, limit=20)
    return matches

In [55]:
tqdm_notebook.pandas()

In [56]:
df['top20_candidates_with_score'] = df['raw_name'].progress_apply(get_fuzzy_matches, match_indexer=name_index)

HBox(children=(FloatProgress(value=0.0, max=5447.0), HTML(value='')))




In [57]:
df

Unnamed: 0,orig_index,raw_name,num_words,top20_candidates_with_score
0,0,a chris heath,3,"[(sharon a mcgrath morrow, 86), (christ, 75), (charissa cheah, 74), (chris chen, 70), (head, 68), (chris craft, 67), (chris hayes, 67), (cara christ, 67), (chris sale, 66), (chris luke, 64), (chris whitty, 64), (chris dede, 64), (christopher zahn, 62), (chris kratochvil, 62), (jean christophe lagier, 62), (rachel thorn heathcock, 62), (ashish jha, 61), (heard garris, 61), (chris braden, 61), (andrea andrea crisanti, 60)]"
1,1,aaron adalja,2,"[(aaron pallas, 75), (aaron glatt, 70), (amesh a adalja, 69), (adam, 68), (azar, 68), (alvaro alban, 67), (aaron carroll, 64), (aaron c davis, 64), (abelardo avila, 62), (jason adler, 61), (diana adama, 61), (arun sanyal, 61), (aaron rupar, 61), (ali, 60), (ala, 60), (aaron naeger, 58), (aaron shakow, 58), (kamiar alaei, 58), (anthony almojera, 57), (aaron milstone, 56)]"
2,2,aaron bernstein,2,"[(ronald brownstein, 75), (john brownstein, 73), (anna bershteyn, 69), (azar, 68), (adam rubinstein, 67), (penny bornstein, 67), (penny borenstein, 65), (adam bernheim, 64), (alan lerner, 62), (alan berger, 62), (aaron milstone, 62), (burton bentley ii, 62), (abe, 60), (ari brown, 60), (ala, 60), (robert benjamin, 60), (austin, 60), (steven b abramson, 59), (aaron reeves, 59), (aaron naeger, 59)]"
3,3,aaron c davis,3,"[(davis, 90), (marybeth davis baggett, 86), (azar, 68), (brian davis, 67), (ron daniels, 67), (aaron adalja, 64), (david abramson, 64), (david asman, 64), (aaron pallas, 64), (aaron reeves, 64), (rob davidson, 64), (anissa davis, 64), (paul davis, 64), (ron desantis, 64), (david canon, 64), (aaron carroll, 62), (david archer, 61), (ali, 60), (ala, 60), (carson, 60)]"
4,4,aaron carroll,2,"[(karen c carroll, 79), (aaron pallas, 72), (carre, 72), (azar, 68), (robin carre, 67), (aaron rupar, 67), (aaron adalja, 64), (daron cowley, 64), (aaron naeger, 64), (aaron shakow, 64), (mario alparone, 64), (martin carballo, 64), (clare farrell, 62), (anthony cardillo, 62), (alan campbell, 62), (damian caraballo, 62), (aaron c davis, 62), (cao bin, 61), (alta charo, 61), (ala, 60)]"
...,...,...,...,...
5442,5463,zsolt katona,2,"[(peter katona, 67), (ali khan, 56), (joe kalt, 56), (kevan shokat, 55), (kathryn snow, 55), (karen, 54), (tal zaks, 53), (ruth karron, 52), (zoltan kiss, 52), (joel kaplan, 52), (rowland kao, 52), (kathryn weston, 51), (kaletra, 51), (soo kim, 51), (zhou tao, 50), (aris katzourakis, 50), (adam kamradt scott, 50), (kathryn stephenson, 50), (karol sikora, 50), (kathryn seawant, 49)]"
5443,5464,zubin damania,2,"[(yu dan, 60), (zhu, 60), (diana adama, 58), (brian davis, 58), (daniela trezzi, 56), (dana perino, 55), (daniel fein, 55), (dana garfin, 55), (zeinab badawi, 54), (zarir udwadia, 54), (o day, 54), (zhou min, 53), (daniel klein, 53), (david hui, 52), (lisa delamaine, 52), (anne zink, 52), (pia daniel, 52), (daniel mcquillen, 52), (david putrino, 51), (daniel sumner, 51)]"
5444,5465,zuo feng zhang,3,"[(jie feng, 86), (mi feng, 86), (zhong, 72), (ho fung hung, 69), (zeng guang, 67), (zhang dingyu, 66), (elaine shuo feng, 64), (zhang wenhong, 64), (fu meng khaw, 62), (zhu, 60), (ai fen, 60), (zhong nanshan, 59), (taisu zhang, 59), (zheng gangtie, 59), (jinfeng zhou, 59), (fengyong liu, 59), (zhang xinmin, 56), (zeng qun, 56), (xu zhangrun, 56), (kean feng lim, 56)]"
5445,5466,zweli mkhize,2,"[(shi zhengli, 58), (joel miller, 52), (moriel zelikowsky, 52), (philip zelikow, 51), (helena maier, 50), (munzer al khalil, 50), (carl minzner, 50), (mehdi veisi, 49), (muze hair, 48), (melissa michelson, 48), (shane paul mcghie, 48), (angela merkel, 48), (elon musk, 48), (howie mandel, 48), (i min lee, 48), (wendy mariner, 48), (william mckoy, 48), (dennis mileti, 48), (elizabeth mitchell, 47), (mehmet oz, 46)]"


In [58]:
df.to_csv("disambiguation_fuzzymatched.csv", index=False)

### 2. Get Possible Name Matches

In [59]:
def contains_word(text, word):
    return bool(re.search(r'\b' + re.escape(word) + r'\b', text))

def get_possible_candidate(row):
#     print("", row['raw_name'])
    name = row['raw_name']
    number_words = row['num_words']
    
    # convert list of tuples into two lists - list[0] = names, list[1] = scores
    all_matches = list(map(list, zip(*row['top20_candidates_with_score'])))
    all_names = all_matches[0]
    all_scores = all_matches[1]
    
    # get index of matches above >= 90 (score)
    top_matches_idx = [idx for idx, element in enumerate(all_scores) if element >= 90]
    
    # if no matches >= 90: 
    if len(top_matches_idx) < 1: 
        return 'SELF'
    
    # if 1 match >= 90:
    elif len(top_matches_idx) == 1:
        if len(name) > len(all_names[top_matches_idx[0]]):
            return 'SELF'            
        else:
            return all_names[top_matches_idx[0]]
        
    # if multiple matches >= 90:
    elif len(top_matches_idx) > 1:
        
        max_val = max(all_scores) # max score
        top_scores_idx = [i for i, j in enumerate(all_scores) if j == max_val] # index of max scores
        top_names = [all_names[i] for i in top_scores_idx] # names matching max scores
        
        # if only 1 highest value
        if len(top_scores_idx) == 1: 
            if len(name) > len(top_names[0]):
                return 'SELF'            
            else:
                return top_names[0]
        # if only multiple highest value
        else:  
            if number_words == 1: # make it so that we ignore substring matches -> ai = ai fen, not dalai lama
                whole_world_checker = []
                for idx, possib_name in enumerate(top_names):
                    if contains_word(possib_name, name) == True:
                        whole_world_checker.append(idx)
                    
                if len(whole_world_checker) == 0:
                    return 'SELF'
                elif len(whole_world_checker) == 1:
                    return top_names[whole_world_checker[0]]
                else:
                    top_whole_word = [top_names[i] for i in whole_world_checker]
                    return max(top_whole_word, key = len)
            else: 
                if len(name) > len(max(top_names, key = len)):
                    return 'SELF'
                else:
                    return max(top_names, key = len)           

In [61]:
df['possible_candidate'] = df.progress_apply(get_possible_candidate, axis=1)

HBox(children=(FloatProgress(value=0.0, max=5447.0), HTML(value='')))




In [62]:
# How many matches??
df[df['possible_candidate'] != 'SELF'].shape

(446, 5)

In [63]:
# Rows where new possible_candidate
df[df['possible_candidate'] != 'SELF'].head(50)

Unnamed: 0,orig_index,raw_name,num_words,top20_candidates_with_score,possible_candidate
14,14,abbott,1,"[(sabra m abbott, 90), (april abbott, 90), (greg abbott, 90), (abbott labs, 90), (andrea gambotto, 75), (lilian abbo, 72), (abe, 60), (bowale abimbola, 60), (alex scott, 50), (ann arbor, 50), (arabia mollette, 48), (asaf bitton, 47), (aaron glatt, 47), (ann bostrom, 47), (andrew rambaut, 45), (jenny abthorpe, 45), (atul malhotra, 45), (ai, 45), (jim acosta, 45), (andrew potter, 45)]",sabra m abbott
19,19,abdullah abdullah,2,"[(noor hisham abdullah, 95), (abdul el sayed, 61), (abe, 60), (ala, 60), (nadia abuelezam, 56), (adam douglas, 55), (rami abdulrahman, 55), (alfa saadu, 54), (sayed attaullah sayedzai, 54), (alan campbell, 53), (amadou alpha sall, 53), (abdul mabud chowdhury, 53), (adam blake, 52), (abelardo avila, 52), (aliyah cardoza, 52), (alfa sa, 51), (lilian abbo, 50), (danny avula, 50), (bruce a elleman, 49), (kathie allen, 48)]",noor hisham abdullah
20,20,abe,1,"[(abelardo avila, 90), (shinzo abe, 90), (isabelle amigues, 90), (rabi abeyasinghe, 86), (allison chamberlain, 60), (david a nace, 60), (jenny abthorpe, 60), (naji abumrad, 60), (adam bernheim, 60), (abigail carlson, 60), (alexander l greninger, 60), (sue anne bell, 60), (florence ader, 60), (henry albrecht, 60), (robert atmar, 60), (fawziya abikar, 60), (abhijit duggal, 60), (alex brandonap, 60), (jane ruth aceng, 60), (anna goldfarb, 60)]",shinzo abe
26,26,abraham,1,"[(jonathan abraham, 90), (abraham l newman, 90), (jonatas abrahao, 77), (adam, 68), (allison brashear, 64), (sabra m abbott, 64), (steven b abramson, 64), (marina abramovic, 64), (david clara amit, 64), (abraar karan, 64), (andrea graham, 64), (david abramson, 64), (mohamed ali ibrahim, 64), (sarah aitken, 64), (jonathan abramowitz, 64), (josh archambault, 64), (abe, 60), (amy, 60), (azra ghani, 59), (adhanom, 57)]",jonathan abraham
28,28,adam,1,"[(adam bernheim, 90), (adam levitin, 90), (diana adama, 90), (adam j levitin, 90), (adam wolanski, 90), (adam blake, 90), (adam kucharski, 90), (adam schiff, 90), (adam marshall, 90), (adam rubinstein, 90), (jerome adams, 90), (beverley adams groom, 90), (adam jarrett, 90), (adam douglas, 90), (adam collins, 90), (adam schefter, 90), (adam rosh, 90), (adams dudley, 90), (adam finn, 90), (jerome m adams, 90)]",adam rubinstein
40,40,adam levitin,2,"[(adam j levitin, 95), (adam, 90), (adam kamradt scott, 86), (daniel levitin, 77), (levitan, 77), (levin, 72), (amler, 72), (adams, 72), (david levin, 70), (david levine, 67), (adam rubinstein, 67), (hagai levine, 67), (adam collins, 67), (adam finn, 67), (andrew levin, 67), (sam lessin, 64), (adam blake, 64), (mark levin, 64), (adam green, 64), (marissa levine, 62)]",adam j levitin
49,49,adams,1,"[(jerome adams, 90), (beverley adams groom, 90), (adams dudley, 90), (jerome m adams, 90), (adam, 89), (adarsh pratap singh, 72), (adam bernheim, 72), (adam levitin, 72), (diana adama, 72), (adam j levitin, 72), (adam wolanski, 72), (adam blake, 72), (adam kucharski, 72), (adam schiff, 72), (adam marshall, 72), (adam rubinstein, 72), (adam jarrett, 72), (adam douglas, 72), (amiyatosh purnanandam, 72), (adam collins, 72)]",beverley adams groom
54,54,adhanom,1,"[(tedros adhanom ghebreyesus, 90), (tedros adhanom ghebreyesu, 90), (tedros adhanom, 90), (adhanom ghebreyesus, 90), (tedros ahanom ghebreyesus, 77), (adam, 73), (adams, 67), (adnan munkarah, 64), (adriano decarli, 64), (adrian, 62), (abraham, 57), (andrew chan, 56), (jawad al bidhani, 56), (ali khan, 53), (ahmed rahman, 53), (adam bernheim, 51), (kranthi achanta, 51), (adam levitin, 51), (aruna ravichandran, 51), (annika sridharan, 51)]",tedros adhanom ghebreyesus
55,55,adhanom ghebreyesus,2,"[(tedros adhanom ghebreyesus, 95), (adhanom, 90), (ghebreyesus, 90), (tedros adhanom ghebreyesu, 82), (tedros ahanom ghebreyesus, 82), (tedros ghebreyesus, 76), (adam, 68), (tedros adhanom, 64), (adam green, 62), (abe, 60), (adrian, 60), (ann wagner, 54), (jane greer, 54), (adams, 54), (mark green, 54), (aaron reeves, 52), (jordan asher, 52), (abraham, 51), (adrian bangerter, 51), (al gore, 51)]",tedros adhanom ghebreyesus
60,60,adrian,1,"[(adrian bangerter, 90), (adrian hyzler, 90), (adriano decarli, 90), (aruna ravichandran, 75), (dorian alexander, 75), (armand dorian, 75), (mihran aroian, 75), (diana adama, 71), (aralen, 67), (andrei lankov, 63), (adhanom, 62), (addison, 62), (andrea crisanti, 60), (mary anastasia o grady, 60), (anita kurian, 60), (alfredo garzino demo, 60), (arvind kumar, 60), (adam, 60), (adnan munkarah, 60), (katrina armstrong, 60)]",adrian bangerter


In [68]:
# Rows where SELF = best candidate
df[df['possible_candidate'] == 'SELF'].head(50)

Unnamed: 0,orig_index,raw_name,num_words,top20_candidates_with_score,possible_candidate
0,0,a chris heath,3,"[(sharon a mcgrath morrow, 86), (christ, 75), (charissa cheah, 74), (chris chen, 70), (head, 68), (chris craft, 67), (chris hayes, 67), (cara christ, 67), (chris sale, 66), (chris luke, 64), (chris whitty, 64), (chris dede, 64), (christopher zahn, 62), (chris kratochvil, 62), (jean christophe lagier, 62), (rachel thorn heathcock, 62), (ashish jha, 61), (heard garris, 61), (chris braden, 61), (andrea andrea crisanti, 60)]",SELF
1,1,aaron adalja,2,"[(aaron pallas, 75), (aaron glatt, 70), (amesh a adalja, 69), (adam, 68), (azar, 68), (alvaro alban, 67), (aaron carroll, 64), (aaron c davis, 64), (abelardo avila, 62), (jason adler, 61), (diana adama, 61), (arun sanyal, 61), (aaron rupar, 61), (ali, 60), (ala, 60), (aaron naeger, 58), (aaron shakow, 58), (kamiar alaei, 58), (anthony almojera, 57), (aaron milstone, 56)]",SELF
2,2,aaron bernstein,2,"[(ronald brownstein, 75), (john brownstein, 73), (anna bershteyn, 69), (azar, 68), (adam rubinstein, 67), (penny bornstein, 67), (penny borenstein, 65), (adam bernheim, 64), (alan lerner, 62), (alan berger, 62), (aaron milstone, 62), (burton bentley ii, 62), (abe, 60), (ari brown, 60), (ala, 60), (robert benjamin, 60), (austin, 60), (steven b abramson, 59), (aaron reeves, 59), (aaron naeger, 59)]",SELF
3,3,aaron c davis,3,"[(davis, 90), (marybeth davis baggett, 86), (azar, 68), (brian davis, 67), (ron daniels, 67), (aaron adalja, 64), (david abramson, 64), (david asman, 64), (aaron pallas, 64), (aaron reeves, 64), (rob davidson, 64), (anissa davis, 64), (paul davis, 64), (ron desantis, 64), (david canon, 64), (aaron carroll, 62), (david archer, 61), (ali, 60), (ala, 60), (carson, 60)]",SELF
4,4,aaron carroll,2,"[(karen c carroll, 79), (aaron pallas, 72), (carre, 72), (azar, 68), (robin carre, 67), (aaron rupar, 67), (aaron adalja, 64), (daron cowley, 64), (aaron naeger, 64), (aaron shakow, 64), (mario alparone, 64), (martin carballo, 64), (clare farrell, 62), (anthony cardillo, 62), (alan campbell, 62), (damian caraballo, 62), (aaron c davis, 62), (cao bin, 61), (alta charo, 61), (ala, 60)]",SELF
5,5,aaron glatt,2,"[(aaron steckelberg, 86), (aaron adalja, 70), (aaron pallas, 70), (azar, 68), (aaron milstone, 64), (robert glatter, 64), (aaron rupar, 64), (sharon goldfarb, 62), (aaron naeger, 61), (aaron shakow, 61), (andy slavitt, 61), (alvaro alban, 61), (dario gil, 60), (ala, 60), (aaron reeves, 59), (aaron carroll, 59), (aaron bernstein, 59), (aaron c davis, 59), (itamar grotto, 58), (alan kraut, 57)]",SELF
6,6,aaron milstone,2,"[(don milton, 75), (carolyn maloney, 69), (azar, 68), (donald milton, 67), (marci hamilton, 64), (mark wilson, 64), (moulton, 64), (aaron glatt, 64), (mariana chilton, 62), (robin armstrong, 62), (aaron naeger, 62), (aaron bernstein, 62), (aaron pallas, 62), (aaron shakow, 62), (rohan miller, 62), (amy stone, 61), (armstrong, 61), (mario alparone, 61), (ala, 60), (marion nestle, 59)]",SELF
7,7,aaron naeger,2,"[(alan berger, 70), (aaron rupar, 70), (azar, 68), (aaron reeves, 67), (ann wagner, 64), (aaron carroll, 64), (anas nader, 64), (alan erera, 64), (marion nestle, 64), (aaron milstone, 62), (aaron steckelberg, 62), (isaac ngere, 61), (alan lerner, 61), (jason adler, 61), (peter navarro, 61), (archon fung, 61), (aaron glatt, 61), (abe, 60), (sharon a mcgrath morrow, 60), (ala, 60)]",SELF
8,8,aaron pallas,2,"[(ron paul, 79), (aaron adalja, 75), (aaron carroll, 72), (aaron rupar, 70), (aaron glatt, 70), (azar, 68), (paul, 68), (alvaro alban, 67), (paula cannon, 64), (aaron c davis, 64), (jared a ellias, 62), (aaron milstone, 62), (abelardo avila, 62), (robin patel, 61), (ali, 60), (ala, 60), (amanda williams, 59), (aaron reeves, 58), (aaron naeger, 58), (aaron shakow, 58)]",SELF
9,9,aaron reeves,2,"[(ron yee, 77), (tate reeves, 70), (reed, 68), (azar, 68), (aaron naeger, 67), (jason reed, 64), (aaron c davis, 64), (walberto reyes, 62), (aaron steckelberg, 62), (aaron rupar, 61), (aurore, 60), (ala, 60), (aaron bernstein, 59), (aaron glatt, 59), (aaron shakow, 58), (stuart reyes, 58), (aaron pallas, 58), (richard l revesz, 57), (john reed, 57), (margot roosevelt, 57)]",SELF


In [65]:
df.to_csv("disambiguation_final.csv", index=False)

In [66]:
df_possible = df[df['possible_candidate'] != 'SELF'].copy()
df_possible.reset_index(drop=True, inplace=True)
print(df_possible.shape)
df_possible.to_csv("disambiguation_final_possible_candidates.csv", index=False)

(446, 5)


### END.
### Just looking at some examples in the final dataframe below. 

In [69]:
df[df['possible_candidate'] == 'SELF'].tail(50)

Unnamed: 0,orig_index,raw_name,num_words,top20_candidates_with_score,possible_candidate
5391,5411,yossi sheffi,2,"[(ms shi, 60), (saira sheikh, 58), (shupe, 54), (singh, 54), (yasutoshi nishimura, 52), (shi zhengli, 52), (josiah solis, 50), (steffie woolhandler, 50), (younas elahi, 50), (sania ashraf, 50), (pei yong shi, 50), (jeanne sheffield, 50), (shi yinhong, 49), (seema yasmin, 48), (douglas senderoff, 48), (asim shah, 48), (hiroshige seko, 46), (jeffrey smith, 46), (jeffrey sachs, 46), (mahmoud hashemi shahroudi, 45)]",SELF
5392,5412,younas elahi,2,"[(young, 72), (yuan, 68), (young pak, 57), (cai yi, 57), (lisa r young, 55), (pei yong shi, 55), (yulia, 54), (ali yusuf, 54), (elaine larson, 53), (neal elattrache, 52), (yorba linda, 52), (yuval neria, 52), (yogesh jain, 52), (donna e shalala, 52), (amir yaron, 52), (ron yee, 51), (lawrence young, 51), (jeanette young, 51), (elanah uretsky, 51), (cheng yung chi, 51)]",SELF
5394,5414,young pak,2,"[(young, 90), (lawrence young, 86), (jeanette young, 86), (dannagal young, 86), (jeannette young, 86), (pampee young, 72), (ho pak leung, 72), (yuan, 68), (brad younggren, 67), (lisa r young, 67), (duncan young, 67), (ron paul, 59), (pan hwai tzong, 57), (the koret playground, 57), (yu dan, 57), (younas elahi, 57), (yang wang, 56), (patrick vogt, 54), (meng aw yong, 54), (pei yong shi, 54)]",SELF
5396,5416,yu dan,2,"[(yu, 90), (dan wallach, 86), (dan doyle, 86), (dan crenshaw, 86), (dan patrick, 86), (howard yu, 86), (dan wakeford, 86), (dan ariely, 86), (dan tehan, 86), (dan mccarthy, 86), (koh yu hwan, 86), (dan varga, 86), (dan schnur, 86), (yu hongyan, 86), (yuan, 80), (wang yu, 73), (yazdan yazdanpaneh, 71), (yazdan yazdanpanah, 71), (duncan young, 71), (dan suan, 67)]",SELF
5397,5417,yu hongyan,2,"[(yu, 90), (yu dan, 86), (yuan, 68), (yang, 68), (larry hogan, 67), (yuan herong, 67), (hogan, 67), (jin dong yan, 64), (leung yiu hong, 64), (sara hogan, 60), (howard yu, 60), (han, 60), (yi guan, 59), (yanzhong huang, 58), (yan jirong, 57), (lu hongzhou, 57), (koh yu hwan, 57), (guan yi, 56), (wang yu, 56), (yang hengjun, 55)]",SELF
5399,5419,yuan herong,2,"[(yu, 90), (yuan, 90), (meng yuan, 76), (yan jirong, 76), (yan chen, 74), (yang hengjun, 70), (yang, 68), (yu hongyan, 67), (alan herman, 64), (yangyang cheng, 64), (anna yeung cheung, 62), (ron yee, 61), (anna hemming, 61), (yang seung ham, 61), (wang yu, 61), (cheng yung chi, 61), (duncan young, 61), (yang wang, 60), (shuhan he, 60), (yu dan, 60)]",SELF
5400,5420,yuan po tu,3,"[(yu, 90), (yuan, 90), (yang, 68), (yuna rapoport, 61), (yu dan, 60), (otto yang, 60), (meng yuan, 59), (yuen kwok yung, 58), (yuan herong, 57), (susan turney, 55), (duncan young, 55), (romano paolucci, 54), (yulia, 54), (young, 54), (george yancopoulos, 54), (george d yancopoulos, 54), (young pak, 53), (rand paul, 53), (jo potuto, 53), (andrew potter, 52)]",SELF
5401,5421,yuen kwok yung,3,"[(yu, 90), (young, 72), (yuan, 68), (yang, 68), (meng aw yong, 62), (meng yuan, 61), (roy kwong, 61), (jeff kwong, 58), (yuan po tu, 58), (yu dan, 57), (yuan herong, 56), (ying ying goh, 56), (kin k leung, 56), (lawrence young, 54), (kevin ouyang, 54), (duncan young, 54), (samson yuen, 53), (yan chen, 53), (yang wang, 52), (nadia kounang, 52)]",SELF
5402,5422,yulia,1,"[(yu, 90), (yuan, 67), (yuval neria, 62), (meng yuan, 60), (yuan herong, 57), (yuna rapoport, 57), (yi cui, 55), (yu dan, 55), (hsu li yang, 54), (younas elahi, 54), (yuan po tu, 54), (jamie yutzie, 54), (yorba linda, 54), (yuval baruch, 54), (dali yang, 54), (yasuyuki sahara, 54), (yifang zhu, 54), (yi guan, 50), (yang, 44), (young pak, 43)]",SELF
5403,5423,yuna rapoport,2,"[(yu, 90), (david rapoport, 74), (yuan, 68), (yuan po tu, 61), (yulia, 57), (robert yoon, 55), (young, 54), (angela rippon, 54), (amy r sapkota, 54), (roberto rona, 53), (ratula chakraborty, 52), (raja krishnamoorthi, 50), (rosina racioppi, 50), (yuan herong, 50), (yuval neria, 50), (nishant rao, 50), (samson yuen, 48), (roger porter, 48), (yonatan grad, 48), (cynthia rohrbeck, 48)]",SELF


In [71]:
df[df['possible_candidate'] != 'SELF'].tail(100)

Unnamed: 0,orig_index,raw_name,num_words,top20_candidates_with_score,possible_candidate
4304,4319,rita,1,"[(rita wilson, 90), (maria rita gismondo, 90), (fernando rodriguez artalejo, 68), (ritu thamman, 68), (greg rigano, 68), (robert atmar, 68), (ricardo azziz, 68), (risa m mish, 68), (kristopher richardson, 68), (ralph northam, 68), (k srinath reddy, 68), (robert anolik, 68), (robert amler, 68), (ricardo ramirez, 68), (gregory rigano, 68), (ana maria henao restrepo, 68), (krista reitberg, 68), (christopher r braden, 68), (emilio ribas, 68), (christian rose, 68)]",maria rita gismondo
4319,4334,robert amler,2,"[(robert w amler, 95), (amler, 90), (robert hendershott, 86), (lionel p robert jr, 86), (robert j kim farley, 86), (robert david siegel, 86), (robert atmar, 83), (robert wagner, 80), (robert kadlec, 80), (robert gary, 78), (robert shiller, 77), (robert glatter, 77), (robert talisse, 77), (robert wachter, 77), (robert a neimeyer, 76), (robert gallo, 75), (roger farmer, 75), (robert faris, 75), (robert levin, 75), (robert schlager, 74)]",robert w amler
4332,4347,robert cohen,2,"[(robert l cohen, 95), (cohen, 90), (robert hendershott, 86), (lionel p robert jr, 86), (robert j kim farley, 86), (roxane cohen silver, 86), (daniel aldana cohen, 86), (robert david siegel, 86), (robert kelchen, 85), (robert chesney, 85), (robert cherry, 80), (john roberts, 79), (robert yoon, 78), (robert o brien, 77), (robert wachter, 77), (robertson, 76), (roberto cosentini, 76), (roberto rona, 75), (robert reich, 75), (robert levin, 75)]",robert l cohen
4355,4370,robert kim farley,3,"[(robert j kim farley, 95), (kim farley, 90), (kim, 90), (jesi kim, 86), (kim jong un, 86), (robert west, 86), (soo kim, 86), (daniel kim, 86), (alfred kim, 86), (thomas kim, 86), (robert gary, 86), (david kim, 86), (kim binsted, 86), (pauline kim, 86), (robert uzzo, 86), (robert yoon, 86), (robert nied, 86), (robert kadlec, 73), (robert faris, 72), (riley, 72)]",robert j kim farley
4368,4383,robert salata,2,"[(robert a salata, 95), (robert atmar, 80), (robert kaplan, 77), (robert west, 75), (sarah t roberts, 75), (roberto stella, 74), (roberto posada, 74), (robert glatter, 74), (robert gallo, 72), (robert amler, 72), (robert schlager, 71), (robert anolik, 69), (robert blanchard, 69), (robert kadlec, 69), (robert siegel, 69), (robert peston, 69), (robert caudill, 67), (robert gary, 67), (robert stevens, 67), (robert epstein, 67)]",robert a salata
4372,4387,robert siegel,2,"[(robert david siegel, 95), (siegel, 90), (robert nied, 75), (robert quigley, 74), (roberto stella, 74), (robert stevens, 74), (robert shiller, 74), (robert gallo, 72), (robert dingwall, 71), (robert schlager, 71), (robert summerer, 71), (robert wagner, 69), (jeremy siegel, 69), (robert salata, 69), (robert bollinger, 69), (robert west, 67), (robert hacking, 67), (robert caudill, 67), (robert bickers, 67), (robert gary, 67)]",robert david siegel
4392,4407,robertson,1,"[(andrew robertson, 90), (robertson davenport, 90), (charles robertson, 90), (johnson roberson, 85), (robert peston, 82), (nichola roberston, 80), (robert yoon, 80), (andrew roberston, 80), (sarah t roberts, 79), (mysheika roberts, 79), (roberto rona, 76), (robert cohen, 76), (roberto burioni, 75), (robert harrison, 75), (roberto perotti, 75), (roberto bernabei, 75), (robert thompson, 75), (roberto cosentini, 75), (roberto posada, 75), (rob burton, 74)]",robertson davenport
4413,4428,rodney ho,2,"[(rodney howard browne, 90), (rodney ramcharan, 86), (rodney rohde, 76), (tyrone howard, 64), (robin howe, 63), (rod hochman, 60), (roxane cohen silver, 60), (robin thompson, 60), (arnold hopland, 60), (ebony hilton buchholz, 60), (robert hockett, 60), (saheli roy choudhury, 60), (ron hira, 59), (jennifer horney, 57), (robert hendershott, 57), (mitt romney, 57), (nichola roberston, 57), (tony holohan, 57), (ebony hilton, 57), (ryde hornsby, 57)]",rodney howard browne
4417,4432,rodriguez,1,"[(fernando rodriguez artalejo, 90), (mauricio rodriguez, 90), (segundo rodriguez, 90), (jose javier rodriguez, 90), (rodriguez diaz, 90), (jean paul rodrigue, 85), (tom ridge, 56), (roy fried, 56), (robert siegel, 55), (rieux, 54), (riley, 54), (ron wright, 53), (rod pearce, 53), (robert quigley, 52), (edward r utley, 50), (richard ebright, 50), (rosemarie truglio, 50), (richard v riggs, 50), (roland more, 50), (roderick seamster, 50)]",fernando rodriguez artalejo
4467,4483,ross mcdonald,6,"[(ross macdonald, 96), (rice mcdonald, 77), (eric mcdonald, 77), (glenn rice mcdonald, 72), (donald marks, 68), (david rosser, 61), (mrs ali, 61), (medina, 60), (donald milton, 59), (gary ross, 59), (ronna mcdaniel, 59), (darcy ross, 58), (josh michaud, 56), (rory medcalf, 56), (mary hunter mcdonnell, 56), (bonnie maldonado, 55), (amira roess, 55), (yvonne maldonado, 55), (rob mccoy, 55), (royal s copeland, 55)]",ross macdonald


In [80]:
df[df['possible_candidate'] != 'SELF'].head(135)

Unnamed: 0,orig_index,raw_name,num_words,top20_candidates_with_score,possible_candidate
14,14,abbott,1,"[(sabra m abbott, 90), (april abbott, 90), (greg abbott, 90), (abbott labs, 90), (andrea gambotto, 75), (lilian abbo, 72), (abe, 60), (bowale abimbola, 60), (alex scott, 50), (ann arbor, 50), (arabia mollette, 48), (asaf bitton, 47), (aaron glatt, 47), (ann bostrom, 47), (andrew rambaut, 45), (jenny abthorpe, 45), (atul malhotra, 45), (ai, 45), (jim acosta, 45), (andrew potter, 45)]",sabra m abbott
19,19,abdullah abdullah,2,"[(noor hisham abdullah, 95), (abdul el sayed, 61), (abe, 60), (ala, 60), (nadia abuelezam, 56), (adam douglas, 55), (rami abdulrahman, 55), (alfa saadu, 54), (sayed attaullah sayedzai, 54), (alan campbell, 53), (amadou alpha sall, 53), (abdul mabud chowdhury, 53), (adam blake, 52), (abelardo avila, 52), (aliyah cardoza, 52), (alfa sa, 51), (lilian abbo, 50), (danny avula, 50), (bruce a elleman, 49), (kathie allen, 48)]",noor hisham abdullah
20,20,abe,1,"[(abelardo avila, 90), (shinzo abe, 90), (isabelle amigues, 90), (rabi abeyasinghe, 86), (allison chamberlain, 60), (david a nace, 60), (jenny abthorpe, 60), (naji abumrad, 60), (adam bernheim, 60), (abigail carlson, 60), (alexander l greninger, 60), (sue anne bell, 60), (florence ader, 60), (henry albrecht, 60), (robert atmar, 60), (fawziya abikar, 60), (abhijit duggal, 60), (alex brandonap, 60), (jane ruth aceng, 60), (anna goldfarb, 60)]",shinzo abe
26,26,abraham,1,"[(jonathan abraham, 90), (abraham l newman, 90), (jonatas abrahao, 77), (adam, 68), (allison brashear, 64), (sabra m abbott, 64), (steven b abramson, 64), (marina abramovic, 64), (david clara amit, 64), (abraar karan, 64), (andrea graham, 64), (david abramson, 64), (mohamed ali ibrahim, 64), (sarah aitken, 64), (jonathan abramowitz, 64), (josh archambault, 64), (abe, 60), (amy, 60), (azra ghani, 59), (adhanom, 57)]",jonathan abraham
28,28,adam,1,"[(adam bernheim, 90), (adam levitin, 90), (diana adama, 90), (adam j levitin, 90), (adam wolanski, 90), (adam blake, 90), (adam kucharski, 90), (adam schiff, 90), (adam marshall, 90), (adam rubinstein, 90), (jerome adams, 90), (beverley adams groom, 90), (adam jarrett, 90), (adam douglas, 90), (adam collins, 90), (adam schefter, 90), (adam rosh, 90), (adams dudley, 90), (adam finn, 90), (jerome m adams, 90)]",adam rubinstein
...,...,...,...,...,...
1512,1516,erik wilke,2,"[(eric wilke, 90), (knut erik hovda, 86), (erika lee, 74), (william cherniak, 68), (eric wei, 67), (patrick walker, 67), (henry walke, 67), (kerri wizner, 64), (keith willett, 61), (nick white, 60), (eric blank, 60), (william petri, 58), (kristen welker, 58), (mark wilson, 57), (nick wilson, 57), (peter white, 57), (eric talley, 57), (rita wilson, 57), (warwick knowles, 56), (erin staples, 55)]",eric wilke
1546,1551,exekiel emanuel,2,"[(ezekiel emanuel, 93), (emanuel, 90), (zeke emanuel, 81), (rahm emanuel, 70), (elaine kinsella, 57), (eike steinmann, 55), (jan emmanuel de neve, 54), (kelly evans, 54), (emil verner, 54), (emily benfer, 52), (eli fenichel, 52), (ernie guzman, 52), (elon musk, 50), (eva larue, 50), (elaine ganley, 50), (emmanuel macron, 50), (jeffrey engel, 50), (emmanuel andre, 48), (osagie ehanire, 48), (eugene tan, 48)]",ezekiel emanuel
1548,1553,ezekiel emanuel,2,"[(exekiel emanuel, 93), (emanuel, 90), (zeke emanuel, 89), (rahm emanuel, 70), (elaine kinsella, 57), (eike steinmann, 55), (jan emmanuel de neve, 54), (kelly evans, 54), (emil verner, 54), (emily benfer, 52), (eli fenichel, 52), (ernie guzman, 52), (elon musk, 50), (eva larue, 50), (elaine ganley, 50), (emmanuel macron, 50), (jeffrey engel, 50), (emmanuel andre, 48), (osagie ehanire, 48), (eugene tan, 48)]",exekiel emanuel
1567,1572,feigl ding,2,"[(eric feigl ding, 90), (jonathan fielding, 63), (michael dowling, 63), (don, 60), (tinglong dai, 55), (zhang dingyu, 55), (jake dunning, 55), (neil fishman, 55), (brice de le vingne, 54), (michael donnino, 54), (daniel fein, 54), (robert dingwall, 54), (jih fei cheng, 52), (eli fenichel, 52), (richard fording, 51), (rafael felix espinoza, 51), (elaine shuo feng, 51), (benjamin f miller, 51), (zuo feng zhang, 50), (dario gil, 50)]",eric feigl ding


# SANDBOX

In [None]:
choices = ["tedros", "tedros adhanom ghebreyesus", "homer", "tedros ghebreyesus", "tedros ahanom ghebreyesus", "talia", "tidres"]
process.extract("tedros", choices, limit=20)

In [None]:
choices = ["ai fen", "aileen marty", "alaa swilam dubai", "al gore", "alain gauthier", "tidres"]
process.extract("ai", choices, limit=5)

### How names start with each letter?

In [None]:
Counter(s[0] for s in df.Name.tolist())

In [None]:
all_names = []
for idx, name in df['raw_name'].iteritems():
    for s in name.split():
        all_names.append(s)

In [None]:
dict_count.most_common()