# Matching Emails to Researchers

## Python Setup

In [1]:
import pandas as pd
import recordlinkage as rl

## Data Load In

In [2]:
projects = pd.read_excel('../data/porteurs_projets0712_finances_helios.xlsx')
grants = pd.read_csv('../data/inca_grants_details.csv')

In [3]:
projects = projects[(projects['prénoms'].notnull())&(projects['nom'].notnull())].reset_index(drop = True)
projects['prénoms'] = projects['prénoms'].str.replace('Prenom', '')
del projects['titre_en'], projects['titre_fr']

In [4]:
projects.head()

Unnamed: 0,année,acronyme,no_projet,nom,prénoms,email1,email2
0,2007,ACI07,ACI07-001,BERGER,François,fberger@ujf-grenoble.fr ; Francois.Berger@ujf-...,fbergez@me.com
1,2007,ACI07,ACI07-002,FAVROT,Marie-Christine,mc.favrot@afssa.fr,
2,2007,ACI07,ACI07-003,PLUMAS,Joël,Joel.plumas@efs.sante.fr; joel.plumas@wanadoo.fr,
3,2007,ACI07,ACI07-004,BALDI,Isabelle,isabelle.baldi@isped.u-bordeaux2.fr,
4,2007,ACI07,ACI07-005,CAZAUX,Christophe,Christophe.Cazaux@ipbs.fr,


In [5]:
grants.head(2)

Unnamed: 0,INCA ID,prenom_port,nom_port,organisme_port,Dimensions Grant ID,Title,Abstract,Funder,Reference,Research Org Names,Research Org IDs,FOR,RCDC,Funding Amount ($),Start Date,End Date
0,inca_1,Jérôme,ABADIE,Ecole Vétérinaire de Nantes,grant.7426242,"Cancer, Environment and metabolomics: the dog ...",Our innovative project aims to use the dog as ...,French Institute of Health and Medical Research,Inserm_6181,Oniris,grid.418682.1,1117 Public Health and Health Services;1112 On...,Rare Diseases;Prevention;Cancer;Clinical Research,65061.0,2011-12-13,2013-06-12
1,inca_2,Julien,ADAM,Hôpital Necker- Enfants malades APHP,grant.7426178,Creating tools to assess DNA repair dysfunctio...,Scientific context Non-small cell lung carcin...,French Institute of Health and Medical Research,Inserm_4631,Necker-Enfants Malades Hospital;Institut Gusta...,grid.412134.1;grid.14925.3b,0601 Biochemistry and Cell Biology;1112 Oncolo...,Biotechnology;Lung;Cancer;Lung Cancer;Genetics...,240110.0,2011-11-01,2014-11-01


In [6]:
projects.shape

(2005, 7)

## Create Name and Name ID

In [7]:
projects['name'] = (projects['prénoms'] + ' ' + projects['nom']).str.upper()
grants['name'] = (grants['prenom_port'] + ' ' + grants['nom_port']).str.upper()

In [8]:
projects['name_id_proj'] = projects.groupby(['name']).ngroup()
grants['name_id_grnt'] = grants.groupby(['name']).ngroup()

## Clean Email

In [9]:
projects['emails'] = projects['email1'].fillna('').str.lower() + '; ' + projects['email2'].fillna('').str.lower()
projects['emails'] = projects['emails'].str.replace(r'(; $|^; |^; $)', '')
projects['emails'] = projects['emails'].str.replace(r'\s?;\s?', '; ')
projects['emails'] = projects['emails'].str.replace(r',', '.')

In [10]:
emails = projects[['name_id_proj', 'name', 'emails']].drop_duplicates()
emails = emails.sort_values('name').reset_index(drop = True)

In [11]:
temp = pd.concat([pd.Series(row['name_id_proj'], row['emails'].split('; ')) for _, row in emails.iterrows()])
temp = temp[temp.index!=""].reset_index().drop_duplicates().reset_index(drop = True)
temp.columns = ['email', 'name_id_proj']
temp = pd.merge(temp, emails[['name_id_proj', 'name']].drop_duplicates(), how = 'left', on = 'name_id_proj')
temp = temp[['name_id_proj', 'name', 'email']]

Insert section on restricting temp email addresses?

In [12]:
temp = temp.groupby(['name_id_proj', 'name'])
temp = pd.DataFrame({'email_count' : temp.size()
                     , 'emails' : temp['email'].apply('; '.join)
                    }).reset_index()
del temp['name']

In [13]:
emails = pd.merge(emails[['name_id_proj', 'name']].drop_duplicates(), temp, how = 'left', on = 'name_id_proj')

In [14]:
emails['email_count'] = emails['email_count'].fillna(0)

## Create Frames for Record Linkage

In [15]:
emails.describe(include = 'all')

Unnamed: 0,name_id_proj,name,email_count,emails
count,1499.0,1499,1499.0,1380
unique,,1499,,1379
top,,HERVÉ MIGNOTTE,,cd67@ligue-cancer.net
freq,,1,,2
mean,749.0,,1.142762,
std,432.86834,,0.57626,
min,0.0,,0.0,
25%,374.5,,1.0,
50%,749.0,,1.0,
75%,1123.5,,1.0,


In [16]:
names = grants[['name_id_grnt', 'name']].drop_duplicates()
names = names.sort_values('name').reset_index(drop = True)

In [17]:
names.describe(include = 'all')

Unnamed: 0,name_id_grnt,name
count,1000.0,1000
unique,,1000
top,,HERVÉ MIGNOTTE
freq,,1
mean,499.5,
std,288.819436,
min,0.0,
25%,249.75,
50%,499.5,
75%,749.25,


## Name Cleaning

In [18]:
names['name'] = names['name'].str.replace(r'\s?-\s?', ' ')
emails['name'] = emails['name'].str.replace(r'\s?-\s?', ' ')

In [19]:
names['name'] = names['name'].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')
emails['name'] = emails['name'].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')

## First Merge on Name

In [20]:
df = pd.merge(names, emails, how = 'left', on = 'name')

In [21]:
df_1 = df[df['email_count'].notnull()].reset_index(drop = True)

## String Comparison on Remaining Names

In [22]:
names_2 = df[df['email_count'].isnull()][['name_id_grnt', 'name']].reset_index(drop = True)

In [23]:
names_2.shape

(15, 2)

In [24]:
names_2_comp = names_2.set_index('name_id_grnt')
emails_comp = emails.set_index('name_id_proj')

In [25]:
indexer = rl.FullIndex()
pairs = indexer.index(names_2_comp, emails_comp)
print(len(pairs))

22485


In [26]:
comp = rl.Compare()
exact = comp.exact('name','name')
levenshtein = comp.string('name','name', method='levenshtein')
jarowinkler = comp.string('name','name', method='jarowinkler')
qgram = comp.string('name', 'name', method='qgram')
# [‘jaro’, ‘jarowinkler’, ‘levenshtein’, ‘damerau_levenshtein’, ‘qgram’, ‘cosine’, ‘smith_waterman’, ‘lcs’].
comp = comp.compute(pairs, names_2_comp, emails_comp).reset_index()
comp['score'] = comp[1]+comp[2]+comp[3]
comp = comp.sort_values(['name_id_grnt', 'score'], ascending=[True, False])
comp = comp.drop_duplicates('name_id_grnt', keep='first')

### Merging on Closest Name

In [27]:
df = pd.merge(names_2, comp, how = 'left', on = 'name_id_grnt')
df = pd.merge(df, emails, how = 'left', on = 'name_id_proj')

### Manual Corrections

In [28]:
df

Unnamed: 0,name_id_grnt,name_x,name_id_proj,0,1,2,3,score,name_y,email_count,emails
0,83,ARNAUD VILLERS,118,0,0.933333,0.958095,0.875,2.766429,ARNAULD VILLERS,2.0,arnauld.villers@wanadoo.fr; a-villers@chru-lil...
1,90,AURA CARREIRA MORENO,124,0,0.65,0.93,0.666667,2.246667,AURA CARREIRA,1.0,aura.carreira@curie.fr
2,184,CHRISTELE DESBOIS MOUTON,274,0,0.96,0.992,0.923077,2.875077,CHRISTELE DESBOIS MOUTHON,2.0,christele.desbois-mouthon@inserm.fr; desbois@s...
3,272,ELLEN BENHAMOU,405,0,0.608696,0.921739,0.625,2.155435,ELLEN BENHAMOU BOROWSKI,2.0,ellen.benhamou@gustaveroussy.fr; benhamou@gust...
4,296,ESMA SAADA,440,0,0.588235,0.917647,0.611111,2.116993,ESMA SAADA BOUZID,1.0,esma.saada-bouzid@nice.unicancer.fr
5,312,FEDERICO VERGA,528,0,0.5,0.732873,0.470588,1.703462,FREDERIC COULAMA,1.0,asso.apal@hotmail.com
6,439,IGNACIO GARRIDO STOWHAS,630,0,0.652174,0.930435,0.666667,2.249275,IGNACIO GARRIDO,2.0,garrido-stowhas.ignacio@claudiusregaud.fr; drg...
7,473,JEAN BOURHIS,716,0,0.666667,0.883333,0.684211,2.234211,JEAN HENRI BOURHIS,2.0,bourhis@gustaveroussy.fr; ana.chauvain@gustave...
8,542,JULIE MERVILLE DECHANET,791,0,0.391304,0.93913,1.0,2.330435,JULIE DECHANET MERVILLE,2.0,julie.dechanet@umr5164.u-bordeaux2.fr; julie.d...
9,582,LAURENCE ALBIGES SAUVIN,836,0,0.695652,0.93913,0.708333,2.343116,LAURENCE ALBIGES,2.0,laurence.albiges@gmail.com; laurence.albiges@g...


In [29]:
incorrect_matching_grnt_ids = [312]
df_2 = df[df['name_id_grnt'].apply(lambda x: x not in incorrect_matching_grnt_ids)].reset_index(drop = True)

## Combining All Results

In [30]:
del df_2[0], df_2[1], df_2[2], df_2[3], df_2['score']
df_2 = df_2.rename(index=str, columns={'name_x': "name", 'name_y': 'inca_name'})

In [31]:
df_1['inca_name'] = df_1['name']

In [32]:
df = pd.concat([df_1, df_2])[['name_id_grnt', 'name', 'inca_name', 'email_count', 'emails']]

In [33]:
df = pd.merge(names, df, on = ['name_id_grnt', 'name'], how = 'left')

In [34]:
df.to_csv('../output/email_linkage.csv', index = False)

## Summary Statistics

In [35]:
df.describe(include = 'all')

Unnamed: 0,name_id_grnt,name,inca_name,email_count,emails
count,1000.0,1000,999,999.0,908
unique,,1000,999,,908
top,,SAMUEL VALABLE,SAMUEL VALABLE,,marc.ychou@montpellier.unicancer.fr; marc.ycho...
freq,,1,1,,1
mean,499.5,,,1.168168,
std,288.819436,,,0.624923,
min,0.0,,,0.0,
25%,249.75,,,1.0,
50%,499.5,,,1.0,
75%,749.25,,,1.0,


In [36]:
print("There are {} unique INCA researchers in the grant data.".format(df['name'].count()))
print("{} researchers were matched to the INCA database.".format(df['inca_name'].count()))
print("The INCA database has email information for {} of these researchers.".format(df['emails'].count()))

There are 1000 unique INCA researchers in the grant data.
999 researchers were matched to the INCA database.
The INCA database has email information for 908 of these researchers.


In [37]:
print("Some researchers have several emails. Here is the distribution:\n")
print(df[df['email_count'].notnull()]['email_count'].apply(lambda x: int(x)).value_counts())
print("\nAs % of total:\n")
print(df[df['email_count'].notnull()]['email_count'].apply(lambda x: int(x)).value_counts(normalize = True))

Some researchers have several emails. Here is the distribution:

1    680
2    200
0     91
3     25
4      3
Name: email_count, dtype: int64

As % of total:

1    0.680681
2    0.200200
0    0.091091
3    0.025025
4    0.003003
Name: email_count, dtype: float64


## Sandbox