In [1]:
# extracted in extract-datasets.ipynb
generic_dataset = '../output/generic_orgs_and_variants.csv'
generic_dataset2 = '../output/specific_orgs_generic_variants.csv'
specific_dataset = '../output/specific_orgs_and_specific_variants.csv'

In [2]:
import sys
sys.path.append('../generic-classifier/')

In [3]:
# from importlib import reload
# import classifyGenericModified
# reload(classifyGenericModified)

In [4]:
import pandas as pd
from classifyGenericModified import extract_acronyms, typeDict, subjectDict, subjApproxDict, subjModDict, orgModDict

In [5]:
paths = [generic_dataset, generic_dataset2, specific_dataset]
all_datasets = pd.concat([pd.read_csv(p) for p in paths])

# Extracting ambiguous acronyms (those used by two or more orgs)

An acronym is considered ambigous if it is associated with two institutional IDs. Due to noise in the datasets, some of the institutions might actually be divisions of the same institutions.

It's OK to include more acronyms in this list that there actually are because that would make the generic classifier more conservative and, eventually, the matcher more accurate.

In [6]:
extract_acronyms_args = [typeDict, subjectDict, subjApproxDict, subjModDict, orgModDict]

In [7]:
all_datasets['acronyms'] = all_datasets.input.apply(extract_acronyms, args=extract_acronyms_args)

In [8]:
filtered_dataset = all_datasets[all_datasets.acronyms.apply(len) > 0].copy()
filtered_dataset['acronyms'] = filtered_dataset['acronyms'].apply(tuple)

In [9]:
filtered_dataset.sample(10)

Unnamed: 0,input,id,label,acronyms
414560,Service Central d'Anatomie et de Cytologie Pat...,60002810,SPE,"(CHU,)"
1326671,KLES&apos;s Institute of Dental Sciences,60108934,SPE,"(KLES,)"
1672188,Clin. ORL Hop. Croix Rousse,60010942,SPE,"(ORL,)"
1307228,National Institute of Economic and Social Rese...,60109561,SPE,"(NIESR,)"
1174656,LABEIN Technological Cent,60102749,SPE,"(LABEIN,)"
1265115,MOL Hungarian Oil and Gas Co,60100927,SPE,"(MOL,)"
1350302,Centro de Investigación Y Tecnología Agroali...,60096984,SPE,"(CITA,)"
317872,ADAS Drayton,60006769,SPE,"(ADAS,)"
1188218,IBS Center for RNA Research,60104544,SPE,"(IBS, RNA)"
992466,Fraunhofer Institut Fertigungstechnik Material...,60010500,SPE,"(IFAM,)"


In [10]:
acronym2doc_id = filtered_dataset.groupby('acronyms').agg(set)

In [11]:
ambiguous_acronyms = acronym2doc_id[acronym2doc_id['id'].apply(len) >= 2]

In [12]:
len(ambiguous_acronyms)

6424

## Eye-balling

In [13]:
def _print_row(row):
    print(row['acronyms'], ':', row['id'], ':', row['input'], '\n')
    
_ = ambiguous_acronyms.sample(10).reset_index().apply(_print_row, axis=1)

('CRTI',) : {60105579, 60087261, 60109806} : {'Research Center in Industrial Tcchnologies CRTI', 'Research Centre in Industrial Technologies CRTI', 'Research Center in Industrial Technologies CRTI', 'CRTI', 'Centre pour la Recherche en Transplantation et Immunointervention (CRTI)', 'Research Center in Idustrial Technologies CRTI'} 

('EPC',) : {60070011, 60031828} : {'McMaster University Evidence-based Practice Center (EPC)', 'European Policy Center (EPC)'} 

('THOMSON',) : {60007716, 60088549, 60020519, 60048520, 60081943} : {'S.G.S. THOMSON Microelectronics', 'Laboratoire Central de Recherches THOMSON-C. S. F', 'THOMSON Deutsche Thomson Brandt GmbH', 'THOMSON Telecom Belgium', 'S.G.S.-THOMSON Microelectronics', 'THOMSON'} 

('CS1C',) : {60010769, 60029205, 60068643, 60021421} : {'Institut de Ciències de la Terra Jaume Almera (CS1C)', 'Institute de Acuicultura de Torre de la Sal (CS1C)', 'Instituto de la Grasa (CS1C)', 'C.I.D. (CS1C)'} 

('ULA',) : {60002439, 60052106, 60016081, 6006

# Extract generic acronyms

These are acronyms associated with a generic org.

In [14]:
generics = pd.read_csv(generic_dataset)

In [15]:
generics['acronyms'] = generics.input.apply(extract_acronyms, args=extract_acronyms_args)

In [16]:
filtered_generics = generics[generics.acronyms.apply(len) > 0]

In [17]:
len(filtered_generics)

334

In [18]:
filtered_generics.sample(10)

Unnamed: 0,input,id,label,acronyms
10979,Department of Botany MRC-166,60001291,GEN,[MRC]
10710,Department of Chemistry (BCC582),60031084,GEN,[BCC582]
14135,Department of Surgery MDC16,60015245,GEN,[MDC16]
18126,Department of Immunology IMM-23,60022232,GEN,[IMM]
16072,Department of Electrical Engineering (DEE),60009736,GEN,[DEE]
13188,ECE Department,60033206,GEN,[ECE]
5646,Orthopaedic Department (RKU),60019361,GEN,[RKU]
5647,Orthopaedic Department (RKU),60019361,GEN,[RKU]
12260,Departamento Cirugía II-ORL,60023169,GEN,[ORL]
14062,Department of Surgery II (ORL),60015245,GEN,[ORL]


# Writing

In [19]:
acronym_blacklist = pd.concat([filtered_generics, filtered_dataset], sort=False)
acronym_blacklist['acronyms'] = acronym_blacklist['acronyms'].apply(lambda x: ' '.join(x))

In [22]:
with open('dicts/acronym_whitelist.txt', 'w') as f:
    for a in sorted(acronym_blacklist['acronyms'].drop_duplicates()):
        f.write(a)
        f.write('\n')