In [1]:
raw_generic_dataset = '../../datasets/generic_orgs_and_variants.json'
raw_generic_dataset2 = '../../datasets/specific_orgs_generic_variants.json'
raw_specific_dataset = '../../datasets/specific_orgs_and_specific_variants.json'
ipr_dataset = '../../datasets/ipr_20191030_names.csv'

In [2]:
import sys
sys.path.append('../generic-classifier/')


In [3]:
import json
import pandas as pd
from sklearn.model_selection import train_test_split

# Utility functions

In [11]:
!head -n 25 $raw_generic_dataset

{
  "responseHeader":{
    "status":0,
    "QTime":1,
    "params":{
      "q":"{!parent which=\"contentType:org\"}address.country:xd",
      "fl":"*,[child parentFilter=\"contentType:org\" childFilter=\"contentType:variant AND variant.name:*\" limit=10000]",
      "rows":"2000"}},
  "response":{"numFound":1926,"start":0,"docs":[
      {
        "id":"60018049",
        "contentType":"org",
        "name":"Department of Critical Care Medicine",
        "normalizedName":"department of critical care medicine",
        "primary":false,
        "scope":"unknown",
        "type":["ddep"],
        "standAlone":false,
        "main":true,
        "generic":false,
        "val":"unv",
        "_version_":1639813966023622656},
      {
        "id":"60016180",
        "contentType":"org",


In [6]:
def _read_child_docs(row):
    if row['_childDocuments_'] != float('nan'):
        df = pd.DataFrame(row['_childDocuments_'])
        df['id'] = row['id']
        return df

def read_dataset_with_variants(path):
    with open(path) as f:
        ds = json.load(f)
        df = pd.DataFrame(ds['response']['docs'])
        df = df.rename(columns={'name': 'input'})
        df['name_type'] = 'name'
        child_docs = pd.concat((_read_child_docs(row) for _, row in 
                                df.dropna(subset=['_childDocuments_']).iterrows()), sort=False)
        child_docs = child_docs.rename(columns={'variant.name': 'input'})
        child_docs['name_type'] = 'variant'
        df = pd.concat([df, child_docs], sort=False)
        return df[['input', 'id', 'name_type']]
    
def read_dataset(path):
    with open(path) as f:
        ds = json.load(f)
        df = pd.DataFrame(ds['response']['docs'])
        df = df.rename(columns={'variant.name': 'input'})
        df['id'] = df['id'].str.replace('^(\d+)-.+$', r'\1')
        return df[['input', 'id']]

# Reading data in

In [7]:
%%time
generics = read_dataset_with_variants(raw_generic_dataset)
generics['label'] = 'GEN'
generics_in_specific = read_dataset(raw_generic_dataset2)
generics_in_specific['label'] = 'GEN'

CPU times: user 2.97 s, sys: 139 ms, total: 3.11 s
Wall time: 3.13 s


In [14]:
generics.sample(10)

Unnamed: 0,input,id,label
493,B6 Department of Medicine,60015766,GEN
1,Faculty of Science and Technology,60069272,GEN
52,Dépt. de Mathématiques,60005657,GEN
25,Dep. Human Genet.,60006093,GEN
275,Department of Pathology (HSC-4N67),60014898,GEN
68,Dept Cell & Molec.Biol.,60004041,GEN
329,Pathol. Dept.,60014898,GEN
1,Dept. of Mech. Eng.,60007744,GEN
622,Department of Medicine (111W),60015766,GEN
36,Dipartimento di Biologia Cellulare,60021002,GEN


In [20]:
%%time
specifics = read_dataset_with_variants(raw_specific_dataset)
specifics['label'] = 'SPE'

CPU times: user 5min 53s, sys: 26.9 s, total: 6min 20s
Wall time: 6min 24s


In [21]:
specifics[specifics['name_type'] == 'name'].sample(10)

Unnamed: 0,input,id,name_type,label
18750,Sandton Medi-Clinic,60014365,name,SPE
41985,"Birjand University of Medical Sciences, Infect...",60115049,name,SPE
22649,Geological Survey of Canada Sidney,60024757,name,SPE
58081,"University of Zurich, Sociological Institute",60070528,name,SPE
17795,Human-centered Design Organization,60091851,name,SPE
53018,E.K. Fedorov Institute of Applied Geophysics,60069624,name,SPE
15929,Mcwest Corporation,60109355,name,SPE
81256,Department of Heat and Gas Supply and Ventilation,60120965,name,SPE
7411,Hungarian Institute for Building Science,60068475,name,SPE
6518,Reproductive Toxicology Center - DC,60025362,name,SPE


In [25]:
specifics[specifics['name_type'] == 'variant'].sample(10)

Unnamed: 0,input,id,name_type,label
32,Canada Research Chair in IT in Healthcare,60088929,variant,SPE
599,Universität Bonn,60007493,variant,SPE
2,University of Azad,60089171,variant,SPE
58,Samung Electronics Yongin,60003780,variant,SPE
17,Moudsley Hospital,60032249,variant,SPE
152,Academy of Sciences of the Latvian S.S.R,60069098,variant,SPE
22,Inst. Univ. Euroéen de la Mer,60026314,variant,SPE
172,Slovak Acad. Sci.,60010330,variant,SPE
53,Federal Research Institute of Nutrition and Bi...,60116207,variant,SPE
7,Enviro-Chemie GmbH Wasser- und Abwassertechnik,60074166,variant,SPE


In [11]:
specifics_from_ipr = pd.read_csv(ipr_dataset).fillna({'numdocs': 1}).rename(columns={'name': 'input'})

In [12]:
specifics_from_ipr.head()

Unnamed: 0,input,numdocs
0,Guizhou Minzu University,8.0
1,Showa General Hospital,20.0
2,National Taipeiuniversity Of Technology,2.0
3,Ministry Of Science Technology,1.0
4,At The University Of Colorado Health Sciences ...,4.0


In [13]:
specifics_from_ipr['input'] = specifics_from_ipr['input'].astype(str)
specifics_from_ipr['label'] = 'SPE'

In [47]:
specifics_from_ipr.sample(10)

Unnamed: 0,input,numdocs,label
1019416,Houston Orthopaedic Surgery And Sports Medicine,2.0,SPE
3349548,Polytechnic School Of Porto,1.0,SPE
238738,Instanes Consulting Engineers,1.0,SPE
2202675,Paducah Veterinary Clinic,1.0,SPE
3638473,Danish Seed Savers,1.0,SPE
4371103,Design Enablement Team,5.0,SPE
6112302,Mljet Health Centre,1.0,SPE
1794835,Laboratoire d'anatomopathologie Zohour,1.0,SPE
2586898,Université de Hearst,1.0,SPE
4726706,Divisione Chirurgia Generale e Servizio Chirur...,1.0,SPE


In [75]:
single_word_org_strings = specifics_from_ipr[specifics_from_ipr.input.str.split().apply(len) == 1]
single_word_org_strings.sort_values('numdocs', ascending=False).iloc[:20])

Unnamed: 0,input,numdocs,label
5161967,Cnrs,110100.0,SPE
3466847,Ieee,63177.0,SPE
4909710,Cern,51073.0,SPE
3140632,Cnr,35270.0,SPE
1631136,Ucla,32028.0,SPE
1946387,Inserm,30850.0,SPE
578886,Mit,27933.0,SPE
4857670,Csiro,27469.0,SPE
232130,Kaist,26003.0,SPE
4985740,Riken,25138.0,SPE


# Stats

In [17]:
len(generics)

26903

In [18]:
len(generics_in_specific)

19898

In [19]:
len(specifics)

2059623

In [31]:
len(specifics_from_ipr)

6319720

# Writing to files

... so that we don't have to do this again

In [13]:
!mkdir ../output

In [14]:
generic_dataset = '../output/generic_orgs_and_variants.csv'
generic_dataset2 = '../output/specific_orgs_generic_variants.csv'
specific_dataset = '../output/specific_orgs_and_specific_variants.csv'

In [15]:
%%time
generics.to_csv(generic_dataset, index=False)
generics_in_specific.to_csv(generic_dataset2, index=False)
specifics.to_csv(specific_dataset, index=False)

CPU times: user 4.53 s, sys: 313 ms, total: 4.84 s
Wall time: 4.88 s


## Adding counts to generic orgs

Generic orgs come without any indication of their prevalence in the data. This makes it hard to tune models to more common org strings. To this end, I joined generic org strings to counts in an ANI table. The processing was done on DataBricks to avoid downloading a large amount of data: https://elsevier.cloud.databricks.com/#notebook/2157896/command/2157897

The result is stored in `../../datasets/mle_linked_generic_orgs.csv`.

## Joined dataset

At first, I used org strings extracted from `specific_orgs_and_specific_variants.json` but it turns out to be not clean enough. Although only a small portion of it turns out to be generic, the sheer amount means that there are as many, or more, incorrectly labeled specifics as there are true generics, which means when we evaluate, the precision is going to be terrible.

Therefore, I decided to use extraction from Scopus IPR instead and extract less specific org strings.

In [4]:
training_set_path = '../output/dataset-200k-noisy.train.csv'
test_set_path = '../output/dataset-200k-noisy.test.csv'

In [57]:
# I select the more frequent specifics because they're less likely to be an error
specifics_from_ipr_small = specifics_from_ipr.sort_values('numdocs', ascending=False).iloc[:150000]
generic_orgs_with_count = pd.read_csv('../../datasets/mle_linked_generic_orgs.csv').rename(columns={'count': 'numdocs'})
joined = pd.concat([generic_orgs_with_count, specifics_from_ipr_small], sort=False)

In [66]:
joined.sample(3)

Unnamed: 0,input,id,label,numdocs
1677503,Uka Tarsadia University,,SPE,194.0
1431571,Indraprastha Institute Of Information Technolo...,,SPE,43.0
4663895,National Academy Of Sciences,,SPE,250.0


In [67]:
train, test = train_test_split(joined, test_size=0.5, stratify=joined.label)

In [68]:
train.label.value_counts()

SPE    75000
GEN    15147
Name: label, dtype: int64

In [69]:
test.label.value_counts()

SPE    75000
GEN    15148
Name: label, dtype: int64

In [72]:
train.sample(10)

Unnamed: 0,input,id,label,numdocs
3746580,Information And Communications Univ.,,SPE,93.0
7130,Systems Engineering Department,60007222.0,GEN,1098.0
3587007,Egis Pharmaceuticals Ltd,,SPE,40.0
3137932,Instituto De Atención Pediátrica,,SPE,55.0
3570742,National Research Center (nrc),,SPE,305.0
22519,School of Infonnation Science and Engineering,60118697.0,GEN,10.0
3609912,Instytutu Włókiennictwa,,SPE,52.0
2382028,Klinikum Schwabing,,SPE,200.0
3993,Abteilung Gynakologie,60019107.0,GEN,3.0
2409078,Edhec,,SPE,69.0


In [71]:
train.to_csv(training_set_path, index=False)
test.to_csv(test_set_path, index=False)

## Stats

In [73]:
!du -sh ../output/*.csv

3.7M	../output/dataset-200k-noisy.test.csv
3.7M	../output/dataset-200k-noisy.train.csv
 41M	../output/dataset-2M-noisy.test.csv
 41M	../output/dataset-2M-noisy.train.csv
1.1M	../output/generic_orgs_and_variants.csv
 99M	../output/specific_orgs_and_specific_variants.csv
888K	../output/specific_orgs_generic_variants.csv
