In [4]:
train_ds_path = 'output/dataset-200k-noisy.train.csv'
test_ds_path = 'output/dataset-200k-noisy.test.csv'
abbreviation_mapping_path = 'dicts/mle_abbreviation_mapping_from_ani_20190925.csv'

In [5]:
import sys
sys.path.append('.')

In [6]:
import os
if os.getcwd().endswith('/notebooks'):
    os.chdir(os.path.join(os.getcwd(), '..'))

In [7]:
import pandas as pd
from sklearn.base import TransformerMixin
from sklearn.model_selection import train_test_split
from collections import Counter, defaultdict, OrderedDict
from sklearn.metrics import precision_recall_fscore_support
from sklearn.preprocessing import KBinsDiscretizer, OneHotEncoder, LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.datasets import dump_svmlight_file
from classifyGenericModified import ExperimentalClassifier, ExperimentalHybridClassifier, normalizeName
from tqdm import tqdm
import numpy as np
from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline
from sklearn.svm import SVC, LinearSVC
from sklearn.kernel_approximation import Nystroem
import itertools
import re
import joblib
from model import *

In [8]:
# import classifyGenericModified
# import importlib
# importlib.reload(classifyGenericModified)
# from classifyGenericModified import ExperimentalClassifier, ExperimentalHybridClassifier, normalizeName

In [9]:
train_ds = pd.read_csv(train_ds_path)
test_ds = pd.read_csv(test_ds_path)

In [10]:
train_ds.sample(3)

Unnamed: 0,input,id,label,numdocs
12435,Chemical Engineering and Analytical Science,60099298.0,GEN,17.0
14705,Hopital Bichat-claude Bernard,,SPE,516.0
12802,Division Of Cardiology,,SPE,92.0


In [11]:
train_ds.label.value_counts()

SPE    75000
GEN    15147
Name: label, dtype: int64

In [12]:
class_weight = {'SPE': 1, 'GEN': 15}

# Word frequency

In [13]:
tokens = Counter()
for s, c in zip(train_ds.input, train_ds.numdocs):
    for t in s.split():
        # tried adding c instead of 1 but it promotes popular university so much that
        # names frequently pop up in the most common words
        tokens[t.lower()] += 1

In [14]:
tokens_ordered = [t for t, _ in tokens.most_common()]

In [15]:
tokens_group1 = set(tokens_ordered[:1000])
tokens_group2 = set(tokens_ordered[1000:10000])
tokens_group3 = set(tokens_ordered[10000:])

In [16]:
tokens_ordered[:10]

['of',
 'university',
 'and',
 'institute',
 'hospital',
 'de',
 'research',
 'department',
 'center',
 'for']

In [17]:
tokens_ordered[200:220] # more frequent tokens are more likely to be common words

['agency',
 'mathematics',
 'università',
 'services',
 'life',
 'surg.',
 'en',
 'station',
 'heart',
 'a',
 'veterans',
 'army',
 'plant',
 'ras',
 'physical',
 'coll.',
 'hospitalier',
 'unit',
 'fisica',
 'association']

In [18]:
tokens_ordered[10000:10010] # less frequent tokens are more likely to be names

['klinischer',
 'csu',
 'xingtai',
 'setsunan',
 'ambientale',
 'det',
 'strangeways',
 'stockport',
 'torrecárdenas',
 'stn.']

# Feature extraction

In [19]:
def classifyOrg(rule_classifier, ml_classifier, df):
    df['prediction'] = ml_classifier.predict(df.input)
    def override_prediction(row):
        org, norm, norm_no_sw = rule_classifier.preprocess(row['input'])
        if norm_no_sw in rule_classifier.whiteList:
            return "GEN"
        if norm_no_sw in rule_classifier.blackList:
            return "SPE"
        return row['prediction']
    df['prediction'] = df.apply(override_prediction, axis=1)

In [20]:
rule_based_classifier = ExperimentalClassifier('rule', use_multilingual_dicts=True, use_zipcode_us_rule=True)

Loading dictionary...


generating approximate dict: 100%|██████████| 50434/50434 [00:32<00:00, 1546.14it/s]


Loading dictionary done in 34.05 sec.


In [21]:
with open('dicts/topLevelTypes.txt') as f:
    top_level_types = f.read().lower().strip().split()

In [22]:
top_level_words = itertools.chain(rule_based_classifier.cn, rule_based_classifier.univ,
                                  rule_based_classifier.companyTypes, top_level_types)
top_level_word_map = {
    w: '^top' for w in top_level_words
}

In [23]:
word_lists = [ # later lists have precedence
    ('^L', rule_based_classifier.allLoc),
    ('^sbE', rule_based_classifier.expandedSubjectDict),
    ('^S', rule_based_classifier.subjectDict),
    ('^cS', rule_based_classifier.commonSubjectsDict),
    ('^sb', rule_based_classifier.subjModDict),
    ('^mO', rule_based_classifier.orgModDict),
    ('^cn', rule_based_classifier.cn),
    ('^univ', rule_based_classifier.univ),
    ('^cnS', rule_based_classifier.companySuffixes),
    ('^cnT', rule_based_classifier.companyTypes),
    ('^T', rule_based_classifier.typeDict),
    ('^e', rule_based_classifier.wordEndingsDict),
    ('^sw', rule_based_classifier.sw),
    ('^lf', tokens_group3),
    ('^and', 'and|&|y|und|e|og|i|και|ja|et|és|en|ból|ve'.split('|')),
    ('^of', 'of|de|des|der|di|fur|fr|for|für|voor|in|zu|és'.split('|')),
    ('^in', 'in|a|op|zu'.split('|')),
    ('^lat', 'I|II|III|IV|V|VI|VII|VIII|IX|X|XI|XII|XIII|XIV|XV|XVI|XVII|XVIII|XIX|XX'.split('|')),
]

In [24]:
word2tag = {}
for name, word_list in word_lists:
    # word lists from ExperimentalClassifier are preprocessed already, 
    # strings were lower-cased and there are no duplicates
    for w in tqdm(word_list, position=0, leave=True):
        if w:
            # ideally, each word occurs in only one list
            # in reality, it's commonly that they occur in many lists
            word2tag[w] = name

100%|██████████| 72276/72276 [00:00<00:00, 1073012.59it/s]
100%|██████████| 15072/15072 [00:00<00:00, 1484897.70it/s]
100%|██████████| 50434/50434 [00:00<00:00, 911940.44it/s]
100%|██████████| 236/236 [00:00<00:00, 146125.74it/s]
100%|██████████| 3077/3077 [00:00<00:00, 627961.92it/s]
100%|██████████| 112/112 [00:00<00:00, 102950.26it/s]
100%|██████████| 542/542 [00:00<00:00, 531769.07it/s]
100%|██████████| 36/36 [00:00<00:00, 102300.10it/s]
100%|██████████| 99/99 [00:00<00:00, 151601.35it/s]
100%|██████████| 95/95 [00:00<00:00, 93865.46it/s]
100%|██████████| 569/569 [00:00<00:00, 625244.69it/s]
100%|██████████| 154/154 [00:00<00:00, 181847.64it/s]
100%|██████████| 1911/1911 [00:00<00:00, 859735.59it/s]
100%|██████████| 27474/27474 [00:00<00:00, 1013200.29it/s]
100%|██████████| 14/14 [00:00<00:00, 75379.02it/s]
100%|██████████| 13/13 [00:00<00:00, 26051.58it/s]
100%|██████████| 4/4 [00:00<00:00, 9010.32it/s]
100%|██████████| 20/20 [00:00<00:00, 20815.40it/s]


In [25]:
[(name, len(wl)) for name, wl in word_lists]

[('^L', 72276),
 ('^sbE', 15072),
 ('^S', 50434),
 ('^cS', 236),
 ('^sb', 3077),
 ('^mO', 112),
 ('^cn', 542),
 ('^univ', 36),
 ('^cnS', 99),
 ('^cnT', 95),
 ('^T', 569),
 ('^e', 154),
 ('^sw', 1911),
 ('^lf', 27474),
 ('^and', 14),
 ('^of', 13),
 ('^in', 4),
 ('^lat', 20)]

In [26]:
len(word2tag)

150015

In [34]:
def build_pipeline():
    rule_encoder = OneHotEncoder(handle_unknown='ignore')
    freq_number_vectorizer_func = lambda: CountVectorizer(analyzer=identity_analyzer, max_features=20, binary=True)
    ngram_pattern_vectorizer = CountVectorizer(analyzer=identity_analyzer, ngram_range=(2,5), min_df=10, binary=True)
    joined_pattern_vectorizer = CountVectorizer(analyzer=identity_analyzer, min_df=20, binary=True)
    ngram_pattern_vectorizer2 = CountVectorizer(analyzer=identity_analyzer, ngram_range=(2,5), min_df=10, binary=True)
    joined_pattern_vectorizer2 = CountVectorizer(analyzer=identity_analyzer, min_df=20, binary=True)
    discretizer = KBinsDiscretizer(n_bins=5, encode='onehot', strategy='quantile')
    feature_extractor = make_pipeline(
        TokenTranslator(abbreviation_mapping_path),
        FeatureUnion([
            ('freq-char-len', make_pipeline(CharLenFeature(), freq_number_vectorizer_func())),
            ('freq-word-len', make_pipeline(WordLenFeature(), freq_number_vectorizer_func())),
            ('freq-min-word-freq', make_pipeline(WordFrequencyFeatures(tokens, stats=['min']), freq_number_vectorizer_func())),
            ('freq-max-word-freq', make_pipeline(WordFrequencyFeatures(tokens, stats=['max']), freq_number_vectorizer_func())),
            ('numerical-discretized', make_pipeline(
                FeatureUnion([
                    ('char_len', CharLenFeature()),
                    ('word_len', WordLenFeature()),
                    ('freq_upper', FreqCapitalsFeature()),
                    ('freq_all_caps', FreqAllCapsFeature()),
                    ('word_freq', WordFrequencyFeatures(tokens)),
                ]),
                discretizer,
            )),
            ('word_list', make_pipeline(
                # didn't use num_top_freq_words because I don't want to override "^top" tags
                WordListTagger(word2tag, 0, default_tag='^nf'),
                FeatureUnion([
                    ('joined', make_pipeline(StringConcat(), joined_pattern_vectorizer)),
                    ('n-grams', ngram_pattern_vectorizer)
                ])
            )),
            ('word_list_top_freq', make_pipeline(
                WordListTagger(word2tag, 100, default_tag='^nf'),
                FeatureUnion([
                    ('joined', make_pipeline(StringConcat(), joined_pattern_vectorizer2)),
                    ('n-grams', ngram_pattern_vectorizer2)
                ])
            )),
            ('top_level_word', make_pipeline( # check for words such as "university", "institution", "ltd."
                WordListTagger(top_level_word_map, 1000, default_tag='other'),
                CountVectorizer(analyzer=identity_analyzer, ngram_range=(1,3), binary=True)
            )),
            ('rule', make_pipeline( 
                # ignore the last one (default rule) because it just emit 'GEN' for everything
                RuleFeatures(rule_based_classifier), 
                rule_encoder
            ))
        ]),
        TfidfTransformer(norm=None), # so that feature vectors are more meaningful (for debugging)
    )
    pipeline = Pipeline([
        ('feature_extractor', feature_extractor),
        ('polynomial_approx', Nystroem(kernel='poly', degree=2)),
        ('classifier', LinearSVC(class_weight=class_weight))
    ])
    return pipeline, feature_extractor

In [35]:
%%time
_, train_sample = train_test_split(train_ds, test_size=0.1, stratify=train_ds.label)
train_sample = train_sample.copy()

CPU times: user 81.3 ms, sys: 3.85 ms, total: 85.1 ms
Wall time: 83.2 ms


In [36]:
%%time
# train_sample = train_ds
pipeline, feature_extractor = build_pipeline()
pipeline.fit(train_sample.input, y=train_sample.label, classifier__sample_weight=train_sample.numdocs)

  'decreasing the number of bins.' % jj)
  'decreasing the number of bins.' % jj)
  'decreasing the number of bins.' % jj)


CPU times: user 15.5 s, sys: 262 ms, total: 15.8 s
Wall time: 14.7 s


Pipeline(memory=None,
         steps=[('feature_extractor',
                 Pipeline(memory=None,
                          steps=[('tokentranslator',
                                  <model.TokenTranslator object at 0x125e1ea20>),
                                 ('featureunion',
                                  FeatureUnion(n_jobs=None,
                                               transformer_list=[('freq-char-len',
                                                                  Pipeline(memory=None,
                                                                           steps=[('charlenfeature',
                                                                                   <model.CharLenFeature object at 0x125e1ea90>),
                                                                                  ('countvectorizer',
                                                                                   CountVectorizer(analyzer=<...
                ('polynomial_approx',
 

In [37]:
_, test_sample = train_test_split(test_ds, test_size=0.1, stratify=test_ds.label)
test_sample = test_sample.copy()
# test_sample = test_ds

In [38]:
%%time
classifyOrg(rule_based_classifier, pipeline, test_sample)

CPU times: user 8.09 s, sys: 78.4 ms, total: 8.17 s
Wall time: 7.35 s


In [39]:
test_sample

Unnamed: 0,input,id,label,numdocs,prediction
20990,Inst. Natl. Des Telecom.,,SPE,112.0,GEN
18288,Universidade Federal De Santa Maria - Ufsm,,SPE,677.0,SPE
27265,Yashima General Hospital,,SPE,42.0,SPE
36998,Department of Oral Medicine and Pathology,60073765.0,GEN,528.0,GEN
50561,Academy Of Sciences Of The Estonian Ssr,,SPE,138.0,SPE
...,...,...,...,...,...
86028,Drury University,,SPE,292.0,SPE
59305,Pacific Medical Center,,SPE,132.0,SPE
11253,A. A. Bogomoletz Institute Of Physiology,,SPE,92.0,SPE
37646,Departamento de Química Orgánica e Inorgáni...,60031266.0,GEN,1.0,SPE


In [40]:
(test_sample['label'] == test_sample['prediction']).mean()

0.8544647809206878

In [41]:
(test_sample['prediction'] == 'GEN').sum()

2619

In [42]:
len(test_sample)

9015

In [43]:
precision_recall_fscore_support(test_sample.label, test_sample.prediction, labels=['GEN'])

(array([0.53875525]), array([0.93135314]), array([0.68263183]), array([1515]))

In [44]:
precision_recall_fscore_support(test_sample.label, test_sample.prediction, sample_weight=test_sample.numdocs, labels=['GEN'])

(array([0.8765384]),
 array([0.97835137]),
 array([0.92465068]),
 array([2543764.]))

## Examining transformations

### Abbreviation resolution

In [None]:
abbreviation_solver = make_pipeline(
    WordSplitter(),
    TokenTranslator(abbreviation_mapping_path)
)

In [None]:
train_ds['input_abbr_solved'] = abbreviation_solver.transform(train_ds.input)

In [None]:
train_ds[train_ds.input.str.match(r'\b(\w{2,}\.){2,}')][['input', 'input_abbr_solved']].sample(5)

In [113]:
train_ds[train_ds.input.str.match(r'\b\w{3,}\.?(/\w{3,}\.?)+')][['input', 'input_abbr_solved']].sample(5)

Unnamed: 0,input,input_abbr_solved
28531,Anatomy/Cellular Biology Dept.,Anatomy Cellular Biology department
48597,Electrical/Electronic Engg. School,Electrical Electronic Engg. School
8455,Reconstructive/Plastic Surgery Dept.,Reconstructive Plastic Surgery department
63787,Laas/cnrs,Laas cnrs
33851,Noaa/nesdis/center For Satellite Applications ...,Noaa nesdis center For Satellite Applications ...


In [114]:
train_ds[train_ds.input.str.match(r'\b\w+\.\s')][['input', 'input_abbr_solved']].sample(50)

Unnamed: 0,input,input_abbr_solved
44368,Univ. Paris Sud,university Paris Sud
1610,Univ. Hosp.,university hospital
50410,Div. Clin. Immunol. and Rheumatology,division clinical immunology and Rheumatology
60432,Sts. Cyril And Methodius University,Sts. Cyril And Methodius University
20732,S. Carlo Hospital,S. Carlo Hospital
12053,Inst. Genet. Select. Industr. Microorgan.,institute genetics selective industrial microo...
71982,Dept. Surg. IV,department surgery IV
21504,Inst. Pitan.,institute Pitan.
49021,St. Joseph's Medical Center,St. Joseph's Medical Center
39535,Univ. Of Arkansas,university Of Arkansas


### Pattern tagging

In [115]:
pattern_tagger = make_pipeline(
    WordSplitter(),
    TokenTranslator(abbreviation_mapping_path),
    WordListTagger(word2tag, 0, default_tag='^nf'),
)
pattern_tagger2 = make_pipeline(
    WordSplitter(),
    TokenTranslator('dicts/mle_abbreviation_mapping_from_ani_20190925.csv'),
    WordListTagger(word2tag, 100, default_tag='^nf'),
)

In [116]:
train_sample['input_tagged'] = pattern_tagger.fit_transform(train_sample.input)
train_sample['input_tagged2'] = pattern_tagger2.fit_transform(train_sample.input)

In [117]:
(train_sample['input_tagged'] != train_sample['input_tagged2']).mean()

0.8347199112590128

In [118]:
train_sample[['input', 'input_tagged', 'input_tagged2']].sample(50)

Unnamed: 0,input,input_tagged,input_tagged2
28652,Cesi,[^nf],[^nf]
3568,Universidad Autónoma De San Luis Potosí,"[^T, ^nf, ^of, ^S, ^nf]","[universidad, ^nf, de, san, ^nf]"
58361,Heymans Institute Of Pharmacology,"[^lf, ^T, ^of, ^S]","[^lf, institute, of, ^S]"
68372,Fraunhofer-institut Für Fertigungstechnik Und ...,"[^nf, ^of, ^lf, ^and, ^S, ^nf]","[^nf, für, ^lf, und, ^S, ^nf]"
38341,Department of BiomedicalScience,"[^T, ^of, ^lf]","[department, of, ^lf]"
38474,Voronezhskij Gu,"[^lf, ^nf]","[^lf, ^nf]"
53064,"Dr. Harisingh Gour University, Sagar","[^nf, ^L]","[^nf, ^L]"
3992,San Diego Va Healthcare System,"[^S, ^nf, ^S, ^cnS, ^sb]","[san, ^nf, ^S, ^cnS, ^sb]"
56185,Ernst-moritz-arndt-universitat,[^lf],[^lf]
86929,International Space Science Institute-beijing,"[^sb, ^cS, ^lf]","[international, ^sb, science, ^lf]"


In [119]:
pattern_tagger2.steps[2][1].top_freq_words

OrderedDict([('of', 2593),
             ('university', 1626),
             ('institute', 984),
             ('and', 962),
             ('department', 887),
             ('hospital', 864),
             ('research', 648),
             ('de', 639),
             ('center', 456),
             ('for', 401),
             ('medical', 348),
             ('science', 326),
             ('medicine', 325),
             ('college', 322),
             ('sciences', 300),
             ('technology', 277),
             ('engineering', 273),
             ('school', 272),
             ('national', 263),
             ('health', 215),
             ('centre', 200),
             ('the', 183),
             ('academy', 177),
             ('state', 165),
             ('laboratory', 144),
             ('inc.', 117),
             ('institut', 109),
             ('biology', 96),
             ('general', 94),
             ('chemistry', 87),
             ('molecular', 84),
             ('cancer', 81),
             ('

In [120]:
top_level_tagger = WordListTagger(top_level_word_map, 0, default_tag='other')

In [121]:
train_sample['input_top_lvl_tagged'] = top_level_tagger.fit_transform(train_sample.input)

In [122]:
train_sample[['input', 'input_top_lvl_tagged']].sample(10)

Unnamed: 0,input,input_top_lvl_tagged
32566,Bethesda Physiocare,[other]
19782,Physik Department E21,[other]
78781,Ospedale S. Giovanni,[other]
51158,"Institute Of Chemistry, Technology And Metallurgy","[^top, other]"
25233,Wuxi Institute Of Technology,"[other, ^top, other]"
52588,Electronics And Telecommunication Research Ins...,"[other, ^top]"
27185,Div. of Cardiovasc. Dis.-I.,[other]
37793,Medizinische Universitätsklinik,[other]
16770,Linyi Normal University,"[other, ^top]"
35732,Columbia River Inter-tribal Fish Commission,[other]


## Examining feaures

In [123]:
discretizer.bin_edges_

array([array([  3.,  20.,  26.,  32.,  41., 126.]),
       array([ 1.,  2.,  3.,  4.,  5., 17.]),
       array([0.        , 0.09090909, 0.11428571, 0.13043478, 0.14814815,
       1.        ]),
       array([0., 1.]), array([0., 1.]),
       array([1.0000e+00, 5.7600e+02, 6.0640e+03, 9.9680e+03, 2.6746e+04]),
       array([6.66666667e-01, 1.69370000e+02, 1.58412000e+03, 2.90276000e+03,
       5.92960000e+03, 1.36140000e+04])], dtype=object)

In [124]:
ngram_pattern_vectorizer.vocabulary_

{'^T': 2,
 '^of': 13,
 '^sb': 14,
 '^cS': 4,
 '^nf': 12,
 '^cnS': 6,
 '^and': 3,
 '^cnT': 7,
 '^sw': 16,
 '^L': 0,
 '^univ': 17,
 '^lf': 10,
 '^mO': 11,
 '^S': 1,
 '^in': 9,
 '^sbE': 15,
 '^e': 8,
 '^cn': 5}

In [125]:
joined_pattern_vectorizer.vocabulary_

{'^T_^of_^sb': 21,
 '^lf_^sb_^T': 35,
 '^T_^of_^S_^and_^S': 16,
 '^T_^of_^S': 15,
 '^lf': 28,
 '^nf': 37,
 '^lf_^T': 30,
 '^T_^of_^sb_^cS': 25,
 '^nf_^sb_^T': 51,
 '^L_^nf_^T': 5,
 '^sb_^T': 55,
 '^nf_^T_^of_^sb_^S': 44,
 '^lf_^mO_^T': 32,
 '^nf_^cnT': 46,
 '^univ_^of_^L': 57,
 '^sb_^S_^T': 54,
 '^T_^S': 12,
 '^lf_^nf_^T': 34,
 '^T_^of_^sb_^and_^sb': 23,
 '^lf_^nf': 33,
 '^nf_^cnS_^T': 45,
 '^S_^T': 9,
 '^nf_^lf': 47,
 '^nf_^T_^of_^cS': 43,
 '^S_^sb_^T': 11,
 '^nf_^T': 41,
 '^nf_^S': 39,
 '^nf_^univ': 52,
 '^L_^T_^of_^S': 2,
 '^T_^nf': 14,
 '^nf_^S_^nf': 40,
 '^T_^sb': 26,
 '^nf_^T_^of_^S': 42,
 '^T_^of_^cS': 18,
 '^nf_^of_^L': 49,
 '^T_^lf': 13,
 '^lf_^cnT': 31,
 '^sb_^cnS_^T': 56,
 '^L_^univ': 8,
 '^nf_^lf_^nf': 48,
 '^nf_^univ_^T': 53,
 '^T_^of_^sb_^S': 22,
 '^lf_^univ': 36,
 '^nf_^of_^nf': 50,
 '^L_^sb_^T': 7,
 '^L_^T': 1,
 '^univ_^of_^nf': 58,
 '^T_^of_^lf': 19,
 '^T_^of_^S_^lf': 17,
 '^S_^nf': 10,
 '^nf_^L': 38,
 '^T_^sb_^cS': 27,
 '^T_^of_^nf': 20,
 '^T_^of_^sb_^and_^sb_^cS': 24

In [126]:
len(joined_pattern_vectorizer.vocabulary_)

59

# Debugging

In [127]:
gen_orgs = train_ds[train_ds.label == 'GEN'].iloc[:10000].copy()
spe_orgs = train_ds[train_ds.label == 'SPE'].iloc[:10000]

In [128]:
classifyOrg(rule_based_classifier, pipeline, gen_orgs)

In [129]:
gen_orgs_misclassified = gen_orgs[gen_orgs['prediction'] != 'GEN']

In [130]:
len(gen_orgs_misclassified)

560

In [131]:
_ = gen_orgs_misclassified.sample(50).input.apply(print)

Primate Resarch Institute
Department of German Studies
Research Institute for Mathmatical Sciences
Electrical and Computer Engineering (ECE)
Institut für Biochemie (FB 08)
Second Affiliated Hosptial
Education Seoul National University Seoul
Inst. for kulturgeografi
Research Institute of Eye Diseases
Health & Human Rights Program
Centre for Eye Research
Aquarius Project
Primate Reseach Institute
Informaton Technology and Systems Center
Alliance Laboratory for Advanced Medical Research
RST
Plastic/Reconstructive SurgeryDept.
CIMR
Res. Inst. Pharmaceutical Sciences
Wiener University
Dept.Obstet.
Institute for Cancer Research (IRCC)
Institute for Chamical Research
Institute for Natural Sciences
St. Catherine’s College
VIRUTUS
Natura1l Products Research Institute
Vlaams Interuniversitair Instituut voor Biotechnologie (VIB)
Robinson College
Faculty of Arts and Social Sciences
University of Chicago Biological Sciences
National Vegatable Research Station
III Neurological Clinic
BEAR C Enter
C

In [132]:
gen_vecs = feature_extractor.transform(gen_orgs_misclassified.input)
spe_vecs = feature_extractor.transform(spe_orgs.input)

In [133]:
muls = gen_vecs.dot(spe_vecs.transpose())

In [135]:
# sorted_indices = muls.argsort(axis=None) # only work for dense matrix
sorted_items = sorted(muls[:100].todok().items(), key=lambda x: -x[1])

In [136]:
def print_similar_orgs(i):
#     idx = sorted_indices[-i-1] # only work for dense matrix
#     gen_idx, spe_idx = idx // muls.shape[0], idx % muls.shape[0] # only work for dense matrix
    gen_idx, spe_idx = sorted_items[i][0]
    print('Generic:', gen_orgs['input'].iloc[gen_idx])
    print('Specific:', spe_orgs['input'].iloc[spe_idx])
    # print vector elements (not very informative without feature names)
#     for v in zip(gen_vecs[gen_idx].todok().items(), spe_vecs[spe_idx].todok().items()):
#         print(v)

In [137]:
print_similar_orgs(1)

Generic: Institute of Radio Engineering and Information Technology
Specific: Forest Research Institute


In [138]:
print_similar_orgs(2)

Generic: Institute of Radio Engineering and Information Technology
Specific: Space Research Institute


In [139]:
print_similar_orgs(3)

Generic: Institute of Radio Engineering and Information Technology
Specific: Food Research Institute


In [140]:
print_similar_orgs(5)

Generic: Departmento de Geofisica
Specific: Cardiovascular Research Institute


In [141]:
print_similar_orgs(6)

Generic: Departmento de Geofisica
Specific: Child Health Research Institute


In [142]:
print_similar_orgs(10)

Generic: Institute of Radio Engineering and Information Technology
Specific: Child Health Research Institute


In [143]:
print_similar_orgs(13)

Generic: Dept. of Materials Chemistry
Specific: Ues, Inc


In [144]:
print_similar_orgs(15)

Generic: Dept. of Materials Chemistry
Specific: Sparta, Inc


In [145]:
print_similar_orgs(25)

Generic: Dept. of Materials Chemistry
Specific: Honeywell, Inc


In [146]:
print_similar_orgs(35)

Generic: Departamento de Anestesiología
Specific: Veter. Res. Lab.


In [147]:
print_similar_orgs(100)

Generic: Dept. of Materials Chemistry
Specific: Inco Ltd


# Storing to file in LibSVM-friendly format

In [45]:
%%time
pipeline, feature_extractor = build_pipeline()
pipeline.fit(train_ds.input, y=train_ds.label)

  'decreasing the number of bins.' % jj)
  'decreasing the number of bins.' % jj)
  'decreasing the number of bins.' % jj)
  'decreasing the number of bins.' % jj)


CPU times: user 1min 58s, sys: 1.16 s, total: 1min 59s
Wall time: 1min 55s


Pipeline(memory=None,
         steps=[('feature_extractor',
                 Pipeline(memory=None,
                          steps=[('tokentranslator',
                                  <model.TokenTranslator object at 0x1262e6320>),
                                 ('featureunion',
                                  FeatureUnion(n_jobs=None,
                                               transformer_list=[('freq-char-len',
                                                                  Pipeline(memory=None,
                                                                           steps=[('charlenfeature',
                                                                                   <model.CharLenFeature object at 0x1262e6710>),
                                                                                  ('countvectorizer',
                                                                                   CountVectorizer(analyzer=<...
                ('polynomial_approx',
 

In [50]:
%%time
pipeline_weighted, feature_extractor = build_pipeline()
pipeline_weighted.fit(train_ds.input, y=train_ds.label, classifier__sample_weight=train_ds.numdocs)

  'decreasing the number of bins.' % jj)
  'decreasing the number of bins.' % jj)
  'decreasing the number of bins.' % jj)
  'decreasing the number of bins.' % jj)


CPU times: user 1min 51s, sys: 1.05 s, total: 1min 52s
Wall time: 1min 47s


Pipeline(memory=None,
         steps=[('feature_extractor',
                 Pipeline(memory=None,
                          steps=[('tokentranslator',
                                  <model.TokenTranslator object at 0x20d066160>),
                                 ('featureunion',
                                  FeatureUnion(n_jobs=None,
                                               transformer_list=[('freq-char-len',
                                                                  Pipeline(memory=None,
                                                                           steps=[('charlenfeature',
                                                                                   <model.CharLenFeature object at 0x20d066b00>),
                                                                                  ('countvectorizer',
                                                                                   CountVectorizer(analyzer=<...
                ('polynomial_approx',
 

In [51]:
%%time
label_encoder = LabelEncoder()
features = feature_extractor.transform(train_ds.input)
labels = label_encoder.fit_transform(train_ds.label)

CPU times: user 1min 26s, sys: 673 ms, total: 1min 27s
Wall time: 1min 24s


In [150]:
dump_svmlight_file(features, labels, 'output/dataset-200k-noisy.train.svm')

In [151]:
%%time
test_features = feature_extractor.transform(test_ds.input)
test_labels = label_encoder.transform(test_ds.label)

CPU times: user 1min 27s, sys: 1.16 s, total: 1min 28s
Wall time: 1min 26s


In [152]:
dump_svmlight_file(test_features, test_labels, 'output/dataset-200k-noisy.test.svm')

In [153]:
joblib.dump(label_encoder, 'output/label_encoder.pkl')

['output/label_encoder.pkl']