In [1]:
! pip -q install datasets

In [2]:
import re
import time
import pickle
import pandas as pd
import numpy as np
from collections import Counter
from tqdm.notebook import tqdm

In [3]:
from datasets import load_dataset

dataset = load_dataset("conll2003")

Reusing dataset conll2003 (/opt/tmp/huggingface/datasets/conll2003/conll2003/1.0.0/26b70ce2b0f32cb35a27151dbfa2dbe88c82bcdaf8f29433bcdc612a9b314e83)


In [4]:
cn = pd.read_csv('conceptnet_en.csv')
cn_isa = pd.read_csv('data/conceptnet_isa.csv')

In [7]:
cn_isa.object.unique()

array(['film', 'band', 'weapon', 'album', 'device', 'book', 'company',
       'musical_artist', 'tangible_thing', 'person', 'river',
       'given_name', 'software', 'city', 'disease', 'capital',
       'agent_non_geographical', 'area', 'event', 'activity', 'state',
       'human', 'magazine', 'asian', 'person_with_nationality', 'country',
       'administrative_region', 'region', 'place', 'ethnic_group',
       'soccer_player', 'african', 'organization', 'people', 'structure',
       'station', 'artifact', 'part', 'location', 'town'], dtype=object)

In [4]:
cn_keys = set(cn.subject.values)

In [5]:
len(cn_keys)

1165189

In [6]:
cn.subject.values[1800:1823]

array(['bona_fide', 'bona_fide', 'bonanza', 'bondage', 'bone', 'bone',
       'bone', 'bone', 'bone', 'boo', 'book', 'book', 'book',
       'book_smart', 'boom', 'boon', 'bore', 'bore', 'bore', 'boreal',
       'boreal', 'boreas', 'boreas'], dtype=object)

In [45]:
word2labels = {}
for w in cn_isa.subject.unique():
    subcn = cn_isa[cn_isa.subject == w]
    word2labels[w] = subcn.object.values.tolist()

In [46]:
pickle.dump(word2labels, open('edges/word2labels.pickle', 'wb'))

In [8]:
! ls data

conceptnet_isa.csv	       eng.testa	  eng.train
conll2003_sparse_dev.pickle    eng.testa.openNLP  eng.train.openNLP
conll2003_sparse_test.pickle   eng.testb	  ws2_40r
conll2003_sparse_train.pickle  eng.testb.openNLP  ws3_40r


In [9]:
dataset['train']['words'][:3], dataset['train']['pos'][:3], dataset['train']['ner'][:3] 

([['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.'],
  ['Peter', 'Blackburn'],
  ['BRUSSELS', '1996-08-22']],
 [['NNP', 'VBZ', 'JJ', 'NN', 'TO', 'VB', 'JJ', 'NN', '.'],
  ['NNP', 'NNP'],
  ['NNP', 'CD']],
 [['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O'],
  ['B-PER', 'I-PER'],
  ['B-LOC', 'O']])

In [47]:
''.join(['!', '$', '%', '&', "'", '*', '+', ',', '-', '.', ':', ';', '<', '=', '>', '?', '@', '`'])

"!$%&'*+,-.:;<=>?@`"

In [10]:
punctuation = ['!', '$', '%', '&', "'", '*', '+', ',', '-', '.', ':', ';', '<', '=', '>', '?', '@', '`']

vocabulary = {}
data = {}

special_cases = []
special_num_cases = []
special_O_cases = []

for split in['train', 'validation', 'test']:
    # print(split)
    data[split] = []
    vocabulary[split] = set()
    
    for doc in tqdm(dataset[split], desc=split.upper()):
        tokens, labels, extras = [], [], []
        
        for token, pos, label in zip(doc['words'], doc['pos'], doc['ner']):
            if token == pos:
                continue # this is punctuation
            
            elif pos == ',':
                pos = 'NNP'
            
            if token.endswith('='):
                token = token[:-1]
            
            while token and token[0] in punctuation:
                token = token[1:]
            
            token = re.sub(r'\d+', '<NUM>', token)
            token = token.replace('`', "'")
            
            if not token:
                continue
            
            if all([c in ',.-' for c in token.split('<NUM>')]):
                special_num_cases.append((token, label))
                token = '<NUM>'
            elif not token.isalpha() and label != 'O':
                special_cases.append((token, label))
            elif not token.isalpha() and label == 'O':
                special_O_cases.append((token, label))
            
            extra = ['<'+pos.lower()+'>']
            if token.lower() in word2labels:
                extra.extend(['<'+l.lower()+'>' for l in word2labels[token.lower()]])
                
            if token.lower() not in cn_keys:
                extra.append('<not_in_dict>')
            if token == token.upper():
                extra.append('<all_caps>')
            if token.count('.') > 0 and (token.count('.') + 1) == len(token.split('.')): # C.J or C.J.
                extra.append('<accronym>')
            if token[0] == token[0].upper() and token[:1] == token[:1].lower(): 
                extra.append('<capitalized>')
                
                
            vocabulary[split].add(token.lower())
            tokens.append(token)
            labels.append(label)
            extras.append(extra)
            

        data[split].append((tokens, labels, extras))

HBox(children=(FloatProgress(value=0.0, description='TRAIN', max=14041.0, style=ProgressStyle(description_widt…




HBox(children=(FloatProgress(value=0.0, description='VALIDATION', max=3250.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='TEST', max=3453.0, style=ProgressStyle(description_width=…




In [11]:
data['train'][0]

(['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb'],
 ['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O'],
 [['<nnp>', '<all_caps>'],
  ['<vbz>'],
  ['<jj>', '<human>', '<person>', '<person_with_nationality>'],
  ['<nn>'],
  ['<to>'],
  ['<vb>'],
  ['<jj>', '<person>', '<person_with_nationality>'],
  ['<nn>']])

In [12]:
extra_vocab = list(set([e for example in data['train'] for l in example[2] for e in l]))
print(extra_vocab)

['<weapon>', '<capitalized>', '<wp$>', '<in>', '<cd>', '<jjs>', '<vbd>', '<wrb>', '<jj>', '<rbr>', '<band>', '<human>', '<ex>', '<(>', '<to>', '<state>', '<disease>', '<fw>', '<vbg>', '<film>', '<agent_non_geographical>', '<dt>', '<artifact>', '<asian>', '<given_name>', '<nns>', '<wp>', '<soccer_player>', '<part>', '<vbz>', '<location>', '<city>', '<vbp>', '<vbn>', '<jjr>', '<person_with_nationality>', '<book>', '<river>', '<pdt>', '<rb>', '<software>', '<all_caps>', '<organization>', '<town>', '<african>', '<region>', '<wdt>', '<nnps>', '<pos>', '<not_in_dict>', '<vb>', '<people>', '<musical_artist>', '<ls>', '<nnp>', '<company>', '<station>', '<country>', '<area>', '<sym>', '<nn|sym>', '<activity>', '<$>', '<ethnic_group>', '<album>', '<uh>', '<prp>', '<rp>', '<administrative_region>', '<magazine>', '<event>', '<prp$>', '<person>', '<tangible_thing>', '<place>', '<nn>', '<cc>', '<rbs>', '<device>', '<capital>', '<md>', '<)>', '<accronym>', '<structure>']


In [13]:
# print(set([x[0] for x in special_O_cases]))

In [14]:
all_voc = set([w for split in vocabulary for w in vocabulary[split]])
print(len(all_voc))

21647


In [15]:
[len(vocabulary[split]) for split in vocabulary]

[17292, 7754, 7161]

In [16]:
all_words = [x for l in data['train'] for x in l[0]]
train_counter = Counter(all_words)

In [17]:
train_labels_counter = Counter([x for l in data['test'] for x in l[1]])
train_labels_counter

Counter({'O': 32515,
         'B-LOC': 1668,
         'B-PER': 1617,
         'I-PER': 1156,
         'I-LOC': 257,
         'B-MISC': 702,
         'I-MISC': 210,
         'B-ORG': 1661,
         'I-ORG': 830})

In [18]:
intials = []
accronyms = []
whatelse = []
hyphenated = []

for term, label in special_cases:
    if term == term.upper() and term.count('.') > 0 and term.count('.') == len(term.split('.')) - 1 and len(term) <= 2:
        intials.append((term, label))
    elif term == term.upper() and term.count('.') > 0 and term.count('.') == len(term.split('.')) - 1 and len(term) > 2:
        accronyms.append((term, label))
    elif '-' in term and len(term.split('-')) > 1 and  (term.split('-')[0] == term.split('-')[0].lower() or (term.split('-')[1] == term.split('-')[1].lower())):
        hyphenated.append((term, label))
    else:
        whatelse.append((term, label))
# print('\n'.join(str(c) for c in set(whatelse)))
print(len(whatelse))

602


# Build the edgelist

In [19]:
final_vocab = sorted(list(vocabulary['train']) + ['<span>'] + list(extra_vocab))
len(final_vocab)

17377

In [28]:
final_vocab[:90]

['/',
 '<$>',
 '<(>',
 '<)>',
 '<accronym>',
 '<activity>',
 '<administrative_region>',
 '<african>',
 '<agent_non_geographical>',
 '<album>',
 '<all_caps>',
 '<area>',
 '<artifact>',
 '<asian>',
 '<band>',
 '<book>',
 '<capital>',
 '<capitalized>',
 '<cc>',
 '<cd>',
 '<city>',
 '<company>',
 '<country>',
 '<device>',
 '<disease>',
 '<dt>',
 '<ethnic_group>',
 '<event>',
 '<ex>',
 '<film>',
 '<fw>',
 '<given_name>',
 '<human>',
 '<in>',
 '<jj>',
 '<jjr>',
 '<jjs>',
 '<location>',
 '<ls>',
 '<magazine>',
 '<md>',
 '<musical_artist>',
 '<nn>',
 '<nnp>',
 '<nnps>',
 '<nns>',
 '<nn|sym>',
 '<not_in_dict>',
 '<num>',
 '<num>)<num>',
 '<num>*',
 '<num>,<num>-a',
 '<num>,<num>-acre',
 '<num>,<num>-b',
 '<num>,<num>-hectare',
 '<num>,<num>-km',
 '<num>,<num>-seat',
 '<num>,<num>-strong',
 '<num>,<num>-student',
 '<num>,<num>nd',
 '<num>,<num>th',
 '<num>--',
 '<num>--<num>-<num>-<num>-<num>',
 '<num>--ruehe',
 '<num>-<num>(<num>-<num>',
 '<num>-<num>-<num>--',
 '<num>-<num>/<num>',
 '<num>-<nu

In [21]:
word2id = {w:i for i,w in enumerate(final_vocab)}

In [29]:
word2id['ismail']

7858

In [30]:
len(word2id)

17377

In [31]:
pickle.dump(final_vocab, open('edges/vocabulary.pickle', 'wb'))

In [32]:
before_edges = {w: [] for w in final_vocab}
after_edges  = {w: [] for w in final_vocab}
isa_edges    = {w: [] for w in final_vocab}
vocab_dict   = {w: [] for w in final_vocab}

window_size = 2

for split in data:
    for example in tqdm(data[split], desc=split.upper()):
        text = [w.lower() for w in example[0]]
        for i, word in enumerate(text):
            term = word.lower()
            if term not in vocab_dict: # new words appearing only in the eval and test
                term = '<span>'
            left_context  = text[max(i-window_size, 0):i] + ([] if i >= window_size else ['<span>'])
            right_context = text[i+1:i+1+window_size] + ([] if i + window_size < len(text) else ['<span>'])
            left_context  = [w if w in vocab_dict else '<span>' for w in left_context]
            right_context = [w if w in vocab_dict else '<span>' for w in right_context]
            isa_context   = example[2][i]
            
            before_edges[term].extend(right_context)
            after_edges[term].extend(left_context)
            isa_edges[term].extend(isa_context)



HBox(children=(FloatProgress(value=0.0, description='TRAIN', max=14041.0, style=ProgressStyle(description_widt…




HBox(children=(FloatProgress(value=0.0, description='VALIDATION', max=3250.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='TEST', max=3453.0, style=ProgressStyle(description_width=…




In [33]:
[l[0] for l in sorted(isa_edges.items(), key=lambda k: len(set(k[1])), reverse=True)[:20]]

['<span>',
 '<num>',
 'city',
 'off',
 'record',
 'top',
 'back',
 'end',
 'japanese',
 'market',
 'open',
 'police',
 'put',
 'quarter',
 'south',
 'area',
 'balance',
 'base',
 'book',
 'bucharest']

In [34]:
edge_list_before = []
edge_list_after  = []
edge_list_isa    = []

for word in vocab_dict:
    edge_list_before.extend((word2id[word], word2id[w]) for w in before_edges[word])
    edge_list_after.extend((word2id[word], word2id[w]) for w in after_edges[word])
    edge_list_isa.extend((word2id[word], word2id[w]) for w in isa_edges[word])

In [35]:
len(edge_list_before), len(edge_list_after), len(edge_list_isa)

(505816, 505816, 460646)

In [36]:
len(set(edge_list_before)), len(set(edge_list_after)), len(set(edge_list_isa))

(220835, 221104, 32046)

In [37]:
len_all_edges = len(set(edge_list_before)) + len(set(edge_list_after)) + len(set(edge_list_isa))
edges_list_unique = sorted(set(edge_list_before).union(set(edge_list_after)))
len_unique_context_edges = len(edges_list_unique)
len_all_edges, len_unique_context_edges, len_all_edges - len_unique_context_edges - len(set(edge_list_isa))

(473985, 401452, 40487)

In [38]:
edges_list_all = sorted(set(edge_list_before).union(set(edge_list_after).union(set(edge_list_isa))))

In [39]:
len(edges_list_all)

433498

In [40]:
! ls

20M_classifier_all_available_features.ipynb   nodes_classifier.ipynb
conceptnet_en.csv			      onehot_pytorch_lightning.ipynb
data					      prepare_dataset.ipynb
edge_list_generation.ipynb		      README.md
edges					      runs
graph_embeddings_generation.ipynb	      snap
GraphNER_binary_representation_pytorch.ipynb  tempGraph.emb
hparams.yaml				      tempGraph.graph
lightning_logs


In [41]:
Counter(edge_list_before).most_common(10)

[((48, 48), 21248),
 ((48, 186), 11338),
 ((186, 186), 2662),
 ((7470, 15542), 1553),
 ((10757, 15542), 1445),
 ((15542, 186), 1335),
 ((15542, 10757), 1170),
 ((186, 48), 1152),
 ((13451, 186), 948),
 ((15736, 15542), 836)]

In [42]:
final_vocab[7470]

'in'

In [44]:
final_vocab[:100]

['/',
 '<$>',
 '<(>',
 '<)>',
 '<accronym>',
 '<activity>',
 '<administrative_region>',
 '<african>',
 '<agent_non_geographical>',
 '<album>',
 '<all_caps>',
 '<area>',
 '<artifact>',
 '<asian>',
 '<band>',
 '<book>',
 '<capital>',
 '<capitalized>',
 '<cc>',
 '<cd>',
 '<city>',
 '<company>',
 '<country>',
 '<device>',
 '<disease>',
 '<dt>',
 '<ethnic_group>',
 '<event>',
 '<ex>',
 '<film>',
 '<fw>',
 '<given_name>',
 '<human>',
 '<in>',
 '<jj>',
 '<jjr>',
 '<jjs>',
 '<location>',
 '<ls>',
 '<magazine>',
 '<md>',
 '<musical_artist>',
 '<nn>',
 '<nnp>',
 '<nnps>',
 '<nns>',
 '<nn|sym>',
 '<not_in_dict>',
 '<num>',
 '<num>)<num>',
 '<num>*',
 '<num>,<num>-a',
 '<num>,<num>-acre',
 '<num>,<num>-b',
 '<num>,<num>-hectare',
 '<num>,<num>-km',
 '<num>,<num>-seat',
 '<num>,<num>-strong',
 '<num>,<num>-student',
 '<num>,<num>nd',
 '<num>,<num>th',
 '<num>--',
 '<num>--<num>-<num>-<num>-<num>',
 '<num>--ruehe',
 '<num>-<num>(<num>-<num>',
 '<num>-<num>-<num>--',
 '<num>-<num>/<num>',
 '<num>-<nu

In [43]:
edge_lists = {'before_edges': edge_list_before,
              'after_edges': edge_list_after,
              'isa_edges': edge_list_isa,
              'context_edges': edges_list_unique,
              'all_edges': edges_list_all,
              }

In [97]:
for filename in edge_lists:
    pickle.dump(edge_lists[filename], open('edges/' + filename + '.pickle', 'wb'))

In [114]:
for filename in edge_lists:
    with open('edges/' + filename + '.edgelist', 'w') as f:
        for s, t in edge_lists[filename]:
            f.write(f'{s} {t}\r')