In [2]:
import pandas as pd

import numpy as np
np.random.seed(1234)

In [None]:
train_all = pd.read_json('./FIGER/train.json', orient='records')

In [None]:
train_all.to_csv('./data/train_all.csv')

In [None]:
test_all = pd.read_json('./FIGER/test.json', orient='records')
test_all.to_csv('./data/test_all.csv')

In [None]:
dev_all = pd.read_json('./FIGER/dev.json', orient='records')
dev_all.to_csv('./data/dev_all.csv')

### Subsampling and Cleaning

In [3]:
train_all = pd.read_csv('./data/train_all.csv', index_col=0)

  mask |= (ar1 == a)


In [4]:
# sub-sample 10,000 sentences
train_sample = train_all.sample(n=10000)
train_sample

Unnamed: 0,sent,labels,start,end,ents
1220282,Teams come from all over the world to play : U...,['/location'],79,85,"[['Q30', 45, 58, 0.21681213], ['Q16', 61, 67, ..."
274295,He was signed to the Chicago American Giants i...,"['/news_agency', '/organization', '/organizati...",192,204,"[['Q2963291', 21, 44, 0.64166665], ['Q2399847'..."
544083,It is bounded on the northwest by the U.S. sta...,"['/location', '/location/province']",194,199,"[['Q35657', 38, 48, 0.51353383], ['Q797', 52, ..."
887317,The story is about an Old English Sheepdog who...,['/living_thing'],22,42,"[['Q37704', 22, 42, 0.5], ['Q6691229', 92, 95,..."
1633023,Ardiles-Waku Menga -LRB- born 28 September 198...,"['/organization/sports_team', '/organization']",124,142,"[['Q92125', 13, 18, 0.17703058], ['Q1869064', ..."
...,...,...,...,...,...
699527,Alison Hennegan is a lecturer at the Universit...,"['/organization', '/organization/educational_i...",37,60,"[['Q4727092', 0, 15, 0.5], ['Q35794', 37, 60, ..."
1388387,This album features Integrity worship leaders ...,"['/person/artist', '/person']",109,121,"[['Q15401689', 20, 29, 0.2793484], ['Q842194',..."
35773,"Ghulam Sarwar Nashir -LRB- 1922-1984 -RRB- , a...","['/location/city', '/location']",153,159,"[['Q5557675', 0, 13, 0.12280702], ['Q1869064',..."
487793,"The Birmingham , Northfield by-election of 28 ...","['/government', '/organization', '/government/...",87,99,"[['Q3136714', 4, 27, 0.67899287], ['Q9626', 87..."


In [5]:
import ast

def clean(dataset, name='train'):
    # convert labels to list of strings
    dataset.labels = dataset.labels.apply(lambda x:  ast.literal_eval(x))
    dataset = dataset.explode('labels')
    dataset[['sent', 'labels', 'start', 'end']].to_csv(f'./data/{name}_clean.csv')

In [7]:
clean(train_sample, 'train')

In [8]:
dev_all = pd.read_csv('./data/dev_all.csv', index_col=0)
clean(dev_all, 'dev')

### Creating BIO files that can be processed by spaCy

example: https://github.com/explosion/spaCy/blob/master/extra/example_data/ner_example_data/ner-sent-per-line.iob

In [9]:
train_clean = pd.read_csv('./data/train_clean.csv', index_col=0)
dev_clean = pd.read_csv('./data/dev_clean.csv', index_col=0)
train_clean

Unnamed: 0,sent,labels,start,end
1220282,Teams come from all over the world to play : U...,/location,79,85
274295,He was signed to the Chicago American Giants i...,/news_agency,192,204
274295,He was signed to the Chicago American Giants i...,/organization,192,204
274295,He was signed to the Chicago American Giants i...,/organization/sports_league,192,204
544083,It is bounded on the northwest by the U.S. sta...,/location,194,199
...,...,...,...,...
487793,"The Birmingham , Northfield by-election of 28 ...",/organization,87,99
487793,"The Birmingham , Northfield by-election of 28 ...",/government/political_party,87,99
367186,Semaphore is a north-western seaside suburb of...,/language,154,163
367186,Semaphore is a north-western seaside suburb of...,/location/country,154,163


In [10]:
dev_clean

Unnamed: 0,sent,labels,start,end
0,"He taught at the universities of Aberdeen , Li...",/organization,70,80
0,"He taught at the universities of Aberdeen , Li...",/organization/educational_institution,70,80
0,"He taught at the universities of Aberdeen , Li...",/location,70,80
1,Carl Theodor 's mother was a sister of Queen E...,/person,75,104
2,As a result of the Bolshevik Revolution in 191...,/event/protest,19,39
...,...,...,...,...
9998,"In August , the Ayyubids conquered Ramla , Dar...",/location,35,40
9999,Gemerska Ves is a village and municipality in ...,/location/country,95,103
9999,Gemerska Ves is a village and municipality in ...,/government,95,103
9999,Gemerska Ves is a village and municipality in ...,/government/government,95,103


In [11]:
display(
    train_clean.sent.str.contains('[', regex=False).sum(),
    train_clean.sent.str.contains(']', regex=False).sum()
)

0

0

In [12]:
# mark start and end of tag with [, ] characters (which are not used in text)
def mark_start_end(x):
    prefix = x['sent'][:x['start']]
    middle = x['sent'][x['start']:x['end']]
    postfix = x['sent'][x['end']:]
    
    return prefix + '[' + middle + ']' + postfix

In [13]:
# using [, ] marks, assign B/I/O tags to each token
def mark_BIO(x, tag):
    tokens = x.split(' ')
    # remove empty tokens
    tokens = [tok for tok in tokens if tok]
    marked_tokens = []
    started = False
    
    for tok in tokens:
        if tok.startswith('['):
            if tok.endswith(']'):
                marked_tokens.append(f'{tok[1:-1]}|B-{tag}')
            else:
                started = True
                marked_tokens.append(f'{tok[1:]}|B-{tag}')
        elif started:
            if tok.endswith(']'):
                started = False
                marked_tokens.append(f'{tok[:-1]}|I-{tag}')
            else:
                marked_tokens.append(f'{tok}|I-{tag}')
        else:
            marked_tokens.append(f'{tok}|O')
    
    return ' '.join(marked_tokens)

In [14]:
def bio_lambda(x):
    sent_marked = mark_start_end(x)
    sent_bio = mark_BIO(sent_marked, x['labels'])
    return sent_bio

In [15]:
train_clean['sent_BIO'] = train_clean.apply(bio_lambda, axis=1)

In [16]:
dev_clean['sent_BIO'] = dev_clean.apply(bio_lambda, axis=1)

In [17]:
np.savetxt('./data/train_clean.iob', train_clean.sent_BIO, fmt='%s')

In [18]:
np.savetxt('./data/dev_clean.iob', dev_clean.sent_BIO, fmt='%s')

### NER with spaCy

In [19]:
!python -m spacy convert ./data/train_clean.iob ./data -c iob

[38;5;4m[i] Auto-detected sentence-per-line NER format[0m
[38;5;4m[i] Grouping every 1 sentences into a document.[0m
[38;5;3m[!] To generate better training data, you may want to group sentences
into documents with `-n 10`.[0m
[38;5;2m[+] Generated output file (24027 documents): data\train_clean.spacy[0m


In [20]:
!python -m spacy convert ./data/dev_clean.iob ./data -c iob

[38;5;4m[i] Auto-detected sentence-per-line NER format[0m
[38;5;4m[i] Grouping every 1 sentences into a document.[0m
[38;5;3m[!] To generate better training data, you may want to group sentences
into documents with `-n 10`.[0m
[38;5;2m[+] Generated output file (23701 documents): data\dev_clean.spacy[0m


In [21]:
!python -m spacy debug config ./cfg/config.cfg

[1m
[1m
[1m
[38;5;2m[+] Config is valid[0m


In [22]:
!python -m spacy debug data ./cfg/config.cfg

[1m
[38;5;2m[+] Corpus is loadable[0m
[38;5;2m[+] Pipeline can be initialized with data[0m
[1m
Language: en
Training pipeline: tok2vec, ner
24026 training docs
23700 evaluation docs
[38;5;3m[!] 62 training examples also in evaluation data[0m
[1m
[38;5;4m[i] 698332 total word(s) in the data (42276 unique)[0m
[38;5;4m[i] No word vectors present in the package[0m
[1m
[38;5;4m[i] 0 new label(s), 112 existing label(s)[0m
0 missing value(s) (tokens with '-' label)
[38;5;3m[!] 280 entity span(s) with punctuation[0m
[38;5;2m[+] Good amount of examples for all labels[0m
[38;5;2m[+] Examples without occurrences available for all labels[0m
[38;5;2m[+] No entities consisting of or starting/ending with whitespace[0m
Entity spans consisting of or starting/ending with punctuation can not be
trained with a noise level > 0.
[1m
[38;5;2m[+] 5 checks passed[0m


In [None]:
!python train.py

### Exploration

In [None]:
train_clean = pd.read_csv('./data/train_clean.csv', index_col=0)
train_clean.head()

In [None]:
# extract spans using start, end
train_100000['span'] = train_100000.apply(lambda x: x['sent'][x['start'] : x['end']], axis=1)