In [2]:
import pandas as pd

import numpy as np
np.random.seed(1234)

In [None]:
train_all = pd.read_json('./FIGER/train.json', orient='records')

In [None]:
train_all.to_csv('./data/train_all.csv')

In [None]:
test_all = pd.read_json('./FIGER/test.json', orient='records')
test_all.to_csv('./data/test_all.csv')

In [None]:
dev_all = pd.read_json('./FIGER/dev.json', orient='records')
dev_all.to_csv('./data/dev_all.csv')

### Subsampling and Cleaning

In [3]:
train_all = pd.read_csv('./data/train_all.csv', index_col=0)

  mask |= (ar1 == a)


In [32]:
# sub-sample 10,000 sentences
train_sample = train_all.sample(n=1000)
train_sample

Unnamed: 0,sent,labels,start,end,ents
1630249,He is best known for his contributions to the ...,['/location'],156,167,"[['Q211248', 76, 87, 0.8112072], ['Q28567', 13..."
339534,"Other languages taught at GEOS include , Frenc...",['/language'],41,47,"[['Q315', 0, 15, 0.14799862], ['Q1210707', 26,..."
496123,The final was played at Centre Court in Wimble...,"['/location', '/building/sports_facility', '/b...",24,36,"[['Q2603183', 24, 36, 0.27905092], ['Q736742',..."
1123512,"It stars Keeley Hawes -LRB- Spooks , Under the...","['/person/actor', '/person']",210,224,"[['Q237290', 9, 21, 0.65806925], ['Q6691229', ..."
1717565,"Students are of all ages , from senior citizen...","['/location/city', '/location']",131,145,"[['Q48282', 0, 8, 0.13177414], ['Q213205', 102..."
...,...,...,...,...,...
1025527,He had suggested earlier that a suitable site ...,"['/location/city', '/location']",84,103,"[['Q1691479', 68, 74, 0.18739218], ['Q37100', ..."
1715943,Jim Lynagh -LRB- 13 April 1956 & ndash ; 8 May...,"['/military', '/organization', '/person']",169,202,"[['Q6196505', 0, 10, 0.7862777999999999], ['Q1..."
1219528,He is currently a member of Sinn Fein 's Ard C...,"['/government', '/organization', '/government/...",28,37,"[['Q4788007', 41, 55, 0.5131183], ['Q1869064',..."
1294826,"Rolf McPherson , 96 , American evangelist , so...","['/person/artist', '/location/cemetery', '/lan...",22,30,"[['Q7360766', 0, 14, 0.6029375], ['Q2563141', ..."


In [5]:
import ast

def clean(dataset, name='train'):
    # convert labels to list of strings
    if dataset.labels
    dataset.labels = dataset.labels.apply(lambda x:  ast.literal_eval(x))
    dataset = dataset.explode('labels')
    dataset[['sent', 'labels', 'start', 'end']].to_csv(f'./data/{name}_clean.csv')

In [33]:
clean(train_sample, 'train_1000')

In [31]:
dev_sample = pd.read_csv('./data/dev_all.csv', index_col=0).sample(1000)
clean(dev_sample, 'dev_1000')

### Creating BIO files that can be processed by spaCy

example: https://github.com/explosion/spaCy/blob/master/extra/example_data/ner_example_data/ner-sent-per-line.iob

In [34]:
train_clean = pd.read_csv('./data/train_1000_clean.csv', index_col=0)
dev_clean = pd.read_csv('./data/dev_1000_clean.csv', index_col=0)
train_clean

Unnamed: 0,sent,labels,start,end
1630249,He is best known for his contributions to the ...,/location,156,167
339534,"Other languages taught at GEOS include , Frenc...",/language,41,47
496123,The final was played at Centre Court in Wimble...,/location,24,36
496123,The final was played at Centre Court in Wimble...,/building/sports_facility,24,36
496123,The final was played at Centre Court in Wimble...,/building,24,36
...,...,...,...,...
1294826,"Rolf McPherson , 96 , American evangelist , so...",/location,22,30
1294826,"Rolf McPherson , 96 , American evangelist , so...",/government_agency,22,30
1294826,"Rolf McPherson , 96 , American evangelist , so...",/person,22,30
306765,But another Grand Final loss followed in 1944 ...,/person/athlete,53,60


In [35]:
dev_clean

Unnamed: 0,sent,labels,start,end
6624,She was nominated for the Academy Award as Bes...,/art,190,207
6624,She was nominated for the Academy Award as Bes...,/art/film,190,207
5964,He has an uncredited appearance in the 1984 fi...,/written_work,49,69
6734,"The region lies in east-central Rajasthan , an...",/location/county,193,202
6734,"The region lies in east-central Rajasthan , an...",/location,193,202
...,...,...,...,...
283,"He died in Portland , Oregon on June 3 , 1995 ...",/location,11,28
270,'' Headfirst for Halos '' is the sixth track f...,/music,87,138
4627,He received his music education at the Samad V...,/person/politician,39,52
4627,He received his music education at the Samad V...,/person/author,39,52


In [36]:
display(
    train_clean.sent.str.contains('[', regex=False).sum(),
    train_clean.sent.str.contains(']', regex=False).sum()
)

0

0

In [37]:
# mark start and end of tag with [, ] characters (which are not used in text)
def mark_start_end(x):
    prefix = x['sent'][:x['start']]
    middle = x['sent'][x['start']:x['end']]
    postfix = x['sent'][x['end']:]
    
    return prefix + '[' + middle + ']' + postfix

In [38]:
# using [, ] marks, assign B/I/O tags to each token
def mark_BIO(x, tag):
    tokens = x.split(' ')
    # remove empty tokens
    tokens = [tok for tok in tokens if tok]
    marked_tokens = []
    started = False
    
    for tok in tokens:
        if tok.startswith('['):
            if tok.endswith(']'):
                marked_tokens.append(f'{tok[1:-1]}|B-{tag}')
            else:
                started = True
                marked_tokens.append(f'{tok[1:]}|B-{tag}')
        elif started:
            if tok.endswith(']'):
                started = False
                marked_tokens.append(f'{tok[:-1]}|I-{tag}')
            else:
                marked_tokens.append(f'{tok}|I-{tag}')
        else:
            marked_tokens.append(f'{tok}|O')
    
    return ' '.join(marked_tokens)

In [39]:
def bio_lambda(x):
    sent_marked = mark_start_end(x)
    sent_bio = mark_BIO(sent_marked, x['labels'])
    return sent_bio

In [40]:
train_clean['sent_BIO'] = train_clean.apply(bio_lambda, axis=1)

In [41]:
dev_clean['sent_BIO'] = dev_clean.apply(bio_lambda, axis=1)

In [43]:
np.savetxt('./data/train_1000_clean.iob', train_clean.sent_BIO, fmt='%s')

In [42]:
np.savetxt('./data/dev_1000_clean.iob', dev_clean.sent_BIO, fmt='%s')

### NER with spaCy

In [46]:
!python -m spacy convert ./data/train_1000_clean.iob ./data -c iob

[38;5;4m[i] Auto-detected sentence-per-line NER format[0m
[38;5;4m[i] Grouping every 1 sentences into a document.[0m
[38;5;3m[!] To generate better training data, you may want to group sentences
into documents with `-n 10`.[0m
[38;5;2m[+] Generated output file (2361 documents):
data\train_1000_clean.spacy[0m


In [48]:
!python -m spacy convert ./data/dev_1000_clean.iob ./data -c iob

[38;5;4m[i] Auto-detected sentence-per-line NER format[0m
[38;5;4m[i] Grouping every 1 sentences into a document.[0m
[38;5;3m[!] To generate better training data, you may want to group sentences
into documents with `-n 10`.[0m
[38;5;2m[+] Generated output file (2355 documents):
data\dev_1000_clean.spacy[0m


In [50]:
!python -m spacy debug config ./cfg/config.cfg

[1m
[1m
[1m
[38;5;2m[+] Config is valid[0m


In [51]:
!python -m spacy debug data ./cfg/config.cfg

[1m
[38;5;2m[+] Corpus is loadable[0m
[38;5;2m[+] Pipeline can be initialized with data[0m
[1m
Language: en
Training pipeline: tok2vec, ner
2360 training docs
2354 evaluation docs
[38;5;3m[!] 1 training examples also in evaluation data[0m
[1m
[38;5;4m[i] 68497 total word(s) in the data (8627 unique)[0m
[38;5;4m[i] No word vectors present in the package[0m
[1m
[38;5;4m[i] 0 new label(s), 96 existing label(s)[0m
0 missing value(s) (tokens with '-' label)
[38;5;3m[!] 30 entity span(s) with punctuation[0m
[38;5;2m[+] Good amount of examples for all labels[0m
[38;5;2m[+] Examples without occurrences available for all labels[0m
[38;5;2m[+] No entities consisting of or starting/ending with whitespace[0m
Entity spans consisting of or starting/ending with punctuation can not be
trained with a noise level > 0.
[1m
[38;5;2m[+] 5 checks passed[0m


In [None]:
!python train.py

In [52]:
# sample output
!cat output.txt

$ python train.py
â„¹ Using CPU

Set up nlp object from config
Loading corpus from path: data\dev_1000_clean.spacy
Loading corpus from path: data\train_1000_clean.spacy
Pipeline: ['tok2vec', 'ner']
Loading lookups from spacy-lookups-data: ['lexeme_norm']
Added vocab lookups: lexeme_norm
Created vocabulary
Finished initializing nlp object
Initialized pipeline components: ['tok2vec', 'ner']
âœ” Initialized pipeline

Loading corpus from path: data\dev_1000_clean.spacy
Loading corpus from path: data\train_1000_clean.spacy
Removed existing output directory: training\model-last
â„¹ Pipeline: ['tok2vec', 'ner']
â„¹ Initial learn rate: 0.001
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     44.79    0.00    0.00    0.00    0.00
  0      50         23.32   1111.54    0.00    0.00    0.00    0.00
  0     100         24.35    303.90    0.00    0.00    0.00    0.00
  0     150        