In [2]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer

import numpy as np
np.random.seed(1234)

In [None]:
train_all = pd.read_json('./FIGER/train.json', orient='records')

In [None]:
train_all.to_csv('./data/train_all.csv')

In [22]:
test_all = pd.read_json('./FIGER/test.json', orient='records')
test_all.to_csv('./data/test_all.csv')

In [23]:
dev_all = pd.read_json('./FIGER/dev.json', orient='records')
dev_all.to_csv('./data/dev_all.csv')

### Subsampling and Cleaning

In [None]:
train_all = pd.read_csv('./data/train_all.csv', index_col=0)

In [None]:
# sub-sample 100,000 sentences
train_100000 = train_all.sample(n=100000)
train_100000

In [None]:
train_100000.to_csv('./data/train_100000.csv')

In [83]:
train_100000 = pd.read_csv('./data/train_100000.csv', index_col=0)
train_100000

Unnamed: 0,sent,labels,start,end,ents
972798,In 1961 a George Jones version of the song was...,['/music'],83,126,"[['Q508202', 10, 22, 0.666855], ['Q165745', 63..."
1816704,He had already inked Gene Colan there on a lon...,['/written_work'],91,108,"[['Q1266750', 21, 31, 0.6346333], ['Q180704', ..."
1291792,"Other early pioneers were the Usher family , w...",['/food'],79,83,"[['Q165911', 30, 35, 0.27522305]]"
1265539,Blood Into Wine is a documentary about the Nor...,"['/person/actor', '/person/artist', '/person']",86,106,"[['Q4927812', 0, 15, 0.7084944], ['Q582744', 5..."
798,"In 317 BC , Cassander , after defeating Olympi...",['/person'],40,48,"[['Q207183', 12, 21, 0.770825], ['Q223134', 40..."
...,...,...,...,...,...
464708,Such films include Seven Notes In Black -LRB- ...,"['/art', '/art/film']",88,98,"[['Q190908', 19, 24, 0.15525861], ['Q1869064',..."
1299913,Released by White Noise Records in 1979 after ...,"['/person/musician', '/person/actor', '/person...",101,112,"[['Q7995120', 12, 31, 0.5784357], ['Q347986', ..."
1157145,She studied painting first with a teacher from...,"['/location/city', '/location']",68,73,"[['Q128499', 68, 73, 0.32345238], ['Q656', 86,..."
970084,He earned a Ph. D in history from the Universi...,['/written_work'],125,158,"[['Q752297', 12, 17, 0.1392031], ['Q230492', 3..."


In [84]:
# convert labels to list of strings
import ast
train_100000.labels = train_100000.labels.apply(lambda x:  ast.literal_eval(x))

In [85]:
# extract spans using start, end
train_100000['span'] = train_100000.apply(lambda x: x['sent'][x['start'] : x['end']], axis=1)

In [86]:
train_100000

Unnamed: 0,sent,labels,start,end,ents,span
972798,In 1961 a George Jones version of the song was...,[/music],83,126,"[['Q508202', 10, 22, 0.666855], ['Q165745', 63...",George Jones Sings Country and Western Hits
1816704,He had already inked Gene Colan there on a lon...,[/written_work],91,108,"[['Q1266750', 21, 31, 0.6346333], ['Q180704', ...",Tales of Suspense
1291792,"Other early pioneers were the Usher family , w...",[/food],79,83,"[['Q165911', 30, 35, 0.27522305]]",wine
1265539,Blood Into Wine is a documentary about the Nor...,"[/person/actor, /person/artist, /person]",86,106,"[['Q4927812', 0, 15, 0.7084944], ['Q582744', 5...",Maynard James Keenan
798,"In 317 BC , Cassander , after defeating Olympi...",[/person],40,48,"[['Q207183', 12, 21, 0.770825], ['Q223134', 40...",Olympias
...,...,...,...,...,...,...
464708,Such films include Seven Notes In Black -LRB- ...,"[/art, /art/film]",88,98,"[['Q190908', 19, 24, 0.15525861], ['Q1869064',...",The Beyond
1299913,Released by White Noise Records in 1979 after ...,"[/person/musician, /person/actor, /person/arti...",101,112,"[['Q7995120', 12, 31, 0.5784357], ['Q347986', ...",Steve Jones
1157145,She studied painting first with a teacher from...,"[/location/city, /location]",68,73,"[['Q128499', 68, 73, 0.32345238], ['Q656', 86,...",Yalta
970084,He earned a Ph. D in history from the Universi...,[/written_work],125,158,"[['Q752297', 12, 17, 0.1392031], ['Q230492', 3...",Science and Civilisation in China


In [88]:
train_100000 = train_100000.explode('labels')

In [89]:
train_100000[['sent', 'labels', 'start', 'end', 'span']].to_csv('./data/train_clean.csv')

### Creating BIO files that can be processed by spaCy

example: https://github.com/explosion/spaCy/blob/master/extra/example_data/ner_example_data/ner-sent-per-line.iob

In [91]:
train_clean = pd.read_csv('./data/train_clean.csv', index_col=0)
train_clean

Unnamed: 0,sent,labels,start,end,span
972798,In 1961 a George Jones version of the song was...,/music,83,126,George Jones Sings Country and Western Hits
1816704,He had already inked Gene Colan there on a lon...,/written_work,91,108,Tales of Suspense
1291792,"Other early pioneers were the Usher family , w...",/food,79,83,wine
1265539,Blood Into Wine is a documentary about the Nor...,/person/actor,86,106,Maynard James Keenan
1265539,Blood Into Wine is a documentary about the Nor...,/person/artist,86,106,Maynard James Keenan
...,...,...,...,...,...
1157145,She studied painting first with a teacher from...,/location/city,68,73,Yalta
1157145,She studied painting first with a teacher from...,/location,68,73,Yalta
970084,He earned a Ph. D in history from the Universi...,/written_work,125,158,Science and Civilisation in China
1735279,"In 1788 Van der Noot travelled to England , th...",/location/country,48,62,Dutch Republic


In [120]:
display(
    train_clean.sent.str.contains('[', regex=False).sum(),
    train_clean.sent.str.contains(']', regex=False).sum()
)

0

0

In [93]:
# mark start and end of tag with [, ] characters (which are not used in text)
def mark_start_end(x):
    prefix = x['sent'][:x['start']]
    middle = x['sent'][x['start']:x['end']]
    postfix = x['sent'][x['end']:]
    
    return prefix + '[' + middle + ']' + postfix

In [94]:
# using [, ] marks, assign B/I/O tags to each token
def mark_BIO(x, tag):
    tokens = x.split(' ')
    marked_tokens = []
    started = False
    
    for tok in tokens:
        if tok.startswith('['):
            if tok.endswith(']'):
                marked_tokens.append(f'{tok[1:-1]}|B-{tag}')
            else:
                started = True
                marked_tokens.append(f'{tok[1:]}|B-{tag}')
        elif started:
            if tok.endswith(']'):
                started = False
                marked_tokens.append(f'{tok[:-1]}|I-{tag}')
            else:
                marked_tokens.append(f'{tok}|I-{tag}')
        else:
            marked_tokens.append(f'{tok}|O')
    
    return ' '.join(marked_tokens)

In [95]:
def bio_lambda(x):
    sent_marked = mark_start_end(x)
    sent_bio = mark_BIO(sent_marked, x['labels'])
    return sent_bio

train_clean['sent_BIO'] = train_clean.apply(bio_lambda, axis=1)

In [96]:
train_clean

Unnamed: 0,sent,labels,start,end,span,sent_BIO
972798,In 1961 a George Jones version of the song was...,/music,83,126,George Jones Sings Country and Western Hits,In|O 1961|O a|O George|O Jones|O version|O of|...
1816704,He had already inked Gene Colan there on a lon...,/written_work,91,108,Tales of Suspense,He|O had|O already|O inked|O Gene|O Colan|O th...
1291792,"Other early pioneers were the Usher family , w...",/food,79,83,wine,Other|O early|O pioneers|O were|O the|O Usher|...
1265539,Blood Into Wine is a documentary about the Nor...,/person/actor,86,106,Maynard James Keenan,Blood|O Into|O Wine|O is|O a|O documentary|O a...
1265539,Blood Into Wine is a documentary about the Nor...,/person/artist,86,106,Maynard James Keenan,Blood|O Into|O Wine|O is|O a|O documentary|O a...
...,...,...,...,...,...,...
1157145,She studied painting first with a teacher from...,/location/city,68,73,Yalta,She|O studied|O painting|O first|O with|O a|O ...
1157145,She studied painting first with a teacher from...,/location,68,73,Yalta,She|O studied|O painting|O first|O with|O a|O ...
970084,He earned a Ph. D in history from the Universi...,/written_work,125,158,Science and Civilisation in China,He|O earned|O a|O Ph.|O D|O in|O history|O fro...
1735279,"In 1788 Van der Noot travelled to England , th...",/location/country,48,62,Dutch Republic,In|O 1788|O Van|O der|O Noot|O travelled|O to|...


In [109]:
np.savetxt('./data/train_clean_BIO.txt', train_clean.sent_BIO.sample(100), fmt='%s')

### Exploration

In [3]:
train_clean = pd.read_csv('./data/train_clean.csv', index_col=0)
train_clean.head()

Unnamed: 0,sent,labels,start,end,span
972798,In 1961 a George Jones version of the song was...,['/music'],83,126,George Jones Sings Country and Western Hits
1816704,He had already inked Gene Colan there on a lon...,['/written_work'],91,108,Tales of Suspense
1291792,"Other early pioneers were the Usher family , w...",['/food'],79,83,wine
1265539,Blood Into Wine is a documentary about the Nor...,"['/person/actor', '/person/artist', '/person']",86,106,Maynard James Keenan
798,"In 317 BC , Cassander , after defeating Olympi...",['/person'],40,48,Olympias


### NER with spaCy

In [110]:
!python -m spacy convert ./data/train_clean_BIO.txt ./data -c iob

[38;5;4m[i] Auto-detected sentence-per-line NER format[0m
[38;5;4m[i] Grouping every 1 sentences into a document.[0m
[38;5;3m[!] To generate better training data, you may want to group sentences
into documents with `-n 10`.[0m
[38;5;2m[+] Generated output file (101 documents):
data\train_clean_BIO.spacy[0m


In [115]:
!python -m spacy debug config ./cfg/config.cfg

[1m
[1m
[1m
[38;5;2m[+] Config is valid[0m


In [114]:
!python -m spacy debug data ./cfg/config.cfg

[1m
[38;5;2m[+] Corpus is loadable[0m
[38;5;2m[+] Pipeline can be initialized with data[0m
[1m
Language: en
Training pipeline: tok2vec, ner
100 training docs
100 evaluation docs
[38;5;3m[!] 100 training examples also in evaluation data[0m
[38;5;3m[!] Low number of examples to train a new pipeline (100)[0m
[1m
[38;5;4m[i] 2881 total word(s) in the data (1388 unique)[0m
[38;5;4m[i] No word vectors present in the package[0m
[1m
[38;5;4m[i] 0 new label(s), 33 existing label(s)[0m
0 missing value(s) (tokens with '-' label)
[38;5;2m[+] Good amount of examples for all labels[0m
[38;5;2m[+] Examples without occurrences available for all labels[0m
[38;5;2m[+] No entities consisting of or starting/ending with whitespace[0m
[38;5;2m[+] No entities consisting of or starting/ending with punctuation[0m
[1m
[38;5;2m[+] 6 checks passed[0m


In [119]:
!python train.py

^C
[38;5;2m[+] Created output directory: training[0m
[38;5;4m[i] Using CPU[0m
[1m
[38;5;2m[+] Initialized pipeline[0m
[1m
[38;5;4m[i] Pipeline: ['tok2vec', 'ner'][0m
[38;5;4m[i] Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     41.44    0.00    0.00    0.00    0.00
  6     200        315.32   2004.79   34.04   36.36   32.00    0.34
 14     400        394.00    849.83   95.48   95.96   95.00    0.95
 25     600         52.61    131.38  100.00  100.00  100.00    1.00
 37     800         17.55     14.03  100.00  100.00  100.00    1.00
 53    1000          2.11      1.03  100.00  100.00  100.00    1.00
 72    1200          8.95      2.72  100.00  100.00  100.00    1.00
 96    1400         92.95     37.00  100.00  100.00  100.00    1.00
125    1600         72.07     21.48  100.00  100.00  100.00    1.00
160    1800         84.40     13.

Set up nlp object from config
Pipeline: ['tok2vec', 'ner']
Created vocabulary
Finished initializing nlp object
Initialized pipeline components: ['tok2vec', 'ner']
Traceback (most recent call last):
  File "C:\ProgramData\Anaconda3\envs\vanguard_2\lib\runpy.py", line 197, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "C:\ProgramData\Anaconda3\envs\vanguard_2\lib\runpy.py", line 87, in _run_code
    exec(code, run_globals)
  File "C:\ProgramData\Anaconda3\envs\vanguard_2\lib\site-packages\spacy\__main__.py", line 4, in <module>
    setup_cli()
  File "C:\ProgramData\Anaconda3\envs\vanguard_2\lib\site-packages\spacy\cli\_util.py", line 68, in setup_cli
    command(prog_name=COMMAND)
  File "C:\ProgramData\Anaconda3\envs\vanguard_2\lib\site-packages\click\core.py", line 829, in __call__
    return self.main(*args, **kwargs)
  File "C:\ProgramData\Anaconda3\envs\vanguard_2\lib\site-packages\click\core.py", line 782, in main
    rv = self.invoke(ctx)
  File "C: