In [3]:
import pandas as pd

In [4]:
df = pd.read_csv('https://dl.dropboxusercontent.com/s/tlijezgr8tnpeym/ner_dataset.csv?dl=0', 
                 header=0, 
                 encoding='latin')

In [5]:
df['Sentence #'].fillna(method='ffill', inplace=True)

In [6]:
df.rename(columns={'Sentence #':'Sent'}, inplace=True)

In [7]:
import re

def match(word):
    if re.match('[\W]+', word):
        return 1
    return 0

df['flag'] = df['Word'].apply(match)

In [8]:
qr1 = df.query('flag == 0').copy()

In [9]:
qr1.shape

(948295, 5)

In [10]:
qr1['Sent'] = qr1['Sent'].apply(lambda x: int(x[10:]))

In [11]:
train = qr1.query('Sent <= 37000')

test = qr1.query('Sent > 37000 and Sent < 43000')

In [12]:
train.name = 'train'
test.name = 'dev'

## Dump in iob format 

In [11]:
for df in [train,test]:
    
    with open(f'{df.name}.tsv', 'w', encoding='utf-8') as f:
        s_i = df.iloc[0]['Sent']
        for i in range(df.shape[0]):
            qr = df.iloc[i]
            if qr['Sent'] != s_i:
                s_i = qr['Sent']
                f.write('\n')
            f.write(qr['Word'] + ' ' + qr['Tag'].upper() + '\n')

# Convert to appropriate format

In [13]:
!python -m spacy convert train.tsv ./ -t spacy -n 1 -c iob
!python -m spacy convert dev.tsv ./ -t spacy -n 1 -c iob

[i] Auto-detected token-per-line NER format
[i] Grouping every 1 sentences into a document.
[!] To generate better training data, you may want to group sentences into
documents with `-n 10`.
[+] Generated output file (37000 documents): train.spacy
[i] Auto-detected token-per-line NER format
[i] Grouping every 1 sentences into a document.
[!] To generate better training data, you may want to group sentences into
documents with `-n 10`.
[+] Generated output file (5998 documents): dev.spacy


In [14]:
!python -m spacy init fill-config base_config.cfg config.cfg

[+] Auto-filled config with all values
[+] Saved config
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [15]:
!python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy -g 0 -o ./model


^C


<a href="https://ibb.co/MsPpXn9"><img src="https://i.ibb.co/SPV0LNJ/Screenshot-2021-03-27-210544.png" alt="Screenshot-2021-03-27-210544" border="0"></a>

In [13]:
qr2 = qr1.query(' Sent >= 43000')
test_df = qr2.groupby('Sent').agg(lambda x: list(x))
test_df['text'] = test_df['Word'].apply(lambda x: ' '.join(x))

In [1]:
import spacy

In [2]:
nlp = spacy.load('./model/model-best')

In [26]:
doc = nlp(test_df['text'].iloc[0])
[(ent.text, ent.label_) for ent in doc.ents]


[('Mauritania', 'GEO'),
 ('French', 'GPE'),
 ('al', 'ORG'),
 ('-', 'ORG'),
 ('Qaida', 'ORG')]

In [22]:
test_df['text'].iloc[0]

'Authorities in Mauritania say suspects in the recent killing of four French tourists are members of an extremist group linked to al-Qaida'

In [24]:
test_df['Tag'].iloc[0]

['O',
 'O',
 'B-geo',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-gpe',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-org']