In [2]:
import pandas as pd

# CONLL 2003

In [544]:
def txt_reader(inpt_file):
    f = open(inpt_file, "r")
    tok_lst = []
    label_lst = []
    for ix, x in enumerate(f):
        if x == '\n':
            tok_lst.append('')
            label_lst.append('')
            continue
        line = x.split()
        tok = line[0]
        try:
            label = line[-1]
        except:
            print(line, ix)
        tok_lst.append(tok)
        label_lst.append(label)
    f.close()
    d = {'token':tok_lst,'ner':label_lst}
    df = pd.DataFrame(d, columns=['token','ner'])
    return df

In [545]:
conl2003_train = txt_reader("conll2003/eng.train")
conl2003_testa = txt_reader("conll2003/eng.testa")
conl2003_testb = txt_reader("conll2003/eng.testb")

In [546]:
conl2003=pd.concat([conl2003_train,conl2003_testa,conl2003_testb])

In [547]:
conl2003.shape

(324948, 2)

# NIST-IEER

In [16]:
import os
import nltk
nltk.download('ieer')
from nltk.corpus import ieer
import csv
import re

[nltk_data] Downloading package ieer to /home/ubuntu/nltk_data...
[nltk_data]   Package ieer is already up-to-date!


True

In [538]:
nltk_file_path = '/home/ubuntu/bert-ner-datasets/nltk_ieer_data'

In [539]:
def ieer_reader(inpt_file):
    f = open(inpt_file, "r")
    last_line = ''
    tok_lst = []
    label_lst = []
    for ix, x in enumerate(f):
        x = x.strip()
        if x == '(DOCUMENT':
            tok_lst.append('-DOCSTART-')
            label_lst.append('O')
            tok_lst.append('')
            label_lst.append('')
            continue
        if len(last_line) > 0:
            if last_line[-1] == '.' and (x[0].isupper() or x[0].isalnum()):
                tok_lst.append('')
                label_lst.append('')
        if x[0] == '(' and x[-1] == ')':
            if x[-2] == ')':
                continue
            x = x[1:-1]
            line = x.split()
            label = line[0]
            tok = line[-1]
        else:
            label = 'O'
            tok = x
        tok_lst.append(tok)
        label_lst.append(label)
        last_line = x #persist last line
    f.close()
    d = {'token':tok_lst,'ner':label_lst}
    df = pd.DataFrame(d, columns=['token','ner'])
    return df

In [540]:
nltk_ieer = pd.DataFrame([])
count=0
for root, dirs, files in os.walk(nltk_file_path):
    for fname in files:
        if re.match(".*.txt",fname):
            temp = ieer_reader(os.path.join(root,fname))
            nltk_ieer = nltk_ieer.append(temp, ignore_index=True)
            count+=1


In [541]:
for root, dirs, files in os.walk(nltk_file_path):
    print(len(files))

94
8


In [542]:
count

102

In [543]:
nltk_ieer.shape

(60063, 2)

# GMB-2.2.0

In [534]:
gmb_file_path = '/home/ubuntu/bert-ner-datasets/GMB22/gmb-2.2.0/data'

In [535]:
def gmb_reader(inpt_file):
    f = open(inpt_file, "r")
    tok_lst = ['-DOCSTART-', '']
    label_lst = ['O', '']
    for ix, x in enumerate(f):
        if x == '\n':
            tok_lst.append('')
            label_lst.append('')
            continue
        try:
            split_sent = x.split('\t')
            tok = split_sent[0]
            label = split_sent[3]
            tok_lst.append(tok)
            label_lst.append(label)
        except:
            print(ix, x, tok_lst[-1], label_lst[-1])
    f.close()
    tok_lst.append('')
    label_lst.append('')
    d = {'token':tok_lst,'ner':label_lst}
    df = pd.DataFrame(d, columns=['token','ner'])
    return df

In [536]:
gmb_conll = pd.DataFrame([])
count=0
for root, dirs, files in os.walk(gmb_file_path):
#    if count > 3: limit to num of docs
#        break
    for fname in files:
        if re.match("en.tags",fname):
            temp = gmb_reader(os.path.join(root,fname))
            gmb_conll = gmb_conll.append(temp, ignore_index=True)
            count+=1


In [537]:
gmb_conll.shape

(1436159, 2)

# GUM-3.1.0

In [529]:
gum_file_path = '/home/ubuntu/bert-ner-datasets/GUM/CONLL-format/data-by-corpustype'

In [530]:
def gum_reader(inpt_file):
    f = open(inpt_file, "r")
    tok_lst = ['-DOCSTART-']
    label_lst = ['O']
    for ix, x in enumerate(f):
        if x == '\n':
            tok_lst.append('')
            label_lst.append('')
            continue
        try:
            x = x.strip()
            split_sent = x.split('\t')
            tok = split_sent[0]
            label = split_sent[1]
            tok_lst.append(tok)
            label_lst.append(label)
        except:
            print(inpt_file, ix, x)
    f.close()
    tok_lst.append('')
    label_lst.append('')
    d = {'token':tok_lst,'ner':label_lst}
    df = pd.DataFrame(d, columns=['token','ner'])
    return df

In [531]:
gum_conll = pd.DataFrame([])
count=0
for root, dirs, files in os.walk(gum_file_path):
    for fname in files:
        if re.match(".*.tsv",fname):
            temp = gum_reader(os.path.join(root,fname))
            gum_conll = gum_conll.append(temp, ignore_index=True)
            count+=1

In [532]:
count

77

In [533]:
gum_conll.shape

(67077, 2)

# wikigold

In [525]:
wikigold_file_path = '/home/ubuntu/bert-ner-datasets/wikigold/CONLL-format/data/wikigold.conll.txt'

In [526]:
def wikigold_reader(inpt_file):
    f = open(inpt_file, "r")
    tok_lst = ['-DOCSTART-', '']
    label_lst = ['O', '']
    for ix, x in enumerate(f):
        if x == '\n':
            tok_lst.append('')
            label_lst.append('')
            continue
        try:
            x = x.strip()
            split_sent = x.split(' ')
            tok = split_sent[0]
            label = split_sent[1]
            tok_lst.append(tok)
            label_lst.append(label)
        except:
            print(ix, x)
    f.close()
    tok_lst.append('')
    label_lst.append('')
    d = {'token':tok_lst,'ner':label_lst}
    df = pd.DataFrame(d, columns=['token','ner'])
    return df

In [527]:
wikigold_conll = pd.DataFrame([])
count=0
temp = wikigold_reader(wikigold_file_path)
wikigold_conll = wikigold_conll.append(temp, ignore_index=True)
count+=1

In [528]:
wikigold_conll.shape

(40996, 2)

# Ritter

In [521]:
ritter_file_path = ['/home/ubuntu/bert-ner-datasets/Ritter/annotated/wnut16/data/train', \
                    '/home/ubuntu/bert-ner-datasets/Ritter/annotated/wnut16/data/test', \
                    '/home/ubuntu/bert-ner-datasets/Ritter/annotated/wnut16/data/dev', \
                    '/home/ubuntu/bert-ner-datasets/Ritter/annotated/ner.txt']

In [522]:
def ritter_reader(inpt_file):
    f = open(inpt_file, "r")
    tok_lst = ['-DOCSTART-', '']
    label_lst = ['O', '']
    for ix, x in enumerate(f):
        if x == '\n':
            tok_lst.append('')
            label_lst.append('')
            continue
        if x == '\t\n':
            tok_lst.append('')
            label_lst.append('')
            continue
        try:
            x = x.strip()
            split_sent = x.split('\t')
            tok = split_sent[0]
            label = split_sent[1]
            tok_lst.append(tok)
            label_lst.append(label)
        except:
            print(inpt_file, ix, x)
    f.close()
    tok_lst.append('')
    label_lst.append('')
    d = {'token':tok_lst,'ner':label_lst}
    df = pd.DataFrame(d, columns=['token','ner'])
    return df

In [523]:
ritter_conll = pd.DataFrame([])
count=0
for i in ritter_file_path:
    temp = ritter_reader(i)
    ritter_conll = ritter_conll.append(temp, ignore_index=True)
    count+=1

In [524]:
ritter_conll.shape

(180763, 2)

# BTC

In [517]:
btc_file_path = ['/home/ubuntu/bert-ner-datasets/BTC/CONLL-format/data/a.conll', \
                    '/home/ubuntu/bert-ner-datasets/BTC/CONLL-format/data/b.conll', \
                    '/home/ubuntu/bert-ner-datasets/BTC/CONLL-format/data/e.conll', \
                    '/home/ubuntu/bert-ner-datasets/BTC/CONLL-format/data/f.conll', \
                    '/home/ubuntu/bert-ner-datasets/BTC/CONLL-format/data/g.conll', \
                    '/home/ubuntu/bert-ner-datasets/BTC/CONLL-format/data/h.conll']

In [518]:
def btc_reader(inpt_file):
    f = open(inpt_file, "r")
    tok_lst = ['-DOCSTART-', '']
    label_lst = ['O', '']
    for ix, x in enumerate(f):
        if x == '\n':
            tok_lst.append('')
            label_lst.append('')
            continue
        if len(x.strip().split('\t')) == 1:
            tok_lst.append('')
            label_lst.append('O')
            continue
        try:
            x = x.strip()
            split_sent = x.split('\t')
            tok = split_sent[0]
            label = split_sent[1]
            tok_lst.append(tok)
            label_lst.append(label)
        except:
            print(inpt_file, ix, x)
    f.close()
    tok_lst.append('')
    label_lst.append('')
    d = {'token':tok_lst,'ner':label_lst}
    df = pd.DataFrame(d, columns=['token','ner'])
    return df

In [519]:
btc_conll = pd.DataFrame([])
count=0
for i in btc_file_path:
    temp = btc_reader(i)
    btc_conll = btc_conll.append(temp, ignore_index=True)
    count+=1

In [520]:
btc_conll.shape

(159744, 2)

# WNUT17

In [513]:
wnut_file_path = ['/home/ubuntu/bert-ner-datasets/WNUT17/CONLL-format/data/train/wnut17train.conll', \
                  '/home/ubuntu/bert-ner-datasets/WNUT17/CONLL-format/data/test/emerging.test.annotated', \
                  '/home/ubuntu/bert-ner-datasets/WNUT17/CONLL-format/data/dev/emerging.dev.conll']

In [514]:
def wnut_reader(inpt_file):
    f = open(inpt_file, "r")
    tok_lst = ['-DOCSTART-', '']
    label_lst = ['O', '']
    for ix, x in enumerate(f):
        if x == '\n':
            tok_lst.append('')
            label_lst.append('')
            continue
        try:
            x = x.strip()
            split_sent = x.split('\t')
            tok = split_sent[0]
            label = split_sent[1]
            tok_lst.append(tok)
            label_lst.append(label)
        except:
            print(inpt_file, ix, x)
    f.close()
    tok_lst.append('')
    label_lst.append('')
    d = {'token':tok_lst,'ner':label_lst}
    df = pd.DataFrame(d, columns=['token','ner'])
    return df

In [515]:
wnut_conll = pd.DataFrame([])
count=0
for i in wnut_file_path:
    temp = wnut_reader(i)
    wnut_conll = wnut_conll.append(temp, ignore_index=True)
    count+=1

In [516]:
wnut_conll.shape

(107556, 2)

# red3d

In [508]:
re3d_path = '/home/ubuntu/bert-ner-datasets/re3d/CONLL-format/data-by-source/conll'

In [509]:
def re3d_reader(inpt_file):
    f = open(inpt_file, "r")
    tok_lst = ['-DOCSTART-', '']
    label_lst = ['O', '']
    for ix, x in enumerate(f):
        if x == '\n':
            tok_lst.append('')
            label_lst.append('')
            continue
        try:
            x = x.strip()
            split_sent = x.split('\t')
            tok = split_sent[0]
            label = split_sent[1]
            tok_lst.append(tok)
            label_lst.append(label)
        except:
            print(inpt_file, ix, x)
    f.close()
    tok_lst.append('')
    label_lst.append('')
    d = {'token':tok_lst,'ner':label_lst}
    df = pd.DataFrame(d, columns=['token','ner'])
    return df

In [510]:
re3d_conll = pd.DataFrame([])
count=0
for root, dirs, files in os.walk(re3d_path):
#    if count > 3: limit to num of docs
#        break
    for fname in files:
        if re.match(".*.conll",fname):
            temp = re3d_reader(os.path.join(root,fname))
            re3d_conll = re3d_conll.append(temp, ignore_index=True)
            count+=1
print(count)

99


In [512]:
re3d_conll.shape

(26723, 2)

# SEC-filings

In [493]:
sec_file_path = ['/home/ubuntu/bert-ner-datasets/SEC-filings/CONLL-format/data/train/FIN5.txt', \
                  '/home/ubuntu/bert-ner-datasets/SEC-filings/CONLL-format/data/test/FIN3.txt']

In [502]:
def sec_reader(inpt_file):
    f = open(inpt_file, "r")
    tok_lst = []
    label_lst = []
    for ix, x in enumerate(f):
        if x == '\n':
            tok_lst.append('')
            label_lst.append('')
            continue
        line = x.split()
        tok = line[0]
        try:
            label = line[-1]
        except:
            print(line, ix)
        tok_lst.append(tok)
        label_lst.append(label)
    f.close()
    d = {'token':tok_lst,'ner':label_lst}
    df = pd.DataFrame(d, columns=['token','ner'])
    return df

In [503]:
sec_conll = pd.DataFrame([])
count=0
for i in sec_file_path:
    temp = sec_reader(i)
    sec_conll = sec_conll.append(temp, ignore_index=True)
    count+=1

In [504]:
sec_conll.shape

(55739, 2)

# Combined_dfs

In [557]:
ner_combine = pd.concat([conl2003,nltk_ieer,gmb_conll,gum_conll,wikigold_conll,ritter_conll,btc_conll,wnut_conll,re3d_conll,sec_conll])
ner_combine=ner_combine.reset_index()
del ner_combine['index']

In [558]:
ner_combine.shape

(2459768, 2)

In [561]:
ner_combine.to_csv('ner_combine.csv')