In [1]:
import re
import time
import string
import pickle
import numpy as np
import pandas as pd

from datasets import load_dataset

from tqdm.notebook import tqdm
from collections import Counter

from sklearn import metrics
from sklearn.metrics import classification_report, f1_score

In [2]:
import torch
from torch import nn
from torch.nn import functional as F
from torch.utils import data

from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter

In [3]:
dataset = pickle.load(open('conll_graph_all.pickle', 'rb'))
print(', '.join([split + f' : {len(dataset[split])}' for split in dataset]))

train : 178610, validation : 44900, test : 40760


In [4]:
word2labels = {}
for split in dataset:
    for doc in dataset[split]:
        if doc['word'][0] not in word2labels:
            word2labels[doc['word'][0]] = []
        word2labels[doc['word'][0]].append(doc['label'])

In [5]:
counter = 0
anticounter = 0
for w in word2labels:
    if len(set(word2labels[w])) > 1:
        print(w, Counter(word2labels[w]).most_common())
        counter += 1
    else:
        anticounter += 1

german [('MISC', 139), ('ORG', 2), ('PER', 1)]
british [('MISC', 105), ('ORG', 21), ('LOC', 5), ('O', 1)]
blackburn [('ORG', 16), ('PER', 1)]
brussels [('LOC', 44), ('MISC', 2), ('ORG', 2)]
<UNK> [('O', 30339), ('PER', 1146), ('ORG', 695), ('MISC', 372), ('LOC', 188)]
the [('O', 12228), ('ORG', 34), ('LOC', 30), ('MISC', 17), ('PER', 1)]
european [('MISC', 107), ('ORG', 37)]
commission [('ORG', 51), ('O', 24), ('MISC', 4)]
said [('O', 2690), ('PER', 4)]
on [('O', 3108), ('MISC', 6), ('ORG', 1)]
shun [('O', 1), ('MISC', 1)]
disease [('O', 39), ('MISC', 2)]
can [('O', 100), ('LOC', 1)]
germany [('LOC', 240), ('ORG', 2)]
s [('O', 2277), ('ORG', 54), ('LOC', 7), ('MISC', 4), ('PER', 1)]
union [('ORG', 70), ('O', 33), ('LOC', 7)]
veterinary [('O', 8), ('ORG', 1)]
committee [('O', 32), ('ORG', 9), ('MISC', 2)]
wednesday [('O', 287), ('ORG', 13)]
countries [('O', 54), ('MISC', 1)]
than [('O', 219), ('PER', 1)]
britain [('LOC', 167), ('ORG', 1)]
do [('O', 144), ('PER', 3), ('ORG', 1)]
n't [('O

plate [('O', 2), ('ORG', 1)]
key [('O', 31), ('PER', 2)]
interim [('O', 16), ('ORG', 1)]
stanley [('PER', 2), ('ORG', 2)]
athletics [('O', 19), ('ORG', 6)]
career [('O', 37), ('ORG', 2)]
mo [('PER', 2), ('LOC', 2), ('O', 1)]
juan [('PER', 20), ('LOC', 1)]
era [('MISC', 5), ('O', 4)]
blue [('O', 9), ('ORG', 5)]
brewers [('ORG', 7), ('O', 6)]
jose [('PER', 32), ('LOC', 8), ('ORG', 2)]
eindhoven [('ORG', 15), ('LOC', 1)]
arthur [('PER', 3), ('ORG', 2), ('LOC', 1)]
numan [('PER', 2), ('O', 1)]
defence [('O', 39), ('ORG', 4)]
breda [('ORG', 10), ('LOC', 1)]
swiss [('MISC', 42), ('ORG', 2), ('O', 1)]
hamburg [('ORG', 12), ('LOC', 7)]
antonio [('PER', 11), ('ORG', 3), ('LOC', 1)]
stuttgart [('ORG', 10), ('LOC', 6)]
lausanne [('LOC', 1), ('ORG', 1)]
milan [('ORG', 27), ('LOC', 11), ('PER', 2)]
house [('O', 44), ('ORG', 17), ('LOC', 17)]
grand [('MISC', 46), ('O', 20), ('LOC', 1)]
glass [('O', 5), ('PER', 1)]
carl [('PER', 11), ('MISC', 2)]
max [('PER', 11), ('O', 1)]
miles [('O', 72), ('PER', 

lion [('O', 1), ('ORG', 1)]
marine [('O', 1), ('ORG', 1)]
lions [('O', 5), ('ORG', 2)]
scientist [('O', 6), ('ORG', 1)]
plastic [('O', 23), ('ORG', 1)]
cups [('O', 1), ('MISC', 1)]
hack [('O', 1), ('PER', 1)]
environment [('O', 5), ('ORG', 1)]
labor [('MISC', 7), ('O', 2), ('ORG', 1)]
toyota [('MISC', 9), ('ORG', 5)]
melbourne [('LOC', 20), ('ORG', 8)]
mountain [('O', 9), ('ORG', 3), ('LOC', 3)]
copper [('O', 30), ('ORG', 3)]
project [('O', 13), ('ORG', 1), ('PER', 1)]
shanghai [('LOC', 15), ('ORG', 4)]
metal [('O', 9), ('ORG', 1)]
metals [('O', 10), ('ORG', 1)]
economics [('ORG', 1), ('O', 1)]
commodities [('ORG', 11), ('O', 3)]
companion [('ORG', 4), ('O', 4)]
marble [('ORG', 3), ('O', 1)]
posts [('O', 10), ('ORG', 1)]
holdings [('ORG', 7), ('O', 2)]
construction [('O', 10), ('ORG', 1)]
boards [('O', 3), ('MISC', 1)]
asset [('O', 1), ('ORG', 1)]
les [('PER', 5), ('ORG', 2)]
permanent [('O', 4), ('ORG', 1)]
taylor [('PER', 8), ('ORG', 3)]
corps [('ORG', 3), ('O', 1)]
enterprise [('MIS

In [6]:
counter

1698

In [7]:
anticounter

17295