In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0,1,2,3"

In [2]:
import torch
from torch import nn
from torch.nn import functional as F
from torch.utils import data
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter

import time
import pickle

import numpy as np
from tqdm.notebook import tqdm
from collections import Counter

from sklearn import metrics
from sklearn.metrics import classification_report, f1_score


# Todo

[X]  Adding other info such as "Capitalized" and "contains punctuation"

[_]  Two-stage encoding (first the node then the context)

[_]  Use Graph Embeddings instead of one hot embeddings

[_]  Data Aug: add edges from embedding-similar words who share the same class

[_]  Accounting for OOV (randomly injecting some "UNK" token/only when the token doesn't appear in Conceptnet/Gazetteer)

[_]  Use Word Pieces

[_]  Autoencoder / Cloze pretraining

In [3]:
! nvidia-smi

Sat Nov  7 02:50:17 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 455.23.04    Driver Version: 455.23.04    CUDA Version: 11.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           On   | 00000000:05:00.0 Off |                    0 |
| N/A   24C    P8    26W / 149W |      0MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla K80           On   | 00000000:06:00.0 Off |                    0 |
| N/A   30C    P8    29W / 149W |      0MiB / 11441MiB |      0%      Default |
|       

In [4]:
torch.manual_seed(42)
np.random.seed(42)

# Setup

In [5]:
%%time
X_train, Y_train = zip(*pickle.load(open('data/conll2003_sparse_train.pickle', 'rb')))
X_dev,   Y_dev   = zip(*pickle.load(open('data/conll2003_sparse_dev.pickle',   'rb')))
X_test,  Y_test  = zip(*pickle.load(open('data/conll2003_sparse_test.pickle',  'rb')))

In [6]:
len(X_train), len(X_dev), len(X_test)

(172046, 43525, 39107)

In [8]:
input_dim = 62981
labels = ['PER', 'ORG', 'LOC', 'MISC', 'O']
labels_to_id = {l:i for i, l in enumerate(labels)}

In [124]:
62981 - 62937 

44

In [125]:
label_counter   = Counter(Y_train)
labels_freqs    = [label_counter[label] / sum(label_counter.values()) for label in labels]
labels_weights  = [min(label_counter.values()) / label_counter[label] for label in labels]
labels_weights2 = [np.sqrt(min(label_counter.values())) / np.sqrt(label_counter[label]) for label in labels]
print(labels_freqs)
print(labels_weights)
print(labels_weights2)

[0.06465712658242563, 0.057781058554107624, 0.048167350592283456, 0.026562663473722145, 0.8028318007974612]
[0.4108234448040273, 0.459712302585253, 0.5514661518040304, 1.0, 0.033086212388867976]
[0.6409551035790473, 0.6780208717917561, 0.7426076701758677, 1.0, 0.1818961582575838]


In [126]:
# CUDA for PyTorch
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")
print('Device:', device)
torch.backends.cudnn.benchmark = True

Device: cuda


# Data

In [11]:
class Dataset(data.Dataset):
    def __init__(self, X, Y, labels):
        'Initialization'
        self.X = X
        self.y2index = {l: i for i, l in enumerate(labels)}
        self.Y = Y
        self.labels = labels
        assert(len(X) == len(Y))

    def __len__(self):
        'Denotes the total number of samples'
        return len(self.X)

    def __getitem__(self, index):
        'Generates one sample of data'
        # Select sample
        x = self.X[index].to_dense().clone().detach() #.to('cuda') # [:voc_size]
        y = self.y2index[self.Y[index]]
        
        return x, y

In [127]:
batch_size  = 32
num_workers = 4


sampling_probs = [labels_weights2[labels_to_id[l]] for l in Y_train]
sampler = torch.utils.data.sampler.WeightedRandomSampler(sampling_probs, len(Y_train), replacement=True)
train_set = Dataset(X_train, Y_train, labels)
train_loader = DataLoader(train_set, batch_size=batch_size, num_workers=num_workers, shuffle=True) # , sampler=sampler) #

dev_set = Dataset(X_dev, Y_dev, labels)
dev_loader = DataLoader(dev_set, batch_size=batch_size, shuffle=False, num_workers=num_workers)

test_set = Dataset(X_test, Y_test, labels)
test_loader = DataLoader(test_set, batch_size=batch_size, shuffle=False, num_workers=num_workers)

In [128]:
t = time.time()
print(len(train_loader))
for batch_X, batch_Y in train_loader:
    print(batch_X.shape)
    print(batch_Y.shape)
    print(sum(batch_X[0]))
    print('Class distribution in this batch:', Counter(batch_Y.numpy()))
    break
print(f'time: {time.time() - t:.3}s')

5377
torch.Size([32, 62981])
torch.Size([32])
tensor(8.)
Class distribution in this batch: Counter({4: 28, 2: 2, 1: 1, 3: 1})
time: 1.05s


# Model / Training / Evaluation

In [129]:
writer = SummaryWriter(log_dir=None,filename_suffix='reproducingbest')
# writer.add_scalar("Loss/train", loss, epoch)
# writer.flush()

In [130]:
def backprop(batch_X, batch_Y, model, optimizer, loss_fn):
    Y_hat = model(batch_X)
    loss = loss_fn(Y_hat, batch_Y)
    loss.backward()
    optimizer.step()
    
    return loss.item()

In [136]:
class FeedForwardNetwork(nn.Module):
    def __init__(self, input_dim=input_dim, hidden_dim=512, output_dim=5, dropout_rate=0.2):
        super().__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fch = nn.Linear(hidden_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, output_dim)
        self.relu = nn.ReLU()

        # extra layers layers
        self.dropout = nn.Dropout(p=dropout_rate)
        # self.batchnorm1 = nn.BatchNorm1d(hidden_dim)
        # self.batchnorm2 = nn.BatchNorm1d(hidden_dim)

        
    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fch(x)
        x = self.relu(x)
        x = self.dropout(x)
        #x = self.fch2(x)
        # x = self.dropout(x)
        # x = self.relu(x)
        logits = self.fc2(x)

        return logits

In [137]:
logs = {'loss/train': {}, 'dev': {}}

In [138]:
ffnet = FeedForwardNetwork().to('cuda')

In [139]:
log_interval = int(len(train_loader) / 4)
weights = torch.Tensor(labels_weights2).to('cuda')
weights

tensor([0.6410, 0.6780, 0.7426, 1.0000, 0.1819], device='cuda:0')

In [147]:
optimizer_params = {'lr': 1e-5, 
                    'momentum': 0.9, 
                    'weight_decay': 5e-4,
                   }

loss_fn = nn.CrossEntropyLoss(weight=weights)
optimizer = torch.optim.SGD(ffnet.parameters(), **optimizer_params)

In [150]:
%%time
max_epochs = 5

for epoch in range(len(logs['loss/train']), len(logs['loss/train']) + max_epochs):
    
    # Training
    ffnet.train()
    print('Epoch', epoch)
    logs['loss/train'][epoch] = []
    writer.add_scalar("Learning_rate", optimizer_params['lr'], epoch)

    for batch, (batch_X, batch_Y) in enumerate(tqdm(train_loader)):
        # tranfer to GPU
        batch_X, batch_Y = batch_X.to(device), batch_Y.to(device)
        optimizer.zero_grad()
        l = backprop(batch_X, batch_Y, ffnet, optimizer, loss_fn)
        logs['loss/train'][epoch].append(l)
        
        if batch % log_interval == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch * len(batch_X), len(train_loader.dataset),
                100. * batch / len(train_loader), l))
    
    logs['loss/train'][epoch] = np.mean(logs['loss/train'][epoch])
    writer.add_scalar("Loss/train", logs['loss/train'][epoch], epoch)
    print(f'Average loss on epoch {epoch}: {logs["loss/train"][epoch]}')
    
    # Validation
    ffnet.eval()
    with torch.no_grad():
        preds = []
        gt = []
        for batch, (batch_X, batch_Y) in enumerate(tqdm(dev_loader)):
            # Transfer to GPU
            batch_X = batch_X.to(device)
            output = nn.Softmax(dim=1)(ffnet(batch_X))
            preds.append(output.cpu())
            gt.append(batch_Y)
    
        all_out = [np.argmax(l) for batch in preds for l in batch.numpy()]
        all_gt  = [l for batch in gt for l in batch.numpy()]
        
        print(classification_report(all_out, all_gt, digits=4))
        micro_F1 = metrics.f1_score(all_gt, all_out, average='micro')
        macro_F1 = metrics.f1_score(all_gt, all_out, average='macro')
        weighted_F1 = metrics.f1_score(all_gt, all_out, average='weighted')
        writer.add_scalar("micro_F1/dev", micro_F1, epoch)
        writer.add_scalar("macro_F1/dev", macro_F1, epoch)
        writer.add_scalar("weighted_F1/dev", weighted_F1, epoch)
        logs['dev'][epoch] = (micro_F1, weighted_F1, macro_F1, (all_gt, all_out))

Epoch 20


HBox(children=(FloatProgress(value=0.0, max=5377.0), HTML(value='')))


Average loss on epoch 20: 0.05545189463163582


HBox(children=(FloatProgress(value=0.0, max=1361.0), HTML(value='')))


              precision    recall  f1-score   support

           0     0.9251    0.8875    0.9059      3270
           1     0.8197    0.7521    0.7845      2231
           2     0.8791    0.9279    0.9028      1983
           3     0.7850    0.8757    0.8278      1134
           4     0.9871    0.9893    0.9882     34907

    accuracy                         0.9637     43525
   macro avg     0.8792    0.8865    0.8819     43525
weighted avg     0.9637    0.9637    0.9635     43525

Epoch 21


HBox(children=(FloatProgress(value=0.0, max=5377.0), HTML(value='')))


Average loss on epoch 21: 0.05548100383044753


HBox(children=(FloatProgress(value=0.0, max=1361.0), HTML(value='')))


              precision    recall  f1-score   support

           0     0.9248    0.8874    0.9057      3269
           1     0.8197    0.7518    0.7843      2232
           2     0.8791    0.9279    0.9028      1983
           3     0.7850    0.8772    0.8285      1132
           4     0.9872    0.9893    0.9882     34909

    accuracy                         0.9637     43525
   macro avg     0.8792    0.8867    0.8819     43525
weighted avg     0.9637    0.9637    0.9635     43525

Epoch 22


HBox(children=(FloatProgress(value=0.0, max=5377.0), HTML(value='')))


Average loss on epoch 22: 0.05503848850699715


HBox(children=(FloatProgress(value=0.0, max=1361.0), HTML(value='')))


              precision    recall  f1-score   support

           0     0.9235    0.8881    0.9055      3262
           1     0.8212    0.7474    0.7826      2249
           2     0.8791    0.9284    0.9031      1982
           3     0.7858    0.8781    0.8294      1132
           4     0.9870    0.9894    0.9882     34900

    accuracy                         0.9636     43525
   macro avg     0.8793    0.8863    0.8817     43525
weighted avg     0.9635    0.9636    0.9634     43525

Epoch 23


HBox(children=(FloatProgress(value=0.0, max=5377.0), HTML(value='')))


Average loss on epoch 23: 0.05552467903463866


HBox(children=(FloatProgress(value=0.0, max=1361.0), HTML(value='')))


              precision    recall  f1-score   support

           0     0.9296    0.8844    0.9064      3297
           1     0.8178    0.7571    0.7863      2211
           2     0.8758    0.9290    0.9016      1973
           3     0.7826    0.8761    0.8267      1130
           4     0.9873    0.9893    0.9883     34914

    accuracy                         0.9639     43525
   macro avg     0.8786    0.8872    0.8819     43525
weighted avg     0.9640    0.9639    0.9637     43525

Epoch 24


HBox(children=(FloatProgress(value=0.0, max=5377.0), HTML(value='')))


Average loss on epoch 24: 0.05534880992705139


HBox(children=(FloatProgress(value=0.0, max=1361.0), HTML(value='')))


              precision    recall  f1-score   support

           0     0.9248    0.8880    0.9060      3267
           1     0.8202    0.7519    0.7846      2233
           2     0.8758    0.9290    0.9016      1973
           3     0.7858    0.8742    0.8276      1137
           4     0.9873    0.9892    0.9882     34915

    accuracy                         0.9637     43525
   macro avg     0.8788    0.8865    0.8816     43525
weighted avg     0.9637    0.9637    0.9635     43525

CPU times: user 9min 22s, sys: 1min 12s, total: 10min 35s
Wall time: 10min 39s


In [151]:
ffnet.eval()
with torch.no_grad():
    preds = []
    gt = []
    for batch, (batch_X, batch_Y) in enumerate(tqdm(test_loader)):
        # Transfer to GPU
        batch_X = batch_X.to(device)
        output = nn.Softmax(dim=1)(ffnet(batch_X))
        preds.append(output.cpu())
        gt.append(batch_Y)

    all_out = [np.argmax(l) for batch in preds for l in batch.numpy()]
    all_gt  = [l for batch in gt for l in batch.numpy()]

    print(classification_report(all_out, all_gt, digits=4))

HBox(children=(FloatProgress(value=0.0, max=1223.0), HTML(value='')))


              precision    recall  f1-score   support

           0     0.8918    0.8551    0.8731      2892
           1     0.7371    0.6905    0.7130      2659
           2     0.8073    0.8557    0.8308      1816
           3     0.7346    0.7001    0.7170       957
           4     0.9776    0.9847    0.9811     30783

    accuracy                         0.9421     39107
   macro avg     0.8297    0.8172    0.8230     39107
weighted avg     0.9410    0.9421    0.9415     39107



In [191]:
logs1['dev'][39][:3]

(0.8330477919554045, 0.8547009930379798, 0.6680419619738478)