# Todo

[X]  Adding other info such as "Capitalized" and "contains punctuation"

[_]  Two-stage encoding (first the node then the context)

[_]  Use Graph Embeddings instead of one hot embeddings

[_]  Accounting for OOV (randomly injecting some "UNK" token/only when the token doesn't appear in Conceptnet/Gazetteer)

[_]  Use Word Pieces

[_]  Autoencoder / Cloze pretraining

# Tutorials
https://huggingface.co/docs/datasets/loading_datasets.html
https://www.learnopencv.com/tensorboard-with-pytorch-lightning/
https://discuss.pytorch.org/t/pytorch-coding-conventions/42548
https://huggingface.co/transformers/examples.html
https://pytorch-lightning.readthedocs.io/en/latest/new-project.html
https://pytorch-lightning.readthedocs.io/en/latest/rapid_prototyping_templates.html
https://pytorch-lightning.readthedocs.io/en/latest/style_guide.html
https://pytorch-lightning.readthedocs.io/en/latest/performance.html
https://pytorch-lightning.readthedocs.io/en/latest/loggers.html
https://pytorch-lightning.readthedocs.io/en/latest/metrics.html
https://pytorch-lightning.readthedocs.io/en/stable/weights_loading.html
https://pytorch-lightning.readthedocs.io/en/0.4.9/Trainer/Checkpointing/
https://pytorch-lightning.readthedocs.io/en/stable/metrics.html#stat-scores-multiple-classes-func
https://pytorch-lightning-bolts.readthedocs.io/en/latest/classic_ml.html#logistic-regression
https://pytorch-lightning.readthedocs.io/en/latest/trainer.html
https://github.com/PyTorchLightning/deep-learning-project-template
https://github.com/PyTorchLightning/pytorch-lightning

In [1]:
# ! pip uninstall torchvision
# ! pip -q install pytorch_lightning

In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="2, 3"

In [2]:
import torch
from collections import Counter
from torch import nn
from torch.nn import functional as F
from torch.utils.data import DataLoader, random_split
from torchvision.datasets import MNIST
from torchvision import transforms
import pytorch_lightning as pl
from pytorch_lightning.metrics.functional import accuracy, f1_score, stat_scores_multiple_classes

import pickle
from sklearn.metrics import classification_report

from tqdm.notebook import tqdm

In [3]:
# torch.backends.cudnn.benchmark = True

In [4]:
import numpy as np

In [5]:
%%time
X_train, Y_train = zip(*pickle.load(open('data/conll2003_sparse_train.pickle', 'rb')))
X_dev,   Y_dev   = zip(*pickle.load(open('data/conll2003_sparse_test.pickle', 'rb')))
X_test,  Y_test  = zip(*pickle.load(open('data/conll2003_sparse_test.pickle', 'rb')))

CPU times: user 49.5 s, sys: 743 ms, total: 50.2 s
Wall time: 50.2 s


In [7]:
len(X_train)

172046

In [6]:
# %%time
# X_train = [x.to_dense()  for x in tqdm(X_train)]
# X_dev = [x.to_dense()  for x in tqdm(X_dev)]
# X_test = [x.to_dense()  for x in tqdm(X_test)]

In [7]:
input_size = 62981
labels = ['PER', 'ORG', 'LOC', 'MISC', 'O']
labels_to_id = {l:i for i, l in enumerate(labels)}

In [8]:
len(X_dev)

39107

In [12]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, X, Y, labels):
        """
            X = Input matrix, shape: [N_samples, dim_sample]
            Y = Output list, shape: [N_samples], contains labels strings
            labels = textual labels for the classes
        """
        assert(len(X) == len(Y))
        assert(all([label in labels for label in Y]))
        self.X = X
        self.Y = Y
        self.X_len = len(X)
        self.Y_len = len(labels)
        self.labels = sorted(labels)
        self.y2index = {l: i for i, l in enumerate(labels)}
        self.y2onehot = {l: np.eye(self.Y_len)[i] for i, l in enumerate(labels)}

    def __len__(self):
        return self.X_len

    def __getitem__(self, index):
        x = self.X[index].to_dense().clone().detach() #.to('cuda') # [:voc_size]
        y = self.y2index[self.Y[index]]

        return x, y

In [13]:
batch_size = 32
num_workers = 4

train_set = Dataset(X_train, Y_train, labels)
train_loader = torch.utils.data.DataLoader(train_set, batch_size=batch_size, num_workers=num_workers, shuffle=True)

dev_set = Dataset(X_dev, Y_dev, labels)
dev_loader = torch.utils.data.DataLoader(dev_set, batch_size=batch_size, num_workers=num_workers, shuffle=False)

test_set = Dataset(X_test, Y_test, labels)
test_loader = torch.utils.data.DataLoader(test_set, batch_size=batch_size, num_workers=num_workers, shuffle=False)

In [14]:
for local_features, local_labels in train_loader:
    print(local_features.shape)
    print(local_labels.shape)
    break

for local_features, local_labels in dev_loader:
    print(local_features.shape)
    print(local_labels.shape)
    break

torch.Size([32, 62981])
torch.Size([32])
torch.Size([32, 62981])
torch.Size([32])


In [15]:
# all_training_labels = []
# for local_features, local_labels in train_loader:
#     all_training_labels.extend([x for x in local_labels.numpy()])
# training_counter = Counter(all_training_labels)
training_counter = Counter(Y_train)
training_counter

Counter({'ORG': 9941, 'O': 138124, 'MISC': 4570, 'PER': 11124, 'LOC': 8287})

In [17]:
# weights = np.power(labels_df.iloc[train_IDs][action].value_counts().values, 1)
weights = torch.Tensor([min(training_counter.values())/training_counter[cls] for cls in labels]) #.to('cuda')
weights

tensor([0.4108, 0.4597, 0.5515, 1.0000, 0.0331])

In [74]:
class MLP(pl.LightningModule):
    
    def __init__(self, input_dim=input_size, hidden_dim=1024, output_dim=5, learning_rate=1e-3):
        super().__init__()
        self.save_hyperparameters()

        self.l1 = torch.nn.Linear(self.hparams.input_dim, self.hparams.hidden_dim)
        self.l2 = torch.nn.Linear(self.hparams.hidden_dim, output_dim)
        self.l3 = torch.nn.Linear(self.hparams.hidden_dim, self.hparams.hidden_dim)

    def forward(self, x):
        # x = x.view(x.size(0), -1)
        x = torch.relu(self.l1(x))
        # x = torch.relu(self.l3(x))
        x = torch.relu(self.l2(x))
        return x

    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = F.cross_entropy(y_hat, y, weight=weights)
        return loss

    def validation_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = F.cross_entropy(y_hat, y)
        preds = torch.argmax(y_hat, dim=1)
        f1 = f1_score(preds, y)
        self.log('valid_loss', loss)
        self.log('valid_acc', f1)

    def test_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = F.cross_entropy(y_hat, y)
        preds = torch.argmax(y_hat, dim=1)
        f1 = accuracy(preds, y)
        self.log('test_loss', loss)
        self.log('test_acc', f1)

    def evaluate(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        preds = torch.argmax(y_hat, dim=1)
        
        return preds.to('cpu')

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=self.hparams.learning_rate)

In [75]:
pl.seed_everything(42)

42

In [76]:
%%time
# ------------
# model
# ------------
model = MLP(learning_rate=1e-3)

# ------------
# training
# ------------
trainer = pl.Trainer(max_epochs=3)#, gpus='0')
trainer.fit(model, train_loader, dev_loader)

# ------------
# testing
# ------------
trainer.test(test_dataloaders=test_loader)

GPU available: True, used: False
TPU available: False, using: 0 TPU cores

  | Name | Type   | Params
--------------------------------
0 | l1   | Linear | 64 M  
1 | l2   | Linear | 5 K   
2 | l3   | Linear | 1 M   


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Testing', layout=Layout(flex='2'), max=…

--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test_acc': tensor(0.8491),
 'test_loss': tensor(1.5345),
 'valid_acc': tensor(nan),
 'valid_loss': tensor(1.5345)}
--------------------------------------------------------------------------------

CPU times: user 19h 8min 27s, sys: 2h 17min 35s, total: 21h 26min 3s
Wall time: 1h 34min 22s


[{'valid_loss': 1.5344531536102295,
  'valid_acc': nan,
  'test_loss': 1.5344531536102295,
  'test_acc': 0.8490551710128784}]

In [70]:
preds = []
gt = []
for local_features, local_labels in tqdm(test_loader, total=len(test_loader)):
    preds.extend(torch.argmax(model(local_features), dim=1))
    gt.extend(local_labels)

print(len(preds))
results = stat_scores_multiple_classes(torch.tensor(preds), torch.tensor(gt))
results

HBox(children=(FloatProgress(value=0.0, max=1223.0), HTML(value='')))


39107


(tensor([ 2413.,     0.,     0.,     0., 30791.]),
 tensor([ 482.,    0.,    0.,    0., 5421.]),
 tensor([35852., 36616., 37182., 38195.,  2680.]),
 tensor([ 360., 2491., 1925.,  912.,  215.]),
 tensor([ 2773.,  2491.,  1925.,   912., 31006.]))

In [71]:
print(len(preds))
results = stat_scores_multiple_classes(torch.tensor(preds), torch.tensor(gt))
results

39107


(tensor([ 2413.,     0.,     0.,     0., 30791.]),
 tensor([ 482.,    0.,    0.,    0., 5421.]),
 tensor([35852., 36616., 37182., 38195.,  2680.]),
 tensor([ 360., 2491., 1925.,  912.,  215.]),
 tensor([ 2773.,  2491.,  1925.,   912., 31006.]))

In [72]:
gt_count = Counter(torch.tensor(gt).numpy())
pr_count = Counter(torch.tensor(preds).numpy())

for i, l in enumerate(labels):
    print(l, '\t', gt_count[i], '\t', pr_count[i],  '\t', str(round(pr_count[i] / gt_count[i], 4)*100) + '%')

PER 	 2773 	 2895 	 104.4%
ORG 	 2491 	 0 	 0.0%
LOC 	 1925 	 0 	 0.0%
MISC 	 912 	 0 	 0.0%
O 	 31006 	 36212 	 116.78999999999999%


In [73]:
print(classification_report(torch.tensor(gt).numpy(), torch.tensor(preds).numpy()))

              precision    recall  f1-score   support

           0       0.83      0.87      0.85      2773
           1       0.00      0.00      0.00      2491
           2       0.00      0.00      0.00      1925
           3       0.00      0.00      0.00       912
           4       0.85      0.99      0.92     31006

    accuracy                           0.85     39107
   macro avg       0.34      0.37      0.35     39107
weighted avg       0.73      0.85      0.79     39107



In [35]:
print("""lr=1e-2, n_epochs=?, ws=3, 
              precision    recall  f1-score   support

           0       0.94      0.59      0.73      2773
           1       0.80      0.59      0.68      2491
           2       0.00      0.00      0.00      1925
           3       0.00      0.00      0.00       912
           4       0.87      1.00      0.93     31006

    accuracy                           0.87     39107
   macro avg       0.52      0.44      0.47     39107
weighted avg       0.81      0.87      0.83     39107
""")

lr=1e-2
              precision    recall  f1-score   support

           0       0.94      0.59      0.73      2773
           1       0.80      0.59      0.68      2491
           2       0.00      0.00      0.00      1925
           3       0.00      0.00      0.00       912
           4       0.87      1.00      0.93     31006

    accuracy                           0.87     39107
   macro avg       0.52      0.44      0.47     39107
weighted avg       0.81      0.87      0.83     39107



In [44]:
print("""lr=1e-1, n_epochs=5, ws=3, 
                precision    recall  f1-score   support

           0       0.00      0.00      0.00      2773
           1       0.00      0.00      0.00      2491
           2       0.00      0.00      0.00      1925
           3       0.00      0.00      0.00       912
           4       0.79      1.00      0.88     31006

    accuracy                           0.79     39107
   macro avg       0.16      0.20      0.18     39107
weighted avg       0.63      0.79      0.70     39107
""")

lr=1e-2
                precision    recall  f1-score   support

           0       0.00      0.00      0.00      2773
           1       0.00      0.00      0.00      2491
           2       0.00      0.00      0.00      1925
           3       0.00      0.00      0.00       912
           4       0.79      1.00      0.88     31006

    accuracy                           0.79     39107
   macro avg       0.16      0.20      0.18     39107
weighted avg       0.63      0.79      0.70     39107



In [51]:
print("""n_epochs=30, hidden_size=512x1, 1e-3
                precision    recall  f1-score   support

           0       0.98      0.39      0.56      2773
           1       0.00      0.00      0.00      2491
           2       0.00      0.00      0.00      1925
           3       0.00      0.00      0.00       912
           4       0.82      1.00      0.90     31006

    accuracy                           0.82     39107
   macro avg       0.36      0.28      0.29     39107
weighted avg       0.72      0.82      0.75     39107""")


n_epochs=30, hidden_size=512, 1e-3
                precision    recall  f1-score   support

           0       0.98      0.39      0.56      2773
           1       0.00      0.00      0.00      2491
           2       0.00      0.00      0.00      1925
           3       0.00      0.00      0.00       912
           4       0.82      1.00      0.90     31006

    accuracy                           0.82     39107
   macro avg       0.36      0.28      0.29     39107
weighted avg       0.72      0.82      0.75     39107


In [45]:
print("""n_epochs=3, hidden_size=128x2, 
PER 	 2773 	 0 	 0.0%
ORG 	 2491 	 0 	 0.0%
LOC 	 1925 	 0 	 0.0%
MISC 	 912 	 0 	 0.0%
O 	 31006 	 39107 	 126.13000000000001%
""")

n_epochs=3, hidden_size=128x2, 
PER 	 2773 	 0 	 0.0%
ORG 	 2491 	 0 	 0.0%
LOC 	 1925 	 0 	 0.0%
MISC 	 912 	 0 	 0.0%
O 	 31006 	 39107 	 126.13000000000001%



In [37]:
print("""lr=1e-4
weights=on
epochs=2
precision    recall  f1-score   support

           0       0.00      0.00      0.00      2773
           1       0.37      0.81      0.51      2491
           2       0.70      0.84      0.76      1925
           3       0.47      0.78      0.59       912
           4       0.96      0.93      0.95     31006

    accuracy                           0.85     39107
   macro avg       0.50      0.67      0.56     39107
weighted avg       0.83      0.85      0.83     39107
""")

lr=1e-4
weights=on
epochs=2
precision    recall  f1-score   support

           0       0.00      0.00      0.00      2773
           1       0.37      0.81      0.51      2491
           2       0.70      0.84      0.76      1925
           3       0.47      0.78      0.59       912
           4       0.96      0.93      0.95     31006

    accuracy                           0.85     39107
   macro avg       0.50      0.67      0.56     39107
weighted avg       0.83      0.85      0.83     39107



In [None]:
print("""
lr=1e-3
max_epochs=3
hidden=512
weights=on
            precision    recall  f1-score   support

           0       0.91      0.79      0.85      2773
           1       0.00      0.00      0.00      2491
           2       0.78      0.84      0.81      1925
           3       0.59      0.74      0.65       912
           4       0.91      0.99      0.95     31006

    accuracy                           0.90     39107
   macro avg       0.64      0.67      0.65     39107
weighted avg       0.84      0.90      0.87     39107
""")