# Setup

In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0,1"

In [2]:
import torch
from torch import nn
from torch.nn import functional as F
from torch.utils import data
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter

import re
import time
import pickle

import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from collections import Counter

from sklearn import metrics
from sklearn.metrics import classification_report, f1_score

In [3]:
seed = 42
np.random.seed(seed)
torch.manual_seed(seed)

<torch._C.Generator at 0x7fad385852f0>

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [5]:
dataset = {'train': [], 'validation': [], 'test': []}
dataset_path = '/data/graphner_embeddings/ae_emb_npy_2000_15epochs/'

for split in dataset:
    files_list = os.listdir(dataset_path+split)
    for i, filename in tqdm(enumerate(sorted(files_list)), total=len(files_list)):
        dataset[split].append(pickle.load(open(dataset_path+split+'/'+str(i)+'.pickle', 'rb')))

HBox(children=(FloatProgress(value=0.0, max=178610.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=44900.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=40760.0), HTML(value='')))




In [6]:
labels = pickle.load(open('labels.pickle', 'rb'))
label2id = {l: i for i, l in enumerate(labels)}
print(label2id)

{'LOC': 0, 'MISC': 1, 'O': 2, 'ORG': 3, 'PER': 4}


# Dataset

In [7]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, dataset, split, label2id=label2id):
        X, Y = zip(*dataset[split])

        self.X = [torch.tensor(x) for x in X]
        self.Y = [torch.tensor(y) for y in Y]
        self.X_len = len(X)
        self.labels = sorted(label2id.keys())
        self.label2id = label2id

    def __len__(self):
        return self.X_len

    def __getitem__(self, index):
        x = self.X[index]
        y = self.Y[index]
        x.requires_grad = False
        
        return x, y

    def labels(self):
        return self.labels
    
    def Y(self):
        return self.Y

In [8]:
train_set = Dataset(dataset, 'train')
dev_set = Dataset(dataset, 'validation')
test_set = Dataset(dataset, 'test')

In [9]:
batch_size = 64
num_workers = 4

train_loader = torch.utils.data.DataLoader(train_set, batch_size=batch_size, num_workers=num_workers, shuffle=True)
dev_loader = torch.utils.data.DataLoader(dev_set, batch_size=batch_size, num_workers=num_workers, shuffle=False)
test_loader = torch.utils.data.DataLoader(test_set, batch_size=batch_size, num_workers=num_workers, shuffle=False)

In [10]:
input_dim = 0
for local_features, local_labels in train_loader:
    input_dim = local_features.shape[1]
    print(local_features.shape)
    print(local_labels.shape)
    break

torch.Size([64, 2000])
torch.Size([64])


In [11]:
input_dim

2000

In [12]:
training_counter = Counter([y.item() for y in train_set.Y])
print(training_counter)

Counter({2: 144631, 4: 11124, 3: 9984, 0: 8288, 1: 4583})


In [13]:
labels

['LOC', 'MISC', 'O', 'ORG', 'PER']

# The Model

In [14]:
def backprop(batch_X, batch_Y, model, optimizer, loss_fn):
    Y_hat = model(batch_X)
    loss = loss_fn(Y_hat, batch_Y)
    loss.backward()
    optimizer.step()
    
    return loss.item()

class FeedForwardNetwork(nn.Module):
    def __init__(self, input_dim=input_dim, hidden_dim=512, output_dim=5, dropout_rate=0.2):
        super().__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fch = nn.Linear(hidden_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, output_dim)
        self.relu = nn.ReLU()

        # extra layers layers
        self.dropout = nn.Dropout(p=dropout_rate)
        
    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fch(x)
        x = self.relu(x)
        x = self.dropout(x)
        logits = self.fc2(x)

        return logits

In [15]:
ffnet = FeedForwardNetwork(dropout_rate=0.2, hidden_dim=1024).to(device)

In [16]:
logs = {'loss/train': {}, 'dev': {}}
writer = SummaryWriter(comment='xp5-autoreg-wei2-lr1e3-mom0.9-wd5e4-hd1024-dr0.2-bs64-dim2000-15', log_dir=None,)

In [17]:
label_counter   = Counter([y.item() for y in train_set.Y])
labels_freqs    = [label_counter[label] / sum(label_counter.values()) for label in range(len(labels))]
labels_weights1 = [min(label_counter.values()) / label_counter[label] for label in range(len(labels))]
labels_weights2 = [np.sqrt(min(label_counter.values())) / np.sqrt(label_counter[label]) for label in range(len(labels))]

weights = torch.Tensor(labels_weights2).to(device)
print(weights)

tensor([0.7436, 1.0000, 0.1780, 0.6775, 0.6419], device='cuda:0')


In [19]:
optimizer_params = {'lr': 5e-3, 
                    'momentum': 0.9, 
                    'weight_decay': 5e-4,
                   }

log_interval = int(len(train_loader) / 2)

loss_fn = nn.CrossEntropyLoss(weight=weights)
optimizer = torch.optim.SGD(ffnet.parameters(), **optimizer_params)

In [31]:
%%time
max_epochs = 1

for epoch in range(len(logs['loss/train']), len(logs['loss/train']) + max_epochs):
    
    # Training
    ffnet.train()
    print('Epoch', epoch)
    logs['loss/train'][epoch] = []
    writer.add_scalar("Learning_rate", optimizer_params['lr'], epoch)

    for batch, (batch_X, batch_Y) in enumerate(tqdm(train_loader)):
        # tranfer to GPU
        batch_X, batch_Y = batch_X.float().to(device), batch_Y.to(device)
        optimizer.zero_grad()
        l = backprop(batch_X, batch_Y, ffnet, optimizer, loss_fn)
        logs['loss/train'][epoch].append(l)
        
        if batch % log_interval == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch * len(batch_X), len(train_loader.dataset),
                100. * batch / len(train_loader), l))
    
    logs['loss/train'][epoch] = np.mean(logs['loss/train'][epoch])
    writer.add_scalar("Loss/train", logs['loss/train'][epoch], epoch)
    print(f'Average loss on epoch {epoch}: {logs["loss/train"][epoch]}')
    
    # Validation
    ffnet.eval()
    with torch.no_grad():
        preds = []
        gt = []
        for batch, (batch_X, batch_Y) in enumerate(tqdm(dev_loader)):
            # Transfer to GPU
            batch_X = batch_X.float().to(device)
            output = nn.Softmax(dim=1)(ffnet(batch_X))
            preds.append(output.cpu())
            gt.append(batch_Y)

        all_out = [np.argmax(l) for batch in preds for l in batch.numpy()]
        all_gt  = [l for batch in gt for l in batch.numpy()]

        print(classification_report(all_out, all_gt, digits=4))

        micro_F1 = metrics.f1_score(all_gt, all_out, average='micro')
        macro_F1 = metrics.f1_score(all_gt, all_out, average='macro')
        weighted_F1 = metrics.f1_score(all_gt, all_out, average='weighted')
        writer.add_scalar("micro_F1/dev", micro_F1, epoch)
        writer.add_scalar("macro_F1/dev", macro_F1, epoch)
        writer.add_scalar("weighted_F1/dev", weighted_F1, epoch)
        logs['dev'][epoch] = (micro_F1, weighted_F1, macro_F1, (all_gt, all_out))

Epoch 32


HBox(children=(FloatProgress(value=0.0, max=2791.0), HTML(value='')))


Average loss on epoch 32: 0.446914252793381


HBox(children=(FloatProgress(value=0.0, max=702.0), HTML(value='')))


              precision    recall  f1-score   support

           0     0.7564    0.7367    0.7465      2150
           1     0.6377    0.4745    0.5441      1703
           2     0.9605    0.9920    0.9760     35185
           3     0.5315    0.5034    0.5171      2179
           4     0.8412    0.7165    0.7739      3683

    accuracy                         0.9138     44900
   macro avg     0.7455    0.6846    0.7115     44900
weighted avg     0.9079    0.9138    0.9098     44900

CPU times: user 15.5 s, sys: 4.32 s, total: 19.9 s
Wall time: 20 s


In [32]:
ffnet.eval()
with torch.no_grad():
    preds = []
    gt = []
    for batch, (batch_X, batch_Y) in enumerate(tqdm(test_loader)):
        # Transfer to GPU
        batch_X = batch_X.float().to(device)
        output = nn.Softmax(dim=1)(ffnet(batch_X))
        preds.append(output.cpu())
        gt.append(batch_Y)

    all_out = [np.argmax(l) for batch in preds for l in batch.numpy()]
    all_gt  = [l for batch in gt for l in batch.numpy()]

    print(classification_report(all_out, all_gt, digits=4))

HBox(children=(FloatProgress(value=0.0, max=637.0), HTML(value='')))


              precision    recall  f1-score   support

           0     0.7231    0.7109    0.7170      1958
           1     0.6558    0.3825    0.4831      1574
           2     0.9500    0.9928    0.9709     31242
           3     0.5483    0.5280    0.5380      2589
           4     0.8330    0.6800    0.7488      3397

    accuracy                         0.9001     40760
   macro avg     0.7420    0.6588    0.6916     40760
weighted avg     0.8925    0.9001    0.8939     40760

