# Setup

In [23]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="2,3"

In [24]:
import torch
from torch import nn
from torch.nn import functional as F
from torch.utils import data
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter

import re
import time
import pickle

import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from collections import Counter

from sklearn import metrics
from sklearn import preprocessing
from sklearn.metrics import classification_report, f1_score

In [25]:
seed = 42
np.random.seed(seed)
torch.manual_seed(seed)

<torch._C.Generator at 0x7f7e0418bbd0>

In [26]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [27]:
dataset_path = 'transe300/embeddings/transe/'
embeddings_df = pd.read_csv(dataset_path+'ent_embedding.tsv', sep='\t', header=None)
embeddings_labels = pd.read_csv(dataset_path+'ent_labels.tsv', sep='\t', header=None)

In [28]:
len(embeddings_df)

19046

In [29]:
vocabulary = pickle.load(open('vocabulary_all.pickle', 'rb'))
word2id = {w:i for i,w in enumerate(vocabulary)}

In [30]:
sum(len(vocabulary[k]) for k in vocabulary)

19046

In [31]:
embeddings_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
0,-0.376816,-2.660249,-1.720779,-3.73814,-1.410253,-4.30319,-0.44441,2.566693,-2.692881,-3.623276,...,1.702528,-2.82791,-3.283765,-2.932739,4.39658,-3.065178,-1.43615,-1.800443,1.500156,-3.188193
1,0.329408,3.283797,1.413908,-1.478796,1.787328,0.017077,3.308312,1.074971,1.750265,3.795927,...,-0.943173,1.075494,3.930749,4.084786,-0.168675,-4.480431,-4.1332,4.235694,1.88289,-3.120216
2,2.126465,0.52841,2.327674,-4.407062,2.473155,-2.692419,-3.859272,4.16491,-4.83665,-6.731491,...,0.560506,-4.260564,4.665086,6.829501,3.161113,-5.295737,-4.141501,1.192512,4.56942,-10.280411
3,2.994188,-5.458786,-4.283843,-4.673262,-0.887951,1.72341,0.915514,-2.708116,3.080909,-4.85621,...,0.334868,-5.392769,-0.364074,-2.457933,0.049093,-2.109209,0.346354,-1.231177,4.252873,-8.651485
4,3.736406,4.844031,-0.655391,-5.600396,-0.018026,0.586715,2.920149,4.049066,-7.120113,7.633911,...,-2.595132,7.347126,-3.256641,7.024849,-0.754481,2.64358,-6.352776,4.000177,-0.699216,1.370345


In [32]:
embeddings_labels.head().values[:,0]

array(['<$>', '<ACRONYM>', '<ADJP>', '<ADVP>', '<ALL_CAPS>'], dtype=object)

In [33]:
scaler = preprocessing.StandardScaler()

In [34]:
normalized_embeddings = scaler.fit_transform(embeddings_df.values)

In [35]:
pd.DataFrame(normalized_embeddings).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
0,-0.107777,-0.726387,-0.463679,-1.007598,-0.379634,-1.221623,-0.215409,0.655358,-0.798581,-0.982443,...,0.58688,-0.75658,-0.894967,-0.835887,1.14127,-0.876541,-0.340067,-0.520436,0.412613,-0.838526
1,0.082893,0.89625,0.393266,-0.388164,0.498001,-0.038548,0.810153,0.246311,0.409314,1.057024,...,-0.132752,0.310807,1.090045,1.081382,-0.108296,-1.264005,-1.083799,1.148822,0.512016,-0.819825
2,0.568071,0.144069,0.643066,-1.190993,0.686239,-0.780525,-1.148639,1.093608,-1.381377,-1.836861,...,0.27625,-1.148339,1.292091,1.831269,0.803108,-1.487217,-1.086089,0.307248,1.209762,-2.789654
3,0.802343,-1.490347,-1.164355,-1.263976,-0.236279,0.428719,0.156237,-0.791055,0.771058,-1.321364,...,0.214876,-1.457941,-0.091639,-0.706164,-0.04869,-0.614818,0.151472,-0.363009,1.127548,-2.341522
4,1.00273,1.322171,-0.172428,-1.518163,0.002488,0.117443,0.704074,1.061842,-2.00215,2.112049,...,-0.582085,2.025787,-0.887504,1.884641,-0.268638,0.686389,-1.695864,1.083692,-0.158608,0.415566


In [36]:
normalized_embeddings[0].shape

(300,)

In [37]:
token_embeddings = {}
for i, vec in tqdm(enumerate(normalized_embeddings)):
    token = embeddings_labels.values[i,0]
    token_embeddings[token] = vec

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [38]:
token_embeddings['<$>']

array([-1.07776996e-01, -7.26387380e-01, -4.63678589e-01, -1.00759757e+00,
       -3.79634409e-01, -1.22162339e+00, -2.15409040e-01,  6.55358411e-01,
       -7.98580864e-01, -9.82442887e-01, -5.35165124e-01,  1.39167390e-01,
        4.99560891e-01,  1.35160254e-01, -8.72019472e-01,  1.61086451e-01,
       -5.31787610e-01,  8.58735757e-01, -2.91400316e-01,  1.03111277e+00,
       -9.60259295e-01, -3.62934227e-02,  3.08977416e-01, -6.90099667e-01,
       -6.14423925e-01,  4.19666328e-01,  1.01956651e+00,  7.68356343e-01,
        8.77950774e-01, -6.65465975e-01, -1.94618332e-01, -2.90986331e-01,
        9.23103828e-01, -7.60354060e-01,  7.93220691e-01,  2.33960834e-01,
       -6.22054066e-01,  8.38496785e-01,  7.24863198e-02,  5.19803522e-01,
       -7.06852134e-01, -8.34160306e-01, -4.01838011e-01,  2.19816135e-01,
       -1.05073685e+00, -6.56431525e-01, -7.82371669e-01, -3.29460202e-02,
        2.15551453e-01, -1.11545241e+00, -7.20596359e-01, -1.99804405e-01,
        5.88893671e-01, -

In [39]:
dataset = pickle.load(open('conll_graph_all.pickle', 'rb'))

In [40]:
class Dataset(data.Dataset):
    def __init__(self, dataset, dataset_split, nodes_embeddings, window_size = 2):
        'Initialization'
        X, Y, RAW = [], [], []
        for doc in tqdm(dataset[dataset_split]):
            word = doc['word'][0]
            chunk = doc['chunk'][0]
            pos = doc['pos'][0]
            extras = doc['extra']
            classes = doc['classes']
            left_context = [w for w in doc['left_context'] if w in nodes_embeddings][-3:]
            right_context = [w for w in doc['right_context'] if w in nodes_embeddings][:3]
            
            extra = [chunk, pos]
            extra.extend(extras)
            extra.extend(classes)
            extra = [t.replace(' ', '_') if t.startswith('<') else t for t in extra]
            
            zeros = np.zeros(nodes_embeddings[token].shape)
            
            graph_rep = np.concatenate([nodes_embeddings[token],
                                        zeros if len(left_context) == 0 else np.mean([nodes_embeddings[w] for w in left_context], axis=0),
                                        zeros if len(right_context) == 0 else np.mean([nodes_embeddings[w] for w in right_context], axis=0),
                                        np.mean([nodes_embeddings[w] for w in extra], axis=0),
                                       ])

            X.append(graph_rep)
            Y.append(doc['label'])
            RAW.append((token, left_context, right_context, extra))
                
        
        self.X = np.array(X)
        self.labels = sorted(set(Y))
        self.y2index = {l: i for i, l in enumerate(self.labels)}
        self.Y = np.array([self.y2index[y] for y in Y])
        self.RAW = RAW
    
    def __len__(self):
        'Denotes the total number of samples'
        return len(self.X)
    
    def get_raw_item(self, index):
        'Denotes the total number of samples'
        return self.RAW[index]
    
    def get_labels(self):
        return self.labels
    
    def get_Y(self):
        return self.Y

    def __getitem__(self, index):
        'Generates one sample of data'
        # Select sample
        x = self.X[index] #.to('cuda') # [:voc_size]
        y = self.Y[index]
        
        return x, y

In [41]:
batch_size  = 64
num_workers = 4

train_set = Dataset(dataset, 'train', token_embeddings)
train_loader = DataLoader(train_set, batch_size=batch_size, num_workers=num_workers, shuffle=True) # , sampler=sampler) #

dev_set = Dataset(dataset, 'validation', token_embeddings)
dev_loader = DataLoader(dev_set, batch_size=batch_size, shuffle=True, num_workers=num_workers)

test_set = Dataset(dataset, 'test', token_embeddings)
test_loader = DataLoader(test_set, batch_size=batch_size, shuffle=False, num_workers=num_workers)

HBox(children=(FloatProgress(value=0.0, max=178610.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=44900.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=40760.0), HTML(value='')))




In [42]:
labels = pickle.load(open('labels.pickle', 'rb'))
label2id = {l: i for i, l in enumerate(labels)}
print(label2id)

{'LOC': 0, 'MISC': 1, 'O': 2, 'ORG': 3, 'PER': 4}


In [43]:
labels = train_set.get_labels()
label_counter   = Counter([labels[y] for y in train_set.get_Y()])
labels_freqs    = [label_counter[label] / sum(label_counter.values()) for label in labels]
labels_weights  = [min(label_counter.values()) / label_counter[label] for label in labels]
labels_weights2 = [np.sqrt(min(label_counter.values())) / np.sqrt(label_counter[label]) for label in labels]
labels_weights

[0.5529681467181468,
 1.0,
 0.03168753586713775,
 0.4590344551282051,
 0.4119920891765552]

In [44]:
t = time.time()
print(len(train_loader))
for batch_X, batch_Y in train_loader:
    print(batch_X.shape)
    print(batch_Y.shape)
    print(sum(batch_X[0]))
    print('Class distribution in this batch:', Counter(batch_Y.numpy()))
    break
print(f'time: {time.time() - t:.3}s')

2791
torch.Size([64, 1200])
torch.Size([64])
tensor(27.0221, dtype=torch.float64)
Class distribution in this batch: Counter({2: 52, 0: 6, 3: 3, 4: 3})
time: 0.54s


In [54]:
input_dim = 1200

# The Model

In [55]:
def backprop(batch_X, batch_Y, model, optimizer, loss_fn):
    Y_hat = model(batch_X)
    loss = loss_fn(Y_hat, batch_Y)
    loss.backward()
    optimizer.step()
    
    return loss.item()

class FeedForwardNetwork(nn.Module):
    def __init__(self, input_dim=input_dim, hidden_dim=256, output_dim=5, dropout_rate=0.2):
        super().__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fch = nn.Linear(hidden_dim, hidden_dim)
        # self.fch2 = nn.Linear(hidden_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, output_dim)
        self.relu = nn.ReLU()

        # extra layers layers
        self.dropout = nn.Dropout(p=dropout_rate)
        
    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fch(x)
        x = self.relu(x)
        x = self.dropout(x)
        logits = self.fc2(x)

        return logits

In [56]:
logs = {'loss/train': {}, 'dev': {}}
writer = SummaryWriter(comment='xp8-transe300-wei2-lr1e3-mom0.95-wd5e4-hd256-dr0.2-bs64-normalized', log_dir=None,)

In [57]:
ffnet = FeedForwardNetwork(dropout_rate=0.2).to(device)

In [58]:
label_counter   = Counter([y.item() for y in train_set.Y])
labels_freqs    = [label_counter[label] / sum(label_counter.values()) for label in range(len(labels))]
labels_weights1 = [min(label_counter.values()) / label_counter[label] for label in range(len(labels))]
labels_weights2 = [np.sqrt(min(label_counter.values())) / np.sqrt(label_counter[label]) for label in range(len(labels))]

weights = torch.Tensor(labels_weights2).to(device)
print(weights)

tensor([0.7436, 1.0000, 0.1780, 0.6775, 0.6419], device='cuda:0')


In [59]:
optimizer_params = {'lr': 1e-3, 
                    'momentum': 0.95, 
                    'weight_decay': 5e-4,
                   }

log_interval = int(len(train_loader) / 5)

loss_fn = nn.CrossEntropyLoss(weight=weights)
optimizer = torch.optim.SGD(ffnet.parameters(), **optimizer_params)

In [67]:
%%time
max_epochs = 1

for epoch in range(len(logs['loss/train']), len(logs['loss/train']) + max_epochs):
    
    # Training
    ffnet.train()
    print('Epoch', epoch)
    logs['loss/train'][epoch] = []
    writer.add_scalar("Learning_rate", optimizer_params['lr'], epoch)

    for batch, (batch_X, batch_Y) in enumerate(tqdm(train_loader)):
        # tranfer to GPU
        batch_X, batch_Y = batch_X.float().to(device), batch_Y.to(device)
        optimizer.zero_grad()
        l = backprop(batch_X, batch_Y, ffnet, optimizer, loss_fn)
        logs['loss/train'][epoch].append(l)
        
        if batch % log_interval == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch * len(batch_X), len(train_loader.dataset),
                100. * batch / len(train_loader), l))
    
    logs['loss/train'][epoch] = np.mean(logs['loss/train'][epoch])
    writer.add_scalar("Loss/train", logs['loss/train'][epoch], epoch)
    print(f'Average loss on epoch {epoch}: {logs["loss/train"][epoch]}')
    
    # Validation
    ffnet.eval()
    with torch.no_grad():
        preds = []
        gt = []
        for batch, (batch_X, batch_Y) in enumerate(tqdm(dev_loader)):
            # Transfer to GPU
            batch_X = batch_X.float().to(device)
            output = nn.Softmax(dim=1)(ffnet(batch_X))
            preds.append(output.cpu())
            gt.append(batch_Y)

        all_out = [np.argmax(l) for batch in preds for l in batch.numpy()]
        all_gt  = [l for batch in gt for l in batch.numpy()]

        print(classification_report(all_out, all_gt, digits=4))

        micro_F1 = metrics.f1_score(all_gt, all_out, average='micro')
        macro_F1 = metrics.f1_score(all_gt, all_out, average='macro')
        weighted_F1 = metrics.f1_score(all_gt, all_out, average='weighted')
        writer.add_scalar("micro_F1/dev", micro_F1, epoch)
        writer.add_scalar("macro_F1/dev", macro_F1, epoch)
        writer.add_scalar("weighted_F1/dev", weighted_F1, epoch)
        logs['dev'][epoch] = (micro_F1, weighted_F1, macro_F1, (all_gt, all_out))

Epoch 32


HBox(children=(FloatProgress(value=0.0, max=2791.0), HTML(value='')))


Average loss on epoch 32: 0.1444724118093443


HBox(children=(FloatProgress(value=0.0, max=702.0), HTML(value='')))


              precision    recall  f1-score   support

           0     0.7818    0.8040    0.7927      2036
           1     0.7009    0.6384    0.6682      1391
           2     0.9745    0.9858    0.9801     35921
           3     0.6609    0.6577    0.6593      2074
           4     0.8872    0.8002    0.8414      3478

    accuracy                         0.9372     44900
   macro avg     0.8010    0.7772    0.7883     44900
weighted avg     0.9360    0.9372    0.9364     44900

CPU times: user 2min 34s, sys: 2.88 s, total: 2min 37s
Wall time: 16 s


In [68]:
ffnet.eval()
with torch.no_grad():
    preds = []
    gt = []
    for batch, (batch_X, batch_Y) in enumerate(tqdm(test_loader)):
        # Transfer to GPU
        batch_X = batch_X.float().to(device)
        output = nn.Softmax(dim=1)(ffnet(batch_X))
        preds.append(output.cpu())
        gt.append(batch_Y)

    all_out = [np.argmax(l) for batch in preds for l in batch.numpy()]
    all_gt  = [l for batch in gt for l in batch.numpy()]

    print(classification_report(all_out, all_gt, digits=4))

HBox(children=(FloatProgress(value=0.0, max=637.0), HTML(value='')))


              precision    recall  f1-score   support

           0     0.7532    0.7254    0.7390      1999
           1     0.6427    0.5203    0.5750      1134
           2     0.9660    0.9847    0.9753     32029
           3     0.6318    0.6315    0.6316      2494
           4     0.8532    0.7622    0.8052      3104

    accuracy                         0.9205     40760
   macro avg     0.7694    0.7248    0.7452     40760
weighted avg     0.9175    0.9205    0.9186     40760



In [70]:
# 
print('''
DIM = 50
Without Normalization
Eval   macro avg     0.7658    0.7460    0.7536     44900
Test   macro avg     0.7360    0.6755    0.7005     40760

Normalized 
Eval   macro         0.7703    0.7335    0.7506     44900
Test   macro avg     0.7282    0.6775    0.7001     40760


DIM = 300
Eval   macro avg     0.8010    0.7772    0.7883     44900
Test   macro avg     0.7694    0.7248    0.7452     40760


''')


DIM = 50
Without Normalization
Eval   macro avg     0.7658    0.7460    0.7536     44900
Test   macro avg     0.7360    0.6755    0.7005     40760

Normalized 
Eval   macro         0.7703    0.7335    0.7506     44900
Test   macro avg     0.7282    0.6775    0.7001     40760


DIM = 300
Eval   macro avg     0.8010    0.7772    0.7883     44900
Test   macro avg     0.7694    0.7248    0.7452     40760



