In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0,1,2,3"

In [None]:
import torch
from torch import nn
from torch.nn import functional as F
from torch.utils import data
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter

import re
import time
import pickle

import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from collections import Counter

from sklearn import metrics
from sklearn.metrics import classification_report, f1_score

In [None]:
torch.manual_seed(42)
np.random.seed(42)

# CUDA for PyTorch
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")
print('Device:', device)
torch.backends.cudnn.benchmark = True

In [None]:
cn = pd.read_csv('conceptnet_en.csv')
cn_isa = pd.read_csv('data/conceptnet_isa.csv')
cn_keys = set(cn.subject.values)

In [None]:
word2labels = pickle.load(open('edges/word2labels.pickle', 'rb'))

In [None]:
word2labels['jacob']

In [None]:
len(conll_dataset['train']), len(conll_dataset['validation']), len(conll_dataset['test'])

In [None]:
nodes_embeddings = {'hope_gsvd': None,
                    'lap_eigmap_svd': None,
                    'lle_svd': None,
                    'node2vec_rw': None}

for embedding_name in nodes_embeddings:
    nodes_embeddings[embedding_name] = pickle.load(open('edges/'+embedding_name+'_all_embeddings.pickle', 'rb'))

In [None]:
class Dataset(data.Dataset):
    def __init__(self, dataset, dataset_split, nodes_embeddings, window_size = 2):
        'Initialization'
        RAW, X, Y = [], [], []
        for doc in tqdm(dataset[dataset_split], desc=f'Loading split {dataset_split}'):
            text = [w.lower() for w in doc['words']]
            for i, (token, pos, label) in enumerate(zip(doc['words'], doc['pos'], doc['ner'])):
                if token == pos:
                    continue # this is punctuation

                

                if token.endswith('='):
                    token = token[:-1]

                while token and token[0] in "!$%&'*+,-.:;<=>?@`":
                    token = token[1:]

                token = re.sub(r'\d+', '<NUM>', token)
                token = token.replace('`', "'")
                
                if token.lower() not in nodes_embeddings: # new words appearing only in the eval and test
                    token = '<span>'
                
                if not token:
                    continue

                extra = ['<'+pos.lower()+'>' if pos.lower() in nodes_embeddings else '<span>']
                if token.lower() in word2labels:
                    extra.extend(['<'+l.lower()+'>' for l in word2labels[token.lower()]])
                if token.lower() not in cn_keys:
                    extra.append('<not_in_dict>')
                if token == token.upper():
                    extra.append('<all_caps>')
                if token.count('.') > 0 and (token.count('.') + 1) == len(token.split('.')): # C.J or C.J.
                    extra.append('<accronym>')
                if token[0] == token[0].upper() and token[1:] == token[1:].lower(): 
                    extra.append('<capitalized>')
                
                
                left_context  = text[max(i-window_size, 0):i] + ([] if i >= window_size else ['<span>'])
                left_context  = [w if w in nodes_embeddings else '<span>' for w in left_context]
                
                right_context = text[i+1:i+1+window_size] + ([] if i + window_size < len(text) else ['<span>'])
                right_context = [w if w in nodes_embeddings else '<span>' for w in right_context]
                
                graph_rep = np.concatenate([nodes_embeddings[token.lower()],
                                            np.mean([nodes_embeddings[w] for w in left_context], axis=0),
                                            np.mean([nodes_embeddings[w] for w in right_context], axis=0),
                                            np.mean([nodes_embeddings[w] for w in extra], axis=0),
                                           ])
                X.append(graph_rep)
                Y.append(label.split('-')[-1])
                RAW.append((token, left_context, right_context, extra))
                
        
        self.X = np.array(X)
        self.labels = sorted(set(Y))
        self.y2index = {l: i for i, l in enumerate(self.labels)}
        self.Y = np.array([self.y2index[y] for y in Y])
        self.RAW = RAW
    
    def __len__(self):
        'Denotes the total number of samples'
        return len(self.X)
    
    def get_raw_item(self, index):
        'Denotes the total number of samples'
        return self.RAW[index]
    
    def get_labels(self):
        return self.labels
    
    def get_Y(self):
        return self.Y

    def __getitem__(self, index):
        'Generates one sample of data'
        # Select sample
        x = self.X[index] #.to('cuda') # [:voc_size]
        y = self.Y[index]
        
        return x, y

In [None]:
batch_size  = 64
num_workers = 4
embeddings_to_use = 'lle_svd'

train_set = Dataset(conll_dataset, 'train', nodes_embeddings[embeddings_to_use])
train_loader = DataLoader(train_set, batch_size=batch_size, num_workers=num_workers, shuffle=True) # , sampler=sampler) #

dev_set = Dataset(conll_dataset, 'validation', nodes_embeddings[embeddings_to_use])
dev_loader = DataLoader(dev_set, batch_size=batch_size, shuffle=True, num_workers=num_workers)

test_set = Dataset(conll_dataset, 'test', nodes_embeddings[embeddings_to_use])
test_loader = DataLoader(test_set, batch_size=batch_size, shuffle=False, num_workers=num_workers)

In [None]:
labels = train_set.get_labels()
label_counter   = Counter([labels[y] for y in train_set.get_Y()])
labels_freqs    = [label_counter[label] / sum(label_counter.values()) for label in labels]
labels_weights  = [min(label_counter.values()) / label_counter[label] for label in labels]
labels_weights2 = [np.sqrt(min(label_counter.values())) / np.sqrt(label_counter[label]) for label in labels]

# sampling_probs = [labels_weights2[labels_to_id[l]] for l in Y_train]
# sampler = torch.utils.data.sampler.WeightedRandomSampler(sampling_probs, len(Y_train), replacement=True)

In [None]:
t = time.time()
print(len(train_loader))
for batch_X, batch_Y in train_loader:
    print(batch_X.shape)
    print(batch_Y.shape)
    print(sum(batch_X[0]))
    print('Class distribution in this batch:', Counter(batch_Y.numpy()))
    break
print(f'time: {time.time() - t:.3}s')

In [None]:
train_set.get_raw_item(2)

# Training

In [None]:
writer = SummaryWriter(log_dir=None,filename_suffix='secondattemptwithn2vembeddings')

def backprop(batch_X, batch_Y, model, optimizer, loss_fn):
    Y_hat = model(batch_X)
    loss = loss_fn(Y_hat, batch_Y)
    loss.backward()
    optimizer.step()
    
    return loss.item()

input_dim = 1200
class FeedForwardNetwork(nn.Module):
    def __init__(self, input_dim=input_dim, hidden_dim=512, output_dim=5, dropout_rate=0.2):
        super().__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fch = nn.Linear(hidden_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, output_dim)
        self.relu = nn.ReLU()

        # extra layers layers
        self.dropout = nn.Dropout(p=dropout_rate)
        # self.batchnorm1 = nn.BatchNorm1d(hidden_dim)
        # self.batchnorm2 = nn.BatchNorm1d(hidden_dim)

        
    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fch(x)
        x = self.relu(x)
        x = self.dropout(x)
        #x = self.fch2(x)
        # x = self.dropout(x)
        # x = self.relu(x)
        logits = self.fc2(x)

        return logits

In [None]:
logs = {'loss/train': {}, 'dev': {}}

ffnet = FeedForwardNetwork().to('cuda')

log_interval = int(len(train_loader) / 4)
weights = torch.Tensor(labels_weights2).to('cuda')
weights

In [None]:
optimizer_params = {'lr': 1e-4, 
                    'momentum': 0.9, 
                    'weight_decay': 5e-4,
                   }

loss_fn = nn.CrossEntropyLoss(weight=weights)
optimizer = torch.optim.SGD(ffnet.parameters(), **optimizer_params)

In [None]:
%%time
max_epochs = 20

for epoch in range(len(logs['loss/train']), len(logs['loss/train']) + max_epochs):
    
    # Training
    ffnet.train()
    print('Epoch', epoch)
    logs['loss/train'][epoch] = []
    writer.add_scalar("Learning_rate", optimizer_params['lr'], epoch)

    for batch, (batch_X, batch_Y) in enumerate(tqdm(train_loader)):
        # tranfer to GPU
        batch_X, batch_Y = batch_X.float().to(device), batch_Y.to(device)
        optimizer.zero_grad()
        l = backprop(batch_X, batch_Y, ffnet, optimizer, loss_fn)
        logs['loss/train'][epoch].append(l)
        
        if batch % log_interval == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch * len(batch_X), len(train_loader.dataset),
                100. * batch / len(train_loader), l))
    
    logs['loss/train'][epoch] = np.mean(logs['loss/train'][epoch])
    writer.add_scalar("Loss/train", logs['loss/train'][epoch], epoch)
    print(f'Average loss on epoch {epoch}: {logs["loss/train"][epoch]}')
    
    # Validation
    ffnet.eval()
    with torch.no_grad():
        preds = []
        gt = []
        for batch, (batch_X, batch_Y) in enumerate(tqdm(dev_loader)):
            # Transfer to GPU
            batch_X = batch_X.float().to(device)
            output = nn.Softmax(dim=1)(ffnet(batch_X))
            preds.append(output.cpu())
            gt.append(batch_Y)
    
        all_out = [np.argmax(l) for batch in preds for l in batch.numpy()]
        all_gt  = [l for batch in gt for l in batch.numpy()]
        
        print(classification_report(all_out, all_gt, digits=4))
        micro_F1 = metrics.f1_score(all_gt, all_out, average='micro')
        macro_F1 = metrics.f1_score(all_gt, all_out, average='macro')
        weighted_F1 = metrics.f1_score(all_gt, all_out, average='weighted')
        writer.add_scalar("micro_F1/dev", micro_F1, epoch)
        writer.add_scalar("macro_F1/dev", macro_F1, epoch)
        writer.add_scalar("weighted_F1/dev", weighted_F1, epoch)
        logs['dev'][epoch] = (micro_F1, weighted_F1, macro_F1, (all_gt, all_out))

In [None]:
ffnet.eval()
with torch.no_grad():
    preds = []
    gt = []
    for batch, (batch_X, batch_Y) in enumerate(tqdm(test_loader)):
        # Transfer to GPU
        batch_X = batch_X.float().to(device)
        output = nn.Softmax(dim=1)(ffnet(batch_X))
        preds.append(output.cpu())
        gt.append(batch_Y)

    all_out = [np.argmax(l) for batch in preds for l in batch.numpy()]
    all_gt  = [l for batch in gt for l in batch.numpy()]

    print(classification_report(all_out, all_gt, digits=4))