In [1]:
import copy

import matplotlib.pyplot as plt
import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from seqeval.scheme import IOB2
from seqeval.metrics import classification_report,f1_score

In [2]:
import pandas as pd
import numpy as np
import json
from tqdm import tqdm
import warnings
import random

from keras.preprocessing.text import Tokenizer
from sklearn.model_selection import KFold

from torch.utils.data import TensorDataset, DataLoader ,SubsetRandomSampler ,ConcatDataset ,Dataset

from g_mlp_pytorch import gMLP

In [3]:
data_train = pd.read_csv("IOB2_Data/BC_train_IOB2_all.txt",sep = '\t', na_filter=False)
data_dev = pd.read_csv("IOB2_Data/BC_dev_IOB2_all.txt",sep = '\t', na_filter=False)

In [4]:
class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, t) for w, t in zip(s["Word"].values.tolist(),
                                                     s["Tag"].values.tolist())]
        self.grouped = self.data.groupby("Sentence#").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped[self.n_sent]
            self.n_sent += 1
            return s
        except:
            return None

In [5]:
getter = SentenceGetter(data_train)
dev_getter = SentenceGetter(data_dev)

In [6]:
sentences = [[word[0] for word in sentence] for sentence in getter.sentences]
dev_sentences = [[word[0] for word in sentence] for sentence in dev_getter.sentences]
labels = [[s[1] for s in sent] for sent in getter.sentences]
dev_labels = [[s[1] for s in sent] for sent in dev_getter.sentences]

In [7]:
max_features = 40000
tokenizer = Tokenizer(num_words=max_features, split=' ', oov_token='<unk>')
tokenizer.fit_on_texts(sentences)

In [8]:
word2idx = tokenizer.word_index.copy()
word2idx['<pad>'] = 0
idx2word = {word2idx[i]:i for i in word2idx}

In [9]:
len(word2idx)

81778

In [10]:
tags = list(set(data_train['Tag'].values))
tags.append("<pad>")
tag2idx = {t: i for i, t in enumerate(tags)}
idx2tag = {i: t for i, t in enumerate(tags)}

In [11]:
X_train = tokenizer.texts_to_sequences(sentences)
X_test = tokenizer.texts_to_sequences(dev_sentences)
y_train = [[tag2idx[l] for l in s] for s in labels]
y_test = [[tag2idx[l] for l in s] for s in dev_labels]

In [12]:
idx2word[5]

'!'

In [13]:
len(max(X_train,key=len))

47

In [17]:
class NERDataset(Dataset):
    def __init__(self,sentences,labels, word_pad_idx, tag_pad_idx, max_len = 50):
        self.sentences = sentences
        self.labels = labels
        self.word_pad_idx = word_pad_idx
        self.tag_pad_idx = tag_pad_idx
        self.max_len = max_len
    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, index):
        return (self.sentences[index],self.labels[index])
        
    def collate_fn(self, datasets):
        sentences = [dataset[0] for dataset in datasets]
        labels = [dataset[1] for dataset in datasets]
#         max_sent = max([len(data) for data in sentences])
#         max_len = max([min(len(sentence), self.max_len) for sentence in sentences])
        pad_sentence = []
        pad_label = []
        for sentence,label in zip(sentences,labels):
            
            if len(sentence) > self.max_len:
#                 print('asd')
                pad_sentence.append(sentence[:self.max_len])
                pad_label.append(label[:self.max_len])
            else:
#                 print('zxc')
                pad_sentence.append(sentence+[self.word_pad_idx]*(self.max_len-len(sentence)))
                pad_label.append(label+[self.tag_pad_idx]*(self.max_len-len(label)))
        return torch.LongTensor(pad_sentence), torch.LongTensor(pad_label)

In [18]:
model = gMLP(
    num_tokens = 20000,
    dim = 3,
    depth = 6,
    seq_len = 256,
    circulant_matrix = True,      # use circulant weight matrix for linear increase in parameters in respect to sequence length
    act = nn.Tanh()               # activation for spatial gate (defaults to identity)
)

x = torch.randint(0, 20000, (16, 256))
print(x.shape)
logits = model(x) # (1, 256, 20000)
print(logits.shape)

torch.Size([16, 256])
torch.Size([16, 256, 20000])


In [19]:
class GMLP_model(nn.Module):
    def __init__(self, vocab, hidden_dim, output_vocab, n_layer,word_pad_idx,tag_pad_idx):
        super(GMLP_model, self).__init__()
        self.n_layer = n_layer
        self.embedding_size = 300
#         self.hidden_dim = hidden_dim
        self.embedded = nn.Embedding(vocab, self.embedding_size , padding_idx  = word_pad_idx)
        self.gmlp = gMLP(num_tokens = vocab,
                        dim = 512,
                        depth = 6,
                        seq_len = 30,
                        circulant_matrix = True, 
                        act = nn.Tanh()
                    )
        self.fc1 = nn.Linear(hidden_dim,output_vocab)
    def forward(self, x):
        output = self.embedded(x)
        print(output.shape)
        output = self.gmlp(x)
        print(output.shape)
        output = self.fc1(output)
        return output

In [20]:
bs = 64
num_epoch = 20

tr_dataset = NERDataset(X_train,y_train,word2idx['<pad>'],tag2idx['<pad>'])
va_dataset = NERDataset(X_test,y_test,word2idx['<pad>'],tag2idx['<pad>'])
train_dataloader = DataLoader(tr_dataset, batch_size=bs,
                            collate_fn=tr_dataset.collate_fn)
valid_dataloader = DataLoader(va_dataset, batch_size=bs,
                            collate_fn=va_dataset.collate_fn)  
model = GMLP_model(len(word2idx), 256, len(tags),2,word2idx['<pad>'],tag2idx['<pad>'])
optimizer = optim.AdamW(model.parameters(), lr=5e-3)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
criterion = nn.CrossEntropyLoss()
all_loader = {"train" : train_dataloader,
              "valid" : valid_dataloader}


In [21]:
Fold_score = []
for epoch in tqdm(range(num_epoch)):
    all_loss = {
        'train': [],
        'valid': [],
    }
    print('')
    for loader in all_loader:
        predictions , true_labels  = [],[]
        for x, y in all_loader[loader]:
            optimizer.zero_grad()
            x = x.to(device)
            y = y.to(device)
            output = model(x)
            loss = criterion(output,y)
            if loader == 'train':
                loss.backward()
                optimizer.step()
            all_loss[loader].append(loss.cpu().item()) 
            predictions.extend([[idx2tag[j] for j in i] for i in output])
            for i in y.detach().cpu().numpy():
                _ = []
                for j in i:
                    if j != tag_pad_idx:
                        _.append(idx2tag[j])
                true_labels.append(_)
        print(f'{loader}_loss : {np.mean(np.array(all_loss[loader]))/64}')
        f_ = f1_score(true_labels,predictions,scheme = IOB2)
        print(f'{loader}_F1: {f_}')
        if loader == 'valid':
            Fold_score.append(f_)

  0%|                                                                                           | 0/20 [00:00<?, ?it/s]


torch.Size([64, 50, 300])


  0%|                                                                                           | 0/20 [00:00<?, ?it/s]


RuntimeError: size of dimension does not match previous size, operand 1, dim 2