In [1]:
import numpy as np
import pandas as pd
from torch import nn
from torch.utils.data import TensorDataset, DataLoader
from torch import cuda
from tqdm import tqdm
import torch
import fasttext
from data_module import CustomDataset
from sklearn.metrics import f1_score, accuracy_score
import os.path
import pickle
from transformers import AutoModel, BertTokenizer, BertForSequenceClassification
from ngram_attention import NGramAttention

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
bert = BertForSequenceClassification.from_pretrained('cointegrated/rubert-tiny2', num_labels=2).to("cuda")
bert_ckpt = torch.load('/mnt/cs/voice/korenevskaya-a/nirma/bert_ckpt.pt')
tokenizer = BertTokenizer.from_pretrained('cointegrated/rubert-tiny2')
bert.load_state_dict(bert_ckpt)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at cointegrated/rubert-tiny2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


<All keys matched successfully>

In [3]:
ngram = NGramAttention()
ngram_ckpt = torch.load('/mnt/cs/voice/korenevskaya-a/nirma/checkpoints_ngram_attention/checkpoint_4.pt')
ngram.load_state_dict(ngram_ckpt)




<All keys matched successfully>

In [3]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
    
# Creating the dataset and dataloader for BERT model 
df = pd.read_csv("out_data/ToxicRussianComments.csv")

train_size = 0.8
train_dataset=df.sample(frac=train_size,random_state=200)
test_dataset=df.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)
MAX_LEN = 267

print("FULL Dataset: {}".format(df.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))



FULL Dataset: (248290, 2)
TRAIN Dataset: (198632, 2)
TEST Dataset: (49658, 2)


In [6]:
# dataloaders for ngram_attention model

with open('fasttext_train.pkl', 'rb') as fp:
    training_set = pickle.load(fp) 
training_set = TensorDataset(training_set[0], training_set[1])        
    
with open('fasttext_test.pkl', 'rb') as fp:
    testing_set = pickle.load(fp)       
testing_set = TensorDataset(testing_set[0], testing_set[1]) 

train_params = {'batch_size': 10,
                'shuffle': False,
                'drop_last': True,
                'num_workers': 0
                }

test_params = {'batch_size': 10,
                'shuffle': False,
                'drop_last': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [19]:
ngram.to(device)
ngram.eval()
results_ngram_test = []
ans_test = []
for _, data in enumerate(tqdm(testing_loader), 0):
    # preprocessing
    sentences = data[0]
    targets = data[1]

    for idx, sentence in enumerate(sentences): 
        for i, word in enumerate(sentence):
            # if emb is pure zeros, then it is altered into trainable eos embedding
            if torch.all(word.eq(torch.zeros_like(word))):
                with torch.no_grad():
                    sentences[idx][i] = ngram.eos    
    sentences = torch.unsqueeze(sentences, 1)    
    
    with torch.no_grad():
        outputs = ngram(sentences.to(device, dtype=torch.float))
        results_ngram_test += outputs
        ans_test += targets 
    

  alpha1 = self.soft1(torch.matmul(x1, self.ngram_context1))
  alpha2 = self.soft2(torch.matmul(x2, self.ngram_context2))
  alpha3 = self.soft3(torch.matmul(x3, self.ngram_context3))
100%|██████████| 4965/4965 [53:59<00:00,  1.53it/s] 


In [20]:
with open('logits_ngram_test.pkl', 'wb') as f:
    pickle.dump(results_ngram_test, f)

In [10]:
with open('logits_ngram_test.pkl', 'rb') as f:
    results_ngram_test = pickle.load(f)

In [6]:
results_bert_test = []
bert.to(device='cpu')
bert.eval()
for comment in tqdm(test_dataset['comment']):
    input_ids = torch.tensor(tokenizer.encode(comment), device='cpu').unsqueeze(0)
    outputs = bert(input_ids)
    results_bert_test.append(outputs.logits)

100%|██████████| 49658/49658 [04:59<00:00, 166.00it/s]


In [9]:
ans = test_dataset['label'].values
ans

array([1, 0, 0, ..., 0, 1, 0])

In [57]:
from sklearn.metrics import f1_score
alpha = 0.8
preds = []
for i, pair in enumerate(zip(results_bert_test, results_ngram_test)): 
    pair0 = torch.sigmoid(pair[0].to(device).squeeze(0))
    pair1 = torch.sigmoid(pair[1])
    pred = pair0*alpha + pair1*(1-alpha)    
    preds.append(int(pred.argmax()))
f1 = f1_score(ans[:len(preds)], preds)
acc = np.sum((ans[:len(preds)]==preds))/len(ans)
print("F1:",  f1)
print("Acc:",  acc)

F1: 0.9223796033994335
Acc: 0.9722501913085505


In [12]:
class Fusion():
    def __init__(self, ):
        self.l1 = nn.Linear(4, 2)
        self.relu = nn.ReLU()

    def forward(self, x1, x2):
        x = torch.cat(x1, x2)
        x = self.l1(x)
        x = self.relu(x)
        return x          

In [None]:
fusion = Fusion()
fusion.train()


for index, data in enumerate(tqdm(training_loader), 0):
    # preprocessing
    sentences = data[0]
    targets_ = data[1]#.to(device, dtype = torch.float)
    targets = torch.empty((len(data[1]), 2), dtype=torch.float)
    for i, tar in enumerate(targets_):
        if tar == 0:
            targets[i] = torch.tensor([1.,0.])
        else:
            targets[i] = torch.tensor([0.,1.])

    for idx, sentence in enumerate(sentences): 
        for i, word in enumerate(sentence):
            # if emb is pure zeros, then it is altered into trainable eos embedding
            if torch.all(word.eq(torch.zeros_like(word))):
                with torch.no_grad():
                    sentences[idx][i] = model.eos    
    sentences = torch.unsqueeze(sentences, 1)    
    
    with torch.enable_grad():
        outputs = model(sentences.to(device, dtype=torch.float))
        #outputs = outputs.reshape(TRAIN_BATCH_SIZE)

        optimizer.zero_grad()
        loss = loss_fn(outputs, targets.to(device, dtype=torch.float))
        if index %50 ==0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')
        
        loss.backward()
        optimizer.step()
scheduler.step()  
print("Current LR: ", scheduler.get_last_lr())      
ckp = model.state_dict()
PATH = f"/mnt/cs/voice/korenevskaya-a/nirma/checkpoints_ngram_attention/checkpoint_{epoch}.pt"
torch.save(ckp, PATH)
print(f"Epoch {epoch} | Training checkpoint saved at {PATH}")        