In [1]:
import numpy as np
import pandas as pd
import json
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

In [2]:
# from transformers import AutoTokenizer, AutoModelForMaskedLM
from transformers import BertTokenizer, BertModel
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')

In [3]:
TRAIN_DATA_PATH = 'train_split.json'
# TEST_DATA_PATH  = '/kaggle/input/nlp-project-dataset/test_split.json'
VAL_DATA_PATH   = 'validation_split.json'

In [4]:
def read_json(file):
    with open(file, 'r', encoding='utf-8') as f:
        data = json.load(f)
    return data


In [5]:
TRAIN_DATA = read_json(TRAIN_DATA_PATH)
VALIDATION_DATA = read_json(VAL_DATA_PATH)
TEST_DATA = read_json(TEST_DATA_PATH)

In [6]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [7]:
LEARNING_RATE = 0.5
MAX_LEN = 32
BATCH_SIZE = 32
SEQ_LEN = 20
# FOR THE SEQ_LEN = 20, READ THE DATA RETREIVAL FILE

In [9]:
class Chinese_Article_Data(Dataset):
    def __init__(self, file, tokenizer, max_len, seq_len):
        self.data = file
        self.tokenizer = tokenizer
        self.max_len   = max_len
        self.seq_len   = seq_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        news_article_input_ids   = torch.empty((self.seq_len,self.max_len))
        news_article_attention_list = torch.empty((self.seq_len,self.max_len))
        itr = 0
        for sentence_idx in range(len(self.data[idx]['news_article'])):
            news_article_instance = self.data[idx]['news_article'][itr]
            encoding = self.tokenizer.encode_plus (
                news_article_instance, 
                max_length = self.max_len,
                pad_to_max_length = True,
                truncation = True,
                padding = 'max_length',
                return_attention_mask= True,
                return_tensors = 'pt',
            )
            news_article_input_ids[itr] = encoding['input_ids']
            news_article_attention_list[itr]  = encoding['attention_mask']
            itr += 1
            if (itr == self.seq_len):
                break
            
        while(itr < self.seq_len):
            encoding = self.tokenizer.encode_plus (
                'NULL', 
                max_length = self.max_len,
                pad_to_max_length = True,
                truncation = True,
                padding = 'max_length',
                return_attention_mask= True,
                return_tensors = 'pt',
            )
            news_article_input_ids[itr] = encoding['input_ids']
            news_article_attention_list[itr]  = encoding['attention_mask']
            itr += 1
        
        question = self.data[idx]['question_stem']
        encoding = self.tokenizer.encode_plus (
                question, 
                max_length = self.max_len,
                pad_to_max_length = True,
                truncation = True,
                padding = 'max_length',
                return_attention_mask= True,
                return_tensors = 'pt',
            )
        question_input_ids = encoding['input_ids']
        question_attention_mask = encoding['attention_mask']
        answer_option = torch.tensor([float(x.replace(',', '')) for x in self.data[idx]['answer_options']])
        answer   = torch.tensor([1.0 if (x==self.data[idx]['ans']) else 0.0 for x in range(4)])
#         print(itr)
        return news_article_input_ids, news_article_attention_list, question_input_ids, question_attention_mask, answer, answer_option


In [8]:
def getloader(file = None, tokenizer = tokenizer, max_len = MAX_LEN, batch_size = BATCH_SIZE, seq_len = SEQ_LEN):
    dataset = Chinese_Article_Data (file, tokenizer, max_len,seq_len)
    loader = DataLoader(dataset = dataset, batch_size = batch_size, num_workers = 4, shuffle = False, pin_memory = True)
    return loader

In [10]:
class Chinese_answer_Model(nn.Module):
    def __init__(self):
        super(Chinese_answer_Model, self).__init__()
        self.bert = BertModel.from_pretrained("bert-base-chinese", return_dict = True)
        self.fc1 = nn.Linear(768*(SEQ_LEN+1), 4)
        self.out = nn.Linear(8, 4)
    
    def forward(self, news_article_input_ids, news_article_attention_mask,question_input_ids, question_attention_mask, answer_option):
        output = torch.empty((news_article_input_ids.shape[0], 4)).to(device)
        
        for i in range(news_article_input_ids.shape[0]):
            text_out = self.bert(input_ids = news_article_input_ids[i].long(),attention_mask= news_article_attention_mask[i])
            question_out = self.bert(input_ids = question_input_ids[i].long(),attention_mask= question_attention_mask[i])
            out = torch.cat((text_out.pooler_output, question_out.pooler_output), dim = 0)
            out = out.view(-1)
            out = self.fc1(out)
            out = torch.relu(out)
            out = torch.cat((out, answer_option[i]),dim = -1)
            out = self.out(out)
            
            out = torch.softmax(out,dim = 0)
            output[i] = out
        
        
        return output

In [11]:
train_data_loader = getloader(TRAIN_DATA)
validation_data_loader = getloader(VALIDATION_DATA)
test_data_loader = getloader(TEST_DATA)

In [12]:
model = Chinese_answer_Model()
model.to(device)
model = nn.DataParallel(model)

In [13]:
def loss_fn (outputs, targets):
    return nn.CrossEntropyLoss()(outputs, targets)
optimizer = torch.optim.SGD(params = model.parameters(), lr = LEARNING_RATE)

In [16]:
EPOCH = 5

In [15]:
def train(data_loader, val_loader):
    train_loss_list = []
    val_loss_list = []
    for epochs in range(EPOCH):
        train_loss = 0
        val_loss = 0
        model.train()
        for batch , data in tqdm(enumerate(data_loader, 0), unit="batch", total=len(data_loader)):
            news_article_input_ids, news_article_attention_mask,question_input_ids, question_attention_mask, answer, answer_option = data
            news_article_input_ids = news_article_input_ids.to(device)
            news_article_attention_mask = news_article_attention_mask.to(device)
            question_input_ids = question_input_ids.to(device)
            question_attention_mask = question_attention_mask.to(device)
            answer = answer.to(device)
            optimizer.zero_grad()
            outputs = model(news_article_input_ids, news_article_attention_mask,question_input_ids, question_attention_mask, answer_option)
            loss = loss_fn(outputs, answer)
            
            loss.backward()
            optimizer.step()
            
            train_loss += loss.item()
        
        model.eval()
        with torch.no_grad():
            for batch , data in  tqdm(enumerate(val_loader, 0), unit="batch", total=len(val_loader)):
                news_article_input_ids, news_article_attention_mask,question_input_ids, question_attention_mask, answer, answer_option = data
                news_article_input_ids = news_article_input_ids.to(device)
                news_article_attention_mask = news_article_attention_mask.to(device)
                question_input_ids = question_input_ids.to(device)
                question_attention_mask = question_attention_mask.to(device)
                answer = answer.to(device)
                outputs = model(news_article_input_ids, news_article_attention_mask,question_input_ids, question_attention_mask, answer_option)
                loss = loss_fn(outputs, answer)

                val_loss += loss.item()
                
        print(f"[{epochs+1}/{EPOCH}], Training Loss: {train_loss/len(data_loader)} Validation_loss: {val_loss/len(val_loader)}")
        train_loss_list.append(train_loss/len(data_loader))
        val_loss_list.append(val_loss/len(val_loader))
    return train_loss_list, val_loss_list

In [None]:
trainLoss, valLoss = train(train_data_loader, validation_data_loader)

100%|██████████| 1264/1264 [42:03<00:00,  2.00s/batch]

100%|██████████| 542/542 [06:36<00:00,  1.37batch/s]

[1/5], Training Loss: 1.0618610332566727 Validation_loss: 1.136973583720267




 37%|███▋      | 464/1264 [15:29<26:38,  2.00s/batch]

In [None]:
import matplotlib.pyplot as plt
epochs = [x for x in range(1, EPOCH+ 1)]
plt.plot(epochs, trainLoss, label='Training Loss')
plt.plot(epochs, valLoss, label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Training Loss vs Epochs and Validation Loss vs Epochs')
plt.legend()
plt.show()



In [None]:
torch.save(model.module.state_dict(), 'Rishav_NLP_R1_final.pth')

In [1]:
import numpy as np
import pandas as pd
import transformers
import torch
import torch.optim
from transformers import BertModel, BertTokenizer
from torch import nn
import json
from torch.utils.data import DataLoader, Dataset

LEARNING_RATE = 5e-2
MAX_LEN = 32
BATCH_SIZE = 32
SEQ_LEN = 20
# FOR THE SEQ_LEN = 20, READ THE DATA RETREIVAL FILE

from transformers import BertTokenizer, BertModel
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')

def read_json(file):
    with open(file, 'r', encoding='utf-8') as f:
        data = json.load(f)
    return data
class Chinese_answer_Model(nn.Module):
    def __init__(self):
        super(Chinese_answer_Model, self).__init__()
        self.bert = BertModel.from_pretrained("bert-base-chinese", return_dict = True)
        self.fc1 = nn.Linear(768*(SEQ_LEN+1), 4)
        self.out = nn.Linear(8, 4)
    
    def forward(self, news_article_input_ids, news_article_attention_mask,question_input_ids, question_attention_mask, answer_option):
        output = torch.empty((news_article_input_ids.shape[0], 4)).to(device)
        
        for i in range(news_article_input_ids.shape[0]):
            text_out = self.bert(input_ids = news_article_input_ids[i].long(),attention_mask= news_article_attention_mask[i])
            question_out = self.bert(input_ids = question_input_ids[i].long(),attention_mask= question_attention_mask[i])
            out = torch.cat((text_out.pooler_output, question_out.pooler_output), dim = 0)
            out = out.view(-1)
            out = self.fc1(out)
            out = torch.relu(out)
            out = torch.cat((out, answer_option[i]),dim = -1)
            out = self.out(out)
            
            out = torch.softmax(out,dim = 0)
            output[i] = out
        
        
        return output

class Chinese_Article_Data(Dataset):
    def __init__(self, file, tokenizer, max_len, seq_len):
        self.data = file
        self.tokenizer = tokenizer
        self.max_len   = max_len
        self.seq_len   = seq_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        news_article_input_ids   = torch.empty((self.seq_len,self.max_len))
        news_article_attention_list = torch.empty((self.seq_len,self.max_len))
        itr = 0
        for sentence_idx in range(len(self.data[idx]['news_article'])):
            news_article_instance = self.data[idx]['news_article'][itr]
            encoding = self.tokenizer.encode_plus (
                news_article_instance, 
                max_length = self.max_len,
                pad_to_max_length = True,
                truncation = True,
                padding = 'max_length',
                return_attention_mask= True,
                return_tensors = 'pt',
            )
            news_article_input_ids[itr] = encoding['input_ids']
            news_article_attention_list[itr]  = encoding['attention_mask']
            itr += 1
            if (itr == self.seq_len):
                break
            
        while(itr < self.seq_len):
            encoding = self.tokenizer.encode_plus (
                'NULL', 
                max_length = self.max_len,
                pad_to_max_length = True,
                truncation = True,
                padding = 'max_length',
                return_attention_mask= True,
                return_tensors = 'pt',
            )
            news_article_input_ids[itr] = encoding['input_ids']
            news_article_attention_list[itr]  = encoding['attention_mask']
            itr += 1
        
        question = self.data[idx]['question_stem']
        encoding = self.tokenizer.encode_plus (
                question, 
                max_length = self.max_len,
                pad_to_max_length = True,
                truncation = True,
                padding = 'max_length',
                return_attention_mask= True,
                return_tensors = 'pt',
            )
        question_input_ids = encoding['input_ids']
        question_attention_mask = encoding['attention_mask']
        answer_option = torch.tensor([float(x.replace(',', '')) for x in self.data[idx]['answer_options']])
        answer   = torch.tensor([1.0 if (x==self.data[idx]['ans']-1) else 0.0 for x in range(4)])
#         print(itr)
        return news_article_input_ids, news_article_attention_list, question_input_ids, question_attention_mask, answer, answer_option

def getloader(file = None, tokenizer = tokenizer, max_len = MAX_LEN, batch_size = BATCH_SIZE, seq_len = SEQ_LEN):
    dataset = Chinese_Article_Data (file, tokenizer, max_len,seq_len)
    loader = DataLoader(dataset = dataset, batch_size = batch_size, num_workers = 4, shuffle = False, pin_memory = True)
    return loader

    

# defining the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")





from sklearn.metrics import f1_score, accuracy_score,  classification_report
def evaluate(file_path,model):
    # give the loaded file
    with open(file_path, 'r') as f:
        file = json.load(f)
    
    data = getloader(file)
    
    num_correct = 0
    num_wrong= 0
    for batch, d in enumerate(data):

        
        news_article_input_ids, news_article_attention_mask,question_input_ids, question_attention_mask, answer, answer_option = d
        news_article_input_ids = news_article_input_ids.to(device)
        news_article_attention_mask = news_article_attention_mask.to(device)
        question_input_ids = question_input_ids.to(device)
        question_attention_mask = question_attention_mask.to(device)
        answer = answer.to(device)
        outputs = model(news_article_input_ids, news_article_attention_mask,question_input_ids, question_attention_mask, answer_option)

      
        
        target_indices = torch.argmax(answer, dim=1)
        prediction_indices = torch.argmax(outputs, dim=1)

        # Compare the indices to find correct and wrong predictions
        num_correct += torch.sum(target_indices == prediction_indices).item()
        num_wrong += torch.sum(target_indices != prediction_indices).item()
    
    print (f"Accuracy : {num_correct/(num_correct+num_wrong)}")

load_model = Chinese_answer_Model()
# Load the state_dict
load_model.load_state_dict(torch.load('/kaggle/input/bert-qt-semantic/pytorch/bert-qt/1/Rishav_NLP_R1_final_trained.pth'))
load_model.to(device)
load_model = nn.DataParallel(load_model)
file_path = "/kaggle/input/nlp-project-dataset/test_split.json"
evaluate(file_path, load_model)

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/110k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/269k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/624 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/412M [00:00<?, ?B/s]

Accuracy : 0.48545504917578614
