In [None]:
!pip install torchcontrib

# IMPORT

In [None]:
import os
import torch
import pandas as pd
import torch.nn as nn
import numpy as np
import torch.nn.functional as F
from torch.optim import lr_scheduler
from torchcontrib.optim import SWA

from sklearn import model_selection
from sklearn import metrics
import transformers
import tokenizers
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup
from transformers import AutoModelForPreTraining, AutoTokenizer, AutoConfig
os.environ["TOKENIZERS_PARALLELISM"] = "false"
from tqdm.autonotebook import tqdm
import utils

# CONFIG

In [None]:
# Maximum length of the input vector.
MAX_LEN = 1050
# Batch size of training and validation set.
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 2
# The number of epochs of model training.
EPOCHS = 35
# Name of the BERT model
BERT_PATH = "cointegrated/rubert-tiny2"
# Path to save the trained model.
MODEL_SAVE_PATH = 'model.bin'
# Path to training file.
TRAINING_FILE = '/kaggle/input/kontur2023/nlp_test_task_2023/nlp_test_task_2023/dataset/train.json'
# Tokenizer for ruBERT-tiny2 model.
TOKENIZER = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny2", lowercase=True)
# Label regularization.
SOFT_ALPHA = 0.4

# PROCESS DATA

In [None]:
def process_data(text, extracted_part, start_index, end_index, label, tokenizer, max_len):
    '''Creating arrays with tokens, offsets and masks for the BERT model.
    
    Args:
        text: The original text of the document.
        extracted_part: The original extracted part of the text alias the correct answer.
        start_index: The character index of the start of the correct answer.
        end_index: The character index of the end of the correct answer.
        label: Text labels.
        tokenizer: The tokenizer to use for the BERT model.
        max_len: The length of the longest sentence after tokenization, including special tokens.
        
    Returns:
        dictionary with keys:
            ids: Output ids for the BERT model for the Q&A problem.
            mask: The attention mask
            token_type_ids: The type of input tokens to split the array into "label" aka question and "text".
            targets_start: The token index of the start of the correct answer.
            targets_end: The token index of the end of the correct answer.
            orig_text: The original text.
            orig_extracted: The original extracted part.
            label: The label of the original text.
            offsets: The beginning and end of each word in a sentence.
    '''
    character_targets = [0] * len(text)
    # [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] 
    if start_index != 0 and end_index != 0:
        for ct in range(start_index, end_index + 1):
            character_targets[ct] = 1
    # [0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0]
    # Where 1 is the target values, what should be extracted.
    
    # Create tokenizer.
    tok_text = tokenizer.encode_plus(text, return_offsets_mapping=True)
    # Input ids of tokens from input text and delete special tokens ([CLS], [SEP]).
    input_ids_orig = tok_text.input_ids[1:-1]
    # Save offsets of words in the text without tokens.
    text_offsets = tok_text.offset_mapping[1:-1]
    
    # Create targets for words with offset_mapping.
    target_idx = []
    for k, (offset_1, offset_2) in enumerate(text_offsets):
        if sum(character_targets[offset_1: offset_2]) > 0:
            target_idx.append(k)
    
    # Check if there is an answer in the text.
    if len(target_idx) > 0:
        targets_start = target_idx[0]
        targets_end = target_idx[-1]
    else:
        targets_start = 0
        targets_end = 0
    # There are target arrays like:
    # [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0] - target start
    # [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0] - target end
    
    # ids of tokens for labels of the texts.
    label_id = {
        'обеспечение исполнения контракта': [33231, 32922, 36035],
        'обеспечение гарантийных обязательств': [33231, 55482, 2313, 38970],   
    }
    # Soft targets
    # [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0] - target start before.
    # [0, 0, 0, 0, 0, 0.3, 0.6, 1, 0, 0, 0, 0, 0] - after regularization.
    n = len(input_ids_orig)
    sentence = np.arange(n)
    answer = sentence[targets_start:targets_end + 1]
    
    start_labels = np.zeros(n)
    for i in range(targets_end+1):
        jac = utils.jaccard_array(answer, sentence[i:targets_end + 1])
        start_labels[i] = jac
    start_labels = (1 - SOFT_ALPHA) * start_labels / start_labels.sum()
    start_labels[targets_start] += SOFT_ALPHA
    
    end_labels = np.zeros(n)
    for i in range(targets_start, n):
        jac = utils.jaccard_array(answer, sentence[targets_start:i + 1])
        end_labels[i] = jac
    end_labels = (1 - SOFT_ALPHA) * end_labels / end_labels.sum()
    end_labels[targets_end] += SOFT_ALPHA
    
    # We format the arrays for the Q&A format for BERT:
    # [CLS] label_id [SEP] text [SEP].
    # And, since we added special tokens, we must make the appropriate offsets in all other arrays.
    # We add +5 for 'обеспечение исполнения контракта' label and +6 for 'обеспечение гарантийных обязательств'
    # because their length differs by one token.
    # [CLS] + len(label) + [SEP] = 5 or 6!
    if label == 'обеспечение исполнения контракта':
        input_ids = [2] + [*label_id[label]] + [3] + input_ids_orig + [3]
        token_type_ids = [0, 0, 0, 0, 0] + [1] * (len(input_ids_orig) + 1)
        mask = [1] * len(token_type_ids)
        text_offsets = [(0, 0)] * 5 + text_offsets + [(0, 0)]
        start_labels = [0, 0, 0, 0, 0] + list(start_labels) + [0]
        end_labels = [0, 0, 0, 0, 0] + list(end_labels) + [0]
        targets_start += 5
        targets_end += 5
    else:
        input_ids = [2] + [*label_id[label]] + [3] + input_ids_orig + [3]
        token_type_ids = [0, 0, 0, 0, 0, 0] + [1] * (len(input_ids_orig) + 1)
        mask = [1] * len(token_type_ids)
        text_offsets = [(0, 0)] * 6 + text_offsets + [(0, 0)]
        start_labels = [0, 0, 0, 0, 0, 0] + list(start_labels) + [0]
        end_labels = [0, 0, 0, 0, 0, 0] + list(end_labels) + [0]
        targets_start += 6
        targets_end += 6
    
    # We adjust the length of the input vectors to the max_len parameter.
    padding_length = max_len - len(input_ids)
    if padding_length > 0:
        input_ids = input_ids + ([0] * padding_length)
        mask = mask + ([0] * padding_length)
        token_type_ids = token_type_ids + ([0] * padding_length)
        text_offsets = text_offsets + ([(0, 0)] * padding_length)
        start_labels = start_labels + ([0] * padding_length)
        end_labels = end_labels + ([0] * padding_length)
        
    return {
        'ids': input_ids,
        'mask': mask,
        'token_type_ids': token_type_ids,
        'targets_start': start_labels,
        'targets_end': end_labels,
        'orig_text': text,
        'orig_extracted': extracted_part,
        'label': label,
        'offsets': text_offsets
    }

# LOADING DATA

In [None]:
class TextLoading:
    '''Loading data in the correct form in the model.
    '''
    def __init__(self, text, label, extracted_part, start_index, end_index):
        '''Initializes the values for data preparation.
        
        Args:
            text: The original text of the document.
            label: Text labels.
            extracted_part: The original extracted part of the text alias the correct answer.
            start_index: The character index of the start of the correct answer.
            end_index: The character index of the end of the correct answer.
        '''
        self.text = text
        self.label = label
        self.extracted_part = extracted_part
        self.start_index = start_index
        self.end_index = end_index
        self.tokenizer = TOKENIZER
        self.max_len = MAX_LEN
    
    def __len__(self):
        '''Return len of the text
        '''
        return len(self.text)
    
    def __getitem__(self, item):
        data = process_data(
            self.text[item], 
            self.extracted_part[item],
            self.start_index[item],
            self.end_index[item],
            self.label[item],
            self.tokenizer,
            self.max_len
        )
        # Return the processed data converted to torch.tensor format.
        return {
            'ids': torch.tensor(data["ids"], dtype=torch.long),
            'mask': torch.tensor(data["mask"], dtype=torch.long),
            'token_type_ids': torch.tensor(data["token_type_ids"], dtype=torch.long),
            'targets_start': torch.tensor(data["targets_start"], dtype=torch.float),
            'targets_end': torch.tensor(data["targets_end"], dtype=torch.float),
            'orig_text': data["orig_text"],
            'orig_extracted': data["orig_extracted"],
            'label': data["label"],
            'offsets': torch.tensor(data["offsets"], dtype=torch.long)
        }

# NEURAL NETWORK MODEL

In [None]:
class TextModel(transformers.BertPreTrainedModel):
    '''Neural network model'''
    def __init__(self, conf):
        super(TextModel, self).__init__(conf)
        # Create backbone of BERT model.
        self.bert = AutoModelForPreTraining.from_pretrained(BERT_PATH, config=conf)
        # Dropout with 50% probability.
        self.drop_out_high = nn.Dropout(0.5)
        # Create linear output with size 312 (size of BERT output).
        # Multiplication by 2 is necessary because the last two layers are taken.
        self.classifier = nn.Linear(conf.hidden_size * 2, 2)
        # Initialization of weights for linear layer.
        torch.nn.init.normal_(self.classifier.weight, std=0.02)
    
    def forward(self, ids, mask, token_type_ids):
        '''Direct pass through the NN.
        '''
        # Return the hidden states from the BERT model.
        out = self.bert(ids,
                        attention_mask=mask,
                        token_type_ids=token_type_ids)
        
        # Concatenate all hidden layers.
        # We take not olny the last layer, because the last one can be 
        # overfitted on the original training data.
        out = torch.stack(tuple(out.hidden_states[-i -1] for i in range(3)), dim=0)
        out_mean = torch.mean(out, dim=0)
        out_max, _ = torch.max(out, dim=0)
        out = torch.cat((out_mean, out_max), dim=-1)
        # Multiple dropout on output layer to avoid overfitting.
        logits = torch.mean(torch.stack([self.classifier(self.drop_out_high(out)) for _ in range(5)], dim=0), dim=0)
        
        # Split the vector to start and end logits - start probabilities 
        # and end probabilities of the correct answer.
        start_logits, end_logits = logits.split(1, dim=-1)

        start_logits = start_logits.squeeze(-1)
        end_logits = end_logits.squeeze(-1)

        return start_logits, end_logits

# LOSS FUNCTION AND JACCARD SCORE

In [None]:
def loss_fn(start_logits, end_logits, start_positions, end_positions):
    '''KLDiv Loss for start and end logits probabilities
    '''
    m = torch.nn.LogSoftmax(dim=1)
    loss_fct = torch.nn.KLDivLoss(reduction='batchmean')
    start_loss = loss_fct(m(start_logits), start_positions)
    end_loss = loss_fct(m(end_logits), end_positions)
    total_loss = (start_loss + end_loss)
    return total_loss


def calculate_jaccard_score(original_text, target_string, label_val, 
                            idx_start, idx_end, offsets):
    '''Jaccard similarity calculation for original text and prediction.
    
    Args:
        original_text: The original text of the document.
        target_string: Target string (extracted part).
        label_val: The label of the text.
        idx_start: The predicted index of the start of the string.
        idx_end: The predicted index of the end of the string.
        offsets: Offsets of the text after tokenization.
    
    Returns:
        jac: The score of jac similarity.
        filtered_output: The string of the prediction text.
        idx_start: The predicted index of the start of the string after filtration 
        (idx_start = 0 if start >= end. It's mean that BERT cannot find the target string).
        idx_end: The predicted index of the end of the string after filtration.
    '''
    
    # Return the target string from idx_start/end.
    filtered_output  = ""
    for ix in range(idx_start, idx_end + 1):
        filtered_output += original_text[offsets[ix][0]: offsets[ix][1]]
        # Add space between words, if token isn't the last one.
        if (ix+1) < len(offsets) and offsets[ix][1] < offsets[ix+1][0]:
            filtered_output += " "
    
    
    # Often, the model gives the first word or letter of the first word 
    # as an answer, although in fact there is no answer in the text.
    # To avoid this, the following selection method is used. 
    # Since real answers are much longer than 4 words.
    if len(filtered_output.split()) < 4:
        filtered_output = ''
        char_start = char_end = 0
    else:
        char_start = offsets[(idx_start)][0]
        char_end = offsets[(idx_end)][1]
    # It may help to remove extra punctuation marks at the end of the answer.
#     filtered_output = filtered_output.rstrip(' ,/:')

    # Calculate the jaccard score and accuracy between
    # the predicted text and the original one.
    jac = utils.jaccard(target_string.strip(), filtered_output.strip())
    acc = utils.accuracy(target_string.strip(), filtered_output.strip())
    return jac, filtered_output, char_start, char_end, acc

# TRAIN

In [None]:
def train_fn(data_loader, model, optimizer, device, scheduler=None):
    '''Training the BERT model.
    
    Args:
        data_loader: data loading utility by PyTorch.
        model: model of the neural network.
        optimizer: AdamW optimizer to optimize the required parameters. 
        device: CPU/GPU.
        scheduler: Training schedule to control the speed of learning.
        
    '''
    # Starting to train the model.
    model.train()
    losses = utils.AverageMeter()
    jaccards = utils.AverageMeter()
    accuracy = utils.AverageMeter()
    
    # tqdm to visualize the learning process.
    tk0 = tqdm(data_loader, total=len(data_loader))
    
    for bi, d in enumerate(tk0):
        
        ids = d["ids"]
        token_type_ids = d["token_type_ids"]
        mask = d["mask"]
        targets_start = d["targets_start"]
        targets_end = d["targets_end"]
        label = d["label"]
        orig_extracted = d["orig_extracted"]
        orig_text = d["orig_text"]
        offsets = d["offsets"]
        
        # We transfer the tensors to the device (CPU/GPU)
        ids = ids.to(device, dtype=torch.long)
        token_type_ids = token_type_ids.to(device, dtype=torch.long)
        mask = mask.to(device, dtype=torch.long)
        targets_start = targets_start.to(device, dtype=torch.float)
        targets_end = targets_end.to(device, dtype=torch.float)
        
        # Reset the gradients at the beginning of model training.
        model.zero_grad()
        # Move ids, mask, token_type_ids to the model and 
        # Predict logits start and end values.
        outputs_start, outputs_end = model(
            ids=ids,
            mask=mask,
            token_type_ids=token_type_ids,
        )
        # Calculate the loss function.
        loss = loss_fn(outputs_start, outputs_end, targets_start, targets_end)
        # Сalculation gradients from loss function.
        loss.backward()
        # Updating weight parameters
        optimizer.step()
        # Update scheduler => update learning rate
        scheduler.step()
        
        # Applying the softmax function to obtain pseudo probabilities for start and end tokens.
        outputs_start = torch.softmax(outputs_start, dim=1).cpu().detach().numpy()
        outputs_end = torch.softmax(outputs_end, dim=1).cpu().detach().numpy()
        # Jaccard and accuracy calculation for the epoch.
        jaccard_scores = []
        accuracy_scores = []
        for px, text in enumerate(orig_text):
            extracted_text = orig_extracted[px]
            text_label = label[px]
            jaccard_score, _, _, _, accuracy_score = calculate_jaccard_score(
                                                    original_text=text,
                                                    target_string=extracted_text,
                                                    label_val=text_label,
                                                    idx_start=np.argmax(outputs_start[px, :]),
                                                    idx_end=np.argmax(outputs_end[px, :]),
                                                    offsets=offsets[px]
                                                )
            jaccard_scores.append(jaccard_score)
            accuracy_scores.append(accuracy_score)
        
        # Update losses, accuracy and jaccard
        accuracy.update(np.mean(accuracy_scores), ids.size(0))
        jaccards.update(np.mean(jaccard_scores), ids.size(0))
        losses.update(loss.item(), ids.size(0))
        # tqdm loading bar with updated losses, accuracy 
        # and jaccard at each epoch.
        tk0.set_postfix(loss=losses.avg, jaccard=jaccards.avg, accuracy=accuracy.avg)

# VALIDATION

In [None]:
def eval_fn(data_loader, model, device):
    '''Validation the BERT model.
    
    Args:
        data_loader: data loading utility by PyTorch.
        model: model of the neural network.
        optimizer: AdamW optimizer to optimize the required parameters. 
        device: CPU/GPU.
        scheduler: Training schedule to control the speed of learning.
        
    '''
    # Starting the model validation process: turn off dropout.
    model.eval()
    losses = utils.AverageMeter()
    jaccards = utils.AverageMeter()
    accuracy = utils.AverageMeter()
    
    # Turn off gradient calculation.
    with torch.no_grad():
        tk0 = tqdm(data_loader, total=len(data_loader))
        for bi, d in enumerate(tk0):
            ids = d["ids"]
            token_type_ids = d["token_type_ids"]
            mask = d["mask"]
            label = d["label"]
            orig_extracted = d["orig_extracted"]
            orig_text = d["orig_text"]
            targets_start = d["targets_start"]
            targets_end = d["targets_end"]
            offsets = d["offsets"].numpy()
            
            # Move tensors to device (CPU/GPU)
            ids = ids.to(device, dtype=torch.long)
            token_type_ids = token_type_ids.to(device, dtype=torch.long)
            mask = mask.to(device, dtype=torch.long)
            targets_start = targets_start.to(device, dtype=torch.float)
            targets_end = targets_end.to(device, dtype=torch.float)
            
            # Predict start and the end indexes of the model.
            outputs_start, outputs_end = model(
                ids=ids,
                mask=mask,
                token_type_ids=token_type_ids
            )
            
            # Calculation loss function.
            loss = loss_fn(outputs_start, outputs_end, targets_start, targets_end)
            
            # Softmax function for probabilities.
            outputs_start = torch.softmax(outputs_start, dim=1).cpu().detach().numpy()
            outputs_end = torch.softmax(outputs_end, dim=1).cpu().detach().numpy()
            # Jaccard and accuracy score calculation.
            jaccard_scores = []
            accuracy_scores = []
            for px, text in enumerate(orig_text):
                extracted_text = orig_extracted[px]
                text_label = label[px]
                jaccard_score, _, _, _, accuracy_score = calculate_jaccard_score(
                    original_text=text,
                    target_string=extracted_text,
                    label_val=text_label,
                    idx_start=np.argmax(outputs_start[px, :]),
                    idx_end=np.argmax(outputs_end[px, :]),
                    offsets=offsets[px]
                )
                jaccard_scores.append(jaccard_score)
                accuracy_scores.append(accuracy_score)
            
            # Update jaccard, accuracy and losses.
            accuracy.update(np.mean(accuracy_scores), ids.size(0))
            jaccards.update(np.mean(jaccard_scores), ids.size(0))
            losses.update(loss.item(), ids.size(0))
            # Display scores.
            tk0.set_postfix(loss=losses.avg, jaccard=jaccards.avg, accuracy=accuracy.avg)
    return jaccards.avg, accuracy.avg

# RUN MODEL

In [98]:
def run(fold):
    
    # Read the training file and get it ready to go.
    dfx = pd.read_json(TRAINING_FILE)
    splitted_df = pd.json_normalize(dfx.extracted_part)
    splitted_df['extracted_part'] = splitted_df['text']
    for i in ['extracted_part', 'answer_start', 'answer_end']:
        dfx[i] = splitted_df[i].str[0]
    # Add column with kfold for cross validation.
    dfx['kfold'] = -1
    
    # Since the objective function is two peaks at zero and around a certain value.
    # Therefore, it is necessary to correctly divide the data set into equal groups:
    # train and valid which contain samples from these two clusters in approximately 
    # the same amount. This makes it possible not to overfit the model only on the 
    # sample with answer start = 0, or another group of samples.

    dfx_temp = dfx.loc[dfx['answer_start'] != 0]
    dfx['q_answer_start'] = pd.qcut(dfx_temp['answer_start'], q=2).astype('str')
    dfx['q_answer_start'] = dfx['q_answer_start'].replace(np.nan, 0).astype('str')
    dfx['q_answer_start_label'] = dfx.q_answer_start.str.cat(dfx.label)
    kf = model_selection.StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    y = dfx["q_answer_start_label"].values
    
    # Create 5 folders for 5 models. In the future, we will average the output values
    # across all models, which in theory will give a better result than if we trained 
    # a single model.
    for kfold, (train_idx, valid_idx) in enumerate(kf.split(X=dfx, y=y)):
        dfx.loc[valid_idx, "kfold"] = kfold

    # create fold from dfx 0,1,2,3,4.
    df_train = dfx[dfx.kfold != fold].reset_index(drop=True)
    df_valid = dfx[dfx.kfold == fold].reset_index(drop=True)
    
    # Create train dataset from TextLoading.
    train_dataset = TextLoading(
        text=df_train.text.values,
        label=df_train.label.values,
        extracted_part=df_train.extracted_part.values,
        start_index=df_train.answer_start.values,
        end_index=df_train.answer_end.values
    )
    
    # Creating a generator to give out a data set in batches.
    train_data_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=TRAIN_BATCH_SIZE,
        num_workers=2
    )
    
    # Create valid dataset from TextLoading.
    valid_dataset = TextLoading(
        text=df_valid.text.values,
        label=df_valid.label.values,
        extracted_part=df_valid.extracted_part.values,
        start_index=df_valid.answer_start.values,
        end_index=df_valid.answer_end.values
    )
    
    # Creating a generator to give out a data set in batches.
    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size=VALID_BATCH_SIZE,
        num_workers=1
    )

    # Turn on GPU for calculation gradients.
    device = torch.device("cuda")
    # Loading pretrained BERT model.
    model_config = AutoConfig.from_pretrained(BERT_PATH)
    # Output hidden states ON for concatenate the hidden states
    # of BERT model.
    model_config.output_hidden_states = True
    model = TextModel(conf=model_config)
    # Move the model to the GPU.
    model.to(device)

    # Calculation the number of training steps.
    num_train_steps = int(len(df_train) / TRAIN_BATCH_SIZE * EPOCHS)
    # Get model parameters.
    param_optimizer = list(model.named_parameters())
    # Parameters that we don't won't to change.
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    # Creat two sets of parameters with weight decay = 0 and 
    # with weight decay =/= 0 for update it
    optimizer_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.001},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0},
    ]
    # Create AdamW optimizer with our parameters.
    base_opt = transformers.AdamW(optimizer_parameters, lr=7e-5, no_deprecation_warning=True)
    # Сreating SWA on top of AdamW for better accuracy on the validation set.
    optimizer = SWA(base_opt, swa_start=int(num_train_steps * 0.9),
                                       swa_freq=30, swa_lr=None)
    
    # Creating a schedule to control the learning rate during training. 
    # This schedule have got a learning rate that decreases linearly at each training step.
    scheduler = get_linear_schedule_with_warmup(
        optimizer, 
        num_warmup_steps=int(num_train_steps * 0.25), 
        num_training_steps=num_train_steps
    )
    
    # Create early stopping function with patience = 5.
    # This means that the learning process will stop after 5 unsuccessful epochs.
    es = utils.EarlyStopping(patience=5, mode="max")
    print(f"Training is Starting for fold={fold}")
    
    # Training the model.
    for epoch in range(EPOCHS):
        train_fn(train_data_loader, model, optimizer, device, scheduler=scheduler)
        jaccard, accuracy = eval_fn(valid_data_loader, model, device)
        print(f"Accuracy score = {accuracy}", f"\nJaccard Score = {jaccard}")
        es(accuracy, model, model_path=f"model_{fold}.bin")
        if es.early_stop:
            print("Early stopping")
            break
    return accuracy

# Create 5-fold of learning

In [96]:
accuracy = []

In [None]:
accuracy.append(run(fold=0))

In [None]:
accuracy.append(run(fold=1))

In [None]:
accuracy.append(run(fold=2))

In [None]:
accuracy.append(run(fold=3))

In [None]:
accuracy.append(run(fold=4))

In [None]:
sum(accuracy) / len(accuracy)

# TEST THE MODEL

In [None]:
device = torch.device("cuda")
model_config = AutoConfig.from_pretrained(BERT_PATH)
model_config.output_hidden_states = True

In [None]:
model1 = TextModel(conf=model_config)
model1.to(device)
model1.load_state_dict(torch.load("/kaggle/working/model_0.bin"))
model1.eval()

model2 = TextModel(conf=model_config)
model2.to(device)
model2.load_state_dict(torch.load("/kaggle/working/model_1.bin"))
model2.eval()

model3 = TextModel(conf=model_config)
model3.to(device)
model3.load_state_dict(torch.load("/kaggle/working/model_2.bin"))
model3.eval()

model4 = TextModel(conf=model_config)
model4.to(device)
model4.load_state_dict(torch.load("/kaggle/working/model_3.bin"))
model4.eval()

model5 = TextModel(conf=model_config)
model5.to(device)
model5.load_state_dict(torch.load("/kaggle/working/model_4.bin"))
model5.eval()

In [None]:
dfx_test = pd.read_json('/kaggle/input/kontur2023/nlp_test_task_2023/nlp_test_task_2023/dataset/test.json')
dfx_test.loc[:, 'extracted_part'] = dfx_test.text.values
dfx_test.loc[:, 'answer_start'] = 0
dfx_test.loc[:, 'answer_end'] = dfx_test.text.str.len() 

In [None]:
final_output = []

test_dataset = TextLoading(
        text=dfx_test.text.values,
        label=dfx_test.label.values,
        extracted_part=dfx_test.extracted_part.values,
        start_index=dfx_test.answer_start.values,
        end_index=dfx_test.answer_end.values
    )

data_loader = torch.utils.data.DataLoader(
    test_dataset,
    shuffle=False,
    batch_size=4,
    num_workers=1
)

with torch.no_grad():
    tk0 = tqdm(data_loader, total=len(data_loader))
    for bi, d in enumerate(tk0):
        ids = d["ids"]
        token_type_ids = d["token_type_ids"]
        mask = d["mask"]
        label = d["label"]
        orig_extracted = d["orig_extracted"]
        orig_text = d["orig_text"]
        targets_start = d["targets_start"]
        targets_end = d["targets_end"]
        offsets = d["offsets"].numpy()

        ids = ids.to(device, dtype=torch.long)
        token_type_ids = token_type_ids.to(device, dtype=torch.long)
        mask = mask.to(device, dtype=torch.long)
        targets_start = targets_start.to(device, dtype=torch.long)
        targets_end = targets_end.to(device, dtype=torch.long)

        outputs_start1, outputs_end1 = model1(
            ids=ids,
            mask=mask,
            token_type_ids=token_type_ids
        )
        
        outputs_start2, outputs_end2 = model2(
            ids=ids,
            mask=mask,
            token_type_ids=token_type_ids
        )
        
        outputs_start3, outputs_end3 = model3(
            ids=ids,
            mask=mask,
            token_type_ids=token_type_ids
        )
        
        outputs_start4, outputs_end4 = model4(
            ids=ids,
            mask=mask,
            token_type_ids=token_type_ids
        )
        
        outputs_start5, outputs_end5 = model5(
            ids=ids,
            mask=mask,
            token_type_ids=token_type_ids
        )
        outputs_start = (
            outputs_start1 
            + outputs_start2 
            + outputs_start3 
            + outputs_start4 
            + outputs_start5
        ) / 5
        outputs_end = (
            outputs_end1 
            + outputs_end2 
            + outputs_end3 
            + outputs_end4 
            + outputs_end5
        ) / 5
        
        outputs_start = torch.softmax(outputs_start, dim=1).cpu().detach().numpy()
        outputs_end = torch.softmax(outputs_end, dim=1).cpu().detach().numpy()
                
        for px, text in enumerate(orig_text):
            extracted_text = orig_extracted[px]
            text_label = label[px]
            _, output_sentence, ind_start, ind_end, _ = calculate_jaccard_score(
                original_text=text,
                target_string=extracted_text,
                label_val=text_label,
                idx_start=np.argmax(outputs_start[px, :]),
                idx_end=np.argmax(outputs_end[px, :]),
                offsets=offsets[px]
            )
            final_output.append([output_sentence, ind_start, ind_end])

In [None]:
dfx_test[['extracted_path', 'answer_start', 'answer_end']] = final_output
sample_submission = pd.DataFrame()
sample_submission[['id', 'text_full', 'label', 'text', 'answer_start', 'answer_end']] = dfx_test[['id', 'text', 'label', 'extracted_path', 
                                                                                                  'answer_start', 'answer_end']]

In [None]:
text_bracket, answer_start_bracket, answer_end_bracket = [], [], []

for index, row in sample_submission.iterrows():
    text_bracket.append([row.text])
    answer_start_bracket.append([row.answer_start])
    answer_end_bracket.append([row.answer_end])
sample_submission['text'] = text_bracket
sample_submission['answer_start'] = answer_start_bracket
sample_submission['answer_end'] = answer_end_bracket

simple_json = pd.DataFrame()
simple_json['id'] = sample_submission['id']
simple_json['text'] = sample_submission['text_full']
simple_json['label'] = sample_submission.label
simple_json['extracted_part'] = sample_submission[['text', 'answer_start', 'answer_end']].to_dict(orient='records')
simple_json.to_json('predictions.json', orient='records', force_ascii=False)

In [None]:
simple_json