In [None]:
from google.colab import drive
import os

drive.mount('/content/drive')
drive_path = '/content/drive/My Drive/'

!mkdir results
!pip install datasets



In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import AdamW
from torch.utils.data import DataLoader, Dataset
from torch.cuda.amp import autocast, GradScaler
from transformers import AutoTokenizer, AutoModel, AdamW, get_linear_schedule_with_warmup


import numpy as np
import pandas as pd
from datasets import load_dataset, load_metric
from  datasets import Dataset as datasetsDataset
from tqdm import tqdm
from sklearn.metrics import precision_score, recall_score, f1_score, matthews_corrcoef,accuracy_score

import os
import copy
import random
import math
import string

In [3]:
delimiter = ","
df = pd.read_csv( os.path.join(drive_path, "train.csv"), delimiter=delimiter)
cols = list(df.columns)
df.columns = cols
dataset = datasetsDataset.from_pandas(df)
subdataset = dataset
split = subdataset.train_test_split(test_size=0.1, seed=1)
train = split['train']
val = split['test']
test =  pd.read_csv(os.path.join(drive_path,"dev.csv"))
test.columns = cols
df_train = pd.DataFrame(train)
df_val = pd.DataFrame(val)
df_test = pd.DataFrame(test)

## Classes and functions

In [4]:
class CustomDataset(Dataset):
    def __init__(self, df, tokenizer_name='albert-base-v2', max_length=128):
        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
        self.data = df
        self.max_length = max_length
        self.chunked_samples = []
        self.make_chunks()

    def make_chunks(self):
        for i in range(len(self.data)):
            self.chunked_samples.append(self.make_chunk(i))

    def __getitem__(self, idx): #last for number of chunks
        return [*self.chunked_samples[idx] , self.data.iloc[idx]["label"],len(self.chunked_samples[idx][0])]


    def preprocess_claim(self,claim):
        if claim.startswith("We should "):
            claim = claim[10:]
        return self.preprocess_text(claim)

    def preprocess_evidence(self,evidence):
        evidence = evidence.replace("[REF]"," ")
        return self.preprocess_text(evidence)

    def preprocess_text(self,text):
        text = text.lower()
        punctuation = string.punctuation
        text = ''.join(char for char in text if char not in punctuation)
        return text

    def make_chunk(self, idx):

        row = self.data.iloc[idx]
        claim = row["Claim"]
        evidence = row["Evidence"]

        claim = self.preprocess_claim(claim)
        evidence = self.preprocess_evidence(evidence)

        claim_tokens = self.tokenizer.encode_plus(claim, add_special_tokens=False, return_tensors='pt', truncation=False)
        evidence_tokens = self.tokenizer.encode_plus(evidence, add_special_tokens=False, return_tensors='pt', truncation=False)

        evidence_input_ids = evidence_tokens['input_ids'][0]
        claim_input_ids = claim_tokens['input_ids'][0]

        if len(evidence_input_ids) + len(claim_input_ids) + 3 <= self.max_length:
            chunk_input_ids = [self.tokenizer.cls_token_id] + claim_input_ids.tolist()  + [self.tokenizer.sep_token_id]  + evidence_input_ids.tolist() + [self.tokenizer.sep_token_id]
            attention_mask = [1] * len(chunk_input_ids)
            padding_length = self.max_length - len(chunk_input_ids)
            chunk_input_ids += [self.tokenizer.pad_token_id] * padding_length
            attention_mask += [0] * padding_length
            ids_chunks = torch.stack([torch.tensor(chunk_input_ids)])
            attention_mask_chunks = torch.stack([torch.tensor(attention_mask)])
            return ids_chunks, attention_mask_chunks

        ids_chunks = []
        attention_mask_chunks = []

        claim_len = (
            min(
                max(
                    0.6,
                    1- len(evidence_input_ids)/(self.max_length - 3)
                    )
                ,len(claim_input_ids)/(self.max_length - 3)))



        claim_tokens_per_chunk = math.floor(claim_len*(self.max_length - 3))
        evidence_tokens_per_chunk = (self.max_length - 3) - claim_tokens_per_chunk

        overlap = 5
        stride = max(evidence_tokens_per_chunk - overlap,1)
        for i in range(0,len(evidence_input_ids),stride):

            chunk_input_ids = [self.tokenizer.cls_token_id] + claim_input_ids.tolist()[:claim_tokens_per_chunk] + [self.tokenizer.sep_token_id]  + evidence_input_ids[i:i+evidence_tokens_per_chunk].tolist() + [self.tokenizer.sep_token_id]
            attention_mask = [1] * len(chunk_input_ids)
            padding_length = self.max_length - len(chunk_input_ids)
            chunk_input_ids += [self.tokenizer.pad_token_id] * padding_length
            attention_mask += [0] * padding_length
            ids_chunks.append(torch.tensor(chunk_input_ids, dtype=torch.long))
            attention_mask_chunks.append(torch.tensor(attention_mask, dtype=torch.long))

        try:
            ids_chunks = torch.stack(ids_chunks)
        except:
            print(f"evidence_tokens_per_chunk {evidence_tokens_per_chunk}" )
            print(f"len(evidence_input_ids) {len(evidence_input_ids)}" )
            print(f"stride {stride}" )
            print(evidence)
            raise Exception()

        attention_mask_chunks = torch.stack(attention_mask_chunks)

        return ids_chunks, attention_mask_chunks

    def __len__(self):
        return len(self.chunked_samples)



def my_collate(batch):
    ids_chunks, attention_mask_chunks, labels, chunk_length = zip(*batch)
    max_num_chunks = max(len(chunks) for chunks in ids_chunks)
    padded_ids_chunks = []
    padded_attention_mask_chunks = []
    for ids_chunk, attention_chunk in zip(ids_chunks, attention_mask_chunks):
        num_padding = max_num_chunks - len(ids_chunk)
        padded_ids = torch.cat((ids_chunk, torch.zeros(num_padding, ids_chunk.size(1), dtype=ids_chunk.dtype)), dim=0)
        padded_attention = torch.cat((attention_chunk, torch.zeros(num_padding, attention_chunk.size(1), dtype=attention_chunk.dtype)), dim=0)
        padded_ids_chunks.append(padded_ids)
        padded_attention_mask_chunks.append(padded_attention)

    batch_ids_chunks = torch.stack(padded_ids_chunks)
    batch_attention_mask_chunks = torch.stack(padded_attention_mask_chunks)
    labels_tensor = torch.tensor(labels, dtype=torch.long)
    chunk_length_tensor = torch.tensor(chunk_length, dtype=torch.long)
    return batch_ids_chunks, batch_attention_mask_chunks, labels_tensor, chunk_length_tensor

class BlankSched:
    @classmethod
    def step(cls):
        return 1

In [5]:
class MaskedGlobalAvgPool1d(nn.Module):
    def __init__(self):
        super(MaskedGlobalAvgPool1d, self).__init__()

    def forward(self, x, mask):
        mask = mask
        x = torch.sum(x,dim = 1)
        batch_wise_non_masked_count = torch.sum(x,dim=1).unsqueeze(1)
        x = x / batch_wise_non_masked_count.clamp(min = 1)
        return x


class MaskedGlobalMaxPool1d(nn.Module):
    def __init__(self):
        super(MaskedGlobalMaxPool1d, self).__init__()

    def forward(self, x):
        x = torch.max(x,dim = 1).values
        return x

class SentencePairClassifier(nn.Module):
    def __init__(self,bert_model="albert-base-v2", hidden_size =768 ,freeze_bert = False,dropout_p= 0.1):
        super(SentencePairClassifier, self).__init__()
        self.bert = AutoModel.from_pretrained(bert_model)
        self.hidden_size = hidden_size
        self.cls_layer = nn.Sequential(
            nn.Linear(768,1)
        )
        self.layer_norm = nn.LayerNorm(768)
        self.dropout = nn.Dropout(p=dropout_p)
        if freeze_bert:
            for p in self.bert.parameters():
                p.requires_grad = False
        self.global_avg_pool = MaskedGlobalAvgPool1d()
        self.global_max_pool = MaskedGlobalMaxPool1d()

    def set_freeze_bert(self,freeze):
        self.freeze_bert = freeze
        if self.freeze_bert:
            for p in self.bert.parameters():
                p.requires_grad = False
        else:
            for p in self.bert.parameters():
                p.requires_grad = True

    def set_freeze_layer_in_albert(self,i,to_freeze=False):
        layer_regex = f"albert.encoder.albert_layer_groups.0.albert_layers.{i}."
        for name, param in self.bert.named_parameters():
            print(name)
            if layer_regex in name:
                param.requires_grad = True
                print(f"Unfroze layer: {name}")

    def set_bert_frozen(self,frozen = False):
        if frozen:
            for p in self.bert.parameters():
                p.requires_grad = False
            print("bert frozen")
        else:
            for p in self.bert.parameters():
                p.requires_grad = True
            print("bert unfrozen")

    def set_cls_frozen(self,frozen = False):
        for p in self.cls_layer.parameters():
            p.requires_grad = not frozen
        print(f"cls layer {(not frozen) *'un'}frozen")

    @autocast(enabled = True)
    def forward(self, input_ids,attention_masks,new_num_chunks = [1]):
        batch_size, num_chunks, chunk_len = input_ids.size()
        bert_embeds = []
        masks = []
        chunk_only_mask = torch.any(attention_masks != 0,dim=2)
        for i,chunk in enumerate(range(int(max(new_num_chunks)))):
            mask = attention_masks[:,chunk,:]
            outs = self.bert(input_ids=input_ids[:,chunk,:],attention_mask=mask)
            masks.append(mask)
            bert_embeds.append(outs["pooler_output"].unsqueeze(1))

        bert_embeds = torch.concat(bert_embeds,dim=1)
        masks = torch.concat(masks,dim=1)

        pooling_out_avg = self.global_avg_pool(bert_embeds,chunk_only_mask)
        pooling_out_max = self.global_max_pool(bert_embeds)

        pooling_joined = torch.stack((pooling_out_avg,pooling_out_max),dim = 1)
        pooling_joined = self.layer_norm(torch.sum(pooling_joined,dim = 1))

        logits = self.cls_layer(self.dropout(pooling_joined))

        return logits

In [6]:
def set_seed(seed):
    """ Set all seeds to make results reproducible """
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

def get_probs_from_logits(logits):
    """
    Converts a tensor of logits into an array of probabilities by applying the sigmoid function
    """
    probs = torch.sigmoid(logits.unsqueeze(-1))
    return probs.detach().cpu().numpy()

def evaluate_loss(net, dataloader):
    net.eval()
    mean_loss = 0
    count = 0
    with torch.no_grad():
        all_predictions = []
        all_labels = []
        for it, (seq, attn_masks, labels,lengths) in enumerate(dataloader):
            seq, attn_masks, labels = seq.to(device), attn_masks.to(device), labels.to(device)
            logits = net(seq, attn_masks,lengths)
            mean_loss += criterion(logits.squeeze(-1), labels.float()).item()
            count += 1
            probs = get_probs_from_logits(logits)

            all_predictions.extend((probs.flatten()>0.5).astype("uint8"))
            all_labels.extend(labels.flatten().to("cpu"))

    accuracy = accuracy_score(all_labels,all_predictions)
    precision = precision_score(all_labels, all_predictions)
    recall = recall_score(all_labels, all_predictions)
    f1 = f1_score(all_labels, all_predictions)
    mcc = matthews_corrcoef(all_labels, all_predictions)
    accuracy = accuracy_score(all_labels,all_predictions)
    return (mean_loss / count), accuracy, precision,recall,f1,mcc

In [7]:
def test_prediction(net, device, dataloader, with_labels=True, result_file="results/output.txt"):
    """
    Predict the probabilities on a dataset with or without labels and print the result in a file
    """
    net.eval()
    w = open(result_file, 'w')
    probs_all = []

    with torch.no_grad():
        if with_labels:
            for seq, attn_masks, token_type_ids, _ in tqdm(dataloader):
                seq, attn_masks, token_type_ids = seq.to(device), attn_masks.to(device), token_type_ids.to(device)
                logits = net(seq, attn_masks, token_type_ids)
                probs = get_probs_from_logits(logits.squeeze(-1)).squeeze(-1)
                probs_all += probs.tolist()
        else:
            for seq, attn_masks, token_type_ids in tqdm(dataloader):
                seq, attn_masks, token_type_ids = seq.to(device), attn_masks.to(device), token_type_ids.to(device)
                logits = net(seq, attn_masks, token_type_ids)
                probs = get_probs_from_logits(logits.squeeze(-1)).squeeze(-1)
                probs_all += probs.tolist()

    w.writelines(str(prob)+'\n' for prob in probs_all)
    w.close()

In [8]:
from transformers import Adafactor, get_linear_schedule_with_warmup
def train_bert(net, optimizer, lr_scheduler, train_loader, val_loader, epochs, iters_to_accumulate):
    best_loss = np.Inf
    best_ep = 1
    nb_iterations = len(train_loader)
    print_every = nb_iterations // 5
    iters = []
    train_losses = []
    val_losses = []
    scaler = GradScaler()
    for ep in range(epochs):
        net.train()
        running_loss = 0.0
        for batch_idx, batch in enumerate(tqdm(train_loader)):
            seq = batch[0].to(device)
            attn_masks = batch[1].to(device)
            labels = batch[2].to(device)
            num_chunk = batch[3].to(device)
            with autocast():
                logits = net(seq, attn_masks,num_chunk)
                loss = criterion(logits.squeeze(-1), labels.float())
                loss = loss / iters_to_accumulate

            scaler.scale(loss).backward()
            if (batch_idx + 1) % iters_to_accumulate == 0 or (batch_idx + 1) == len(train_loader):
                scaler.unscale_(optimizer)
                torch.nn.utils.clip_grad_norm_(net.parameters(), gradient_clip_val)
                scaler.step(optimizer)
                scaler.update()
                optimizer.zero_grad()
                lr_scheduler.step()


            running_loss += loss.item()

            if (batch_idx+ 1) % print_every == 0:  # Print training loss information
                print()
                print("Iteration {}/{} of epoch {} complete. Loss : {} "
                    .format(batch_idx+1, nb_iterations, ep+1, running_loss/print_every))

                running_loss = 0.0



        val_loss,val_acc,val_prec,val_rec,val_f1,val_mcc = evaluate_loss(net, val_loader)  # Compute validation loss
        print("Epoch : {} \n Loss : {} Accuracy : {} \n Precision : {} Recall : {} \n f1 : {} mcc : {} ".format(ep+1,val_loss,val_acc,val_prec,val_rec,val_f1,val_mcc))
        if val_loss < best_loss:
            print("Best validation loss improved from {} to {}".format(best_loss, val_loss))
            print()
            best_loss = val_loss
            best_ep = ep + 1

            path_to_model= os.path.join(drive_path, 'NLU_models/Transformer/{}_chunk_attention_avg_pool_maxlen_{}_val_loss_{}.pt'.format(bert_model, maxlen, round(val_loss, 3)))
            torch.save(net.state_dict(), path_to_model)
            print("The model has been saved in {}".format(path_to_model))


In [10]:
maxlen = 128
bert_model = "bert-base-uncased"
train_set = CustomDataset(df_train,bert_model,maxlen)
val_set = CustomDataset(df_val, bert_model,maxlen)
criterion = nn.BCEWithLogitsLoss()
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [11]:
torch.cuda.empty_cache()
set_seed(1)
freeze_bert = False
lr = 2e-5
epochs = 2
gradient_clip_val = 5.0
bs = 32
train_loader = DataLoader(train_set, batch_size=bs, shuffle=True,collate_fn=my_collate)
val_loader = DataLoader(val_set, batch_size=bs, shuffle = False,collate_fn=my_collate)

In [None]:
net = SentencePairClassifier(bert_model,freeze_bert=freeze_bert)
net.to(device)
net.train()
optimizer = AdamW(net.parameters(), lr=lr)
iters_to_accumulate = 1 ##Effective batch size = batch size
num_warmup_steps = 500
num_training_steps = len(train_loader) * 4 #This is to make the last learn rate to be 0.5x max, instead of 0
lr_scheduler = get_linear_schedule_with_warmup(optimizer=optimizer, num_warmup_steps=500, num_training_steps=num_training_steps)

train_bert(net,optimizer,lr_scheduler,train_loader,val_loader,epochs,iters_to_accumulate=iters_to_accumulate)
torch.save(net,os.path.join(drive_path,"NLU_models/Transformer/AnyLengthBert.pt"))

In [16]:
print("Reading test data...")
from plotly import figure_factory as ff

test_set = CustomDataset(df_test,bert_model,maxlen)
path_to_output_file = "my_results"
test_loader = DataLoader(test_set, batch_size=bs, num_workers=2,collate_fn=my_collate)
model = net
print("Predicting on test data...")
test_prediction(net=model, device=device, dataloader=test_loader, with_labels=True,  # set the with_labels parameter to False if your want to get predictions on a dataset without labels
                result_file=path_to_output_file)
print()
print("Predictions are available in : {}".format(path_to_output_file))
labels_test = df_test['label']  # true labels
probs_test = pd.read_csv(path_to_output_file, header=None)[0]  # prediction probabilities
threshold = 0.5   # you can adjust this threshold for your own dataset
preds_test=(probs_test>=threshold).astype('uint8') # predicted labels using the above fixed threshold

from sklearn.metrics import precision_score, recall_score, f1_score, matthews_corrcoef,accuracy_score, confusion_matrix

all_labels = labels_test
all_predictions = preds_test

precision = precision_score(all_labels, all_predictions)
recall = recall_score(all_labels, all_predictions)
f1 = f1_score(all_labels, all_predictions)
mcc = matthews_corrcoef(all_labels, all_predictions)
accuracy = accuracy_score(all_labels,all_predictions)

print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")
print(f"MCC: {mcc}")
print(f"Accuracy: {accuracy}")


cm = confusion_matrix(all_labels, all_predictions)


# Labels for the axes
labels = ['Negative', 'Positive']

# Create annotated heatmap
fig = ff.create_annotated_heatmap(cm, x=labels, y=labels, colorscale='Blues', showscale=True)
fig.update_layout(title='Confusion Matrix with Plotly',
                  xaxis_title='Predicted Value',
                  yaxis_title='Actual Value',
                  yaxis_autorange='reversed')  # Reverse y-axis to have the layout similar to scikit-learn's confusion matrix plot

# Add custom xaxis and yaxis titles
for i in range(len(fig.layout.annotations)):
    fig.layout.annotations[i].font.size = 12

# Show the plot
fig.show()


Reading test data...
Predicting on test data...


  self.pid = os.fork()
100%|██████████| 186/186 [00:06<00:00, 26.92it/s]



Predictions are available in : my_results
Precision: 0.7582554517133956
Recall: 0.7611006879299562
F1 Score: 0.7596754057428214
MCC: 0.6706401144817785
Accuracy: 0.8700641241984475
