In [None]:
import os 
import warnings 
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt 

In [None]:
warnings.filterwarnings("ignore")

In [None]:
import torch
import torch.nn as nn 
from torch.utils.data import Dataset, DataLoader

In [None]:
from google.colab import files
files.upload()

{}

In [None]:
!pip install transformers pytorch-lightning

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Using cached transformers-4.25.1-py3-none-any.whl (5.8 MB)
Collecting pytorch-lightning
  Downloading pytorch_lightning-1.8.6-py3-none-any.whl (800 kB)
[K     |████████████████████████████████| 800 kB 24.9 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.10.0
  Using cached huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Using cached tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
Collecting tensorboardX>=2.2
  Downloading tensorboardX-2.5.1-py2.py3-none-any.whl (125 kB)
[K     |████████████████████████████████| 125 kB 66.2 MB/s 
[?25hCollecting lightning-utilities!=0.4.0,>=0.3.0
  Downloading lightning_utilities-0.5.0-py3-none-any.whl (18 kB)
Collecting torchmetrics>=0.7.0
  Downloading torchmetrics-0.11.0-py3-none-any.whl (512 kB)
[K     |████████████████████████████████| 512 

In [None]:
import transformers
from transformers import *

In [None]:
# configs 

DEVICE='cuda'
EPOCHS=3
TRAIN_BATCH_SIZE=16
VALID_BATCH_SIZE=8
TEST_BATCH_SIZE=8
ACCUMULATION_STEPS=4
MAX_LEN=256 

TRAINING_DATASET='/content/train_dataset_random_negative_sample_from_iteself.csv'
BERT_PATH='/content/bert_base_uncased'

In [None]:
TOKENIZER=transformers.BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True)

loading file vocab.txt from cache at /root/.cache/huggingface/hub/models--bert-base-uncased/snapshots/0a6aa9128b6194f4f3c4db429b6cb4891cdb421b/vocab.txt
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at None
loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--bert-base-uncased/snapshots/0a6aa9128b6194f4f3c4db429b6cb4891cdb421b/tokenizer_config.json
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--bert-base-uncased/snapshots/0a6aa9128b6194f4f3c4db429b6cb4891cdb421b/config.json
Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_positio

In [None]:
# Datset Approach 

## Approach 1: 
## -----------

# [CLS] tokens_a [SEP] tokens_b [SEP]

# similarly for our dataset 
# [CLS] text [SEP] reason [SEP] If we remove the middle [SEP] token and try to 
# concat like this : [CLS] text || reason [SEP] then we might loose a lot of info


## Approach 2:
## ------------

# input1 : [CLS] text [SEP]
# input2 : [CLS] reason [SEP]

## TODO: Explore Both the Dataset format and then see which one works better

In [None]:
from typing import List

# Creating Dataset based on Approach 1 

class BERTDataset:
    def __init__(self, texts : List[str], reasons : List[str], targets : List[int]):
        self.texts=texts
        self.reasons=reasons
        self.targets=targets
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, item : int):
        text=str(self.texts[item])
        reason=str(self.reasons[item])

        text=" ".join(text.split())
        reason=" ".join(reason.split()) # remove extra spaces does a bit of cleaning 


        # encode plus : Takes a pair of text and does in our format
        # adds_special_token : adds [CLS], [SEP], [MASK] tokens 

        inputs=TOKENIZER.encode_plus(
            text, 
            reason, 
            add_special_tokens=True, 
            max_length=MAX_LEN,
            pad_to_max_length=True
        )

        ids=inputs['input_ids']
        token_types=inputs['token_type_ids']
        mask=inputs['attention_mask']

        return {
            'ids' : torch.tensor(ids, dtype=torch.long), 
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids' : torch.tensor(token_types, dtype=torch.long),
            'targets' : torch.tensor(int(self.targets[item]), dtype=torch.long)
        }

In [None]:
# for Approach 2: we will pass the same format of input viz: text, reason
# we will also use encode_plus function to get our encodings
# but, instead of text2, we will pass None something like this

'''
inputs1=TOKENIZER.encode_plus(
        text, 
        None, 
        add_special_tokens=True, 
        max_length=MAX_LEN,
        pad_to_max_len=True
    )

inputs2 = ...

ids = [
    inputs1['input_ids'], inputs2['input_ids']
]

'''
# similarly for others and return same type of dict

"\ninputs1=TOKENIZER.encode_plus(\n        text, \n        None, \n        add_special_tokens=True, \n        max_length=MAX_LEN,\n        pad_to_max_len=True\n    )\n\ninputs2 = ...\n\nids = [\n    inputs1['input_ids'], inputs2['input_ids']\n]\n\n"

In [None]:
# Building our model with Approach 1 Dataset

class BERTBaseUncasedSimilarityModel(nn.Module):
    def __init__(self):
        super(BERTBaseUncasedSimilarityModel, self).__init__()
        self.bert=transformers.BertModel.from_pretrained('bert-base-uncased')
        self.bert_dop=nn.Dropout(0.2)
        self.classifier=nn.Linear(768, 1)
    
    def forward(self, ids, token_type_ids, mask):
        print(ids.shape)
        print(token_type_ids.shape, mask.shape)
        output1, output2 = self.bert(
            ids,
            token_type_ids=token_type_ids,
            attention_mask=mask
        )

        # output1 = sequence output : for every token we will get a vector of size x (x=768 for bert small)
        # Which means output1 = (256 x 768)

        # output2 = will be the pooled output of size 768 
        # also if we want we can || (mean, max) pool of output1 and do the same 

        # here our standard loss will be BCE 

        # For Approach 2 :
        # ----------------
        '''
        We can pass inputs in the format : [ids1, ids2], [ttt1, ttt2], [mask1, mask2]
        then we will do something like this 

        _, b1=self.bert(ids1, ttt1, mask1)
        _, b2=self.bert....

        (optionally) we can pass b1 and b2 to a liner layer (if required)

        then we do either of the following:
        1. concat them and pass them through a classifier
        2. Use consine sim and compute 

        #  if we use consine distance then our loss can be MSE loss 
        # and our aim will be to decrease the diff of distances 
        # we will judge by: {0 : greater then 90 degs, 1: less than 90 degs} probably 
        # might need a better judgeing criteria
        '''

        output2 = self.bert_drop(output2)
        return self.classifier(output2)

In [None]:
model=BERTBaseUncasedSimilarityModel().to(DEVICE)

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--bert-base-uncased/snapshots/0a6aa9128b6194f4f3c4db429b6cb4891cdb421b/config.json
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.25.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file pytorch_model.bin from cache at /root/.cache/huggingface/hub/models--bert-base-uncased/snapshots/0a6aa9128b6194f4f3c4db429b6cb4891cdb421b/pytorch_model.bin
Some weights of the model check

In [None]:
# NOTE: For each batch it is important to have equal distribution of 0s and 1s, which is not here (or implemened)

In [None]:
def train_valid_split(df, test_split=0.2):
    train_length = int(len(df) * (1 - test_split))
    train_data = pd.DataFrame(df.iloc[:train_length, :])
    valid_data = pd.DataFrame(df.iloc[train_length:, :])
    return (train_data, valid_data)

In [None]:
dataset = pd.read_csv('/content/train_data.csv')
train_dataset, valid_dataset = train_valid_split(dataset)

test_dataset = pd.read_csv('/content/evaluation.csv')

In [None]:
print(f"Length of Training data: {len(train_dataset)}") 
print(f"Length of Validation data: {len(valid_dataset)}") 
print(f"Length of Test data: {len(test_dataset)}")

Length of Training data: 1648
Length of Validation data: 413
Length of Test data: 9000


In [None]:
def get_text_reason_target(df):
    return [
        df['text'].tolist(),
        df['reason'].tolist(),
        df['label'].tolist()
    ]

In [None]:
train_text, train_reason, train_targets = get_text_reason_target(train_dataset) 
valid_text, valid_reason, valid_targets = get_text_reason_target(valid_dataset)
test_text, test_reason, test_targets = get_text_reason_target(test_dataset)

In [None]:
# in utils create a function for all of these to get all of those by once and speciffying the format of the dataset 
# do this similarly for the dataloaders aswell

train_bert_dataset = BERTDataset(
    texts=train_text, reasons=train_reason, targets=train_targets
)

valid_bert_dataset = BERTDataset(
    texts=valid_text, reasons=valid_reason, targets=valid_targets
)

test_bert_dataset = BERTDataset(
    texts=test_text, reasons=test_reason, targets=test_targets
)

In [None]:
train_dataloader = DataLoader(
    train_bert_dataset, batch_size=TRAIN_BATCH_SIZE
)

valid_dataloader = DataLoader(
    valid_bert_dataset, batch_size=VALID_BATCH_SIZE
)

valid_dataloader = DataLoader(
    test_bert_dataset, batch_size=TEST_BATCH_SIZE
)

In [None]:
def bce_loss(outputs, targets):
    return nn.BCEWithLogitsLoss()(outputs, targets)

In [None]:
import pytorch_lightning as pl 

class BertSemanticSimilarity(pl.LightningModule):
    def __init__(self, model):
        super(BertSemanticSimilarity, self).__init__()
        self.model = model 
    
    def process_step(self, batch):
        ids = batch['ids']
        mask = batch['mask']
        token_type_ids = batch['token_type_ids']
        targets = batch['targets']

        logits = self.model(
            ids=ids, token_type_ids=token_type_ids, mask=mask
        ) 

        loss = bce_loss(logits, targets)
        return loss
    
    def training_step(self, batch, batch_idx):
        batch = batch.to(DEVICE)
        return self.process_step(batch)
    
    def validation_step(self, batch, batch_idx):
        return self.process_step(batch)
    
    
    def test_step(self, batch, batch_idx):
        return self.process_step(batch)

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=0.01)
        return optimizer

In [None]:
similarity_model = BertSemanticSimilarity(model=model)

In [None]:
checkpoint_path='/content/checkpoints'

In [None]:
trainer = pl.Trainer(default_root_dir=checkpoint_path)
trainer.fit(
    similarity_model, train_dataloader, valid_dataloader
)

In [None]:
trainer.test(
    model=similarity_model, dataloaders=valid_dataloader
)

In [None]:
'''

# disable randomness, dropout, etc...
model.eval()

# predict with the model
y_hat = model(x)
'''

In [None]:
i, m, t, o = list(next(iter(train_dataloader)).values())

In [None]:
i = i.to(DEVICE)
m = m.to(DEVICE)
t = t.to(DEVICE)
o = o.to(DEVICE)

In [None]:
class BertBaseUncasedSingleSentence(nn.Module):
    def __init__(self, dropout_prob=0.2):
        super(BertBaseUncasedSingleSentence, self).__init__() 
        self.bert=transformers.BertModel.from_pretrained('bert-base-uncased')
        self.bert_drop=nn.Dropout(dropout_prob)
        self.classifier=nn.Linear(768, 1)
    
    def forward(self, ids : torch.Tensor, token_type_ids : torch.Tensor, mask : torch.Tensor) -> torch.Tensor:
        """Computes the similarity between two sentences provided using the dataset format 

        Args:
            ids (torch.Tensor): Token ids to be used
            token_type_ids (torch.Tensor): Token type ids to be used
            mask (torch.Tensor): Attention mask

        Returns:
            torch.Tensor: Returns logits between 0 to 1 for computing the probability of similarity
        """
        sequence_encodings, pooled_encodings = self.bert(
            ids, token_type_ids,mask
        )

        print(pooled_encodings)
        print(sequence_encodings)
        #embeddings = self.bert_drop(pooled_encodings)
        #return torch.sigmoid(self.classifier(embeddings)) # Check if it is doing better with simple logits or not 

In [None]:
model = BertBaseUncasedSingleSentence().to(DEVICE)

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--bert-base-uncased/snapshots/0a6aa9128b6194f4f3c4db429b6cb4891cdb421b/config.json
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.25.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file pytorch_model.bin from cache at /root/.cache/huggingface/hub/models--bert-base-uncased/snapshots/0a6aa9128b6194f4f3c4db429b6cb4891cdb421b/pytorch_model.bin
Some weights of the model check

In [None]:
model(i, t, m)

OutOfMemoryError: ignored

In [None]:
bert = transformers.BertModel.from_pretrained('bert-base-uncased').to(DEVICE)

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--bert-base-uncased/snapshots/0a6aa9128b6194f4f3c4db429b6cb4891cdb421b/config.json
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.25.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file pytorch_model.bin from cache at /root/.cache/huggingface/hub/models--bert-base-uncased/snapshots/0a6aa9128b6194f4f3c4db429b6cb4891cdb421b/pytorch_model.bin
Some weights of the model check

OutOfMemoryError: ignored

In [None]:
bert(i, t, m)

In [None]:
import torch
torch.cuda.empty_cache()

In [None]:
torch.cuda.is_available()