## Importing packages

In [None]:
import pandas as pd
import torch
import torch.nn as nn
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
from tqdm import tqdm
from sklearn import model_selection
import shutil

In [None]:
torch.__version__

'1.11.0+cu113'

## Configurations

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(device)

cuda


In [None]:
BATCH_SIZE= 16
MAXLEN=512
LR= 0.005
EPOCHS= 4


## Import data

In [None]:
keys= pd.read_csv('/content/drive/MyDrive/Colab Notebooks/proj_sem4/data/key.csv',sep='\t')

In [None]:
keyList= keys.columns.tolist()

In [None]:
test_df=pd.read_csv('/content/drive/MyDrive/Colab Notebooks/proj_sem4/data/test-balanced.csv',delimiter= '\t', names=keyList, header=None)
train_df=pd.read_csv('/content/drive/MyDrive/Colab Notebooks/proj_sem4/data/train-balanced.csv',delimiter= '\t', names=keyList, header=None)

In [None]:
train_df.head()

Unnamed: 0,label,comment,author,subreddit,score,ups,downs,date,created_utc,parent_comment
0,0,"I highly doubt this mostly ignored, surely uns...",What_I_Thought,politics,0,0,0,2016-09,1473426915,"The GOP has the reputation, in recent times, o..."
1,0,Holy shit they are dropping an Halloween surpr...,Quinnjester,politics,0,-1,-1,2016-11,1477961322,Donald Trump Used Legally Dubious Method to Av...
2,0,Chafetz is a known liar (see PP vids) why does...,TrumpsMonkeyPaw,politics,8,-1,-1,2016-10,1477928901,Some principles you've got there
3,0,Kansas Number 1 in imaginary Muslim terrorists...,Ginsengstrip_2002,politics,19,-1,-1,2016-10,1477864377,Kansas is probably the last state to have a te...
4,1,wow it is totally unreasonable to assume that ...,pb2crazy,politics,2,-1,-1,2016-11,1477968131,Clinton campaign accuses FBI of 'blatant doubl...


## SAVE AND LOAD FUNCTIONS

In [None]:
def load_ckp(ckpt_path,model,optimizer=None):
    checkpoint= torch.load(ckpt_path)
    model.load_state_dict(checkpoint['state_dict'])
    if optimizer!=None:
      optimizer.load_state_dict(checkpoint['optimizer'])
    valid_loss_min= checkpoint['valid_loss_min']
    return model, optimizer, checkpoint['epoch'],valid_loss_min.item()
def save_ckp(state, is_best, ckpt_path,best_model_path):
    torch.save(state, ckpt_path)
    if is_best:
        best_path = best_model_path
        shutil.copyfile(ckpt_path,best_path)

## Download BERT Model and Tokenizer

In [None]:
!pip install transformers



In [None]:
from transformers import BertTokenizer, BertModel

In [None]:
#bert= BertModel.from_pretrained("bert-base-uncased", output_hidden_states=True)

In [None]:
#torch.save({'state_dict': bert.state_dict()}, "/content/drive/MyDrive/Colab Notebooks/proj_sem4/bert_state_dict/bert.pt")

In [None]:
checkpt = torch.load("/content/drive/MyDrive/Colab Notebooks/proj_sem4/bert_state_dict/bert.pt")
#use same initial weights for all models
bert = BertModel.from_pretrained("bert-base-uncased", output_hidden_states=True, state_dict = checkpt['state_dict'])

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

## CUSTOM DATA CLEANING

In [None]:
def clean_text(text):
    text= str(text)
    text=  " ".join(text.split())
    text = text.lower()
    if len(text) <=1:
      return []
    pattern = re.compile('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    text = pattern.sub('', text)
    text = " ".join(filter(lambda x:x[0]!='@', text.split()))
    emoji = re.compile("["
                           u"\U0001F600-\U0001FFFF"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)

    text = emoji.sub(r'', text)
    text = text.lower()

    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r" im |^im", "where is", text)

    text = re.sub(r"he's", "he is", text)
    text = re.sub(r" hes |^hes", "he is", text)

    text = re.sub(r"she's", "she is", text)
    text = re.sub(r" shes |^shes", "she is", text)

    text = re.sub(r"that's", "that is", text)
    text = re.sub(r" thats |^thats", "that is", text)

    text = re.sub(r"what's", "what is", text)
    text = re.sub(r" whats |^whats", "what is", text)

    text = re.sub(r"where's", "where is", text)
    text = re.sub(r" wheres |^wheres", "where is", text)

    text = re.sub(r"ain't", "is not", text)
    text = re.sub(r" aint |^aint", "is not", text)

    text = re.sub(r"won't", "will not", text)
    text = re.sub(r" wont |^wont", "will not", text)

    text = re.sub(r"wasn't", "was not", text)
    text = re.sub(r" wasnt |^wasnt", "was not", text)

    text = re.sub(r"hasn't", "has not", text)
    text = re.sub(r" hasnt |^hasnt", "has not", text)

    text = re.sub(r"don't", "do not", text)
    text = re.sub(r" dont |^dont", "do not", text)

    text = re.sub(r"didn't", "did not", text)
    text = re.sub(r" didnt |^didnt", "did not", text)

    text = re.sub(r"can't", "can not", text)
    text = re.sub(r" cant |^cant", "can not", text)

    text = re.sub(r"it's", "it is", text)
    text = re.sub(r" its |^its", "it is", text)

    text = re.sub(r"couldn't", "could not", text)
    text = re.sub(r" coudlnt |^couldnt", "could not", text)

    text = re.sub(r"haven't", "have not", text)
    text = re.sub(r" havent |^havent", "have not", text)

    text = re.sub(r" theyre |^theyre", "they are", text)

    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"n't", " not", text)

    #based on EDA, symbols{. ! ? } should be preserved
    text = re.sub(r"[,\"'@#$%^&*(){}/;`~:<>+=-]", " ", text)
    """
    rather than subbing with "", it's better to sub with " ", to show that there was something there,
    not necessarily useful, but not worth nothing either. Situations in which 2 words are separated by , or - for example,
    would retain their form as individual words. Subbing with "" would cause them to effectively get concatenated.
    """
    return text

In [None]:
def clean_data(df):
    sent = { 'comment': [] , 'parent_comment': [], 'label': [] }
    for row in df.values:
        comment = clean_text(row[1])
        context = clean_text(row[9])
        if len(comment)<=1 or len(context) <=1 :
          continue
        else:
          sent['comment'].append(comment)
          sent['parent_comment'].append(context)
          sent['label'].append(row[0])
    return sent

In [None]:
"""
train_df['comment']= train_df['comment'].apply(lambda x: clean_text(x))
train_df['comment']= train_df['comment'].apply(lambda x: clean_text(x))
train_df['parent_comment'] = train_df['parent_comment'].apply(lambda x: clean_text(x))
test_df['parent_comment'] = test_df['parent_comment'].apply(lambda x: clean_text(x))
""";

In [None]:
train_df_set = clean_data(train_df)
train_df2= pd.DataFrame(train_df_set)

test_df_set = clean_data(test_df)
test_df2 = pd.DataFrame(test_df_set)

In [None]:
train_df2['comment'] = train_df['comment'].dropna(axis=0)
test_df2['comment'] = test_df['comment'].dropna(axis=0)
train_df2['parent_comment'] = train_df['parent_comment'].dropna(axis=0)
test_df2['parent_comment'] = test_df['parent_comment'].dropna(axis=0)

In [None]:
train_df2.shape , train_df.shape

((47959, 3), (47970, 10))

## CUSTOM TRUNCATION FUNCTION

In [None]:
#Truncation: longest first, middle part, last

def truncation(text,
               maxLen,
               main_comment_Length=0,
               is_context=False,
               main_comment_has_special_tokens=False ,
               has_special_tokens=False,
               truncation_strategy='longest_first',
               is_tokenized=False):

    trunc=dict()
    textList= list()

    assert maxLen>=0 , "Max length can't be <=0 if you want to truncate"
    assert not (is_context and has_special_tokens) , "Context shouldn't have special tokens while using this function"
    #has_special_tokens(B)
    # (A.B)'

    assert is_context or main_comment_has_special_tokens==has_special_tokens, "If text is main comment, then it can't be tokenized and not tokenized at the same time"
    # B-> main_comment_has_special_tokens==has_special_tokens
    # (A+B)

    assert text!=None or text!=[] , "Text can't be None or empty list, i.e, []"
    assert not (is_context and main_comment_Length <=0) , "Main comment length cannot be 0"
    """
    is_context(A) | main_comment == 0 (B)  |  Bool
    1          |    1               |     0
    1          |    0               |     1
    0          |    1               |     0
    0          |    0               |     0

    -> A'B
    """

    assert not is_tokenized or type(text)==list , "Tokenized input has to be a list"
    #is_tokenized(A) ; type(text)==list (B)
    #(A'+B)
    if not is_tokenized:
        textList=tokenizer.tokenize(text) #global tokenizer
    else:
        textList = text


    length= len(textList)

    #maxLen = maxLen-2 if has_special_tokens else maxLen

    maxLen=  (maxLen- main_comment_Length - 2) if main_comment_has_special_tokens else (maxLen - main_comment_Length)

    if length<=maxLen:
        return textList

    mid= length/2
    split = maxLen/2
    assert type(maxLen)==int
    trunc['longest_first'] = textList[0:maxLen]
    trunc['middle_part'] = textList[int ( mid-split ) : int (mid + split) ]
    trunc['last']  = textList[-maxLen:]

    assert len (trunc[truncation_strategy])>0, f"length cannot be 0, for \ntext: {text}\ntextList: {textList}\nlength: {length}\nmaxLen:{maxLen}\nmid:{mid}\nsplit:{split}\nmain_comment_length:{main_comment_Length}"
    assert len (trunc[truncation_strategy])<=512, f"length cannot be > 512, for \ntext: {text}\ntextList: {textList}\nlength: {length}\nmaxLen:{maxLen}\nmid:{mid}\nsplit:{split}\nmain_comment_length:{main_comment_Length}"

    return trunc[truncation_strategy]

## Split into Training and validation sets

In [None]:
#take a fraction of the data
training_df = train_df2.sample(frac=0.2, random_state=200).reset_index(drop=True)

In [None]:
training_examples =  training_df.sample(frac=0.8, random_state= 200)
val_examples = training_df.drop(training_examples.index).reset_index(drop=True)
training_examples= training_examples.reset_index(drop=True)
"""
trainSet, valSet = model_selection.train_test_split(train_df,train_size= 0.8, test_size=0.2, stratify=train_df.label.values)
trainSet = trainSet.reset_index(drop=True )
valSet = valSet.reset_index(drop=True )
training_examples =  train_df.sample(frac=0.001, random_state= 200). reset_index(drop=True)
train_df= df_train.sample(frac=0.8,random_state=200).reset_index(drop=True)
val_df= df_train.drop(train_df.index).reset_index(drop=True)
print ("training example shape: ", training_examples.shape)
""";

In [None]:
training_examples.shape , val_examples.shape

((7674, 3), (1918, 3))

## **DATASET Class**

In [None]:
class my_Dataset(torch.utils.data.Dataset):
    def __init__(self,df, tokenizer, max_len):
        self.df = df
        self.tokenizer= tokenizer
        self.max_len = max_len
        self.text= self.df['comment']
        self.context= self.df['parent_comment']
        self.targets = self.df['label'].values
    def __len__(self):
        return len(self.text)

    def __getitem__(self,index):
        text= str(self.text[index])
        text=  " ".join(text.split())
        """
        Can skip cleaning for now as its been done in earlier stages
        """
        #cleaned_text= clean_text(text)
        #if len (cleaned_text)<=1:
        if len(text) <=1 :
          print("text is empty or has length =1 :",text)
        #text= '[CLS]' + text + '[SEP]'

        """
        Keep it like this till I don't figure out the issues with Truncate
        trunc_text = truncation(cleaned_text,maxLen= self.max_len, truncation_strategy= 'middle_part')

        """

        #assert len(cleaned_text)>0 , f"length of text must be > 0\ntext: {text}\ncleaned: {cleaned_text}"

        context= str(self.context[index])
        context=  " ".join(context.split())
        #cleaned_context = clean_text(context)
        """
        Keep it like this till I don't figure out the issues with Truncate
        context = truncation(context,
                             maxLen=self.max_len,
                             is_context=True,
                             truncation_strategy='middle_part', main_comment_Length=len(trunc_text))
        """
        inputs = self.tokenizer.encode_plus(text ,
                                      context,
                               max_length=self.max_len,
                               add_special_tokens=True,
                               is_split_into_words=False, #Keep FALSE till I don't figure out the issues in truncate
                               truncation= 'longest_first',
                               padding='max_length',
                               return_tensors='pt',
                               return_attention_mask=True,
                               return_token_type_ids=True )
        assert len(inputs['input_ids'])<=512 , f"input length can't be >512\n index: {index}\n text: {text}\n context:{context}"
        target=self.targets[index]

        return {
            'input_ids':inputs['input_ids'].flatten(),
            'attention_mask':inputs['attention_mask'].flatten(),
            'token_type_ids':inputs['token_type_ids'].flatten(),
            'targets':torch.FloatTensor([target])
        }

#### **Initialize my_Dataset object**

In [None]:
train_set= my_Dataset(training_examples, tokenizer, MAXLEN)
val_set= my_Dataset(val_examples, tokenizer, MAXLEN)

## Data Loader objects

In [None]:
trainLoader = torch.utils.data.DataLoader(
    train_set,
    shuffle=True,
    batch_size = BATCH_SIZE,
    num_workers= 0,
    )
valLoader = torch.utils.data.DataLoader(
    val_set,
    shuffle=True,
    batch_size = BATCH_SIZE,
    num_workers= 0,
    )

## Loss function ,optimizer and accuracy functions

In [None]:
def loss_fn(outputs, targets):
    return nn.BCEWithLogitsLoss()(outputs,targets)

def optimizer_fn(model , learn_rate):
  return torch.optim.Adam(params=model.parameters(),lr=learn_rate)

def accuracy(outputs, targets):
  sigmoid = nn.Sigmoid()
  output= sigmoid (outputs )
  output = output.cpu().detach().flatten().numpy()
  target= targets.cpu().detach().flatten().numpy()
  print("\noutputs:{}\ntargets: {}".format( (output>0.5) , target==1.) )
  return np.sum ((output>0.5) == (target==1.)) / len(output) *100

## Training and Testing functions

#### Dictionary

In [None]:
prediction_dict = {'Vanilla_BUPO': {'train': {'predictions':[], 'targets': [] , 'loss':[], 'accuracy':[]},
                                     'val': {'predictions':[], 'targets': [] , 'loss':[], 'accuracy':[]}
                                    } ,
                    'Vanilla_SOL': {'train': {'predictions':[], 'targets': [] , 'loss':[], 'accuracy':[]},
                                     'val': {'predictions':[], 'targets': [] , 'loss':[], 'accuracy':[]}
                                    }
                    }

### Training fn

In [None]:
def train(trainLoader,
          model,
          model_name,
          optimizer,
          epochs,
          save=False,
          validate=False,
          valLoader=None,
          curr_ckpt=None,
          best_ckpt=None,
          ):

  assert model_name=='Vanilla_BUPO' or model_name=='Vanilla_SOL', "model_name doesn't exist"

  assert not validate or valLoader!=None, "valLoader can't be None if you want to perform the validation step"

  assert not save or (curr_ckpt!=None and best_ckpt!=None), "To save the model, mention paths curr_ckpt and best_ckpt"

  train_loss_track = []
  val_loss_track = []
  val_loss_min= np.Inf

  for epoch in range(epochs):
    train_loss=0
    #val_loss =0
    model.train()
    for index, batch in tqdm (enumerate (trainLoader), total= len(trainLoader)):

      targets = batch['targets'].to(device, dtype=torch.float)
      input_ids = batch['input_ids'].to(device, dtype=torch.long)
      token_type_ids = batch['token_type_ids'].to(device, dtype=torch.long)
      attention_mask = batch['attention_mask'].to(device, dtype=torch.long)

      optimizer.zero_grad()

      outputs= model(input_ids, token_type_ids, attention_mask)
      #print( outputs.shape)
      #print(targets.shape)
      loss = loss_fn(outputs, targets)

      loss.backward()

      optimizer.step()

      train_batch_accuracy = accuracy(outputs, targets)

      train_loss=train_loss + ((1/(index+1))*(loss.item()-train_loss))

      train_loss_track.append(train_loss)

      print("\nAvg train_loss :{:.8f} | accuracy :{:.8f} | loss :{:.8f}".format(train_loss,train_batch_accuracy,loss.item()) )

      """
      prediction_dict[model_name]['train']['predictions'].extend(outputs.cpu().detach().numpy().tolist())
      prediction_dict[model_name]['train']['targets'].extend(targets.cpu().detach().numpy().tolist())
      prediction_dict[model_name]['train']['loss'].append(loss.item())
      prediction_dict[model_name]['train']['accuracy'].append(train_batch_accuracy)
      """

    if validate==True:
      vlt , val_loss_min = validation(model, model_name, valLoader,  val_loss_min, optimizer, epoch, curr_ckpt, best_ckpt, save)
      val_loss_track.extend(vlt)

  return train_loss_track, val_loss_track, prediction_dict

### Validation fn

In [None]:

def validation(model, model_name, valLoader, val_loss_min ,optimizer, epoch, ckpt_path=None, best_model_path=None, save=False):
      val_loss_track = []
      val_loss=0
      assert not save or (ckpt_path!=None and best_model_path!=None), "To save the model, mention paths curr_ckpt and best_ckpt"

      #VALIDATION
      model.eval()
      with torch.no_grad():
        for index, batch in tqdm( enumerate(valLoader), total= len(valLoader)):

          targets = batch['targets'].to(device, dtype=torch.float)
          input_ids = batch['input_ids'].to(device, dtype=torch.long)
          token_type_ids = batch['token_type_ids'].to(device, dtype=torch.long)
          attention_mask = batch['attention_mask'].to(device, dtype=torch.long)

          outputs= model(input_ids, token_type_ids, attention_mask)

          loss = loss_fn(outputs, targets)

          val_batch_acc= accuracy(outputs,targets)
          val_loss= (val_loss + (1/(index+1))*(loss.item()-val_loss))

          val_loss_track.append(val_loss)

          print("\nAvg val_loss :{:.8f} | val_accuracy :{:.8f} | val_loss :{:.8f}".format(val_loss,val_batch_acc,loss.item()) )

          prediction_dict[model_name]['val']['predictions'].extend(outputs.cpu().detach().numpy().tolist())
          prediction_dict[model_name]['val']['targets'].extend(targets.cpu().detach().numpy().tolist())
          prediction_dict[model_name]['val']['loss'].append(loss.item())
          prediction_dict[model_name]['val']['accuracy'].append(val_batch_acc)

          checkpoint ={
            'epoch': epoch,
            'valid_loss_min':val_loss,
            'state_dict':model.state_dict(),
            'optimizer':optimizer.state_dict()
          }
          save_ckp(checkpoint,False,ckpt_path,best_model_path)

          if val_loss < val_loss_min:
            print("previous Val_loss_min={:.4f}; new val_loss_min={:.4f}".format(val_loss_min, val_loss))
            save_ckp(checkpoint, True, ckpt_path, best_model_path)
            print("SAVED")
            val_loss_min = val_loss
          print("epoch {} end".format(epoch))
          return val_loss_track , val_loss_min

## Models

### Vanilla_BERT Model using pooler output

> Indented block



In [None]:
class Vanilla_Bert_Using_Pooler_Output(nn.Module):
  def __init__(self,model):
    super(Vanilla_Bert_Using_Pooler_Output,self).__init__()
    self.bert = model #want to use same initial BERT weights for all models
    self.dropout = nn.Dropout(0.3)
    self.linear_layer = nn.Linear(768,1)
    #self.activation_layer = nn.Sigmoid()
  def forward(self, input_ids,attention_mask,token_type_ids):

    bert_output = self.bert(input_ids, attention_mask, token_type_ids)

    dropout_output= self.dropout(bert_output.pooler_output)

    linear_output = self.linear_layer(dropout_output)

    #activation_output =  self.activation_layer(linear_output)

    return linear_output

In [None]:
Vanilla_BUPO = Vanilla_Bert_Using_Pooler_Output(bert)
Vanilla_BUPO.to(device);

In [None]:
#Training
optimizer_BUPO= optimizer_fn(Vanilla_BUPO, LR)
torch.cuda.empty_cache()
train_loss_track, val_loss_track, prediction_dict = train(model=Vanilla_BUPO,
      model_name= "Vanilla_BUPO",
      trainLoader= trainLoader,
      valLoader= valLoader,
      optimizer=optimizer_BUPO,
      epochs=EPOCHS,
      save= True,
      validate= True,
      curr_ckpt= VANILLA_BUPO_CKPT,
      best_ckpt=VANILLA_BUPO_BEST );

### Vanilla_BERT Model using sum of last 4 layers

In [None]:
#bert models using sum of last 4 layers
class Vanilla_Bert_Using_Sum_of_Layers(nn.Module):
  def __init__(self, model):
    super(self, Vanilla_Bert_Using_Sum_of_Layers).__init__()
    self.bert=model
    self.linear_layer1 = nn.Linear(768,1)
    self.linear_layer2 = nn.Linear(512,1)
    self.activation1 = nn.Sigmoid()
    self.activation2 = nn.Sigmoid()
    self.dropout1 = nn.Dropout(0.3)
    self.dropout2 = nn.Dropout(0.3)
  def forward(self, input_ids,attention_mask,token_type_ids):

    bert_output = self.bert(input_ids, attention_mask, token_type_ids)

    ##get sum of last 4 hidden layers
    hidden_states = bert_output[2]
    sum_of_last_4_layers = torch.stack(hidden_states[-4:]).sum(0)

    dropout_output = self.dropout1(sum_of_last_4_layers)

    linear1_output = self.linear_layer1(dropout_output)
    activation1_output = self.activation1(linear1_output.view(1,512))

    linear2_output = self.linear_layer2(activation1_output)
    activation2_output = self.activation2(linear2_output.flatten)

In [None]:
vanilla_SOL = Vanilla_Bert_Using_Sum_of_Layers(bert)