In [1]:
try:
    import transformers
except ImportError as e:
    print('transformers not installed')
    print('Installing now...')
    !pip install -q git+https://github.com/huggingface/transformers.git
    print("Install complete.")
    pass  

In [2]:
import torch
import io 
import os
from torch.utils.data import Dataset,DataLoader,TensorDataset
from sklearn.metrics import classification_report,accuracy_score
import transformers
import json
from tqdm.notebook import tqdm
from transformers.utils.dummy_pt_objects import AutoModelForSequenceClassification
from transformers import AutoModelForTokenClassification,AutoConfig,AutoModel,AutoTokenizer,BertModel,BertConfig,AdamW, get_constant_schedule,BertForSequenceClassification,get_linear_schedule_with_warmup
import random
import numpy as np
import torch.nn as nn
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split

#Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [3]:
# if using Google Colab, set colab = True
colab = False

if colab == True:
    #Mounting Drive
    from google.colab import drive
    drive.mount('/content/gdrive')
    
    %cd '/content/gdrive/Shareddrives/523 Project/Data'
    %ls
else:
    DATA_DIR = '/projectnb2/dl523/students/kjv/EC523_Project/Data/'

In [4]:
#Reading in the data
df = pd.read_json(DATA_DIR + "News Headlines/Sarcasm_Headlines_Dataset_v2.json",lines = True)
df = df.rename(columns={'is_sarcastic': 'label'})
df = df.drop('article_link', 1)
df.head()

#splits for training test validation

train_headlines, temporary_text, train_label, temporary_label = train_test_split(df['headline'], df['label'], 
                                                                    random_state=200, 
                                                                    test_size=0.2, 
                                                                    stratify=df['label'])







validation_headlines, test_headlines, validation_label, test_label = train_test_split(temporary_text, temporary_label, 
                                                                    random_state=200, 
                                                                    test_size=0.5, 
                                                                    stratify=temporary_label)

In [5]:
# set max length for padding/clipping during tokenization
max_length = 35

# create tokenized training, validation, and test splits
tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased")

training_tokens = tokenizer.__call__(train_headlines.tolist(),max_length = max_length,padding = True,truncation = True)
validation_tokens = tokenizer.__call__(validation_headlines.tolist(),max_length = max_length,padding = True,truncation = True)
test_tokens = tokenizer.__call__(test_headlines.tolist(),max_length = max_length,padding= True,truncation = True)

# Stacking the inputs as tensors for use in the BERT model

training_set = TensorDataset(torch.tensor(training_tokens['input_ids']),torch.tensor(training_tokens['attention_mask']),torch.tensor(train_label.tolist()))
validation_set = TensorDataset(torch.tensor(validation_tokens['input_ids']),torch.tensor(validation_tokens['attention_mask']),torch.tensor(validation_label.tolist()))
test_set = TensorDataset(torch.tensor(test_tokens['input_ids']),torch.tensor(test_tokens['attention_mask']),torch.tensor(test_label.tolist()))

In [15]:
# initialize pre-trained BERT large

bertconfig = BertConfig()
bert_large = BertModel.from_pretrained("bert-large-uncased")

Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [16]:
# freeze pre-trained layers in BERT

for param in bert_large.parameters():
    param.requires_grad = False

In [17]:
class multihead_attn_bert(nn.Module):

    def __init__(self, bert_encoder, embed_dim, num_attn_layers, num_heads):
        super(multihead_attn_bert, self).__init__()
        
        self.bert = bert_encoder
        
        self.multiheads = nn.ModuleList([nn.MultiheadAttention(embed_dim, num_heads, batch_first=True)])
        self.multiheads.extend([nn.MultiheadAttention(embed_dim, num_heads, batch_first=True) for i in range(num_attn_layers-1)])
        
        self.GRU = nn.GRU(input_size=embed_dim, hidden_size=512, bidirectional=True, batch_first=True)
        
        # 1024 = hidden_size of GRU x 2 (for bidirectionality of GRU)
        self.fc = nn.Linear(in_features=1024, out_features=1)
        
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, tokenized_input_values, attention_mask):
        
        output = self.bert(tokenized_input_values, attention_mask=attention_mask).last_hidden_state
        
        for multihead_layer in self.multiheads:
            output,_ = multihead_layer(query=output,key=output,value=output,key_padding_mask=(~attention_mask.bool()))
        
        _,hidden = self.GRU(output)
        
        # concatenate bidirectional outputs from GRU to pass to linear layer
        hidden = torch.cat([hidden[0,:, :], hidden[1,:,:]], dim=1).unsqueeze(0)
        
        output = self.fc(hidden)
        
        output = self.sigmoid(output)
        
        return output

In [18]:
# initialize multihead attention sarcasm model with BERT large embedder

# change 1024 to 768 if using bert_base
mh_sarcasm_model = multihead_attn_bert(bert_large, 1024, 3, 8)
mh_sarcasm_model.to(device)

multihead_attn_bert(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 1024, padding_idx=0)
      (position_embeddings): Embedding(512, 1024)
      (token_type_embeddings): Embedding(2, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNorm((1024,), eps=1e-12, elem

In [19]:
#Dataloaders for the sets

batch_size = 64

trainloader = DataLoader(training_set, batch_size = batch_size,num_workers=2,shuffle = True)
validationloader = DataLoader(validation_set, batch_size = batch_size,num_workers=2,shuffle = True)
testloader = DataLoader(test_set, batch_size = batch_size,num_workers=2,shuffle = True)

#Loss function (used in reference multihead attention paper)
loss_function = nn.BCELoss()

In [22]:
#Training mh_sarcasm bert
Epochs = 4

# optimizer and scheduler using learning rate from reference paper
optimizer = torch.optim.Adam(mh_sarcasm_model.parameters(),lr = 1e-4,eps = 1e-8)
# scheduler = get_linear_schedule_with_warmup(optimizer,num_warmup_steps = 0,num_training_steps = len(trainloader)*Epochs)

mh_sarcasm_model.train()

for epoch in range(1, Epochs+1):
    
    running_loss = 0
    print('Epoch: ',epoch)
    
    for idx, (inputs,attention_mask,label) in enumerate(tqdm(trainloader,total = len(trainloader))):

        inputs, attention_mask, label = inputs.to(device),attention_mask.to(device),label.to(device)

        optimizer.zero_grad()

        output = mh_sarcasm_model(inputs, attention_mask)
        output = torch.flatten(output)

        # convert label type from int to float for use in BCELoss
        label = label.float()
        loss = loss_function(output,label)

        loss.backward()
        optimizer.step()
#       scheduler.step()
        running_loss += loss.item()
    
        # print loss every 100 batches
        if idx % 100 == 0:
            print('Loss: ',float(loss))
            running_loss = 0.0

Epoch:  1


  0%|          | 0/358 [00:00<?, ?it/s]

Loss:  0.1605427861213684
Loss:  0.30182021856307983
Loss:  0.26590657234191895
Loss:  0.2875272035598755
Epoch:  2


  0%|          | 0/358 [00:00<?, ?it/s]

Loss:  0.36311209201812744
Loss:  0.1934570074081421
Loss:  0.15978430211544037
Loss:  0.33119630813598633
Epoch:  3


  0%|          | 0/358 [00:00<?, ?it/s]

Loss:  0.11623425036668777
Loss:  0.23238784074783325
Loss:  0.27096614241600037
Loss:  0.31331688165664673
Epoch:  4


  0%|          | 0/358 [00:00<?, ?it/s]

Loss:  0.1627371907234192
Loss:  0.2804999351501465
Loss:  0.21763089299201965
Loss:  0.17203137278556824
