# Hyper-Paremeter testing, training, and final model testing

In [None]:
try:
    import transformers
except ImportError as e:
    print('transformers not installed')
    print('Installing now...')
    !pip install -q git+https://github.com/huggingface/transformers.git
    pass  

try:
    import optuna
except ImportError as e:
    print('optuna not installed')
    print('Installing now...')
    !pip install optuna
    pass  


In [None]:
import torch
import io 
import os
from torch.utils.data import Dataset,DataLoader,TensorDataset
from sklearn.metrics import classification_report,accuracy_score
import transformers
import json
from tqdm.notebook import tqdm
from transformers.utils.dummy_pt_objects import AutoModelForSequenceClassification
from transformers import AutoModelForTokenClassification,AutoConfig, AutoModel,AutoTokenizer,BertModel,BertConfig,AdamW, get_constant_schedule,BertForSequenceClassification,get_linear_schedule_with_warmup,get_cosine_with_hard_restarts_schedule_with_warmup
import random
import numpy as np
import torch.nn as nn
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
import matplotlib.pyplot as plt

from optuna.visualization.matplotlib import plot_optimization_history
from optuna.visualization.matplotlib import plot_param_importances

#Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


In [None]:
colab = False
if colab == True:
    #Mounting Drive
    from google.colab import drive
    drive.mount('/content/gdrive')
    
    %cd '/content/gdrive/Shareddrives/523 Project/Data/News Headlines'
    %ls
else:
    MODEL_PATH = '/projectnb/dl523/projects/Sarcasm/Sarcasm_Models/sarcasm_bert_headline.pth'
    DATA_DIR = '/projectnb2/dl523/projects/Sarcasm'
    os.chdir(DATA_DIR)

In [None]:
#Getting the BERT outputs for use with other downstream model implementations 
import pandas as pd
from sklearn.model_selection import train_test_split

In [None]:
#Reading in the data
df = pd.read_json("Sarcasm_Headlines_Dataset_v2.json",lines = True)
df = df.rename(columns={'is_sarcastic': 'label'})
df = df.drop('article_link', 1)
df.head()

#splits for training test validation

train_headlines, temporary_text, train_label, temporary_label = train_test_split(df['headline'], df['label'], 
                                                                    random_state=200, 
                                                                    test_size=0.2, 
                                                                    stratify=df['label'])







validation_headlines, test_headlines, validation_label, test_label = train_test_split(temporary_text, temporary_label, 
                                                                    random_state=200, 
                                                                    test_size=0.5, 
                                                                    stratify=temporary_label)


#Set max length for the padding/clipping
count = df['headline'].str.split().str.len()
count.describe()

max_length = 35


In [None]:
# Class for modifying BERT 
class bert_for_sarcasm(nn.Module):

    def __init__(self,input_model, linear1 = 256,linear2 = 128,drop = .25,mod_type = 'binary',bert_model = 'bert-base-uncased'):
        super(bert_for_sarcasm,self).__init__()
        # Assumes size parameters of bert-base-uncased
        self.mod_type = mod_type
        self.bert_model = bert_model
        if self.bert_model == 'bert-base-uncased':
            self.model_size = 768
        else:
            self.model_size = 1024
        self.input_model = input_model
        
        self.linear = nn.Linear(self.model_size,linear1)
        
        self.linear2 = nn.Linear(linear1,linear2)
        
        self.linear3 = nn.Linear(linear2,2)
        
        self.linear3_binary = nn.Linear(linear2,1)
        
        self.relu = nn.ReLU()
        
        self.log = nn.LogSoftmax(dim = 1)

        self.dropout = nn.Dropout(drop)
        
        self.sigmoid = nn.Sigmoid()
        
    
    def forward(self,input_values,attention_mask):
   
        _,output = self.input_model(input_values, attention_mask=attention_mask).values()
        
        output = self.linear(output)

        output = self.dropout(output)
        
        output = self.relu(output)
        
        output = self.linear2(output)

        output = self.dropout(output)
        
        output = self.relu(output)
        
        if self.mod_type == 'binary':
            output = self.linear3_binary(output)
            output = self.sigmoid(output)
        else:
            output = self.linear3(output)
            output = self.log(output)

        
        return output
    


In [None]:
# Finding best hyper-parameters
def train_bert(model, params,trial,tokenizer):
    model.to(device)
    max_length = 35

    #Create tokenized training, validation, and test splits

    training_tokens = tokenizer.batch_encode_plus(train_headlines.tolist(),max_length = max_length,padding = True,truncation = True)
    validation_tokens = tokenizer.batch_encode_plus(validation_headlines.tolist(),max_length = max_length,padding = True,truncation = True)
    test_tokens = tokenizer.batch_encode_plus(test_headlines.tolist(),max_length = max_length,padding= True,truncation = True)

    #Stacking the inputs as tensors for use in the BERT model

    training_set = TensorDataset(torch.tensor(training_tokens['input_ids']),torch.tensor(training_tokens['attention_mask']),torch.tensor(train_label.tolist()))
    validation_set = TensorDataset(torch.tensor(validation_tokens['input_ids']),torch.tensor(validation_tokens['attention_mask']),torch.tensor(validation_label.tolist()))
    test_set = TensorDataset(torch.tensor(test_tokens['input_ids']),torch.tensor(test_tokens['attention_mask']),torch.tensor(test_label.tolist()))

    trainloader = DataLoader(training_set, batch_size = params['batch_size'],num_workers=2,shuffle = True)
    validationloader = DataLoader(validation_set, batch_size = params['batch_size'],num_workers=2,shuffle = True)
    testloader = DataLoader(test_set, batch_size = params['batch_size'],num_workers=2,shuffle = True)

    Epochs = 60

    
    #optimizer and scheduler for learning rate
    optimizer = AdamW(model.parameters(),lr = params["lr"],eps = 1e-6)
    NO_SCHEDULER = False
    if params['schedule'] == "linear":
        scheduler = get_linear_schedule_with_warmup(optimizer,num_warmup_steps = 2,num_training_steps = len(trainloader)*Epochs)
    elif params['schedule'] == 'cosine':
        scheduler = get_cosine_with_hard_restarts_schedule_with_warmup(optimizer,num_warmup_steps = 2, num_training_steps = len(trainloader)*Epochs)
    else:
        NO_SCHEDULER = True
        pass
    # Setting the correct loss function 
    if params['mod_type'] == 'binary':
        loss_function = nn.BCELoss()
        BINARY = True
    else:
        loss_function = nn.NLLLoss()
        BINARY = False
    
    min_validation_loss = np.inf
    total = 0 
    correct = 0
    for epoch in range(1, Epochs+1):
        model.train()
        training_loss = 0 
        epoch_step = 0
        for idx, (inputs,attention_mask,label) in enumerate(trainloader):

            inputs,attention_mask,label = inputs.to(device),attention_mask.to(device),label.to(device)

            optimizer.zero_grad()

            output = model(inputs,attention_mask)
            if BINARY:
                output = torch.flatten(output)
                loss = loss_function(output,label.float())
            else:
                loss = loss_function(output,label)

            loss.backward()
            optimizer.step()
            
            training_loss +=loss.item()*inputs.size(0)
            
        if NO_SCHEDULER == False:
            scheduler.step()
        
        validation_loss = 0
        
        total_acc_val = 0
        model.eval()
        with torch.no_grad():
            for idx, (inputs,attention_mask,label) in enumerate(validationloader):
                inputs,attention_mask,label = inputs.to(device),attention_mask.to(device),label.to(device)

                output = model(inputs,attention_mask)
                
                if BINARY:
                    output = torch.flatten(output)
                    loss = loss_function(output,label.float())
                else:
                    loss = loss_function(output,label)

                
                validation_loss += loss.item()*inputs.size(0)
                if BINARY:
                    output[output<0.5] = 0
                    output[output>=0.5] = 1
                    correct = (output == label).float().sum().item()
                    total_acc_val += correct
                else:
                    correct = (output.argmax(dim=1) == label).sum().item()
                    total_acc_val += correct
                
        # If the training is doing worse then a median set, stop training to save time
        prune_check = total_acc_val/len(validation_set)       
        trial.report(prune_check, epoch)

        if trial.should_prune():
          raise optuna.exceptions.TrialPruned()
        
    return total_acc_val/len(validation_set)




In [None]:
def build_model(params):
    bertconfig = BertConfig()
    tokenizer2 = AutoTokenizer.from_pretrained(params['bert_model'])

    bert2 = BertModel.from_pretrained(params['bert_model'])
    
    #Only want to train the additional layers at first,freezing pretrained
    for param in bert2.parameters():
        param.requires_grad = False
    
    for idx,param in enumerate(bert2.encoder.layer):
        if 1+idx>len(bert2.encoder.layer)-params['number_to_unfreeze']:
            param.requires_grad = True
        bert2.pooler.requires_grad = True


    return bert_for_sarcasm(bert2,params['linear1'],params['linear2'],params['drop'],params['mod_type'],params['bert_model']),tokenizer2


In [None]:
# Setup for hyper-parameter testing
def objective(trial):
    # Set of hyper-parameters we are interested in analyzing
    params = {
        'lr':trial.suggest_categorical('lr',[1e-5,1e-4,1e-3]),
        'linear1': trial.suggest_categorical('linear1',[256,512]),
        'linear2': trial.suggest_categorical('linear2',[64,128]),
        'drop': trial.suggest_categorical("drop",[0,.1,.2]),
        'batch_size': trial.suggest_categorical('batch_size',[32,64,128]),
        'schedule': trial.suggest_categorical('schedule',['none','cosine','linear']),
        'bert_model': trial.suggest_categorical('bert_model',['bert-base-uncased','bert-large-uncased']),
        'mod_type': trial.suggest_categorical('mod_type',['binary','multi']), # binary or multi
        'number_to_unfreeze': trial.suggest_categorical('number_to_unfreeze',[0,1,2,4,6,12])
    }
    
    
    model,tokenizer = build_model(params)
    
    accuracy = train_bert(model,params,trial,tokenizer)
    
    return accuracy
    

In [None]:
def optimal_values(study):
    # Return the optimal values from the hyper-parameter testing
    
    best_trial = study.best_trial
    best_parameters = best_trial.params
    return best_parameters
    


In [None]:
# Training after hyper-parameter testing
def train_optimized_bert(model,params,tokenizer,Epochs):
   
    model.to(device)
    #Set max length for the padding/clipping
    max_length = 35
    #Create tokenized training, validation, and test splits

    training_tokens = tokenizer.batch_encode_plus(train_headlines.tolist(),max_length = max_length,padding = True,truncation = True)
    validation_tokens = tokenizer.batch_encode_plus(validation_headlines.tolist(),max_length = max_length,padding = True,truncation = True)
    test_tokens = tokenizer.batch_encode_plus(test_headlines.tolist(),max_length = max_length,padding= True,truncation = True)

    #Stacking the inputs as tensors for use in the BERT model

    training_set = TensorDataset(torch.tensor(training_tokens['input_ids']),torch.tensor(training_tokens['attention_mask']),torch.tensor(train_label.tolist()))
    validation_set = TensorDataset(torch.tensor(validation_tokens['input_ids']),torch.tensor(validation_tokens['attention_mask']),torch.tensor(validation_label.tolist()))
    test_set = TensorDataset(torch.tensor(test_tokens['input_ids']),torch.tensor(test_tokens['attention_mask']),torch.tensor(test_label.tolist()))

    # Data loaders 
    trainloader = DataLoader(training_set, batch_size = params['batch_size'],num_workers=2,shuffle = True)
    validationloader = DataLoader(validation_set, batch_size = params['batch_size'],num_workers=2,shuffle = True)
    testloader = DataLoader(test_set, batch_size = params['batch_size'],num_workers=2,shuffle = True)


    optimizer = AdamW(model.parameters(),lr = params["lr"],eps = 1e-6)
    NO_SCHEDULER = False
    if params['schedule'] == "linear":
        scheduler = get_linear_schedule_with_warmup(optimizer,num_warmup_steps = 2,num_training_steps = len(trainloader)*Epochs)
    elif params['schedule'] == 'cosine':
        scheduler = get_cosine_with_hard_restarts_schedule_with_warmup(optimizer,num_warmup_steps = 2, num_training_steps = len(trainloader)*Epochs)
    else:
        NO_SCHEDULER = True
        
        
    if params['mod_type'] == 'binary':
        loss_function = nn.BCELoss()
        BINARY = True
    else:
        loss_function = nn.NLLLoss()
        BINARY = False
        
    # For checking early stop criteria
    min_validation_loss = np.inf
    patience = 3
    stop_number = 0
    epoch_count = 0
    running_validation_loss = []
    running_training_loss = []
    validation_acc = []
    training_acc = []
    
    last_loss = np.inf
    for epoch in range(1, Epochs+1):
        epoch_count+=1
        print('Epoch',epoch_count)
        model.train()
        training_loss = 0 

        train_correct = 0 
        total_acc_train = 0
        for idx, (inputs,attention_mask,label) in enumerate(tqdm(trainloader,total = len(trainloader))):

            inputs,attention_mask,label = inputs.to(device),attention_mask.to(device),label.to(device)

            optimizer.zero_grad()

            output = model(inputs,attention_mask)
            
            if BINARY:
                output = torch.flatten(output)
                loss = loss_function(output,label.float())
            else:
                loss = loss_function(output,label)
                
            loss.backward()
            optimizer.step()
            
            training_loss +=loss.item()*inputs.size(0)

            
            if BINARY:
                output[output<0.5] = 0
                output[output>=0.5] = 1
                train_correct = (output == label).float().sum().item()
                total_acc_train += train_correct
            else:
                train_correct = (output.argmax(dim=1) == label).sum().item()
                total_acc_train += train_correct

        if NO_SCHEDULER == False:
            scheduler.step()

    
        validation_loss = 0

        total_acc_val = 0
        val_correct = 0 
      
        model.eval()
        with torch.no_grad():
            for idx, (inputs,attention_mask,label) in enumerate(tqdm(validationloader,total = len(validationloader))):
                inputs,attention_mask,label = inputs.to(device),attention_mask.to(device),label.to(device)

                output = model(inputs,attention_mask)
                
                if BINARY:
                    output = torch.flatten(output)
                    loss = loss_function(output,label.float())
                else:
                    loss = loss_function(output,label)
                    
                validation_loss += loss.item()*inputs.size(0)
                
                if BINARY:
                    output[output<0.5] = 0
                    output[output>=0.5] = 1
                    val_correct = (output == label).float().sum().item()
                    total_acc_val += val_correct
                else:
                    val_correct = (output.argmax(dim=1) == label).sum().item()
                    total_acc_val += val_correct

        #Early Stopping Criteria
    
        if validation_loss>= last_loss:
            stop_number+=1
            if stop_number >= patience:
                return model,running_training_loss,training_acc,running_validation_loss,validation_acc,epoch_count,testloader
        else:
            stop_number = 0
            if validation_loss< min_validation_loss:
                min_validation_loss = validation_loss
                #Save the best performing model parameters 
                torch.save(model, MODEL_PATH)
        last_loss = validation_loss

        # For graphing training and testing loss, accuracy
        running_validation_loss.append(validation_loss/len(validation_set))
        running_training_loss.append(training_loss/len(training_set))

        validation_acc.append(total_acc_val/len(validation_set))
        training_acc.append(total_acc_train/len(training_set))

    return model,running_training_loss,training_acc,running_validation_loss,validation_acc,epoch_count,testloader


In [None]:
# Final Model Testing
def test_bert(model,testloader):
    correct = []
    final_pred = []
    final_lab = []
    model.eval()
    
    # For selecting correct loss function, properly working with the outputs
    if model.mod_type == 'binary':
        BINARY = True
    else:
        BINARY = False
    
    with torch.no_grad():
        for idx, (inputs,attention_mask,label) in enumerate(tqdm(testloader,total = len(testloader))):
            inputs,attention_mask,label = inputs.to(device),attention_mask.to(device), label.to(device)
            output = model(inputs,attention_mask).cpu()
            if BINARY:
                output = torch.flatten(output)
                output[output<0.5] = 0
                output[output>=0.5] = 1
                final_lab.extend(label.cpu().numpy())
                final_pred.extend(output.numpy())
            else:
                preds = output.data.max(1, keepdim=True)[1].squeeze(1).numpy()

                l = label.cpu().numpy()
                comp = l == preds
                final_lab.extend(l)
                final_pred.extend(preds)

    return final_lab,final_pred


# Functions for working with reddit data

In [None]:
# Slighlty modified test function to account for different structure of reddit data
def test_bert_reddit(model,testloader):
    correct = []
    final_pred = []
    final_lab = []
    model.eval()
    
    # For selecting correct loss function, properly working with the outputs
    if model.mod_type == 'binary':
        BINARY = True
    else:
        BINARY = False
    
    with torch.no_grad():
        for idx, (encodings,label) in enumerate(tqdm(testloader,total = len(testloader))):
            inputs = encodings['input_ids']
            attention_mask = encodings['attention_mask']
            inputs,attention_mask,label = inputs.to(device),attention_mask.to(device), label.to(device)
            output = model(inputs,attention_mask).cpu()
            if BINARY:
                output = torch.flatten(output)
                output[output<0.5] = 0
                output[output>=0.5] = 1
                final_lab.extend(label.cpu().numpy())
                final_pred.extend(output.numpy())
            else:
                preds = output.data.max(1, keepdim=True)[1].squeeze(1).numpy()

                l = label.cpu().numpy()
                comp = l == preds
                final_lab.extend(l)
                final_pred.extend(preds)

    return final_lab,final_pred

class Reddit(Dataset):
    def __init__(self, pd_text, pd_labels, selected_tokenizer, max_length=None):
        
       
        self.inputs = selected_tokenizer.batch_encode_plus(pd_text.tolist(), max_length = max_length,\
                                                          padding = True, truncation = True, \
                                                         add_special_tokens = True, return_tensors = "pt", \
                                                          return_attention_mask = True)
        
        self.labels = torch.Tensor(pd_labels.tolist())
        return
        
    def __len__(self): 
        return len(self.labels)
        
        
    def __getitem__(self,item):
        text = {key: self.inputs[key][item] for key in self.inputs.keys()}
        label = self.labels[item]
        return text, label
    

def split_reddit_data(csv_path):
    """
        Reads in reddit data from .csv and performs a stratified split into training-validation-testing sets   
    """
    #read in .csv
    data_all = None
    try:
        data_all = pd.read_csv(csv_path)
    except FileNotFoundError as e:
        print('Data csv not found')
        return
    
    #some NA in data (see data EDA file)
    data_all.dropna(subset=['comment'], inplace=True)
    
    x_train, x_testval, y_train, y_testval= train_test_split(data_all['comment'], data_all['label'], random_state=200, 
                                                                    test_size=0.2, 
                                                                    stratify=data_all['label'])
    
    x_test, x_val, y_test, y_val = train_test_split(x_testval, y_testval, random_state=200, 
                                                                    test_size=0.5, 
                                                                    stratify=y_testval)
    
    return x_train, y_train, x_val, y_val, x_test, y_test 


def get_data_loaders(train, val, test, batch_size, num_workers):
    
    trainloader = DataLoader(train, batch_size = batch_size,num_workers=num_workers,shuffle = True)
    validationloader = DataLoader(val, batch_size = batch_size,num_workers=num_workers,shuffle = True)
    testloader = DataLoader(test, batch_size = batch_size,num_workers=num_workers,shuffle = True)
    
    return trainloader, validationloader, testloader


# Running all functions to find and evaluate best model

In [None]:
# Create and train final model 

# Find Best Hyper-Parameters
Epochs = 150 # Training epochs for the best performing model from hyper-parameter testing
n_trials = 100 # Number of combinations of hyper-parameters to test 
n_jobs = 1 # Number of parallel trials 
study = optuna.create_study(direction="maximize",
                            sampler=optuna.samplers.TPESampler(),
                            pruner=optuna.pruners.MedianPruner())
study.optimize(objective, n_jobs = n_jobs,n_trials=n_trials,show_progress_bar = True)

# Plots to visualize the hyper-paremeter search 
plot_optimization_history(study);
plot_param_importances(study);

# Getting best parameters to train final model

best_parameters = optimal_values(study)
print('Best Values:')
for key, value in best_parameters.items():
    print(key, ' : ', value)
final_model,tokenizer = build_model(best_parameters)

# Training the final model 
trained_final_model,train_loss,train_acc,val_loss,val_acc,epoch_count,testloader = train_optimized_bert(final_model,best_parameters,tokenizer,Epochs)

# Training and Validation Loss Plots
plt.plot(train_loss,'g',label = 'Training Loss')
plt.plot(val_loss,'r',label = 'Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Training and Validation Loss')
plt.legend()
plt.show()

# Training and Validation Accuracy Plots
plt.plot(val_acc,'g',label = 'Training Accuracy')
plt.plot(train_acc,'r',label = 'Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.title('Training and Validation Accuracy')
plt.legend()
plt.show()

#Testing Final Model
final_lab,final_pred = test_bert(trained_final_model,testloader)

# Visualizing the results
conf_mat = confusion_matrix(final_lab, final_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=conf_mat)
fig, ax = plt.subplots(figsize=(20,20))
plt.title('Confusion Matrix For Sarcasm_BERT')
disp.plot(ax = ax)

r_words = ["Sarcastic","Not Sarcastic"]
class_report = classification_report(final_lab,final_pred,target_names =r_words)
print('\033[1m'+'Precision, Recall and Accuracy for Headline Data:\n')
print(class_report)


In [None]:
# Looking at the tokenizer and its output

new = tokenizer("Smartest Man In World Dead After Papercut",return_tensors="pt")
new.to(device)
print(new)
outputs = trained_final_model(new['input_ids'],new['attention_mask']).detach().cpu()
print(np.exp(outputs))
a = outputs.data.max(1, keepdim=True)[1].squeeze(1).numpy()
print(a)


## Testing Headlines Model on Reddit Data

In [None]:
# Loading in the reddit data foor testing on the model trained on headlines data
csv_path = '/projectnb/dl523/projects/Sarcasm/train-balanced-sarcasm.csv'
x_train, y_train, x_val, y_val, x_test, y_test = split_reddit_data(csv_path)

max_length = 35
reddit_train = Reddit(x_train, y_train, tokenizer, max_length)
reddit_val = Reddit(x_val, y_val, tokenizer, max_length)
reddit_test = Reddit(x_test, y_test, tokenizer, max_length)

batch_size = 64
num_workers = 2
trainloader_r, validationloader_r, testloader_r = get_data_loaders(reddit_train, reddit_val, reddit_test, batch_size, num_workers)

In [None]:
# Load best headlines model 

model = torch.load(MODEL_PATH)

# testing
model.to(device)
final_lab_r,final_pred_r = test_bert_reddit(model,testloader_r)

# Visualizing the results
conf_mat = confusion_matrix(final_lab_r, final_pred_r)
disp = ConfusionMatrixDisplay(confusion_matrix=conf_mat)
fig, ax = plt.subplots(figsize=(20,20))
plt.title('Confusion Matrix For Sarcasm_BERT')
disp.plot(ax = ax)

r_words = ["Sarcastic","Not Sarcastic"]
class_report = classification_report(final_lab_r,final_pred_r,target_names =r_words)
print('\033[1m'+'Precision, Recall and Accuracy for Reddit Data:\n')
print(class_report)