# Hyper-Paremeter testing, training, and final model testing

In [1]:
try:
    import transformers
except ImportError as e:
    print('transformers not installed')
    print('Installing now...')
    !pip install -q git+https://github.com/huggingface/transformers.git
    pass  

try:
    import optuna
except ImportError as e:
    print('optuna not installed')
    print('Installing now...')
    !pip install optuna
    pass  


In [2]:
import torch
import io 
import os
from torch.utils.data import Dataset,DataLoader,TensorDataset
from sklearn.metrics import classification_report,accuracy_score
import transformers
import json
from tqdm.notebook import tqdm
from transformers.utils.dummy_pt_objects import AutoModelForSequenceClassification
from transformers import AutoModelForTokenClassification,AutoConfig, AutoModel,AutoTokenizer,BertModel,BertConfig,AdamW, get_constant_schedule,BertForSequenceClassification,get_linear_schedule_with_warmup,get_cosine_with_hard_restarts_schedule_with_warmup
import random
import numpy as np
import torch.nn as nn
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
import matplotlib.pyplot as plt

from optuna.visualization.matplotlib import plot_optimization_history
from optuna.visualization.matplotlib import plot_param_importances

#Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


In [3]:
colab = False
if colab == True:
    #Mounting Drive
    from google.colab import drive
    drive.mount('/content/gdrive')
    
    %cd '/content/gdrive/Shareddrives/523 Project/Data/News Headlines'
    %ls
else:
    MODEL_PATH = '/projectnb/dl523/projects/Sarcasm/Sarcasm_Models/sarcasm_bert.pth'
    DATA_DIR = '/projectnb2/dl523/projects/Sarcasm'
    os.chdir(DATA_DIR)

In [4]:
#Getting the BERT outputs for use with other downstream model implementations 
import pandas as pd
from sklearn.model_selection import train_test_split

In [5]:
#Reading in the data
df = pd.read_json("Sarcasm_Headlines_Dataset_v2.json",lines = True)
df = df.rename(columns={'is_sarcastic': 'label'})
df = df.drop('article_link', 1)
df.head()

#splits for training test validation

train_headlines, temporary_text, train_label, temporary_label = train_test_split(df['headline'], df['label'], 
                                                                    random_state=200, 
                                                                    test_size=0.2, 
                                                                    stratify=df['label'])







validation_headlines, test_headlines, validation_label, test_label = train_test_split(temporary_text, temporary_label, 
                                                                    random_state=200, 
                                                                    test_size=0.5, 
                                                                    stratify=temporary_label)


#Set max length for the padding/clipping
count = df['headline'].str.split().str.len()
count.describe()

max_length = 35


In [18]:
# Class for modifying BERT 
class bert_for_sarcasm(nn.Module):

    def __init__(self,input_model, linear1 = 256,linear2 = 128,drop = .25):
        super(bert_for_sarcasm,self).__init__()

        self.input_model = input_model
        
        self.linear = nn.Linear(1024,linear1)
        
        self.linear2 = nn.Linear(linear1,linear2)
        
        self.linear3 = nn.Linear(linear2,2)
        
        self.relu = nn.ReLU()
        
        self.log = nn.LogSoftmax(dim = 1)

        self.dropout = nn.Dropout(drop)
    
    def forward(self,input_values,attention_mask):
   
        _,output = self.input_model(input_values, attention_mask=attention_mask).values()
        
        output = self.linear(output)

        output = self.dropout(output)
        
        output = self.relu(output)
        
        output = self.linear2(output)

        output = self.dropout(output)
        
        output = self.relu(output)
        
        output = self.linear3(output)
        
        output = self.log(output)
        
        return output
    


In [21]:
# Finding best hyper-parameters
def train_bert(model, params,trial,tokenizer):
    model.to(device)
    max_length = 35

    #Create tokenized training, validation, and test splits

    training_tokens = tokenizer.batch_encode_plus(train_headlines.tolist(),max_length = max_length,padding = True,truncation = True)
    validation_tokens = tokenizer.batch_encode_plus(validation_headlines.tolist(),max_length = max_length,padding = True,truncation = True)
    test_tokens = tokenizer.batch_encode_plus(test_headlines.tolist(),max_length = max_length,padding= True,truncation = True)

    #Stacking the inputs as tensors for use in the BERT model

    training_set = TensorDataset(torch.tensor(training_tokens['input_ids']),torch.tensor(training_tokens['attention_mask']),torch.tensor(train_label.tolist()))
    validation_set = TensorDataset(torch.tensor(validation_tokens['input_ids']),torch.tensor(validation_tokens['attention_mask']),torch.tensor(validation_label.tolist()))
    test_set = TensorDataset(torch.tensor(test_tokens['input_ids']),torch.tensor(test_tokens['attention_mask']),torch.tensor(test_label.tolist()))

    trainloader = DataLoader(training_set, batch_size = params['batch_size'],num_workers=2,shuffle = True)
    validationloader = DataLoader(validation_set, batch_size = params['batch_size'],num_workers=2,shuffle = True)
    testloader = DataLoader(test_set, batch_size = params['batch_size'],num_workers=2,shuffle = True)

    #Loss function (standard in the BERT documentation is MSELoss)
    loss_function = nn.NLLLoss()
    Epochs = 15

    
    #optimizer and scheduler for learning rate
    optimizer = AdamW(model.parameters(),lr = params["lr"],eps = 1e-6)
    
    if params['schedule'] == "linear":
        scheduler = get_linear_schedule_with_warmup(optimizer,num_warmup_steps = 2,num_training_steps = len(trainloader)*Epochs)
    else:
        scheduler = get_cosine_with_hard_restarts_schedule_with_warmup(optimizer,num_warmup_steps = 2, num_training_steps = len(trainloader)*Epochs)
    
    loss_function = nn.NLLLoss()
    
    min_validation_loss = np.inf
    total = 0 
    correct = 0
    for epoch in range(1, Epochs+1):
        model.train()
        training_loss = 0 
        epoch_step = 0
        for idx, (inputs,attention_mask,label) in enumerate(trainloader):

            inputs,attention_mask,label = inputs.to(device),attention_mask.to(device),label.to(device)

            optimizer.zero_grad()

            output = model(inputs,attention_mask)

            loss = loss_function(output,label)

            loss.backward()
            optimizer.step()
            scheduler.step()
            training_loss +=loss.item()
           

        validation_loss = 0
        
        total_acc_val = 0
        model.eval()
        with torch.no_grad():
            for idx, (inputs,attention_mask,label) in enumerate(validationloader):
                inputs,attention_mask,label = inputs.to(device),attention_mask.to(device),label.to(device)

                output = model(inputs,attention_mask)

                
                loss = loss_function(output,label)
                validation_loss += loss.item()
                correct = (output.argmax(dim=1) == label).sum().item()
                total_acc_val += correct
                

        prune_check = total_acc_val/len(validation_set)       
        trial.report(prune_check, epoch)

        if trial.should_prune():
          raise optuna.exceptions.TrialPruned()
        
    return total_acc_val/len(validation_set)




In [8]:
def build_model(params):
    bertconfig = BertConfig()
    tokenizer2 = AutoTokenizer.from_pretrained(params['bert_model'])

    bert2 = BertModel.from_pretrained(params['bert_model'])
    
    #Only want to train the additional layers at first,freezing pretrained
    for param in bert2.parameters():
        param.requires_grad = False

    return bert_for_sarcasm(bert2,params['linear1'],params['linear2'],params['drop']),tokenizer2


In [9]:
# Setup for hyper-parameter testing
def objective(trial):
    
    params = {
        'lr':trial.suggest_categorical('lr',[1e-6,1e-5,1e-4,1e-3]),
        'linear1': trial.suggest_categorical('linear1',[128,256,512]),
        'linear2': trial.suggest_categorical('linear2',[32,64]),
        'drop': trial.suggest_categorical("drop",[0,.1,.2,.3,.4,.5]),
        'batch_size': trial.suggest_categorical('batch_size',[32,64,128]),
        'schedule': trial.suggest_categorical('schedule',['linear','cosine']),
        'bert_model': trial.suggest_categorical('bert_model',['bert-large-uncased'])
    }
    
    
    model,tokenizer = build_model(params)
    
    accuracy = train_bert(model,params,trial,tokenizer)
    
    return accuracy
    

In [15]:
def optimal_values(study):
    # return the optimal values from the hyper-parameter testing
    
    best_trial = study.best_trial
    best_parameters = best_trial.params
    return best_parameters
    


In [16]:
# Training after hyper-parameter testing
def train_optimized_bert(model,params,tokenizer,Epochs):
   
    model.to(device)
    #Set max length for the padding/clipping
    max_length = 35
    #Create tokenized training, validation, and test splits

    training_tokens = tokenizer.batch_encode_plus(train_headlines.tolist(),max_length = max_length,padding = True,truncation = True)
    validation_tokens = tokenizer.batch_encode_plus(validation_headlines.tolist(),max_length = max_length,padding = True,truncation = True)
    test_tokens = tokenizer.batch_encode_plus(test_headlines.tolist(),max_length = max_length,padding= True,truncation = True)

    #Stacking the inputs as tensors for use in the BERT model

    training_set = TensorDataset(torch.tensor(training_tokens['input_ids']),torch.tensor(training_tokens['attention_mask']),torch.tensor(train_label.tolist()))
    validation_set = TensorDataset(torch.tensor(validation_tokens['input_ids']),torch.tensor(validation_tokens['attention_mask']),torch.tensor(validation_label.tolist()))
    test_set = TensorDataset(torch.tensor(test_tokens['input_ids']),torch.tensor(test_tokens['attention_mask']),torch.tensor(test_label.tolist()))

    # Data loaders 
    trainloader = DataLoader(training_set, batch_size = params['batch_size'],num_workers=2,shuffle = True)
    validationloader = DataLoader(validation_set, batch_size = params['batch_size'],num_workers=2,shuffle = True)
    testloader = DataLoader(test_set, batch_size = params['batch_size'],num_workers=2,shuffle = True)


    #Loss function (standard in the BERT documentation is MSELoss)
    loss_function = nn.NLLLoss()

    optimizer = AdamW(model.parameters(),lr = params["lr"],eps = 1e-6)

    if params['schedule'] == "linear":
        scheduler = get_linear_schedule_with_warmup(optimizer,num_warmup_steps = 2,num_training_steps = len(trainloader)*Epochs)
    else:
        scheduler = get_cosine_with_hard_restarts_schedule_with_warmup(optimizer,num_warmup_steps = 2, num_training_steps = len(trainloader)*Epochs)
    
    # For checking early stop criteria
    min_validation_loss = np.inf
    patience = 2
    epoch_count = 0
    for epoch in range(1, Epochs+1):
        epoch_count+=1
        model.train()
        training_loss = 0 

        train_correct = 0 
        total_acc_train = 0
        for idx, (inputs,attention_mask,label) in enumerate(tqdm(validationloader,total = len(trainloader))):

            inputs,attention_mask,label = inputs.to(device),attention_mask.to(device),label.to(device)

            optimizer.zero_grad()

            output = model(inputs,attention_mask)

            loss = loss_function(output,label)

            loss.backward()
            optimizer.step()
            scheduler.step()
            training_loss +=loss.item()

            train_correct = (output.argmax(dim=1) == label).sum().item()
            total_acc_train += train_correct

        validation_loss = 0

        total_acc_val = 0
        val_correct = 0 
        stop_number = 0
        model.eval()
        with torch.no_grad():
            for idx, (inputs,attention_mask,label) in enumerate(tqdm(validationloader,total = len(validationloader))):
                inputs,attention_mask,label = inputs.to(device),attention_mask.to(device),label.to(device)

                output = model(inputs,attention_mask)


                loss = loss_function(output,label)
                validation_loss += loss.item()
                val_correct = (output.argmax(dim=1) == label).sum().item()
                total_acc_val += val_correct
        #Early Stopping Criteria
        if validation_loss>= min_validation_loss:
            stop_number+=1
            if stop_number >patience:
                return model,running_training_loss,training_acc,running_validation_loss,validation_acc,epoch_count
        else:
            min_validation_loss = validation_loss
            #Save the best performing model parameters 
            torch.save(model.state_dict(), MODEL_PATH)

        # For graphing training and testing loss, accuracy
        running_validation_loss.append(validation_loss/len(validation_set))
        running_training_loss.append(training_loss/len(training_set))

        validation_acc.append(total_acc_val/len(validation_set))
        training_acc.append(total_acc_train/len(training_set))

    return model,running_training_loss,training_acc,running_validation_loss,validation_acc


In [13]:
# Final Model Testing
def test_bert(model,testloader):
    correct = []
    final_pred = []
    final_lab = []
    model.eval()
    with torch.no_grad():
        for idx, (inputs,attention_mask,label) in enumerate(tqdm(testloader,total = len(testloader))):
            inputs,attention_mask,label = inputs.to(device),attention_mask.to(device), label.to(device)
            output = best_model(inputs,attention_mask).cpu()
            preds = output.data.max(1, keepdim=True)[1].squeeze(1).numpy()

            l = label.cpu().numpy()
            comp = l == preds
            final_lab.extend(l)
            final_pred.extend(preds)
#             for i in range(l.size):
#                 if comp[i] == True:
#                 correct.append(1)
#                 else:
#                 correct.append(0)
    return final_lab,final_pred


In [None]:
# Create and train final model 

# Find Best Hyper-Parameters
Epochs = 1
study = optuna.create_study(direction="maximize",
                            sampler=optuna.samplers.TPESampler(),
                            pruner=optuna.pruners.MedianPruner())
study.optimize(objective, n_trials=1,show_progress_bar = True)

# Plots to visualize the hyper-paremeter search 
plot_optimization_history(study);
plot_param_importances(study);

# Getting best parameters to train final model

best_parameters = optimal_values(study)
final_model,tokenizer = build_model(best_parameters)
trained_final_model,train_loss,train_acc,val_loss,val_acc,epoch_count = train_optimized_bert(final_model,best_parameters,tokenizer,Epochs)

# Training and Validation Loss Plots
plt.plot(epoch_count,train_loss,'g',label = 'Training Loss')
plt.plot(epoch_count,val_loss,'g',label = 'Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Training and Validation Loss')
plt.legend()
plt.show()

# Training and Validation Accuracy Plots
plt.plot(epoch_count,val_loss,'g',label = 'Training Accuracy')
plt.plot(epoch_count,train_acc,'g',label = 'Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.title('Training and Validation Accuracy')
plt.legend()
plt.show()

#Testing Final Model
final_lab,final_pred = test_bert(trained_final_model,testloader)

# Visualizing the results

conf_mat = confusion_matrix(final_lab, final_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=conf_mat)
fig, ax = plt.subplots(figsize=(20,20))
plt.title('Confusion Matrix For Sarcasm_BERT')
disp.plot(ax = ax)

r_words = ["Sarcastic","Not Sarcastic"]
class_report = classification_report(final_lab,final_pred,target_names =r_words)
print('\033[1m'+'Precision, Recall and Accuracy for Headline Data:\n')
print(class_report)


In [None]:
# Looking at the tokenizer and its output

new = tokenizer("Smartest Man In World Dead After Papercut",return_tensors="pt")
new.to(device)
print(new)
outputs = trained_final_model(new['input_ids'],new['attention_mask']).detach().cpu()
print(np.exp(outputs))
a = outputs.data.max(1, keepdim=True)[1].squeeze(1).numpy()
print(a)


## Original No Hyper-Paremeter Testing

In [None]:
#Getting the BERT outputs for use with other downstream model implementations 
import pandas as pd
from sklearn.model_selection import train_test_split

In [None]:
#Reading in the data
df = pd.read_json("Sarcasm_Headlines_Dataset_v2.json",lines = True)
df = df.rename(columns={'is_sarcastic': 'label'})
df = df.drop('article_link', 1)
df.head()

#splits for training test validation

train_headlines, temporary_text, train_label, temporary_label = train_test_split(df['headline'], df['label'], 
                                                                    random_state=200, 
                                                                    test_size=0.2, 
                                                                    stratify=df['label'])







validation_headlines, test_headlines, validation_label, test_label = train_test_split(temporary_text, temporary_label, 
                                                                    random_state=200, 
                                                                    test_size=0.5, 
                                                                    stratify=temporary_label)

In [None]:
#initialize the BERT 

bertconfig = BertConfig()
tokenizer2 = AutoTokenizer.from_pretrained("bert-base-uncased")

bert2 = BertModel.from_pretrained("bert-base-uncased")

#Only want to train the additional layers at first,freezing pretrained
for param in bert2.parameters():
    param.requires_grad = False

# Create a model which takes output from BERT and run through new layers for classification


class bert_for_sarcasm(nn.Module):

    def __init__(self,input_model):
        super(bert_for_sarcasm,self).__init__()
        
        self.input_model = input_model
        
        self.linear = nn.Linear(768,256)
        
        self.linear2 = nn.Linear(256,128)
        
        self.linear3 = nn.Linear(128,2)
        
        self.relu = nn.ReLU()
        
        self.log = nn.LogSoftmax(dim = 1)

        self.dropout = nn.Dropout(0.25)
    
    def forward(self,input_values,attention_mask):
   
        _,output = self.input_model(input_values, attention_mask=attention_mask).values()
        
        output = self.linear(output)

        output = self.dropout(output)
        
        output = self.relu(output)
        
        output = self.linear2(output)

        output = self.dropout(output)
        
        output = self.relu(output)
        
        output = self.linear3(output)
        
        output = self.log(output)
        
        return output
    
    
#Put updated sarcasm model on GPU
sarcasm_model = bert_for_sarcasm(bert2)
sarcasm_model.to(device)



In [None]:
#Set max length for the padding/clipping
count = df['headline'].str.split().str.len()
count.describe()

max_length = 35
#Create tokenized training, validation, and test splits

training_tokens = tokenizer2.batch_encode_plus(train_headlines.tolist(),max_length = max_length,padding = True,truncation = True)
validation_tokens = tokenizer2.batch_encode_plus(validation_headlines.tolist(),max_length = max_length,padding = True,truncation = True)
test_tokens = tokenizer2.batch_encode_plus(test_headlines.tolist(),max_length = max_length,padding= True,truncation = True)

#Stacking the inputs as tensors for use in the BERT model

training_set = TensorDataset(torch.tensor(training_tokens['input_ids']),torch.tensor(training_tokens['attention_mask']),torch.tensor(train_label.tolist()))
validation_set = TensorDataset(torch.tensor(validation_tokens['input_ids']),torch.tensor(validation_tokens['attention_mask']),torch.tensor(validation_label.tolist()))
test_set = TensorDataset(torch.tensor(test_tokens['input_ids']),torch.tensor(test_tokens['attention_mask']),torch.tensor(test_label.tolist()))




In [None]:

#Dataloaders for the sets

trainloader = DataLoader(training_set, batch_size = batch_size,num_workers=2,shuffle = True)
validationloader = DataLoader(validation_set, batch_size = batch_size,num_workers=2,shuffle = True)
testloader = DataLoader(test_set, batch_size = batch_size,num_workers=2,shuffle = True)

#Loss function (standard in the BERT documentation is MSELoss)
loss_function = nn.NLLLoss()



In [None]:
#Training sarcasm bert
Epochs = 150

running_training_loss = []
running_validation_loss = []

#optimizer and scheduler for learning rate
optimizer = AdamW(sarcasm_model.parameters(),lr = 1e-4,eps = 1e-6)
scheduler = get_linear_schedule_with_warmup(optimizer,num_warmup_steps = 2,num_training_steps = len(trainloader)*Epochs)

min_validation_loss = np.inf

for epoch in range(1, Epochs+1):
    print('Epoch: ',epoch)
    sarcasm_model.train()
    training_loss = 0 

    for idx, (inputs,attention_mask,label) in enumerate(tqdm(trainloader,total = len(trainloader))):

        inputs,attention_mask,label = inputs.to(device),attention_mask.to(device),label.to(device)

        optimizer.zero_grad()

        output = sarcasm_model(inputs,attention_mask)

        loss = loss_function(output,label)

        loss.backward()
        optimizer.step()
        scheduler.step()
        training_loss +=loss.item()
    
    
    
    validation_loss = 0
    sarcasm_model.eval()
    with torch.no_grad():
        for idx, (inputs,attention_mask,label) in enumerate(tqdm(validationloader,total = len(validationloader))):
            inputs,attention_mask,label = inputs.to(device),attention_mask.to(device),label.to(device)

            output = sarcasm_model(inputs,attention_mask)
            loss = loss_function(output,label)
            validation_loss += loss.item()

        # For graphs
        running_validation_loss.append(validation_loss/len(validationloader))
        running_training_loss.append(training_loss/len(trainloader))

        #Store the best performing model
        if validation_loss<min_validation_loss:
            min_validation_loss = validation_loss
            #Save this model 
            torch.save(sarcasm_model.state_dict(), MODEL_PATH)

    print('Epoch',epoch, '\t\t Training Loss:',training_loss / len(trainloader), '\t\t', 'Validation Loss:', validation_loss / len(validationloader))


In [None]:
# Load best model 
best_model = bert_for_sarcasm(bert2)
best_model.load_state_dict(torch.load(MODEL_PATH))
best_model.to(device)
best_model.eval()

In [None]:
correct = []
final_pred = []
final_lab = []
with torch.no_grad():
    
    for idx, (inputs,attention_mask,label) in enumerate(tqdm(testloader,total = len(testloader))):
        inputs,attention_mask,label = inputs.to(device),attention_mask.to(device), label.to(device)
        output = best_model(inputs,attention_mask).cpu()
        preds = output.data.max(1, keepdim=True)[1].squeeze(1).numpy()

        l = label.cpu().numpy()
        comp = l == preds
        final_lab.extend(l)
        final_pred.extend(preds)
        for i in range(l.size):
          if comp[i] == True:
            correct.append(1)
          else:
            correct.append(0)
    


In [None]:
#Confusion Matrix
conf_mat = confusion_matrix(final_lab, final_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=conf_mat)
fig, ax = plt.subplots(figsize=(20,20))
plt.title('Confusion Matrix For Sarcasm_BERT')
disp.plot(ax = ax)

r_words = ["Sarcastic","Not Sarcastic"]
class_report = classification_report(final_lab,final_pred,target_names =r_words)
print('\033[1m'+'Precision, Recall and Accuracy for Headline Data:\n')
print(class_report)