# Download MARBERT checkpoint

In [None]:
from google.colab import drive
drive.mount('/gdrive')

In [None]:
!wget https://huggingface.co/UBC-NLP/MARBERT/resolve/main/MARBERT_pytorch_verison.tar.gz

In [None]:
!tar -xvf MARBERT_pytorch_verison.tar.gz

In [None]:
!pip install GPUtil pytorch_pretrained_bert transformers

In [None]:
import csv
import json
from sklearn import *
# (1)load libraries 
import json, sys, regex
import torch
import GPUtil
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from transformers import  BertTokenizer, BertConfig, BertForSequenceClassification

from tqdm import tqdm, trange
import pandas as pd
import os
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, classification_report, confusion_matrix
from transformers import XLMRobertaConfig
from transformers import XLMRobertaModel
from transformers import AutoTokenizer, AutoModelWithLMHead
from transformers import XLMRobertaForSequenceClassification, XLMRobertaTokenizer, XLMRobertaModel
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, processors
from transformers import AdamW, get_linear_schedule_with_warmup
from transformers import AutoTokenizer, AutoModel

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from numpy.random import RandomState
import pandas as pd

In [None]:
!mkdir -p  isarcastic
!cp "/gdrive/MyDrive/Master/Dataset/isarcastic/isarcastic_Ar_train_test/isarcastic_Ar_train.csv"  ./isarcastic/isarcastic_Ar_train.csv

In [None]:
!cp "/gdrive/MyDrive/Master/Dataset/isarcastic/isarcastic_Ar_train_test/isarcastic_Ar_test1.csv"  ./isarcastic/isarcastic_Ar_test1.csv
!cp "/gdrive/MyDrive/Master/Dataset/isarcastic/isarcastic_Ar_train_test/isarcastic_Ar_test2.csv"  ./isarcastic/isarcastic_Ar_test2.csv

In [None]:
!cp "/gdrive/MyDrive/Master/Dataset/TestingData/task_A_AR_test.csv"  ./isarcastic/task_A_AR_test.csv

In [None]:
file = open('isarcastic/task_A_AR_test.csv')

In [None]:
csvreader = csv.reader(file)
header = next(csvreader)
print(header)
rows = []
for row in csvreader:
    rows.append(row)
print(rows)
file.close()

# Fine-tuning code

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print ("your device ", device)

In [None]:
df = pd.read_csv('isarcastic/task_A_AR_test.csv', sep=",")
df.head(5)

In [None]:
def flat_pred(preds, labels):
	pred_flat = np.argmax(preds, axis=1).flatten()
	labels_flat = labels.flatten()
	return pred_flat.tolist(), labels_flat.tolist()

In [None]:

def train(model, iterator, optimizer, scheduler, criterion):
	
	model.train()
	epoch_loss = 0
	for i, batch in enumerate(iterator):
		# Add batch to GPU
		batch = tuple(t.to(device) for t in batch)
		# Unpack the inputs from our dataloader
		input_ids, input_mask, labels = batch
		outputs = model(input_ids, input_mask, labels=labels)
		loss, logits = outputs[:2]
		'''all_pred_global.extend(outputs[1])
		print(all_pred_global)'''
		# delete used variables to free GPU memory
		del batch, input_ids, input_mask, labels
		optimizer.zero_grad()
		if torch.cuda.device_count() == 1:
			loss.backward()
			epoch_loss += loss.cpu().item()
		else:
			loss.sum().backward()
			epoch_loss += loss.sum().cpu().item()
		optimizer.step()
		torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)  # Gradient clipping is not in AdamW anymore
		# optimizer.step()
		scheduler.step()
	# free GPU memory
	if device == 'cuda':
		torch.cuda.empty_cache()
	return epoch_loss / len(iterator)

In [None]:

def evaluate(model, iterator, criterion):
	model.eval()
	epoch_loss = 0
	all_pred=[]
	all_label = []
	with torch.no_grad():
		for i, batch in enumerate(iterator):
			batch = tuple(t.to(device) for t in batch)
			input_ids, input_mask, labels = batch
			outputs = model(input_ids, input_mask, labels=labels)
			loss, logits = outputs[:2]
			del batch, input_ids, input_mask
			if torch.cuda.device_count() == 1:
				epoch_loss += loss.cpu().item()
			else:
				epoch_loss += loss.sum().cpu().item()
			probabilities, predicted = torch.max(logits.cpu().data, 1)
			all_pred.extend(predicted)
			all_label.extend(labels.cpu())

	accuracy = accuracy_score(all_label, all_pred)
	print("all_pred", all_pred)
	for i in all_pred:
		print(i.item())
	f1_sarcastic = f1_score(all_label, all_pred, average='binary', pos_label=1) 
	f1score = f1_score(all_label, all_pred, average='macro') 
	recall = recall_score(all_label, all_pred, average='macro')
	precision = precision_score(all_label, all_pred, average='macro')
	report = classification_report(all_label, all_pred)
	return (epoch_loss / len(iterator)), accuracy, f1score, recall, precision, f1_sarcastic



# config

In [None]:

config={"task_name": "isarcastic_MARBERT", #output directory name
             "data_dir": "./", #data directory
             "checkpoint_dir": "/gdrive/MyDrive/Master/Dataset/isarcastic",
             "train_file": "isarcastic/isarcastic_Ar_train.csv", #train file path
            #  "dev_file": "isarcastic/task_A_AR_test.csv", #dev file path or test file path
              "dev_file": "isarcastic/isarcastic_Ar_test1.csv", #dev file path or test file path

             "pretrained_model_path": 'MARBERT_pytorch_verison', #MARBERT checkpoint path
             "fine_tuned_model_path" : "/gdrive/MyDrive/Master/Dataset/isarcasticisarcastic_MARBERT_bert_ckpt/model_5",
             "epochs": 5, #number of epochs
             "content_col": "tweet", #text column
            #  "content_col": "text",
             "label_col": "sarcastic", #label column
             "lr": 2e-06, #learning rate
              "max_seq_length": 64 ,#128, #max sequance length
              "batch_size": 32, #batch shize
              "sortby":"val_acc"} #sort results based on val_acc or val_f1


# Run fine-tuning for 5 epochs

In [None]:
#---------------------------------------
print ("[INFO] step (1) load train_test config file")
# config_file = open(config_file, 'r', encoding="utf8")
# config = json.load(config_file)
task_name = config["task_name"]
content_col = config["content_col"]
label_col = config["label_col"]
train_file = config["data_dir"]+config["train_file"]
dev_file = config["data_dir"]+config["dev_file"]
sortby = config["sortby"]
max_seq_length= int(config["max_seq_length"])
batch_size = int(config["batch_size"])
lr_var = float(config["lr"])
model_path = config['pretrained_model_path']
num_epochs = config['epochs'] # Number of training epochs (authors recommend between 2 and 4)
global label2idx_file
label2idx_file = config["data_dir"]+config["task_name"]+"_labels-dict.json"
#-------------------------------------------------------
print ("[INFO] step (2) convert labels2index")

In [None]:

def create_label2ind_file(file, label_col):
	labels_json={}
	#load train_dev_test file
	df = pd.read_csv(file, sep=",")
	df.head(5)
	#get labels and sort it A-Z
	labels = df[label_col].unique()
	labels.sort()
	#convert labels to indexes
	for idx in range(0, len(labels)):
		labels_json[labels[idx]]=idx
	#save labels with indexes to file
	# with open(label2idx_file, 'w') as json_file:
	# 	json.dump(labels_json, json_file)


In [None]:
create_label2ind_file(train_file, label_col)
print (label2idx_file)

In [None]:
#---------------------------------------------------------
print ("[INFO] step (3) check checkpoit directory and report file")
ckpt_dir = config["checkpoint_dir"]+task_name+"_bert_ckpt/"
report = ckpt_dir+task_name+"_report.tsv"
sorted_report = ckpt_dir+task_name+"_report_sorted.tsv"
if not os.path.exists(ckpt_dir):
  os.mkdir(ckpt_dir)

In [None]:

#-------------------------------------------------------
print ("[INFO] step (4) load label to number dictionary")

In [None]:
x =  '{ "0":0, "1":1}'

# parse x:
y = json.loads(x)

In [None]:
lab2ind = json.loads(x)

In [None]:
print ("[INFO] train_file", train_file)
print ("[INFO] dev_file", dev_file)
print ("[INFO] num_epochs", num_epochs)
print ("[INFO] model_path", model_path)
print ("max_seq_length", max_seq_length, "batch_size", batch_size)
#-------------------------------------------------------
print ("[INFO] step (5) Use defined funtion to extract tokanize data")
print ("loading BERT setting")
tokenizer = BertTokenizer.from_pretrained(model_path)

In [None]:
df = pd.read_csv(train_file, delimiter=',', header=0)
df = df[df[content_col].notnull()]
df = df[df[label_col].notnull()]
print("Data size ", df.shape)
sentences = df[content_col].values
sentences = ["[CLS] " + sentence+ " [SEP]" for sentence in sentences]
print ("The first sentence:")
print (sentences[0])
labels = df[label_col].values
print(labels)
labels = [lab2ind[ str(i)] for i in labels]
tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]
print ("Tokenize the first sentence:")
print (tokenized_texts[0])
input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]

In [None]:
len(input_ids)

In [None]:
seq_len = [len(i) for i in input_ids]

pd.Series(seq_len).hist(bins = 30)

In [None]:
max(seq_len)

In [None]:
def data_prepare_BERT(file_path, lab2ind, tokenizer, content_col, label_col, MAX_LEN):
  df = pd.read_csv(file_path, delimiter=',', header=0)
  df = df[df[content_col].notnull()]
  df = df[df[label_col].notnull()]
  print("Data size ", df.shape)
  sentences = df[content_col].values
  sentences = ["[CLS] " + sentence+ " [SEP]" for sentence in sentences]
  print ("The first sentence:")
  print (sentences[0])
  labels = df[label_col].values
  print(labels)
  labels = [lab2ind[ str(i)] for i in labels]
  tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]
  print ("Tokenize the first sentence:")
  print (tokenized_texts[0])
  input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
  print ("Index numbers of the first sentence:")
  print (input_ids[0])
  pad_ind = tokenizer.convert_tokens_to_ids(['[PAD]'])[0]
  input_ids = pad_sequences(input_ids, maxlen=MAX_LEN+2, dtype="long", truncating="post", padding="post", value=pad_ind)
  print ("Index numbers of the first sentence after padding:\n",input_ids[0])
  attention_masks = []
  for seq in input_ids:
    seq_mask = [int(i > 0) for i in seq]
    attention_masks.append(seq_mask)
  inputs = torch.tensor(input_ids)
  labels = torch.tensor(labels)
  masks = torch.tensor(attention_masks)
  return inputs, labels, masks


In [None]:
train_inputs, train_labels, train_masks = data_prepare_BERT(train_file, lab2ind, tokenizer,content_col, label_col, max_seq_length)

In [None]:
validation_inputs, validation_labels, validation_masks = data_prepare_BERT(dev_file, lab2ind, tokenizer, content_col, label_col,max_seq_length)

In [None]:
model = BertForSequenceClassification.from_pretrained(model_path, num_labels=len(lab2ind))

In [None]:
#--------------------------------------
print ("[INFO] step (6) Create an iterator of data with torch DataLoader.")
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_dataloader = DataLoader(train_data, batch_size=batch_size)
#---------------------------
validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_dataloader = DataLoader(validation_data, batch_size=batch_size)
#------------------------------------------

In [None]:
print ("[INFO] step (7) run with parallel GPUs")
if torch.cuda.is_available():
  if torch.cuda.device_count() == 1:
    print("Run", "with one GPU")
    model = model.to(device)
  else:
    n_gpu = torch.cuda.device_count()
    print("Run", "with", n_gpu, "GPUs with max 4 GPUs")
    device_ids = GPUtil.getAvailable(limit = 4)
    torch.backends.cudnn.benchmark = True
    model = model.to(device)
    model = nn.DataParallel(model, device_ids=device_ids)
else:
  print("Run", "with CPU")
  model = model
#---------------------------------------------------
print ("[INFO] step (8) set Parameters, schedules, and loss function")
global max_grad_norm
max_grad_norm = 1.0
warmup_proportion = 0.1
num_training_steps	= len(train_dataloader) * num_epochs
num_warmup_steps = num_training_steps * warmup_proportion
optimizer = AdamW(model.parameters(), lr=lr_var, correct_bias=False)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps)  # PyTorch scheduler
criterion = nn.CrossEntropyLoss()

In [None]:

#---------------------------------------------------
print ("[INFO] step (9) start fine_tuning")
for epoch in trange(num_epochs, desc="Epoch"):
  train_loss = train(model, train_dataloader, optimizer, scheduler, criterion)	  
  val_loss, val_acc, val_f1, val_recall, val_precision, f1_sarcastic = evaluate(model, validation_dataloader, criterion)
  if not os.path.exists(ckpt_dir + 'model_' + str(int(epoch + 1)) + '/'): os.mkdir(ckpt_dir + 'model_' + str(int(epoch + 1)) + '/')
  model.save_pretrained(ckpt_dir+ 'model_' + str(int(epoch + 1)) + '/')
  epoch_eval_results = {"epoch_num":int(epoch + 1),
          "val_acc":val_acc, "val_recall":val_recall, "val_precision":val_precision, "val_f1":val_f1,"lr":lr_var, "f1_sarcastic":f1_sarcastic }
  with open(report,"a") as fOut:
    fOut.write(json.dumps(epoch_eval_results)+"\n")
    fOut.flush()
  #------------------------------------
  report_df = pd.read_json(report, orient='records', lines=True)
  report_df.sort_values(by=[sortby],ascending=False, inplace=True)
  report_df.to_csv(sorted_report,sep="\t",index=False)


In [None]:
report_df.head(5)

# load pretrained model and test

In [None]:
#---------------------------------------------------
print ("[INFO] step (10) testing")

In [None]:
ckpt_dir = config["checkpoint_dir"]+task_name+"_bert_ckpt/"
report = ckpt_dir+task_name+"_test_report.tsv"
sorted_report = ckpt_dir+task_name+"_test_report_sorted.tsv"
if not os.path.exists(ckpt_dir):
  os.mkdir(ckpt_dir)

In [None]:
ls /gdrive/MyDrive/Master/Dataset/isarcasticisarcastic_MARBERT_bert_ckpt/model_7

In [None]:

config={"task_name": "isarcastic_MARBERT", #output directory name
             "data_dir": "./", #data directory
             "checkpoint_dir": "/gdrive/MyDrive/Master/Dataset/isarcastic",
             "train_file": "isarcastic/isarcastic_Ar_train.csv", #train file path
             "dev_file": "isarcastic/task_A_AR_test.csv", #dev file path or test file path
            # "dev_file": "isarcastic/isarcastic_Ar_test2.csv", #dev file path or test file path

             "pretrained_model_path": 'MARBERT_pytorch_verison', #MARBERT checkpoint path
             "fine_tuned_model_path" : "/gdrive/MyDrive/Master/Dataset/isarcasticisarcastic_MARBERT_bert_ckpt/model_7",
             "epochs": 5, #number of epochs
            #  "content_col": "tweet", #text column
             "content_col": "text",
             "label_col": "sarcastic", #label column
             "lr": 2e-06, #learning rate
              "max_seq_length": 64, #max sequance length
              "batch_size": 32, #batch shize
              "sortby":"val_acc"} #sort results based on val_acc or val_f1


In [None]:
dev_file = config["data_dir"]+config["dev_file"]
content_col = config["content_col"]

In [None]:
model_path = config['fine_tuned_model_path']

In [None]:
!cp '/gdrive/MyDrive/Master/Dataset/isarcasticisarcastic_MARBERT_bert_ckpt/model_4/vocab.txt' '/gdrive/MyDrive/Master/Dataset/isarcasticisarcastic_MARBERT_bert_ckpt/model_7/vocab.txt'

In [None]:
print ("loading BERT setting")
tokenizer = BertTokenizer.from_pretrained(model_path)

In [None]:

def data_prepare_BERT(file_path, lab2ind, tokenizer, content_col, label_col, MAX_LEN):
  # Use pandas to load dataset
  df = pd.read_csv(file_path, delimiter=',', header=0)
  df = df[df[content_col].notnull()]
  print("Data size ", df.shape)
  sentences = df[content_col].values
  sentences = ["[CLS] " + sentence+ " [SEP]" for sentence in sentences]
  print ("The first sentence:")
  print (sentences[0])
  tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]
  print ("Tokenize the first sentence:")
  print (tokenized_texts[0])
  input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
  print ("Index numbers of the first sentence:")
  print (input_ids[0])
  pad_ind = tokenizer.convert_tokens_to_ids(['[PAD]'])[0]
  input_ids = pad_sequences(input_ids, maxlen=MAX_LEN+2, dtype="long", truncating="post", padding="post", value=pad_ind)
  print ("Index numbers of the first sentence after padding:\n",input_ids[0])
  attention_masks = []
  for seq in input_ids:
    seq_mask = [int(i > 0) for i in seq]
    attention_masks.append(seq_mask)
  inputs = torch.tensor(input_ids)
  masks = torch.tensor(attention_masks)
  return inputs, masks


In [None]:
dev_file= "isarcastic/task_A_AR_test.csv" 

In [None]:
content_col ="text"

In [None]:
validation_inputs, validation_masks = data_prepare_BERT(dev_file, lab2ind, tokenizer, content_col, label_col,max_seq_length)
model = BertForSequenceClassification.from_pretrained(model_path, num_labels=len(lab2ind))
#--------------------------------------
print ("[INFO] step (6') Create an iterator of data with torch DataLoader.")

In [None]:
validation_data = TensorDataset(validation_inputs, validation_masks)


In [None]:

#---------------------------
validation_dataloader = DataLoader(validation_data, batch_size=batch_size)
#------------------------------------------

In [None]:


print ("[INFO] step (7') run with parallel GPUs")
if torch.cuda.is_available():
  if torch.cuda.device_count() == 1:
    print("Run", "with one GPU")
    model = model.to(device)
  else:
    n_gpu = torch.cuda.device_count()
    print("Run", "with", n_gpu, "GPUs with max 4 GPUs")
    device_ids = GPUtil.getAvailable(limit = 4)
    torch.backends.cudnn.benchmark = True
    model = model.to(device)
    model = nn.DataParallel(model, device_ids=device_ids)
else:
  print("Run", "with CPU")
  model = model
#---------------------------------------------------
print ("[INFO] step (8) set Parameters, schedules, and loss function")
global max_grad_norm
max_grad_norm = 1.0
warmup_proportion = 0.1
optimizer = AdamW(model.parameters(), lr=lr_var, correct_bias=False) 
criterion = nn.CrossEntropyLoss()

In [None]:

def evaluate(model, iterator, criterion):
  model.eval()
  epoch_loss = 0
  all_pred=[]
  with torch.no_grad():
    for i, batch in enumerate(iterator):
      batch = tuple(t.to(device) for t in batch)
      input_ids, input_mask = batch
      outputs = model(input_ids, input_mask)
      logits = outputs[0]
      del batch, input_ids, input_mask
      probabilities, predicted = torch.max(logits.cpu().data, 1)
      all_pred.extend(predicted)
  return all_pred



In [None]:
epoch = 6

In [None]:
test_all_pred = evaluate(model, validation_dataloader, criterion)

In [None]:
textfile = open("task_a_ar.txt", "w")
textfile.write("task_a_ar" + "\n")
for i in test_all_pred:
    textfile.write(str(i.item()) + "\n")
textfile.close()

In [None]:
!cp    "task_a_ar.txt" /gdrive/MyDrive/Master/Dataset/Outputs/task_a_ar.txt