# Download MARBERT checkpoint

In [None]:
!wget https://huggingface.co/UBC-NLP/MARBERT/resolve/main/MARBERT_pytorch_verison.tar.gz

In [None]:
!tar -xvf MARBERT_pytorch_verison.tar.gz

In [None]:
!wget https://raw.githubusercontent.com/UBC-NLP/marbert/main/examples/UBC_AJGT_final_shuffled_train.tsv
!wget https://raw.githubusercontent.com/UBC-NLP/marbert/main/examples/UBC_AJGT_final_shuffled_test.tsv

In [None]:
!mkdir -p AJGT
!mv UBC_AJGT_final_shuffled_train.tsv ./AJGT/UBC_AJGT_final_shuffled_train.tsv
!mv UBC_AJGT_final_shuffled_test.tsv ./AJGT/UBC_AJGT_final_shuffled_test.tsv

In [None]:
!pip install GPUtil pytorch_pretrained_bert transformers

In [None]:
from google.colab import drive
drive.mount('/content/drive')
import os
os.chdir("/content/drive/")

# Fine-tuning code

In [None]:
# (1)load libraries 
import json, sys, regex
import torch
import GPUtil
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from pytorch_pretrained_bert import BertTokenizer, BertConfig, BertAdam, BertForSequenceClassification
from tqdm import tqdm, trange
import pandas as pd
import os
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, classification_report, confusion_matrix
##----------------------------------------------------
from transformers import *
from transformers import XLMRobertaConfig
from transformers import XLMRobertaModel
from transformers import AutoTokenizer, AutoModelWithLMHead
from transformers import XLMRobertaForSequenceClassification, XLMRobertaTokenizer, XLMRobertaModel
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, processors
from transformers import AdamW, get_linear_schedule_with_warmup
from transformers import AutoTokenizer, AutoModel


In [None]:
import re

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print ("your device ", device)

In [None]:
def create_label2ind_file(file, label_col):
    labels_json = {}

    # load train_dev_test file

    df = pd.read_csv(file)
    df.loc[df.sentiment=="NEU",label_col]="NEU"
    df.loc[df.sentiment=="NEG",label_col]="NEG"
    df.loc[df.sentiment=="POS",label_col]="POS"
    df.loc[df.sentiment=="neutral",label_col]="NEU"
    df.loc[df.sentiment=="negative",label_col]="NEG"
    df.loc[df.sentiment=="positive",label_col]="POS"
    df.head(5)

    # get labels and sort it A-Z

    labels = df[label_col].unique()
    labels.sort()

    # convert labels to indexes

    for idx in range(0, len(labels)):
        labels_json[labels[idx]] = idx
    with open(label2idx_file, 'w') as json_file:
        json.dump(labels_json, json_file)

In [None]:
def clean_text(text):  
    # ref: https://github.com/bakrianoo/aravec
    search = ["أ","إ","آ","ة","_","-","/",".","،"," و "," يا ",'"',"ـ","'","ى",
              "\\",'\n', '\t','&quot;','?','؟','!']
    replace = ["ا","ا","ا","ه"," "," ","","",""," و"," يا",
               "","","","ي","",' ', ' ',' ',' ? ',' ؟ ', ' ! ']
    
    tashkeel = re.compile(r'[\u0617-\u061A\u064B-\u0652]')
    text = re.sub(tashkeel,"", text)
    
    # longation = re.compile(r'(.)\1+')
    # subst = r"\1\1"
    # text = re.sub(longation, subst, text)
    
    text = re.sub(r"[^\w\s]", '', text)
    text = re.sub(r"[a-zA-Z]", '', text)
    text = re.sub(r"\d+", ' ', text)
    text = re.sub(r"\n+", ' ', text)
    text = re.sub(r"\t+", ' ', text)
    text = re.sub(r"\r+", ' ', text)
    text = re.sub(r"\s+", ' ', text)
    text = text.replace('وو', 'و')
    text = text.replace('يي', 'ي')
    text = text.replace('اا', 'ا')
    
    for i in range(0, len(search)):
        text = text.replace(search[i], replace[i])
    
    text = text.strip()
    
    return text

In [None]:
def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)

In [None]:
#!/usr/bin/python
# -*- coding: utf-8 -*-


def data_prepare_BERT(
    file_path,
    lab2ind,
    tokenizer,
    content_col,
    label_col,
    MAX_LEN,
    ):

    # Use pandas to load dataset

    df = pd.read_csv(file_path)

    df = df[df[content_col].notnull()]
    df = df[df[label_col].notnull()]
    df[content_col] = df.tweet.apply(clean_text)
    df[content_col] = df.tweet.apply(remove_urls)
    print ('Data size ', df.shape)

    # Create sentence and label lists

    sentences = df[content_col].values
    sentences = ['[CLS] ' + sentence + ' [SEP]' for sentence in
                 sentences]
    print ('The first sentence:')
    print (sentences[0])

    # Create sentence and label lists

    labels = df[label_col].values

   

    labels = [lab2ind[i] for i in labels]

    # Import the BERT tokenizer, used to convert our text into tokens that correspond to BERT's vocabulary.

    tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]
    print ('Tokenize the first sentence:')
    print (tokenized_texts[0])

    
    # Use the BERT tokenizer to convert the tokens to their index numbers in the BERT vocabulary

    input_ids = [tokenizer.convert_tokens_to_ids(x) for x in
                 tokenized_texts]
    print ('Index numbers of the first sentence:')
    print (input_ids[0])

    # Pad our input seqeunce to the fixed length (i.e., max_len) with index of [PAD] token
    # ~ input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

    pad_ind = tokenizer.convert_tokens_to_ids(['[PAD]'])[0]
    input_ids = pad_sequences(
        input_ids,
        maxlen=MAX_LEN + 2,
        dtype='long',
        truncating='post',
        padding='post',
        value=pad_ind,
        )
    print ('Index numbers of the first sentence after padding:\n',
           input_ids[0])

    # Create attention masks

    attention_masks = []

    # Create a mask of 1s for each token followed by 0s for padding

    for seq in input_ids:
        seq_mask = [float(i > 0) for i in seq]
        attention_masks.append(seq_mask)

    # Convert all of our data into torch tensors, the required datatype for our model

    inputs = torch.tensor(input_ids)
    labels = torch.tensor(labels)
    masks = torch.tensor(attention_masks)
    return inputs, labels, masks


In [None]:
#!/usr/bin/python
# -*- coding: utf-8 -*-


def data_prepare_BERT_test(
    file_path,
    lab2ind,
    tokenizer,
    content_col,
    
    MAX_LEN,
    ):

    # Use pandas to load dataset

    df = pd.read_csv(file_path)

    df = df[df[content_col].notnull()]
    df[content_col] = df.tweet.apply(clean_text)

    print ('Data size ', df.shape)

    # Create sentence and label lists

    sentences = df[content_col].values
    sentences = ['[CLS] ' + sentence + ' [SEP]' for sentence in
                 sentences]
    print ('The first sentence:')
    print (sentences[0])

    # Create sentence and label lists

   

    # Import the BERT tokenizer, used to convert our text into tokens that correspond to BERT's vocabulary.

    tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]
    print ('Tokenize the first sentence:')
    print (tokenized_texts[0])

    # print("Label is ", labels[0])
    # Use the BERT tokenizer to convert the tokens to their index numbers in the BERT vocabulary

    input_ids = [tokenizer.convert_tokens_to_ids(x) for x in
                 tokenized_texts]
    print ('Index numbers of the first sentence:')
    print (input_ids[0])

    # Pad our input seqeunce to the fixed length (i.e., max_len) with index of [PAD] token
    # ~ input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

    pad_ind = tokenizer.convert_tokens_to_ids(['[PAD]'])[0]
    input_ids = pad_sequences(
        input_ids,
        maxlen=MAX_LEN + 2,
        dtype='long',
        truncating='post',
        padding='post',
        value=pad_ind,
        )
    print ('Index numbers of the first sentence after padding:\n',
           input_ids[0])

    # Create attention masks

    attention_masks = []

    # Create a mask of 1s for each token followed by 0s for padding

    for seq in input_ids:
        seq_mask = [float(i > 0) for i in seq]
        attention_masks.append(seq_mask)

    # Convert all of our data into torch tensors, the required datatype for our model

    inputs = torch.tensor(input_ids)
    masks = torch.tensor(attention_masks)
    return inputs,  masks


In [None]:
# Function to calculate the accuracy of our predictions vs labels
# def flat_accuracy(preds, labels):
#	  pred_flat = np.argmax(preds, axis=1).flatten()
#	  labels_flat = labels.flatten()
#	  return np.sum(pred_flat == labels_flat) / len(labels_flat)
def flat_pred(preds, labels):
	pred_flat = np.argmax(preds, axis=1).flatten()
	labels_flat = labels.flatten()
	return pred_flat.tolist(), labels_flat.tolist()

In [None]:

def train(model, iterator, optimizer, scheduler, criterion):
	
	model.train()
	epoch_loss = 0
	for i, batch in enumerate(iterator):
		# Add batch to GPU
		batch = tuple(t.to(device) for t in batch)
		# Unpack the inputs from our dataloader
		input_ids, input_mask, labels = batch
		outputs = model(input_ids, input_mask, labels=labels)
		loss, logits = outputs[:2]
		# delete used variables to free GPU memory
		del batch, input_ids, input_mask, labels
		optimizer.zero_grad()
		if torch.cuda.device_count() == 1:
			loss.backward()
			epoch_loss += loss.cpu().item()
		else:
			loss.sum().backward()
			epoch_loss += loss.sum().cpu().item()
		optimizer.step()
		torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)  # Gradient clipping is not in AdamW anymore
		# optimizer.step()
		scheduler.step()
	# free GPU memory
	if device == 'cuda':
		torch.cuda.empty_cache()
	return epoch_loss / len(iterator)

In [None]:

def evaluate(model, iterator, criterion):
	model.eval()
	epoch_loss = 0
	all_pred=[]
	all_label = []
	with torch.no_grad():
		for i, batch in enumerate(iterator):
			# Add batch to GPU
			batch = tuple(t.to(device) for t in batch)
			# Unpack the inputs from our dataloader
			input_ids, input_mask, labels = batch
			outputs = model(input_ids, input_mask, labels=labels)
			loss, logits = outputs[:2]
			# delete used variables to free GPU memory
			del batch, input_ids, input_mask
			if torch.cuda.device_count() == 1:
				epoch_loss += loss.cpu().item()
			else:
				epoch_loss += loss.sum().cpu().item()
			# identify the predicted class for each example in the batch
			probabilities, predicted = torch.max(logits.cpu().data, 1)
			# put all the true labels and predictions to two lists
			all_pred.extend(predicted)
			all_label.extend(labels.cpu())
	accuracy = accuracy_score(all_label, all_pred)
	f1score = f1_score(all_label, all_pred, average='macro') 
	recall = recall_score(all_label, all_pred, average='macro')
	precision = precision_score(all_label, all_pred, average='macro')
	report = classification_report(all_label, all_pred)
	return (epoch_loss / len(iterator)), accuracy, f1score, recall, precision



In [None]:

def fine_tuning(config):
	#---------------------------------------
	print ("[INFO] step (1) load train_test config file")
	# config_file = open(config_file, 'r', encoding="utf8")
	# config = json.load(config_file)
	task_name = config["task_name"]
	content_col = config["content_col"]
	label_col = config["label_col"]
	train_file = config["data_dir"]+config["train_file"]
	dev_file = config["data_dir"]+config["dev_file"]
	sortby = config["sortby"]
	max_seq_length= int(config["max_seq_length"])
	batch_size = int(config["batch_size"])
	lr_var = float(config["lr"])
	model_path = config['pretrained_model_path']
	num_epochs = config['epochs'] # Number of training epochs (authors recommend between 2 and 4)
	global label2idx_file
	label2idx_file = config["data_dir"]+config["task_name"]+"_labels-dict.json"
	#-------------------------------------------------------
	print ("[INFO] step (2) convert labels2index")
	create_label2ind_file(train_file, label_col)
	print (label2idx_file)
	#---------------------------------------------------------
	print ("[INFO] step (3) check checkpoit directory and report file")
	ckpt_dir = config["data_dir"]+task_name+"_bert_ckpt/"
	report = ckpt_dir+task_name+"_report.tsv"
	sorted_report = ckpt_dir+task_name+"_report_sorted.tsv"
	if not os.path.exists(ckpt_dir):
		os.mkdir(ckpt_dir)
	#-------------------------------------------------------
	print ("[INFO] step (4) load label to number dictionary")
	lab2ind = json.load(open(label2idx_file))
	print ("[INFO] train_file", train_file)
	print ("[INFO] dev_file", dev_file)
	print ("[INFO] num_epochs", num_epochs)
	print ("[INFO] model_path", model_path)
	print ("max_seq_length", max_seq_length, "batch_size", batch_size)
	#-------------------------------------------------------
	print ("[INFO] step (5) Use defined funtion to extract tokanize data")
	# tokenizer from pre-trained BERT model
	print ("loading BERT setting")
	tokenizer = BertTokenizer.from_pretrained(model_path)
	train_inputs, train_labels, train_masks = data_prepare_BERT(train_file, lab2ind, tokenizer,content_col, label_col, max_seq_length)
	validation_inputs, validation_labels, validation_masks = data_prepare_BERT(dev_file, lab2ind, tokenizer, content_col, label_col,max_seq_length)
	# Load BertForSequenceClassification, the pretrained BERT model with a single linear classification layer on top.
	model = BertForSequenceClassification.from_pretrained(model_path, num_labels=len(lab2ind))
	#--------------------------------------
	print ("[INFO] step (6) Create an iterator of data with torch DataLoader.")
#		  This helps save on memory during training because, unlike a for loop,\
#		  with an iterator the entire dataset does not need to be loaded into memory")
	train_data = TensorDataset(train_inputs, train_masks, train_labels)
	train_dataloader = DataLoader(train_data, batch_size=batch_size)
	#---------------------------
	validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
	validation_dataloader = DataLoader(validation_data, batch_size=batch_size)
	#------------------------------------------
	print ("[INFO] step (7) run with parallel GPUs")
	if torch.cuda.is_available():
		if torch.cuda.device_count() == 1:
			print("Run", "with one GPU")
			model = model.to(device)
		else:
			n_gpu = torch.cuda.device_count()
			print("Run", "with", n_gpu, "GPUs with max 4 GPUs")
			device_ids = GPUtil.getAvailable(limit = 4)
			torch.backends.cudnn.benchmark = True
			model = model.to(device)
			model = nn.DataParallel(model, device_ids=device_ids)
	else:
		print("Run", "with CPU")
		model = model
	#---------------------------------------------------
	print ("[INFO] step (8) set Parameters, schedules, and loss function")
	global max_grad_norm
	max_grad_norm = 1.0
	warmup_proportion = 0.1
	num_training_steps	= len(train_dataloader) * num_epochs
	num_warmup_steps = num_training_steps * warmup_proportion
	### In Transformers, optimizer and schedules are instantiated like this:
	# Note: AdamW is a class from the huggingface library
	# the 'W' stands for 'Weight Decay"
	optimizer = AdamW(model.parameters(), lr=lr_var, correct_bias=False)
	# schedules
	scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps)  # PyTorch scheduler
	# We use nn.CrossEntropyLoss() as our loss function. 
	criterion = nn.CrossEntropyLoss()
	#---------------------------------------------------
	print ("[INFO] step (9) start fine_tuning")
	for epoch in trange(num_epochs, desc="Epoch"):
		train_loss = train(model, train_dataloader, optimizer, scheduler, criterion)	  
		val_loss, val_acc, val_f1, val_recall, val_precision = evaluate(model, validation_dataloader, criterion)
# 		print (train_loss, val_acc)
		# Create checkpoint at end of each epoch
		if not os.path.exists(ckpt_dir + 'model_' + str(int(epoch + 1)) + '/'): os.mkdir(ckpt_dir + 'model_' + str(int(epoch + 1)) + '/')
		model.save_pretrained(ckpt_dir+ 'model_' + str(int(epoch + 1)) + '/')
		epoch_eval_results = {"epoch_num":int(epoch + 1),"train_loss":train_loss,
					  "val_acc":val_acc, "val_recall":val_recall, "val_precision":val_precision, "val_f1":val_f1,"lr":lr_var }
		with open(report,"a") as fOut:
			fOut.write(json.dumps(epoch_eval_results)+"\n")
			fOut.flush()
		#------------------------------------
		report_df = pd.read_json(report, orient='records', lines=True)
		report_df.sort_values(by=[sortby],ascending=False, inplace=True)
		report_df.to_csv(sorted_report,sep="\t",index=False)
	return report_df

# Run fine-tuning for 5 epochs

In [None]:

config={"task_name": "Ironic_MARBERT_Task_2", #output directory name
             "data_dir": "", #data directory
             "train_file": "", #train file path
             "dev_file": "", #dev file path or test file path
             "pretrained_model_path": '', #MARBERT checkpoint path
             "epochs": 5, #number of epochs
             "content_col": "tweet", #text column
             "label_col": "sentiment", #label column
             "lr": 2e-06, #learning rate
              "max_seq_length": 300, #max sequance length
              "batch_size": 32, #batch shize
              "sortby":"val_acc"} #sort results based on val_acc or val_f1


In [None]:
report_df = fine_tuning(config)

In [None]:
report_df.head(5)

In [None]:
def predict_extrnal_test_set():
    lab2ind = json.load(open()) # load label2index file generated and stored in traiing pahse
    tokenizer = BertTokenizer.from_pretrained('')#load bert tokenizer
    max_seq_length=300
    validation_inputs,  validation_masks = data_prepare_BERT_test('', lab2ind, tokenizer, 'tweet', max_seq_length)
    # Load BertForSequenceClassification, the pretrained BERT model with a single linear classification layer on top.
    model = BertForSequenceClassification.from_pretrained('', num_labels=len(lab2ind))
    validation_data = TensorDataset(validation_inputs, validation_masks)
    validation_dataloader = DataLoader(validation_data, batch_size=32)
    ################ Evaluation and prediction#####################
    if torch.cuda.is_available():
		if torch.cuda.device_count() == 1:
			print("Run", "with one GPU")
			model = model.to(device)
		else:
			n_gpu = torch.cuda.device_count()
			print("Run", "with", n_gpu, "GPUs with max 4 GPUs")
			device_ids = GPUtil.getAvailable(limit = 4)
			torch.backends.cudnn.benchmark = True
			model = model.to(device)
			model = nn.DataParallel(model, device_ids=device_ids)
    else:
      print("Run", "with CPU")
      model = model
    criterion = nn.CrossEntropyLoss()
    model.eval()
    all_pred=[]
    all_label = []
    with torch.no_grad():
      for i, batch in enumerate(validation_dataloader):    
        batch = tuple(t.to(device) for t in batch)
        # Unpack the inputs from our dataloader
        input_ids, input_mask = batch
        outputs = model(input_ids, input_mask)
        logits = outputs[:2]
        # delete used variables to free GPU memory
        del batch, input_ids, input_mask   
        # identify the predicted class for each example in the batch
        probabilities, predicted = torch.max(outputs[0], 1)
        # put all the true labels and predictions to two lists
        all_pred.extend(predicted)
    return all_pred
    

In [None]:
all_pred = predict_extrnal_test_set()

In [None]:
def write_output_file(all_pred):
    f = open("", "w")
    for pred in all_pred:
 
      if pred==1:
        f.write("NEU\n")
      elif pred==0:
        f.write("NEG\n")
      else:
        f.write("POS\n")
    f.close()

In [None]:
write_output_file(all_pred)


