In [None]:
import torch
from torch.utils.data import Dataset
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import load_metric
from sklearn.model_selection import train_test_split
import numpy as np
from tqdm.notebook import tqdm
tqdm.pandas()
import os
from sklearn.metrics import classification_report, cohen_kappa_score

In [None]:
"""Initializing the root path"""

path = './'

In [None]:
"""Loading the preprocessed text file"""

data = open(f"{path}data/data_preprocessed.txt","r").readlines()
data = [x[:-1] for x in data]
class_size = len(data)//11

In [None]:
"""Splitting the data and labels into training, validation, and testing sets. Stratified split is used."""

x = np.array(data, dtype=object)
y = np.array([0]*class_size + [1]*10*class_size)
trainval_x, test_x, trainval_y, test_y = train_test_split(x, y, test_size = 0.2, random_state=42, stratify=y)
train_x, val_x, train_y, val_y = train_test_split(trainval_x, trainval_y, test_size = 0.25, random_state=42, stratify=trainval_y)

In [None]:
"""Initialzing the base model name which is to be finetuned"""

model_checkpoint = "lanwuwei/BERTOverflow_stackoverflow_github"

In [None]:
"""Initialzing the tokenizer corresponding to the 'model_checkpoint' initialized above"""

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

In [None]:
"""Tokenizing the text data and storing them in numpy files"""

def tokenizer_text(text, filename):
    max_len = 512
    encodings = []
    for x in tqdm(list(text)):
        inputs = tokenizer(x, add_special_tokens=True, max_length=max_len, truncation=True)['input_ids']
        inputs += [0]*(max_len-len(inputs))
        encodings.append(inputs)
    np.save(filename,np.array(encodings))
    
os.mkdir(f"{path}encodings")
tokenizer_text(train_x, f"{path}encodings/train_encodings")
tokenizer_text(val_x, f"{path}encodings/val_encodings")
tokenizer_text(test_x, f"{path}encodings/test_encodings")

In [None]:
"""Reading a chunk from a numpy file"""

def read_npy_chunk(fhandle, row_size, dtype, shape, start_row, num_rows):
    fhandle.seek(0)
    _, _ = np.lib.format.read_magic(fhandle)
    _, _, _ = np.lib.format.read_array_header_1_0(fhandle)    
    start_byte = start_row * row_size * dtype.itemsize
    fhandle.seek(start_byte, 1)
    n_items = row_size * num_rows
    flat = np.fromfile(fhandle, count=n_items, dtype=dtype)
    return flat.reshape((-1,) + shape[1:])

In [None]:
class SOdata(Dataset):
    def __init__(self, filename, labels):
        r"""
        Initializing the dataset class
        
        Args:
            filename (string): the path to the file where the tokenized data is stored
            labels (numpy.ndarray): the true labels of the data
        """
        self.filename = filename
        self.labels = labels
        self.fhandle =  open(filename, 'rb')
        major, minor = np.lib.format.read_magic(self.fhandle)
        self.shape, fortran, self.dtype = np.lib.format.read_array_header_1_0(self.fhandle)
        self.row_size = np.prod(self.shape[1:])

    def __len__(self):
        r"""
        Returning the number of data points
        """
        return len(self.labels)

    def __getitem__(self, idx):
        r"""
        Returning the input ids and labels corresponding to the 'idx' variable
        
        Args:
            idx (integer): the index corresponding to which the data needs to be fetched
        """
        encodings = read_npy_chunk(self.fhandle, self.row_size, self.dtype, self.shape, idx, 1)
        item = {}
        item['input_ids'] = torch.tensor(encodings[0])
        item['labels'] = torch.tensor(self.labels[idx])
        return item    

In [None]:
"""Initializing the training, validation, and test datasets"""

train_dataset = SOdata(f"{path}encodings/train_encodings.npy", train_y)
val_dataset = SOdata(f"{path}encodings/val_encodings.npy", val_y)
test_dataset = SOdata(f"{path}encodings/test_encodings.npy", test_y)

In [None]:
"""Initializing the metrics to be reported while evaluating the model"""

acc = load_metric("accuracy")
pre = load_metric("precision")
rec = load_metric("recall")
f1 = load_metric("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {'accuracy':acc.compute(predictions=predictions, references=labels), 
            'precision':pre.compute(predictions=predictions, references=labels),
            'recall':rec.compute(predictions=predictions, references=labels),
            'f1':f1.compute(predictions=predictions, references=labels)}

In [None]:
"""Initializing arguments to be used by the 'Trainer' API while training the model"""

os.mkdir(f"{path}models/")

training_args = TrainingArguments(
    output_dir = f"{path}models/",
    num_train_epochs=1,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_steps=5000,
    save_steps=5000,
    save_total_limit=5,
    seed=42,
    evaluation_strategy="steps",
    eval_steps=5000,
    learning_rate=1e-5,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    no_cuda=False
)

In [None]:
"""Initialzing the model corresponding to the 'model_checkpoint' initialized above and the 'Trainer' object"""

model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=2, output_hidden_states=False)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

In [None]:
"""Training and evaluating the model"""

print(trainer.train())

In [None]:
"""Testing the finetuned model"""

test_pred = trainer.predict(test_dataset=test_dataset)
pred_class = np.argmax(test_pred[0], axis=1)
print(f"{classification_report(test_y, pred_class, digits=7)}\n")
print(f"cohen kappa: {cohen_kappa_score(test_y, pred_class)}\n")