In [None]:
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from datasets import load_dataset
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments,AutoTokenizer,AutoConfig
from transformers import EarlyStoppingCallback

from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix, accuracy_score, f1_score

### Helper Functions

In [None]:
def num_labels_cnvt(batch):
    labels = [dicts[lbl] for lbl in batch['sentiment']]
    return {'labels':labels}

def tokenize(batch):
    tokens = tokenizer(batch['text'], padding='max_length', max_length = 250, truncation=True)
    return tokens

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1}

def data_encoder(root, train_file, valid_file):
    dataset = load_dataset(path = root,
                            data_files={'train': train_file, 'valid': valid_file})
    dataset = dataset.map(num_labels_cnvt, batched=True)
    
    dataset_encoded = dataset.map(tokenize,batched=True)
    dataset_encoded.set_format(type='torch',columns=["input_ids","attention_mask","labels"])
    return dataset_encoded

### Tokenizer & Model 

In [None]:
num_labels = 2
device = 'cpu'
model_ckpt = 'distilbert-base-cased'

# Loading Models
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
fine_model = AutoModelForSequenceClassification.from_pretrained(model_ckpt,num_labels=num_labels).to(device)

### Encoding Data

In [None]:
dicts = {'negative':0,'positive':1}
root = '../data/BERT/'
train = 'articles_2015_2019_train_fold-1.csv'
valid = 'articles_2015_2019_valid_fold-1.csv'
####################################################################################
encoded_dataset = data_encoder(root,train,valid)

### Model Training

In [None]:
batch_size = 16
logging_steps = len(encoded_dataset['train']) // batch_size
model_name = f"CVs/fold-10/fold_10-distilbert-base-cased-Adamw"

training_args = TrainingArguments(output_dir=model_name,
                                  num_train_epochs=15,
                                  learning_rate=2e-6,
                                  per_device_train_batch_size=batch_size,
                                  per_device_eval_batch_size=batch_size,
                                  weight_decay=0.0001,
                                  
                                  #### EarlyStopping
                                  evaluation_strategy = 'epoch',
                                  save_strategy='epoch',
                                  load_best_model_at_end = True,
                                  metric_for_best_model = 'f1',
                                  ##########
                                  
                                  disable_tqdm=False,
                                  logging_steps=logging_steps,
                                  push_to_hub=False,
                                  no_cuda=False,
                                  log_level="error",
                                  # optim = 'sgd'
                                 )

trainer = Trainer(model=fine_model, 
                  tokenizer=tokenizer,
                  args=training_args,
                  compute_metrics=compute_metrics,
                  train_dataset=encoded_dataset['train'],
                  eval_dataset=encoded_dataset['valid'],
                  callbacks = [EarlyStoppingCallback(early_stopping_patience=2)]
                 )

results = trainer.train()

## Evaluation 

In [None]:
predictions = trainer.predict(encoded_dataset['valid'])

In [None]:
y_true = predictions.label_ids

In [None]:
y_pred = np.argmax(predictions.predictions,axis=1)

In [None]:
print(f1_score(y_pred,y_true,average='micro'))

In [None]:
ConfusionMatrixDisplay.from_predictions(y_true,y_pred,cmap='Blues',normalize='true')