# DeBERTa-v3 

In [54]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from mlflow.sklearn import save_model

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder

import sentencepiece
import os

from transformers import TrainingArguments, Trainer, AutoModelForSequenceClassification, AutoModel, AutoTokenizer, AutoConfig

Install requirements in terminal:
pip install transformers torch datasets

In [17]:
df = pd.read_csv('../data/data_small.csv')

Y = df["logical_fallacies"]
X = df["text"]

In [None]:
df['text'].isna().sum()

In [None]:
df.info()

In [19]:
X_train, X_test, y_train, y_test = train_test_split(
    X, Y, test_size=0.30, random_state=42)

### Data Preparation

https://stackoverflow.com/questions/76868251/how-to-load-deberta-v3-properly

In [None]:
# tokenization after train test split to prevent data leakage

#added use_fast=False to prevent tokenization error (might happen when using fast tokenization)
tokenizer = AutoTokenizer.from_pretrained('microsoft/deberta-v3-base', use_fast=False)

In [21]:
def tokenize(texts):
    return tokenizer(
        texts,
        padding="max_length", #ensures that all tokenized sequences are padded to the same length, padding adds special tokens to shorter sequeces so they match the maximum length
        truncation=True, #if sequence exceeds max, it will be trucated
        max_length=512, #for most transformer models, 512 is a common limit for maximum length
        return_tensors="pt" #converts the output to pytorch tensors
    )

In [22]:
train_encodings = tokenize(X_train.to_list())
test_encodings = tokenize(X_test.to_list())

### Convert string labels to integers

In [23]:
le = LabelEncoder()
y_train = le.fit_transform(y_train) 
y_test = le.transform(y_test)

### Dataset Preparation for usage in model

needed to create a PyTorch Dataset object that:
- organizes tokenized text
- pairs them with corresponding labels
- structures everything for batch processing during training

In [24]:
#object oriented programming (class is the object), with class you can do different things, such as calling functions

from torch.utils.data import Dataset, DataLoader
import torch

class TextDataset(Dataset):  # Inherits from PyTorch's Dataset class
    def __init__(self, encodings, labels):
        self.input_ids = encodings['input_ids']       # Token IDs from tokenizer
        self.attention_mask = encodings['attention_mask']  # Mask for padding
        self.labels = torch.tensor(labels)  # Convert labels to tensors
    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],       # Token IDs for one sample
            'attention_mask': self.attention_mask[idx],  # Mask for one sample
            'labels': self.labels[idx]              # Label for one sample
        }
    def __len__(self):
        return len(self.labels)  # Total number of samples

In [25]:
train_dataset = TextDataset(train_encodings, y_train)
test_dataset = TextDataset(test_encodings, y_test)

### Zero Shot Inference (I have computation error)

In [62]:
# # disable upper limit for memory
# os.environ["PYTORCH_MPS_HIGH_WATERMARK_RATIO"] = "0.0"

# # Allows up to 100% of available memory
# torch.mps.set_per_process_memory_fraction(1.0)  

# torch.mps.empty_cache()  # Clears unused GPU memory

In [None]:
# # Load fresh copy of base model (not train on our data)
# num_classes = len(df["logical_fallacies"].unique())
# base_model = AutoModelForSequenceClassification.from_pretrained(
#     "microsoft/deberta-v3-small",
#     num_labels=num_classes,
#     problem_type="single_label_classification"
# )

In [67]:
# def predict(model, encodings):
#     # Set the model to evaluation mode
#     model.eval()
    
#     # Use CPU
#     device = torch.device("cpu")
#     model.to(device)
    
#     # Perform inference
#     with torch.no_grad():
#         outputs = model(
#             input_ids=encodings["input_ids"].to(device),
#             attention_mask=encodings["attention_mask"].to(device)
#         )
     
#     # Apply softmax to get probabilities
#     probabilities = torch.softmax(outputs.logits, dim=-1).cpu().numpy()
#     return probabilities

In [None]:
# # Get predictions for test data
# base_probs = predict(base_model, test_encodings)

: 

### Model Initialization

I had to change configuration of accelerate, as it might still be configured to fp16 (mixed precision):
- type in bash accelerate confic
- this machine
- no distributed training
- do you want to run your training on CPU only, say No, as MAC has GPU
- do you wish to optimize script with torch dynamo: say "No" if using an Apple M1 Pro with MPS backend
- do you want to use mixed prexision: NO

In [26]:
num_classes = len(df["logical_fallacies"].unique())
model = AutoModelForSequenceClassification.from_pretrained(
    "microsoft/deberta-v3-small",
    num_labels=num_classes,
    problem_type="single_label_classification"
)

model.gradient_checkpointing_enable()  # force model to use gradient checkpointing to save memory

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-small and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Class imbalance (We could try)

In [None]:
# # Class-balanced trainer
# from sklearn.utils.class_weight import compute_class_weight
# class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)

### Training Confguration 

In [27]:
training_args = TrainingArguments(
    # output_dir='./results', # to sve results
    num_train_epochs=3,
    per_device_train_batch_size=4, #small to save memory
    per_device_eval_batch_size=8, #small to save memory
    learning_rate=2e-5, #standard for deberta; maybe try 6e-6
    weight_decay=0.01,
    evaluation_strategy="epoch",
    logging_steps=50,
    save_strategy="epoch",
    load_best_model_at_end=True
)

def compute_metrics(p):
    preds = p.predictions.argmax(-1)
    return {'accuracy': accuracy_score(p.label_ids, preds)}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)



## Execute Training

In [28]:
torch.mps.empty_cache()  # Clears unused GPU memory

In [29]:
# disable upper limit for memory
os.environ["PYTORCH_MPS_HIGH_WATERMARK_RATIO"] = "0.0"

# Allows up to 100% of available memory
torch.mps.set_per_process_memory_fraction(1.0)  

In [30]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.9398,0.961467,0.682667
2,0.7404,0.926966,0.731333
3,0.5509,1.017781,0.749333


Test Accuracy: 0.7313


In [36]:
results = trainer.evaluate()
print(f"Test Accuracy: {results['eval_accuracy']:.4f}")

Test Accuracy: 0.7313


In [38]:
output = trainer.predict(test_dataset)
predictions = np.argmax(output.predictions, axis=1)
y_true = output.label_ids

In [39]:
# Generate classification report
print(classification_report(y_true, predictions))

# Generate confusion matrix
cm = confusion_matrix(y_true, predictions)
print("Confusion Matrix:")
print(cm)

              precision    recall  f1-score   support

           0       0.71      0.60      0.65       159
           1       0.69      0.32      0.44        97
           2       0.73      0.66      0.69       227
           3       0.82      0.56      0.67       133
           4       0.58      0.63      0.60       202
           5       0.77      0.91      0.83       682

    accuracy                           0.73      1500
   macro avg       0.72      0.61      0.65      1500
weighted avg       0.73      0.73      0.72      1500

Confusion Matrix:
[[ 96   6  25   2  11  19]
 [  6  31   9   0  14  37]
 [ 11   2 150   5  24  35]
 [  3   0   5  75   9  41]
 [  6   3  12   2 127  52]
 [ 14   3   5   7  35 618]]


### Save model

In [44]:
path = "models/LLM_deberta_v3_small"
# model.save_pretrained(path)
save_model(sk_model=model, path=path)

