# DeBERTa-v3 

### Import necessary packages

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import mlflow
from mlflow.transformers import log_model
import logging 
from mlflow.sklearn import save_model

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from mlflow.models.signature import infer_signature
from sklearn.utils.class_weight import compute_class_weight
from sklearn.preprocessing import LabelBinarizer
from scipy.special import softmax
import numpy as np

from torch import nn
import mlflow.pytorch

import sentencepiece
import os

# os.environ["TOKENIZERS_PARALLELISM"] = "false"  # This tells Hugging Face: “Don’t use parallel tokenization — avoid possible deadlocks.”

from torch.utils.data import Dataset, DataLoader
import torch

from transformers import TrainingArguments, Trainer, AutoModelForSequenceClassification, AutoModel, AutoTokenizer, AutoConfig

import config 

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from basic_functions import(
    get_encode_tokenize_data,
    createTrainer,
    get_eval_metrics
)

### Setup


In [3]:
MODEL_NAME = "deberta_v3" 
TRACKING_URI = open("../.mlflow_uri").read().strip()
EXPERIMENT_NAME = config.EXPERIMENT_NAME

logging.basicConfig(format="%(asctime)s: %(message)s") # Configure logging format to show timestamp before every message

logger = logging.getLogger()
logger.setLevel(logging.INFO) # Only show logs that are INFO or more important (e.g., WARNING, ERROR) — but ignore DEBUG.

In [4]:
DATA_PATH = "../data/data_dropped_duplicates_small.csv"
MODEL_PATH = "microsoft/deberta-v3-base"
MODEL_TRAINING_PATH ="microsoft/deberta-v3-small"
OUTPUT_DIR = "../models/LLM_deberta_v3_tiny/trainer_output"
SAVE_PATH = "../models/LLM_deberta_v3_tiny/pytorch_model"

### Get data

In [5]:
train_dataset, test_dataset, y_train, le = get_encode_tokenize_data(DATA_PATH, MODEL_PATH)

INFO:basic_functions:Loading data...
INFO:basic_functions:Train test split, test-size 0.3
INFO:root:encode the label column
INFO:root:tokenize
INFO:basic_functions:create tokenizer & load model
INFO:basic_functions:create tokenizer & load model
INFO:root:create TextDatasets (train & test)


### Zero Shot Inference 

In [6]:
# # disable upper limit for memory
# os.environ["PYTORCH_MPS_HIGH_WATERMARK_RATIO"] = "0.0"

# # Allows up to 100% of available memory
# torch.mps.set_per_process_memory_fraction(1.0)  

# torch.mps.empty_cache()  # Clears unused GPU memory

In [7]:
# # Load fresh copy of base model (not train on our data)
# num_classes = len(df["logical_fallacies"].unique())
# base_model = AutoModelForSequenceClassification.from_pretrained(
#     "microsoft/deberta-v3-small",
#     num_labels=num_classes,
#     problem_type="single_label_classification"
# )

In [8]:
# def predict(model, encodings, batch_size=8):
#     # Set the model to evaluation mode
#     model.eval()
    
#     # Use GPU
#     device = torch.device("mps")
#     model.to(device)
    
#     # Perform inference
#     probabilities = []
#     for i in range(0, len(encodings["input_ids"]), batch_size):
#         with torch.no_grad():
#             batch = {
#                 "input_ids": encodings["input_ids"][i:i+batch_size].to(device),
#                 "attention_mask": encodings["attention_mask"][i:i+batch_size].to(device)
#             }
#             outputs = model(**batch)
#             probs = torch.softmax(outputs.logits, dim=-1).cpu().numpy()
#             probabilities.extend(probs)
            
#         # Clear GPU memory after each batch
#         torch.mps.empty_cache()
    
#     return np.array(probabilities)

In [9]:
# # Get predictions for test data
# base_probs = predict(base_model, test_encodings, batch_size=8)

In [10]:
# # Get highest probability indices
# predicted_indices = np.argmax(base_probs, axis=1)  

In [11]:
# from sklearn.metrics import classification_report

# # Generate classification report
# report = classification_report(y_test, predicted_indices, target_names=le.classes_)
# print(report)

Note: This deberta model is actually not designed for zero shot, there is one by MoritzLauer which can be used without requiring training on data. So training on data is actually necessary! The DeBERTa used here is meant for supervised learning. 
Another option is to use BART, facebook/bart-large-mnli model.

**Zero-Shot Learning** </span> is a concept, that a model when trained on enough unlabeled data (unsupervised learning) is able to generalize/ recognize at inference time even though the model was not trained on the inference data. This can be used in NLP, Images etc.

### Model Initialization

I had to change configuration of accelerate, as it might still be configured to fp16 (mixed precision)(doesn't work on Apple M1 Pro):
- type in bash accelerate config
- this machine
- no distributed training
- do you want to run your training on CPU only, say No, as MAC Apple M1 Pro has GPU
- do you wish to optimize script with torch dynamo: say "No" if using an Apple M1 Pro with MPS backend
- do you want to use mixed precision: NO

In [12]:
num_classes = len(np.unique(y_train))
num_classes

6

In [13]:
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_TRAINING_PATH,
    num_labels=num_classes,
    problem_type="single_label_classification"
)

model.gradient_checkpointing_enable()  # force model to use gradient checkpointing to save memory

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-small and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
# epoch = 3
# learning_rate=2e-5 #standard for deberta; maybe try 6e-6
# weight_decay=0.01
# per_device_train_batch_size=4 #small to save memory
# per_device_eval_batch_size=8 

In [None]:
params = {
      "learning_rate": 2e-5,
      "weight_decay": 0.01,
      "num_train_epochs": 1,
      "evaluation_strategy": "epoch",
      "train_batch_size":4, 
      "eval_batch_size":8,
      "epochs": 4,
      "class_weight": "False",
  }
  
# setting the MLFlow connection and experiment
mlflow.set_tracking_uri(TRACKING_URI)
mlflow.set_experiment(EXPERIMENT_NAME)

mlflow.start_run()
run = mlflow.active_run()
print("Active run_id: {}".format(run.info.run_id))

mlflow.set_tag("model_name", MODEL_NAME)
mlflow.set_tag('mlflow.runName', 'deberta_v3_small')
mlflow.log_params(params)

Active run_id: a982d0f5999943aaba97157c28e37303


In [16]:
trainer = createTrainer(
    model= model, 
    train_dataset = train_dataset,
    test_dataset = test_dataset,
    output_dir= OUTPUT_DIR, 
    y_train=y_train, 
    class_weight=False, 
    epochs=4, 
    learning_rate=2e-5, #standard for deberta; maybe try 6e-6
    weight_decay = 0.01, 
    train_batch_size=4, 
    eval_batch_size=8 )


INFO:root:defining training arguments
INFO:root:get normal trainer


### Execute Training

In [17]:
torch.mps.empty_cache()  # Clears unused GPU memory

In [18]:
# disable upper limit for memory
os.environ["PYTORCH_MPS_HIGH_WATERMARK_RATIO"] = "0.0"

# Allows up to 100% of available memory
torch.mps.set_per_process_memory_fraction(1.0)  

In [19]:
logger.info('training is running')
trainer.train()

INFO:root:training is running


Epoch,Training Loss,Validation Loss,Accuracy
1,1.073,1.03124,0.651333
2,0.678,0.955212,0.747333
3,0.5112,1.077594,0.763333
4,0.4068,1.148954,0.768667


2025/04/07 10:19:09 ERROR mlflow.utils.async_logging.async_logging_queue: Run Id a982d0f5999943aaba97157c28e37303: Failed to log run data: Exception: API request to http://127.0.0.1:5001/api/2.0/mlflow/runs/log-batch failed with exception HTTPConnectionPool(host='127.0.0.1', port=5001): Max retries exceeded with url: /api/2.0/mlflow/runs/log-batch (Caused by ResponseError('too many 500 error responses'))


TrainOutput(global_step=3500, training_loss=0.7577310218811035, metrics={'train_runtime': 3461.6108, 'train_samples_per_second': 4.044, 'train_steps_per_second': 1.011, 'total_flos': 1854741934080000.0, 'train_loss': 0.7577310218811035, 'epoch': 4.0})

### Evaluation

In [20]:
def log_metrics(cr, brier, split):
    mlflow.log_metric(f"{split}_brier", brier)

    for key, value in cr.items():
        if (key == "accuracy"):
                # print(f"{split}_{key}", round(value,2))
                mlflow.log_metric(f"{split}_{key}", value)
        else:
            for metric in value:
                mlflow.log_metric(f"{split}_{key}_{metric}", value.get(metric))
                # print(f"{split}_{key}_{metric}", round(value.get(metric),2))

In [21]:
logger.info('predict on train_dataset')
train_output = trainer.predict(train_dataset)

classification_report, brier= get_eval_metrics(train_output, le)
log_metrics(classification_report, brier, "train")

INFO:root:predict on train_dataset


INFO:basic_functions:get evaluation metrics
INFO:basic_functions:classification_report
INFO:basic_functions:confusion_matrix
INFO:basic_functions:brier score


                       precision    recall  f1-score   support

           ad_hominem       0.89      0.69      0.78       331
  appeal_to_authority       0.89      0.57      0.69       227
    appeal_to_emotion       0.79      0.86      0.82       504
        false_dilemma       0.91      0.70      0.79       319
faulty_generalization       0.81      0.67      0.73       449
                 none       0.85      0.98      0.91      1670

             accuracy                           0.84      3500
            macro avg       0.86      0.74      0.79      3500
         weighted avg       0.85      0.84      0.84      3500

[[ 228    5   49    1   13   35]
 [   3  129   30    5   18   42]
 [   8    1  434    3   16   42]
 [  11    0   10  223   16   59]
 [   6    1   19    3  302  118]
 [   1    9    7    9    8 1636]]
Multiclass Brier score: 0.24648120722082073


In [22]:
logger.info('predict on test_dataset')
test_output = trainer.predict(test_dataset)

classification_report, brier = get_eval_metrics(test_output, le)
log_metrics(classification_report, brier, "test")

INFO:root:predict on test_dataset


INFO:basic_functions:get evaluation metrics
INFO:basic_functions:classification_report
INFO:basic_functions:confusion_matrix
INFO:basic_functions:brier score


                       precision    recall  f1-score   support

           ad_hominem       0.75      0.55      0.63       142
  appeal_to_authority       0.80      0.37      0.51        97
    appeal_to_emotion       0.67      0.78      0.72       216
        false_dilemma       0.85      0.67      0.75       137
faulty_generalization       0.63      0.46      0.53       192
                 none       0.77      0.92      0.84       716

             accuracy                           0.75      1500
            macro avg       0.75      0.63      0.66      1500
         weighted avg       0.75      0.75      0.73      1500

[[ 78   1  28   0  11  24]
 [  7  36   9   1   7  37]
 [  6   2 168   2  10  28]
 [  2   0   2  92   7  34]
 [  4   0  27   1  89  71]
 [  7   6  16  12  17 658]]
Multiclass Brier score: 0.4123306847059788


In [23]:
mlflow.end_run()

🏃 View run deberta_v3_small at: http://127.0.0.1:5001/#/experiments/823412171152425451/runs/a982d0f5999943aaba97157c28e37303
🧪 View experiment at: http://127.0.0.1:5001/#/experiments/823412171152425451


### Save model

In [24]:
#save with pytorch
mlflow.pytorch.save_model(model, path=SAVE_PATH)

### Load model

In [25]:
import mlflow.pytorch
path_pt = "../models/LLM_deberta_v3_small_class_imbalance/pytorch_model"
model = mlflow.pytorch.load_model(path_pt)



### Make predictions based on reloaded model

In [None]:
## Function for prediction

def predict(model, encodings, batch_size=8):
    # Set the model to evaluation mode
    model.eval()
    
    # Use GPU
    device = torch.device("mps")
    model.to(device)
    
    # Perform inference
    probabilities = []
    for i in range(0, len(encodings["input_ids"]), batch_size):
        with torch.no_grad():
            batch = {
                key: val[i:i+batch_size].to(device) 
                for key, val in encodings.items()
            }
            outputs = model(**batch)
            probs = torch.softmax(outputs.logits, dim=-1).cpu().numpy()
            probabilities.extend(probs)
            
        # Clear GPU memory after each batch
        torch.mps.empty_cache()
    
    return np.array(probabilities)

In [None]:
#needed to reduce the batch size, otherwise I had an error
# Get predictions for test data
base_probs = predict(model, test_encodings, batch_size=2)

In [None]:
# Get highest probability indices
predicted_labels = np.argmax(base_probs, axis=1)

In [None]:
# Get second highest probability indices
second_predicted_labels = np.argsort(base_probs, axis=1)[:, -2]  

In [None]:
#probabilites of first predicted
predicted_label_probs = base_probs[np.arange(len(predicted_labels)), predicted_labels]

In [None]:
#probabilites of second predicted
second_predicted_label_probs = np.sort(base_probs, axis=1)[:, -2]  

In [None]:
#for backend 
result = {
    "predicted_labels": predicted_labels,
    "predicted_label_probs": predicted_label_probs,
    "second_predicted_labels": second_predicted_labels,
    "second_predicted_label_probs": second_predicted_label_probs
}

In [None]:
from sklearn.metrics import classification_report

# Generate classification report
report = classification_report(y_test, predicted_labels, target_names=le.classes_)
print(report)

# Generate confusion matrix
cm = confusion_matrix(y_test, predicted_labels)
print("Confusion Matrix:")
print(cm)

In [None]:
# from sklearn.preprocessing import LabelBinarizer

# # 1. One-hot encode the true labels (y_test)
# lb = LabelBinarizer()
# y_true_onehot = lb.fit_transform(y_test)  # Shape: (n_samples, n_classes)

# # 2. Compute Brier score for multiclass
# brier_score = np.mean(np.sum((base_probs - y_true_onehot) ** 2, axis=1))
# print("Multiclass Brier score:", brier_score)