In [1]:
import torch
import torch.nn.functional as F
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, EvalPrediction
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from sklearn.model_selection import train_test_split
import os
import numpy as np

In [2]:
# Load tokenizer for mBERT
tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")

# Tokenization function
def tokenize_function(texts):
    if isinstance(texts, pd.Series):
        texts = texts.fillna("")  # Replace NaN values with empty strings
    return tokenizer(list(texts), padding=True, truncation=True, max_length=256, return_tensors="pt")

# Load dataset
df = pd.read_csv("D:/FinetuningBERT-HFT/Final_Augmented_Labeled_Data.csv")
df["label"] = df["label"].astype(int)
print("Sample from dataset:")
print(df.head())
print("Class distribution:")
print(df["label"].value_counts())  

Sample from dataset:
                                                text  label
0  contratto di noleggio installazione manutenzio...      0
1  impegno spesa per ilnoleggio di apparecchiatur...      0
2  servizio di noleggio di manutenzione ordinaria...      0
3  il servizio riguarda il noleggio la manutenzio...      0
4  procedura di acquisizione fornitura soluzione ...      0
Class distribution:
label
0    75
1    57
2    51
Name: count, dtype: int64


In [3]:
# Load OOD Dataset if file exists
ood_file = "D:/FinetuningBERT-HFT/Extended_Dataset_with_Random_Topics.csv"
if os.path.exists(ood_file):
    df_ood = pd.read_csv(ood_file)
    if "text" in df_ood.columns:
        df_ood = df_ood.dropna(subset=["text"])
        df_ood["label"] = 3  # Label OOD samples as class 3
    else:
        raise ValueError("OOD dataset must contain a 'text' column.")
else:
    ood_texts = [
        "Quantum computing will revolutionize technology.",
        "The Renaissance was a cultural movement in Europe.",
        "Climate change is a major concern for future generations.",
        "Python is a popular programming language for AI.",
        "Shakespeare's plays are among the most influential works.",
    ]
    df_ood = pd.DataFrame({"text": ood_texts, "label": [3] * len(ood_texts)})

In [4]:
# Merge datasets
df_combined = pd.concat([df, df_ood], ignore_index=True)

# Prepare data for training and validation
def prepare_data():
    train_texts, val_texts, train_labels, val_labels = train_test_split(
        df_combined["text"].astype(str).tolist(),
        df_combined["label"].tolist(),
        test_size=0.2,
        stratify=df_combined["label"],
        random_state=42
    )
    return train_texts, val_texts, train_labels, val_labels

train_texts, val_texts, train_labels, val_labels = prepare_data()

In [5]:
# Dataset Wrapper
class DatasetWrapper(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels)

# Create tokenized datasets
def create_dataset(texts, labels):
    encodings = tokenize_function(texts)
    return DatasetWrapper(encodings, labels)

train_dataset = create_dataset(train_texts, train_labels)
val_dataset = create_dataset(val_texts, val_labels)

In [6]:
# Load mBERT model
def load_model():
    return BertForSequenceClassification.from_pretrained("bert-base-multilingual-cased", num_labels=4).to("cuda")

model = load_model()

# Compute evaluation metrics
def compute_metrics(eval_pred: EvalPrediction):
    predictions = np.argmax(eval_pred.predictions, axis=1)
    labels = eval_pred.label_ids
    accuracy = np.mean(predictions == labels)
    print(f"Validation Accuracy: {accuracy:.4f}")
    return {"accuracy": accuracy}

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
# Training function
def train_model():
    training_args = TrainingArguments(
        output_dir="./results",
        evaluation_strategy="epoch",
        save_strategy="epoch",
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=5,
        learning_rate=2e-5,
        weight_decay=0.01,
        logging_dir="./logs",
        load_best_model_at_end=True,
    )
    
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics
    )
    
    trainer.train()

In [8]:
# Energy-based OOD detection
def energy_score(logits):
    return -torch.logsumexp(logits, dim=1)

# Prediction with energy and confidence thresholds
def predict_with_energy(text, threshold=-3.5, confidence_threshold=0.75):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=256)
    inputs = {key: value.to("cuda") for key, value in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)

    softmax_scores = F.softmax(outputs.logits, dim=1)
    max_confidence = torch.max(softmax_scores).item()

    if max_confidence < confidence_threshold:
        return "Unknown", -1.0  # Reject low-confidence predictions

    score = energy_score(outputs.logits).item()
    predicted_class = torch.argmax(outputs.logits, dim=1).item()

    if score > threshold or predicted_class == 3:
        return "Unknown", score
    
    return predicted_class, score

In [9]:
# Train the model
train_model()



Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.02144,1.0
2,No log,0.015633,0.995595
3,No log,0.022121,0.995595
4,No log,0.021701,0.995595
5,0.059100,0.020785,0.995595


Validation Accuracy: 1.0000
Validation Accuracy: 0.9956
Validation Accuracy: 0.9956
Validation Accuracy: 0.9956
Validation Accuracy: 0.9956


In [10]:
# Test Pipeline
def run_tests():
    test_texts = [
        "servizio di manutenzione degli impianti siti al parco idroscalo di condizionamento e riscaldamento e assunzione della figura di terzo responsabile per il triennio 20232025 nan",
        "Quantum mechanics describes subatomic particles.",
        "Python programming is widely used in AI.",
        "Servizio di noleggio a lungo termine di n. 2 auto destinate alla Polizia stradale a servizio delle tratte di Autostrada Pedemontana Lombarda SpA"
    ]
    
    for text in test_texts:
        pred_class, score = predict_with_energy(text)
        print(f"Text: {text}\nPredicted Class: {pred_class}\nConfidence Score: {score}\n")

run_tests()

Text: servizio di manutenzione degli impianti siti al parco idroscalo di condizionamento e riscaldamento e assunzione della figura di terzo responsabile per il triennio 20232025 nan
Predicted Class: 0
Confidence Score: -4.067328453063965

Text: Quantum mechanics describes subatomic particles.
Predicted Class: Unknown
Confidence Score: -2.3810746669769287

Text: Python programming is widely used in AI.
Predicted Class: Unknown
Confidence Score: -2.603029251098633

Text: Servizio di noleggio a lungo termine di n. 2 auto destinate alla Polizia stradale a servizio delle tratte di Autostrada Pedemontana Lombarda SpA
Predicted Class: 0
Confidence Score: -3.580868721008301

