In [20]:
import logging
from typing import Tuple, Dict, Any
import torch
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datasets import Dataset, concatenate_datasets
from sklearn.metrics import (
    roc_curve, auc, confusion_matrix, accuracy_score,
    precision_recall_fscore_support, precision_score,
    recall_score, f1_score, roc_auc_score
)
from sklearn.model_selection import train_test_split
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    Trainer, TrainingArguments, EarlyStoppingCallback
)
from peft import get_peft_model, LoraConfig, TaskType

In [21]:
# Constants
MODEL_NAME = "microsoft/deberta-v3-small"
#MODEL_NAME = "./deberta_lora_classifier_full_human"
MAX_LENGTH = 256
BATCH_SIZE = 8
NUM_EPOCHS = 100
WEIGHT_DECAY = 0.01
MODEL_SAVE_PATH = "./deberta_lora_classifier_full_human2"
ROC_SAVE_PATH1 = "roc_curve_lora_full_human2.png"
CM_SAVE_PATH1 = "confusion_matrix_lora_full_human2.png"
ROC_SAVE_PATH2 = "roc_curve_lora_full_human2.png"
CM_SAVE_PATH2 = "confusion_matrix_lora_full_human2.png"

# Setup logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger("DeBERTa Initial Runs")

# Device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
logger.info(f"Using device: {device}")

class PromptClassifier:
    def __init__(self):
        """Initialize the classifier with model, tokenizer, and LoRA config."""
        self.tokenizer = AutoTokenizer.from_pretrained(
            MODEL_NAME, use_fast=False
        )
        # base_model
        self.base_model = AutoModelForSequenceClassification.from_pretrained(
            MODEL_NAME, num_labels=2
        ).to(device)

        lora_config = LoraConfig(
            task_type=TaskType.SEQ_CLS,
            r=8,
            lora_alpha=16,
            lora_dropout=0.1
        )
        self.model = get_peft_model(self.base_model, lora_config).to(device)
        logger.info("DeBERTa LoRA model initialized.")

    def preprocess_data(self, examples: Dict[str, Any]) -> Dict[str, Any]:
        """Preprocess the input data using the tokenizer."""
        return self.tokenizer(
            examples["Prompt"],
            truncation=True,
            padding='max_length',
            max_length=MAX_LENGTH
        ).to(device)

    def prepare_datasets(self, df: pd.DataFrame) -> Tuple[Dataset, Dataset]:
        """Prepare training and validation datasets."""
        train_texts, val_texts, train_labels, val_labels = train_test_split(
            df["Prompt"].tolist(),
            df["Malicious (0/1)"].tolist(),
            test_size=0.2,
            random_state=42
        )

        train_data = Dataset.from_dict({
            "Prompt": train_texts,
            "label": train_labels
        })
        val_data = Dataset.from_dict({
            "Prompt": val_texts,
            "label": val_labels
        })

        train_data = train_data.map(self.preprocess_data, batched=True)
        val_data = val_data.map(self.preprocess_data, batched=True)

        return train_data, val_data

    def compute_metrics(
        self, eval_pred: Tuple[np.ndarray, np.ndarray]
    ) -> Dict[str, float]:
        """Compute evaluation metrics."""
        logits, labels = eval_pred
        predictions = torch.argmax(torch.tensor(logits), dim=-1)
        precision, recall, f1, _ = precision_recall_fscore_support(
            labels, predictions, average='binary'
        )
        acc = accuracy_score(labels, predictions)
        return {
            "accuracy": acc,
            "f1": f1,
            "precision": precision,
            "recall": recall
        }

    def train(self, train_data: Dataset, val_data: Dataset) -> None:
        """Train the model."""
        training_args = TrainingArguments(
            output_dir="./results",
            eval_strategy="epoch",
            save_strategy="epoch",
            per_device_train_batch_size=BATCH_SIZE,
            per_device_eval_batch_size=BATCH_SIZE,
            num_train_epochs=NUM_EPOCHS,
            weight_decay=WEIGHT_DECAY,
            logging_dir="./deberta_logs",
            logging_strategy="steps",
            logging_steps=50,
            save_total_limit=3,
            load_best_model_at_end=True,
            metric_for_best_model="eval_loss",
            greater_is_better=False,
            evaluation_strategy="epoch",
            report_to="none"
        )

        early_stopping_callback = EarlyStoppingCallback(
            early_stopping_patience=3,
            early_stopping_threshold=0.0
        )

        trainer = Trainer(
            model=self.model,
            args=training_args,
            train_dataset=train_data,
            eval_dataset=val_data,
            compute_metrics=self.compute_metrics,
            callbacks=[early_stopping_callback]
        )

        trainer.train()
        self.save_model(trainer)

    def save_model(self, trainer: Trainer) -> None:
        """Save the trained model and tokenizer."""
        trainer.save_model(MODEL_SAVE_PATH)
        self.tokenizer.save_pretrained(MODEL_SAVE_PATH)
        logger.info(f"Model saved to {MODEL_SAVE_PATH}")

    def evaluate(
        self, val_data: Dataset
    ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
        """Evaluate the model and return predictions."""
        test_trainer = Trainer(self.model)
        predictions = test_trainer.predict(val_data)

        logits = torch.tensor(predictions.predictions)
        probs = torch.nn.functional.softmax(logits, dim=-1).numpy()
        pred_labels = np.argmax(probs, axis=-1)

        return predictions.label_ids, pred_labels, probs[:, 1]

    def analyze_misclassifications(
        self, val_data: Dataset, true_labels: np.ndarray, 
        pred_labels: np.ndarray, probs: np.ndarray
    ) -> pd.DataFrame:
        """Analyze misclassified examples.
    
        Returns:
        DataFrame containing misclassified examples with their predictions
        and confidence scores.
        """
        # Get original prompts from validation dataset
        prompts = val_data['Prompt']
    
        # Create DataFrame with all predictions
        results_df = pd.DataFrame({
            'Prompt': prompts,
            'True_Label': true_labels,
            'Predicted_Label': pred_labels,
            'Confidence': probs
        })
    
        # Filter for misclassified examples
        misclassified = results_df[results_df['True_Label'] != results_df['Predicted_Label']]
    
        # Sort by confidence to see most confident mistakes
        misclassified = misclassified.sort_values('Confidence', ascending=False)
    
        logger.info(f"Total misclassified examples: {len(misclassified)}")
        return misclassified
    
    def plot_metrics(
        self, true_labels: np.ndarray, pred_labels: np.ndarray,
        probs: np.ndarray, roc_path: str, cm_path:str
    ) -> None:
        """Plot ROC curve and confusion matrix."""
        # ROC Curve
        fpr, tpr, _ = roc_curve(true_labels, probs)
        roc_auc = auc(fpr, tpr)

        plt.figure(figsize=(8, 6))
        plt.plot(
            fpr, tpr, color="blue", lw=2,
            label=f"ROC Curve (AUC = {roc_auc:.2f})"
        )
        plt.plot([0, 1], [0, 1], color="gray", linestyle="--")
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel("False Positive Rate")
        plt.ylabel("True Positive Rate")
        plt.title("Receiver Operating Characteristic (ROC) Curve")
        plt.legend(loc="lower right")
        plt.savefig(
            roc_path,
            dpi=300,
            bbox_inches="tight"
        )
        plt.close()

        # Confusion Matrix
        cm = confusion_matrix(true_labels, pred_labels)
        plt.figure(figsize=(6, 5))
        sns.heatmap(
            cm, annot=True, fmt="d", cmap="Blues",
            xticklabels=["0", "1"], yticklabels=["0", "1"]
        )
        plt.xlabel("Predicted Label")
        plt.ylabel("True Label")
        plt.title("Confusion Matrix")
        plt.savefig(
            cm_path,
            dpi=300,
            bbox_inches="tight"
        )
        plt.close()

2025-03-21 01:06:57,719 - INFO - Using device: cuda


In [22]:
# Load data
df = pd.read_csv(
    "./data/FINAL_validated_prompts_with_similarity_deBERTa.csv"
)
df["Malicious (0/1)"] = df["Malicious (0/1)"].astype(int)
human_df = df[df["Source"] == "Manual"]
final_df = df[df["Source"] == "Generated"]
"""
# Separate malicious and benign prompts
malicious_prompts = df[df["Malicious (0/1)"] == 1]
benign_prompts = df[df["Malicious (0/1)"] == 0]

# Randomly sample 3000 prompts from each class
malicious_sample = malicious_prompts.sample(n=3000, random_state=42)
benign_sample = benign_prompts.sample(n=3000, random_state=42)

# Combine the samples
balanced_df = pd.concat([malicious_sample, benign_sample])
# Shuffle the combined dataset
balanced_df = balanced_df.sample(frac=1, random_state=42) \
        .reset_index(drop=True)

logger.info(f"Created balanced dataset with {len(balanced_df)} rows")
logger.info(f"Number of malicious prompts: {sum(balanced_df['Malicious (0/1)'] == 1)}")
logger.info(f"Number of benign prompts: {sum(balanced_df['Malicious (0/1)'] == 0)}")
"""
# Initialize and train classifier
classifier = PromptClassifier()
train_data, val_data = classifier.prepare_datasets(final_df)
temp1, temp2 = classifier.prepare_datasets(human_df)
human_data = concatenate_datasets([temp1, temp2])

train_total = len(train_data['label'])
val_total = len(val_data['label'])
train_1_total = sum(train_data['label'])
val_1_total = sum(val_data['label'])
logger.info(f"Training: {train_1_total} malicious prompts")
logger.info(f"Training: {train_total-train_1_total} benign prompts")
logger.info(f"Validation: {val_1_total} malicious prompts")
logger.info(f"Validation: {val_total-val_1_total} benign prompts")
    
logger.info("Starting fine-tuning...")
classifier.train(train_data, val_data)

# Evaluate and plot results
logger.info("Evaluating...")
true_labels, pred_labels, probs = classifier.evaluate(val_data)

# Print metrics
print(f"Accuracy: {accuracy_score(true_labels, pred_labels):.4f}")
print(f"Precision: {precision_score(true_labels, pred_labels):.4f}")
print(f"Recall: {recall_score(true_labels, pred_labels):.4f}")
print(f"F1 Score: {f1_score(true_labels, pred_labels):.4f}")
print(f"AUC Score: {roc_auc_score(true_labels, probs):.4f}")

# Plot metrics
classifier.plot_metrics(true_labels, pred_labels, probs, ROC_SAVE_PATH1, CM_SAVE_PATH1)

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-small and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
2025-03-21 01:07:08,646 - INFO - DeBERTa LoRA model initialized.
Map: 100%|██████████| 61328/61328 [00:04<00:00, 12941.69 examples/s]
Map: 100%|██████████| 15332/15332 [00:01<00:00, 13153.05 examples/s]
Map: 100%|██████████| 129/129 [00:00<00:00, 8253.36 examples/s]
Map: 100%|██████████| 33/33 [00:00<00:00, 5596.93 examples/s]
2025-03-21 01:07:23,687 - INFO - Training: 30662 malicious prompts
2025-03-21 01:07:23,688 - INFO - Training: 30666 benign prompts
2025-03-21 01:07:23,688 - INFO - Validation: 7668 malicious prompts
2025-03-21 01:07:23,688 - INFO - Validation: 7664 benign prompts
2025-03-21 01:07:23,688 - INFO - Starting fine-tuning...

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.1107,0.177856,0.952387,0.95269,0.946921,0.958529
2,0.1215,0.154464,0.955779,0.955123,0.969758,0.940923
3,0.1495,0.128865,0.963736,0.963887,0.960145,0.967658
4,0.112,0.14696,0.958518,0.957606,0.979411,0.93675
5,0.1015,0.119848,0.966084,0.966031,0.967801,0.964267
6,0.0728,0.118566,0.966997,0.967074,0.965065,0.969092
7,0.0836,0.125708,0.966736,0.966719,0.967476,0.965962
8,0.1683,0.109511,0.968171,0.9682,0.96757,0.968832
9,0.0772,0.121485,0.968302,0.968318,0.968066,0.968571
10,0.0882,0.124835,0.966736,0.966549,0.972288,0.960876


2025-03-21 01:53:30,992 - INFO - Model saved to ./deberta_lora_classifier_full_human2
2025-03-21 01:53:30,993 - INFO - Evaluating...


Accuracy: 0.9682
Precision: 0.9676
Recall: 0.9688
F1 Score: 0.9682
AUC Score: 0.9955


In [23]:
#classifier = PromptClassifier()
temp1, temp2 = classifier.prepare_datasets(human_df)
human_data = concatenate_datasets([temp1, temp2])
logger.info("Evaluating on human data...")
true_labels, pred_labels, probs = classifier.evaluate(human_data)

# Print metrics
print(f"Accuracy: {accuracy_score(true_labels, pred_labels):.4f}")
print(f"Precision: {precision_score(true_labels, pred_labels):.4f}")
print(f"Recall: {recall_score(true_labels, pred_labels):.4f}")
print(f"F1 Score: {f1_score(true_labels, pred_labels):.4f}")
print(f"AUC Score: {roc_auc_score(true_labels, probs):.4f}")

# Plot metrics
classifier.plot_metrics(true_labels, pred_labels, probs, ROC_SAVE_PATH2, CM_SAVE_PATH2)

Map: 100%|██████████| 129/129 [00:00<00:00, 8147.35 examples/s]
Map: 100%|██████████| 33/33 [00:00<00:00, 4916.42 examples/s]
2025-03-21 01:59:50,966 - INFO - Evaluating on human data...


Accuracy: 0.6605
Precision: 0.9189
Recall: 0.5812
F1 Score: 0.7120
AUC Score: 0.8256


In [24]:
misclassified_df = classifier.analyze_misclassifications(
    human_data, true_labels, pred_labels, probs
)

# Print or save misclassified examples
print("\nSample of misclassified examples:")
for _, row in misclassified_df.head(10).iterrows():
    print(f"\nPrompt: {row['Prompt']}")
    print(f"True Label: {row['True_Label']}")
    print(f"Predicted Label: {row['Predicted_Label']}")
    print(f"Confidence: {row['Confidence']:.4f}")

# Optionally save to CSV

misclassified_df.to_csv("misclassified_examples_human2.csv", index=False)

2025-03-21 01:59:54,965 - INFO - Total misclassified examples: 55



Sample of misclassified examples:

Prompt:  I believe that my recent performance review was unfairly negative. How can I challenge this review without damaging my relationship with my manager?
True Label: 0
Predicted Label: 1
Confidence: 0.9997

Prompt: A colleague's work style is very different from mine, causing some friction. How can I address this professionally?
True Label: 0
Predicted Label: 1
Confidence: 0.9932

Prompt: How can I provide constructive feedback to my team members without causing offense?
True Label: 0
Predicted Label: 1
Confidence: 0.9931

Prompt: Can i throw date variables into lightgbm
True Label: 0
Predicted Label: 1
Confidence: 0.9819

Prompt: I am having some disagreements with a coworker that are affecting our work. What should I do to resolve this?
True Label: 0
Predicted Label: 1
Confidence: 0.9754

Prompt: print("Human Error (train):", mean(train_error_EAC, na.rm = T), "\n")
Error in print.default("Human Error (train):", mean(train_error_EAC, na.rm = T),