# DeBERTa-v3 Large Prediction Script
## Optimized for Google Colab

In [None]:
# ================================
# Step 0: Disable W&B Logging
# ================================

import os
os.environ["WANDB_DISABLED"] = "true"  # Disable W&B before any imports

# ================================
# Step 1: Install and Import Libraries
# ================================

# Install required libraries
!pip install --upgrade transformers datasets scikit-learn imbalanced-learn

# Import libraries and verify versions
import transformers
import datasets
import sklearn
import torch

print(f"Transformers version: {transformers.__version__}")
print(f"Datasets version: {datasets.__version__}")
print(f"Scikit-learn version: {sklearn.__version__}")
print(f"PyTorch version: {torch.__version__}")

# Ensure PyTorch uses the GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')


# Change directory to your project folder in Google Drive (if applicable)
# %cd /content/drive/MyDrive/your_project_folder

# ================================
# Step 2: Load the Data
# ================================

import pandas as pd
from datasets import Dataset, DatasetDict

# Load your data
train_df = pd.read_csv('train_data.tsv', sep='\t')
val_df = pd.read_csv('validation_data.tsv', sep='\t')
test_df = pd.read_csv('test_data.tsv', sep='\t')

# ================================
# Step 3: Prepare the Datasets
# ================================

def preprocess_dataframe(df):
    df = df.copy()
    # Ensure 'text' is a single string, not a list
    df['text'] = df['text'].apply(lambda x: x[0] if isinstance(x, list) else x)
    return df

train_df = preprocess_dataframe(train_df)
val_df = preprocess_dataframe(val_df)
test_df = preprocess_dataframe(test_df)

# Convert DataFrames to Hugging Face Datasets
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

dataset = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset,
    'test': test_dataset
})

# ================================
# Step 4: Encode the Labels and Compute Class Weights
# ================================

from sklearn.utils.class_weight import compute_class_weight
import numpy as np

# Identify unique class labels
class_labels = sorted(train_df['label'].unique())
print(f"Unique class labels: {class_labels}")

# Create label to ID and ID to label mappings
label_to_id = {label: idx for idx, label in enumerate(class_labels)}
id_to_label = {idx: label for label, idx in label_to_id.items()}

print(f"Label to ID mapping: {label_to_id}")
print(f"ID to Label mapping: {id_to_label}")

# Encode labels in the dataset
def encode_labels(example):
    example['labels'] = label_to_id[example['label']]
    return example

dataset = dataset.map(encode_labels)

# Compute class weights based on the training set
train_labels = dataset['train']['labels']
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(train_labels),
    y=train_labels
)

# Convert class weights to a tensor and move to the appropriate device
class_weights_tensor = torch.tensor(class_weights, dtype=torch.float).to(device)
print(f"Class weights: {class_weights_tensor}")

# ================================
# Step 5: Tokenize the Text
# ================================

from transformers import AutoTokenizer

# Initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained('microsoft/deberta-v3-large')

# Define the tokenization function
def tokenize_function(examples):
    return tokenizer(
        examples['text'],
        padding='max_length',
        truncation=True,
        max_length=256
    )

# Apply tokenization to the dataset
dataset = dataset.map(tokenize_function, batched=True)

# ================================
# Step 6: Set Format for PyTorch
# ================================

dataset.set_format(
    type='torch',
    columns=['input_ids', 'attention_mask', 'labels']
)

# ================================
# Step 7: Initialize the Model
# ================================

from transformers import AutoModelForSequenceClassification

num_labels = len(class_labels)
model = AutoModelForSequenceClassification.from_pretrained(
    'microsoft/deberta-v3-large',
    num_labels=num_labels
).to(device)

# ================================
# Step 8: Define the Metrics
# ================================

from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report

def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    labels_true = p.label_ids
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels_true, preds, average='weighted', zero_division=0
    )
    acc = accuracy_score(labels_true, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# ================================
# Step 9: Set Up Training Arguments
# ================================

from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir='./deberta_results',
    eval_strategy='epoch',
    save_strategy='epoch',
    learning_rate=1e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=4,
    num_train_epochs=18,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model='f1',
    greater_is_better=True,
    logging_dir='./deberta_logs',
    save_total_limit=2,
    gradient_accumulation_steps=8,
    fp16=True,                    # Enable mixed precision
    gradient_checkpointing=True,
    report_to=[],                  # Disable all logging integrations
    logging_steps=50,             # Ensure frequent logging
)

# ================================
# Step 10: Define a Custom Trainer with Class Weights
# ================================

from transformers import Trainer

class CustomTrainer(Trainer):
    def __init__(self, class_weights, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = class_weights

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        loss_fct = torch.nn.CrossEntropyLoss(weight=self.class_weights)
        loss = loss_fct(logits, labels)
        return (loss, outputs) if return_outputs else loss

# Initialize the custom trainer with class weights
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['validation'],
    compute_metrics=compute_metrics,
    class_weights=class_weights_tensor
)

# ================================
# Step 11: Train the Model
# ================================

trainer.train()

# ================================
# Step 12: Evaluate the Model
# ================================

print("Validation Results:")
eval_results = trainer.evaluate(eval_dataset=dataset['validation'])
print(eval_results)

print("Test Results:")
test_results = trainer.evaluate(eval_dataset=dataset['test'])
print(test_results)

# ================================
# Step 13: Make Predictions on the Test Set
# ================================

import numpy as np

predictions, labels, metrics = trainer.predict(dataset['test'])
preds = np.argmax(predictions, axis=1)

# Convert numeric labels back to text labels
predicted_labels = [id_to_label[pred] for pred in preds]
true_labels = [id_to_label[label] for label in labels]

# Add predictions to the test DataFrame
test_df['predicted_label'] = predicted_labels
test_df['true_label'] = true_labels

# Save the predictions to a CSV file
test_df.to_csv('deberta_test_predictions.csv', index=False)

# ================================
# Step 14: Analyze the Results
# ================================

print("Classification Report:")
print(classification_report(
    test_df['true_label'],
    test_df['predicted_label'],
    target_names=class_labels  # Ensure this matches the number of classes
))

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.metrics import confusion_matrix, precision_recall_fscore_support, classification_report


predictions = np.argmax(predictions, axis=1)  # Raw predictions to class IDs
true_labels = labels  # True class IDs

# Map IDs back to class labels
predicted_labels = [id_to_label[pred] for pred in predictions]
true_labels_text = [id_to_label[true] for true in true_labels]

# Get class names
class_labels = sorted(list(set(true_labels_text)))

# ============================
# Step 1: Confusion Matrix Heatmap
# ============================
cm = confusion_matrix(true_labels_text, predicted_labels, labels=class_labels)

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", cbar=False, xticklabels=class_labels, yticklabels=class_labels)
plt.title("Confusion Matrix")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.tight_layout()
plt.show()

# ============================
# Step 2: Precision, Recall, and F1-Score Bar Chart
# ============================
precision, recall, f1_score, support = precision_recall_fscore_support(
    true_labels_text, predicted_labels, labels=class_labels, zero_division=0
)

x = np.arange(len(class_labels))
width = 0.2

fig, ax = plt.subplots(figsize=(8, 6))

bar1 = ax.bar(x - width, precision, width, label='Precision')
bar2 = ax.bar(x, recall, width, label='Recall')
bar3 = ax.bar(x + width, f1_score, width, label='F1-Score')

for bars in [bar1, bar2, bar3]:
    for bar in bars:
        height = bar.get_height()
        ax.annotate(f'{height:.2f}', xy=(bar.get_x() + bar.get_width() / 2, height),
                    xytext=(0, 3), textcoords="offset points", ha='center', va='bottom')

ax.set_xlabel('Class Labels')
ax.set_ylabel('Scores')
ax.set_title('Performance Metrics per Class')
ax.set_xticks(x)
ax.set_xticklabels(class_labels)
ax.legend()

plt.tight_layout()
plt.show()

# ============================
# Step 3: Class Distribution Pie Chart
# ============================
fig, ax = plt.subplots(figsize=(6, 6))
ax.pie(support, labels=class_labels, autopct='%1.1f%%', startangle=90, colors=['lightblue', 'lightgreen', 'salmon'])
ax.set_title('Class Distribution')
plt.tight_layout()
plt.show()

# Print updated classification report
print("Updated Classification Report:")
print(classification_report(true_labels_text, predicted_labels, target_names=class_labels))