In [5]:
import torch
# Check device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"✓ Using device: {device}")
if torch.cuda.is_available():
    print(f"  GPU: {torch.cuda.get_device_name(0)}")
    print(f"  Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
else:
    print("  ⚠ No GPU detected. Training will be slower on CPU.")

✓ Using device: cuda
  GPU: Tesla T4
  Memory: 15.83 GB


In [6]:
# ---------------------------------------------------------
# PREREQUISITES: Install these libraries first
# !pip install transformers datasets evaluate scikit-learn accelerate torch
# ---------------------------------------------------------

import pandas as pd
import numpy as np
import torch
from torch import nn
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.preprocessing import LabelEncoder
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)
import evaluate

# ==========================================
# 1. CONFIGURATION
# ==========================================
MODEL_NAME = "microsoft/deberta-v3-base"  # Change to "base" if you run out of GPU memory
MAX_LENGTH = 512
BATCH_SIZE = 4 # DeBERTa-large is heavy; use 4 or 8. For 'base' you can use 16.

# ==========================================
# 2. DATA LOADING & PREPROCESSING
# ==========================================
# Load data
df = pd.read_csv("/content/Without_embeddings.csv")

# Encode Labels (Text -> Integers)
le = LabelEncoder()
df['label'] = le.fit_transform(df['final_label'])
num_labels = len(le.classes_)
label_mapping = dict(zip(le.classes_, range(len(le.classes_))))
id2label = {i: label for label, i in label_mapping.items()}
label2id = label_mapping

print(f"Labels found: {label_mapping}")

# Compute Class Weights (CRITICAL for your imbalanced data)
# This calculates how much 'attention' the model should pay to each class.
class_weights = compute_class_weight(
    class_weight="balanced",
    classes=np.unique(df['label']),
    y=df['label']
)
class_weights = torch.tensor(class_weights, dtype=torch.float)

# Split Data
train_df, val_df = train_test_split(df, test_size=0.15, stratify=df['label'], random_state=42)

# Convert to Hugging Face Dataset
dataset = DatasetDict({
    "train": Dataset.from_pandas(train_df),
    "test": Dataset.from_pandas(val_df)
})

# ==========================================
# 3. TOKENIZATION
# ==========================================
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def preprocess_function(examples):
    return tokenizer(
        examples["English MT"],
        truncation=True,
        max_length=MAX_LENGTH,
        padding="max_length"
    )

tokenized_dataset = dataset.map(preprocess_function, batched=True)

# ==========================================
# 4. CUSTOM TRAINER (To handle Class Imbalance)
# ==========================================
# We override the loss function to include our calculated weights
class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs): # Correct signature
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")

        # Define weighted loss
        loss_fct = nn.CrossEntropyLoss(weight=class_weights.to(model.device))
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))

        return (loss, outputs) if return_outputs else loss

# ==========================================
# 5. MODEL SETUP & TRAINING
# ==========================================
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id
)

metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

training_args = TrainingArguments(
    output_dir="./bias_detection_model",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,               # Low learning rate is best for DeBERTa
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    fp16=True,                        # Use mixed precision (faster on GPU)
    logging_steps=50,
)

trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# START TRAINING
print("Starting training...")
trainer.train()

# ==========================================
# 6. SAVE & PREDICT
# ==========================================
trainer.save_model("./final_bias_model")
print("Model saved to ./final_bias_model")

# Example Prediction
text = "The israeli occupation forces attacked civilians."
inputs = tokenizer(text, return_tensors="pt").to(model.device)
with torch.no_grad():
    logits = model(**inputs).logits
predicted_class_id = logits.argmax().item()
print(f"Prediction: {id2label[predicted_class_id]}")

Labels found: {'Biased against Israel': 0, 'Biased against Palestine': 1, 'Biased against both Palestine and Israel': 2, 'Biased against others': 3, 'Unbiased': 4, 'Unclear': 5}


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Map:   0%|          | 0/6443 [00:00<?, ? examples/s]

Map:   0%|          | 0/1138 [00:00<?, ? examples/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = WeightedTrainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 2, 'bos_token_id': 1}.


Starting training...


[34m[1mwandb[0m: (1) Create a W&B account
[34m[1mwandb[0m: (2) Use an existing W&B account
[34m[1mwandb[0m: (3) Don't visualize my results
[34m[1mwandb[0m: Enter your choice:

 3


[34m[1mwandb[0m: You chose "Don't visualize my results"


Epoch,Training Loss,Validation Loss,Accuracy
1,1.3763,1.38002,0.442882
2,1.4308,1.363794,0.359402
3,1.2477,1.369236,0.435852
4,1.1473,1.476515,0.467487
5,1.1095,1.603106,0.458699


Model saved to ./final_bias_model
Prediction: Unclear


In [2]:
!pip install transformers datasets evaluate scikit-learn accelerate torch

