In [None]:
"""
XLM-RoBERTa Standalone Testing for FIGNEWS-2024
================================================
Isolated script to test xlm-roberta-base on bias detection task.
This allows quick experimentation without running the full ensemble pipeline.
"""

import os
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
from collections import Counter
from typing import Dict, List, Tuple

# Core ML libraries
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, f1_score, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# Transformers
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    EarlyStoppingCallback
)
from datasets import Dataset
import torch

print("="*80)
print("XLM-ROBERTA STANDALONE TESTING")
print("="*80)


# ============================================================================
# CONFIGURATION
# ============================================================================

class Config:
    """Configuration for XLM-RoBERTa testing"""

    # File paths
    MAIN_FILE = "/content/Main.xlsx"
    IAA_FILES = ["/content/IAA-1.xlsx", "/content/IAA-2.xlsx", "/content/IAA-3.xlsx", "/content/IAA-4.xlsx"]

    # Label mapping
    LABEL_MAP = {
        'Unbiased': 'Unbiased',
        'Biased against Palestine': 'Biased Against Palestine',
        'Biased Against Palestine': 'Biased Against Palestine',
        'Biased against Israel': 'Biased Against Israel',
        'Biased Against Israel': 'Biased Against Israel',
        'Unclear': 'Others',
        'Biased against others': 'Others',
        'Biased against both': 'Others',
        'Others': 'Others'
    }

    TARGET_LABELS = ['Unbiased', 'Biased Against Palestine', 'Biased Against Israel', 'Others']
    LABEL2ID = {label: idx for idx, label in enumerate(TARGET_LABELS)}
    ID2LABEL = {idx: label for label, idx in LABEL2ID.items()}

    # Model configuration
    MODEL_NAME = "xlm-roberta-base"

    # Training parameters
    IAA_TRAIN_SPLIT = 0.8
    RANDOM_STATE = 42

    # Training arguments
    TRAINING_ARGS = {
        "output_dir": "./results_xlm_roberta",
        "num_train_epochs": 4,
        "per_device_train_batch_size": 16,
        "per_device_eval_batch_size": 16,
        "learning_rate": 2e-5,
        "weight_decay": 0.01,
        "eval_strategy": "epoch",  # Changed from evaluation_strategy
        "save_strategy": "epoch",
        "load_best_model_at_end": True,
        "metric_for_best_model": "f1_macro",
        "logging_steps": 50,
        "warmup_steps": 100,
        "save_total_limit": 2,
    }


# ============================================================================
# DATA LOADING & PREPROCESSING
# ============================================================================

def load_and_clean_data() -> Tuple[pd.DataFrame, pd.DataFrame]:
    """Load MAIN and IAA files, filter out rows with missing Bias labels."""
    print("\n[STEP 1] Loading and cleaning data...")

    # Load MAIN file
    print(f"  Loading {Config.MAIN_FILE}...")
    main_df = pd.read_excel(Config.MAIN_FILE)
    initial_main_count = len(main_df)

    # Filter out missing Bias labels from MAIN
    main_df = main_df[main_df['Bias'].notna() & (main_df['Bias'] != '')]
    filtered_main_count = len(main_df)
    print(f"    MAIN: {initial_main_count} rows → {filtered_main_count} rows (removed {initial_main_count - filtered_main_count} with missing labels)")

    # Load and concatenate IAA files
    iaa_dfs = []
    for iaa_file in Config.IAA_FILES:
        if os.path.exists(iaa_file):
            print(f"  Loading {iaa_file}...")
            iaa_df_temp = pd.read_excel(iaa_file)
            initial_count = len(iaa_df_temp)

            # Check both 'Bias' and 'Bais' (typo in IAA files)
            if 'Bais' in iaa_df_temp.columns:
                iaa_df_temp['Bias'] = iaa_df_temp['Bais']

            iaa_df_temp = iaa_df_temp[iaa_df_temp['Bias'].notna() & (iaa_df_temp['Bias'] != '')]
            filtered_count = len(iaa_df_temp)

            if filtered_count > 0:
                iaa_dfs.append(iaa_df_temp)
                print(f"    {iaa_file}: {initial_count} rows → {filtered_count} rows")
            else:
                print(f"    {iaa_file}: No valid labels found, skipping.")
        else:
            print(f"  Warning: {iaa_file} not found, skipping.")

    # Concatenate all IAA data
    if iaa_dfs:
        iaa_df = pd.concat(iaa_dfs, ignore_index=True)
        print(f"\n  Total IAA data: {len(iaa_df)} rows with valid labels")
    else:
        print("\n  Warning: No valid IAA data found!")
        iaa_df = pd.DataFrame()

    return main_df, iaa_df


def map_labels(df: pd.DataFrame) -> pd.DataFrame:
    """Map raw bias labels to standardized 4-class labels."""
    df = df.copy()
    df['Bias_Mapped'] = df['Bias'].map(Config.LABEL_MAP)

    # Handle any unmapped labels
    unmapped = df[df['Bias_Mapped'].isna()]['Bias'].unique()
    if len(unmapped) > 0:
        print(f"  Warning: Unmapped labels found: {unmapped}")
        print(f"  Mapping these to 'Others'")
        df['Bias_Mapped'] = df['Bias_Mapped'].fillna('Others')

    return df


def apply_majority_vote(df: pd.DataFrame) -> pd.DataFrame:
    """Apply majority voting to IAA data grouped by Text/ID."""
    print("\n  Applying majority vote to collapse duplicate annotations...")

    # Create a unique text identifier
    df['Text_ID'] = df['ID'].astype(str) + "_" + df['Text'].str[:50]

    gold_standard_rows = []

    for text_id, group in df.groupby('Text_ID'):
        # Get most common label (mode)
        labels = group['Bias_Mapped'].tolist()
        label_counts = Counter(labels)
        majority_label = label_counts.most_common(1)[0][0]

        # Take first row and update its label
        gold_row = group.iloc[0].copy()
        gold_row['Bias_Mapped'] = majority_label
        gold_row['Annotator_Count'] = len(group)
        gold_standard_rows.append(gold_row)

    result_df = pd.DataFrame(gold_standard_rows)
    print(f"    Collapsed {len(df)} annotations → {len(result_df)} unique texts")

    return result_df


def create_train_test_split(main_df: pd.DataFrame, iaa_df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """Create training and test sets following the silver/gold strategy."""
    print("\n[STEP 2] Creating train/test splits...")

    # --- UPDATED LOGIC START ---
    # Get unique text IDs from IAA (use ID if Text_ID doesn't exist yet)
    unique_ids = iaa_df['Text_ID'].unique() if 'Text_ID' in iaa_df.columns else iaa_df['ID'].unique()
    print(f"  Total unique IAA texts: {len(unique_ids)}")

    # Split by IDs (80/20)
    train_ids, test_ids = train_test_split(
        unique_ids,
        test_size=(1 - Config.IAA_TRAIN_SPLIT),
        random_state=Config.RANDOM_STATE,
        stratify=None
    )

    print(f"  IAA split: {len(train_ids)} train IDs, {len(test_ids)} test IDs")

    # Split IAA data (Using fallback logic for filtering)
    if 'Text_ID' in iaa_df.columns:
        iaa_train = iaa_df[iaa_df['Text_ID'].isin(train_ids)].copy()
        iaa_test = iaa_df[iaa_df['Text_ID'].isin(test_ids)].copy()
    else:
        iaa_train = iaa_df[iaa_df['ID'].isin(train_ids)].copy()
        iaa_test = iaa_df[iaa_df['ID'].isin(test_ids)].copy()
    # --- UPDATED LOGIC END ---

    # Apply majority vote to both
    iaa_train_collapsed = apply_majority_vote(iaa_train)
    iaa_test_collapsed = apply_majority_vote(iaa_test)

    # Combine MAIN + IAA_train for training set
    train_df = pd.concat([main_df, iaa_train_collapsed], ignore_index=True)
    test_df = iaa_test_collapsed

    print(f"\n  Final training set: {len(train_df)} samples")
    print(f"    - From MAIN (silver): {len(main_df)}")
    print(f"    - From IAA (gold): {len(iaa_train_collapsed)}")
    print(f"  Final test set: {len(test_df)} samples (gold standard)")

    # Print class distribution
    print("\n  Training set class distribution:")
    print(train_df['Bias_Mapped'].value_counts())
    print("\n  Test set class distribution:")
    print(test_df['Bias_Mapped'].value_counts())

    return train_df, test_df


# ============================================================================
# XLM-ROBERTA MODEL
# ============================================================================

def compute_metrics(eval_pred):
    """Compute metrics for training."""
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)

    acc = accuracy_score(labels, predictions)
    f1_macro = f1_score(labels, predictions, average='macro')
    f1_weighted = f1_score(labels, predictions, average='weighted')

    return {
        'accuracy': acc,
        'f1_macro': f1_macro,
        'f1_weighted': f1_weighted
    }


def prepare_dataset(df: pd.DataFrame, tokenizer, text_column: str = 'Text'):
    """Prepare dataset for transformer training."""

    def tokenize_function(examples):
        return tokenizer(
            examples['text'],
            padding='max_length',
            truncation=True,
            max_length=512
        )

    # Prepare data
    data_dict = {
        'text': df[text_column].tolist(),
        'label': df['Bias_Mapped'].map(Config.LABEL2ID).tolist()
    }

    dataset = Dataset.from_dict(data_dict)
    tokenized_dataset = dataset.map(tokenize_function, batched=True)

    return tokenized_dataset


class XLMRoBERTaBiasDetector:
    """XLM-RoBERTa model for bias detection."""

    def __init__(self):
        self.tokenizer = None
        self.model = None
        self.trainer = None
        print(f"\n[MODEL] Initializing XLM-RoBERTa Bias Detector")
        print(f"  Model: {Config.MODEL_NAME}")

    def train(self, train_df: pd.DataFrame, eval_df: pd.DataFrame = None):
        """Train the model."""
        print("\n[STEP 3] Training XLM-RoBERTa...")

        # Load tokenizer and model
        print(f"  Loading tokenizer and model: {Config.MODEL_NAME}...")
        self.tokenizer = AutoTokenizer.from_pretrained(Config.MODEL_NAME)
        self.model = AutoModelForSequenceClassification.from_pretrained(
            Config.MODEL_NAME,
            num_labels=len(Config.TARGET_LABELS),
            id2label=Config.ID2LABEL,
            label2id=Config.LABEL2ID
        )

        # Prepare datasets
        print("  Preparing datasets...")
        train_dataset = prepare_dataset(train_df, self.tokenizer)
        eval_dataset = None
        if eval_df is not None and len(eval_df) > 0:
            eval_dataset = prepare_dataset(eval_df, self.tokenizer)

        # Training arguments
        training_args = TrainingArguments(
            **Config.TRAINING_ARGS,
            report_to="none"
        )

        # Trainer
        self.trainer = Trainer(
            model=self.model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=eval_dataset,
            compute_metrics=compute_metrics,
            callbacks=[EarlyStoppingCallback(early_stopping_patience=2)] if eval_dataset else None
        )

        # Train
        print("\n  Starting training...")
        print(f"    Epochs: {Config.TRAINING_ARGS['num_train_epochs']}")
        print(f"    Batch size: {Config.TRAINING_ARGS['per_device_train_batch_size']}")
        print(f"    Learning rate: {Config.TRAINING_ARGS['learning_rate']}")

        self.trainer.train()

        print("\n  ✓ Training completed successfully!")

    def predict(self, texts: List[str]) -> List[str]:
        """Predict labels for texts."""
        if not texts:
            return []

        inputs = self.tokenizer(
            texts,
            padding=True,
            truncation=True,
            max_length=512,
            return_tensors="pt"
        )

        self.model.eval()
        with torch.no_grad():
            outputs = self.model(**inputs)
            predictions = torch.argmax(outputs.logits, dim=-1)

        return [Config.ID2LABEL[pred.item()] for pred in predictions]

    def predict_proba(self, texts: List[str]) -> np.ndarray:
        """Predict probabilities for each class."""
        if not texts:
            return np.array([])

        inputs = self.tokenizer(
            texts,
            padding=True,
            truncation=True,
            max_length=512,
            return_tensors="pt"
        )

        self.model.eval()
        with torch.no_grad():
            outputs = self.model(**inputs)
            probs = torch.softmax(outputs.logits, dim=-1)

        return probs.numpy()

    def evaluate(self, test_df: pd.DataFrame):
        """Evaluate model on test set with detailed metrics."""
        print("\n[STEP 4] Evaluating XLM-RoBERTa...")

        texts = test_df['Text'].tolist()
        true_labels = test_df['Bias_Mapped'].tolist()

        # Get predictions
        print("  Generating predictions...")
        predictions = self.predict(texts)

        # Overall metrics
        print("\n" + "="*80)
        print("CLASSIFICATION REPORT")
        print("="*80)
        print(classification_report(
            true_labels,
            predictions,
            target_names=Config.TARGET_LABELS,
            digits=4
        ))

        acc = accuracy_score(true_labels, predictions)
        f1_macro = f1_score(true_labels, predictions, average='macro')
        f1_weighted = f1_score(true_labels, predictions, average='weighted')

        print("\n" + "="*80)
        print("OVERALL METRICS")
        print("="*80)
        print(f"  Accuracy:        {acc:.4f}")
        print(f"  Macro F1-Score:  {f1_macro:.4f}")
        print(f"  Weighted F1:     {f1_weighted:.4f}")

        # Language-specific analysis
        if 'Source Language' in test_df.columns:
            print("\n" + "="*80)
            print("LANGUAGE-SPECIFIC PERFORMANCE")
            print("="*80)

            for lang in ['Arabic', 'English']:
                mask = test_df['Source Language'].str.contains(lang, case=False, na=False)
                if mask.sum() > 0:
                    lang_true = [true_labels[i] for i in range(len(true_labels)) if mask.iloc[i]]
                    lang_pred = [predictions[i] for i in range(len(predictions)) if mask.iloc[i]]

                    lang_acc = accuracy_score(lang_true, lang_pred)
                    lang_f1 = f1_score(lang_true, lang_pred, average='macro')

                    print(f"\n  {lang}:")
                    print(f"    Samples: {len(lang_true)}")
                    print(f"    Accuracy: {lang_acc:.4f}")
                    print(f"    Macro F1: {lang_f1:.4f}")

        # Confusion matrix
        print("\n" + "="*80)
        print("CONFUSION MATRIX")
        print("="*80)
        cm = confusion_matrix(true_labels, predictions, labels=Config.TARGET_LABELS)
        print("\nTrue (rows) vs Predicted (columns):")
        cm_df = pd.DataFrame(cm, index=Config.TARGET_LABELS, columns=Config.TARGET_LABELS)
        print(cm_df)

        return {
            'predictions': predictions,
            'accuracy': acc,
            'f1_macro': f1_macro,
            'f1_weighted': f1_weighted,
            'confusion_matrix': cm
        }


# ============================================================================
# MAIN EXECUTION
# ============================================================================

def main():
    """Main execution function."""

    # Set random seeds
    np.random.seed(Config.RANDOM_STATE)
    torch.manual_seed(Config.RANDOM_STATE)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(Config.RANDOM_STATE)

    # Load and prepare data
    main_df, iaa_df = load_and_clean_data()

    # Map labels
    print("\n[PREPROCESSING] Mapping labels...")
    main_df = map_labels(main_df)
    if len(iaa_df) > 0:
        iaa_df = map_labels(iaa_df)

    # Create splits
    train_df, test_df = create_train_test_split(main_df, iaa_df)

    # Initialize and train model
    model = XLMRoBERTaBiasDetector()
    model.train(train_df, test_df)

    # Evaluate
    if len(test_df) > 0:
        results = model.evaluate(test_df)
    else:
        print("\n  Warning: No test data available for evaluation")
        results = None

    print("\n" + "="*80)
    print("TESTING COMPLETE")
    print("="*80)

    return model, results


# ============================================================================
# INFERENCE EXAMPLES
# ============================================================================

def test_predictions(model: XLMRoBERTaBiasDetector):
    """Test the model with example texts."""
    print("\n" + "="*80)
    print("EXAMPLE PREDICTIONS")
    print("="*80)

    sample_texts = [
        # English examples
        "Hamas terrorists launched brutal attacks on innocent Israeli civilians.",
        "Israeli forces killed dozens of Palestinian civilians in Gaza today.",
        "The conflict continues with casualties on both sides.",

        # Arabic examples
        "الاحتلال الإسرائيلي يقصف المدنيين في غزة",
        "حماس تطلق صواريخ على المدن الإسرائيلية",
        "استمرار الصراع مع سقوط ضحايا من الجانبين"
    ]

    predictions = model.predict(sample_texts)
    probabilities = model.predict_proba(sample_texts)

    for i, (text, pred, probs) in enumerate(zip(sample_texts, predictions, probabilities), 1):
        print(f"\n[Example {i}]")
        print(f"  Text: {text}")
        print(f"  Prediction: {pred}")
        print(f"  Confidence:")
        for label, prob in zip(Config.TARGET_LABELS, probs):
            print(f"    {label}: {prob:.4f}")


if __name__ == "__main__":
    # Run main pipeline
    model, results = main()

    # Test with examples
    if model is not None:
        test_predictions(model)

XLM-ROBERTA STANDALONE TESTING

[STEP 1] Loading and cleaning data...
  Loading /content/Main.xlsx...
    MAIN: 13500 rows → 10800 rows (removed 2700 with missing labels)
  Loading /content/IAA-1.xlsx...
    /content/IAA-1.xlsx: 1200 rows → 1200 rows
  Loading /content/IAA-2.xlsx...
    /content/IAA-2.xlsx: 1200 rows → 1200 rows
  Loading /content/IAA-3.xlsx...
    /content/IAA-3.xlsx: No valid labels found, skipping.
  Loading /content/IAA-4.xlsx...
    /content/IAA-4.xlsx: No valid labels found, skipping.

  Total IAA data: 2400 rows with valid labels

[PREPROCESSING] Mapping labels...
  Mapping these to 'Others'
  Mapping these to 'Others'

[STEP 2] Creating train/test splits...
  Total unique IAA texts: 1115
  IAA split: 892 train IDs, 223 test IDs

  Applying majority vote to collapse duplicate annotations...
    Collapsed 1920 annotations → 960 unique texts

  Applying majority vote to collapse duplicate annotations...
    Collapsed 480 annotations → 240 unique texts

  Final tra

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  Preparing datasets...


Map:   0%|          | 0/11760 [00:00<?, ? examples/s]

Map:   0%|          | 0/240 [00:00<?, ? examples/s]


  Starting training...
    Epochs: 4
    Batch size: 16
    Learning rate: 2e-05


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,F1 Weighted
1,0.869,0.814113,0.691667,0.274308,0.604678
2,0.8681,0.794993,0.691667,0.3091,0.635633
3,0.8482,0.73151,0.741667,0.378181,0.695256
4,0.7594,0.706352,0.75,0.394129,0.712648



  ✓ Training completed successfully!

[STEP 4] Evaluating XLM-RoBERTa...
  Generating predictions...


RuntimeError: Expected all tensors to be on the same device, but got index is on cpu, different from other tensors on cuda:0 (when checking argument in method wrapper_CUDA__index_select)

In [None]:
import os
import torch
import shutil
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# 1. SETUP PATHS
# Based on your logs, this is your best checkpoint
checkpoint_path = "./results_xlm_roberta/checkpoint-2940"
original_model_name = "xlm-roberta-base"
final_output_dir = "./final_xlm_roberta_complete"

print(f"🔧 Fixing model loading...")

# 2. LOAD MODEL (From your fine-tuned checkpoint)
print(f"   Loading WEIGHTS from: {checkpoint_path}")
model = AutoModelForSequenceClassification.from_pretrained(checkpoint_path)

# 3. LOAD TOKENIZER (From the original base, since it's missing in the checkpoint)
print(f"   Loading TOKENIZER from: {original_model_name}")
tokenizer = AutoTokenizer.from_pretrained(original_model_name)

# 4. SAVE EVERYTHING TOGETHER (This creates the valid folder)
print(f"   Saving complete model to: {final_output_dir}")
model.save_pretrained(final_output_dir)
tokenizer.save_pretrained(final_output_dir)

# 5. VERIFY & ZIP
if os.path.exists(os.path.join(final_output_dir, "sentencepiece.bpe.model")):
    print("   ✅ Tokenizer file successfully restored!")

print("📦 Zipping for download...")
shutil.make_archive("xlm_roberta_bias_model", 'zip', final_output_dir)

print("\n✅ DONE! Download 'xlm_roberta_bias_model.zip' from the files tab.")

# Optional: Test Prediction to ensure it works
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
inputs = tokenizer("Testing the fixed model", return_tensors="pt").to(device)
with torch.no_grad():
    outputs = model(**inputs)
print("\nTest Run Successful. Model Output Shape:", outputs.logits.shape)

🔧 Fixing model loading...
   Loading WEIGHTS from: ./results_xlm_roberta/checkpoint-2940
   Loading TOKENIZER from: xlm-roberta-base
   Saving complete model to: ./final_xlm_roberta_complete
   ✅ Tokenizer file successfully restored!
📦 Zipping for download...

✅ DONE! Download 'xlm_roberta_bias_model.zip' from the files tab.

Test Run Successful. Model Output Shape: torch.Size([1, 4])


In [None]:
import os
import torch
import zipfile
import shutil
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# ==========================================
# 1. SETUP: UNZIP THE MODEL
# ==========================================
zip_path = "/content/xlm_roberta_bias_model.zip"
extract_path = "./my_production_model"

# Clean up previous extraction if exists
if os.path.exists(extract_path):
    shutil.rmtree(extract_path)

print(f"📦 Unzipping {zip_path}...")
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

print(f"✅ Model extracted to: {extract_path}")

# ==========================================
# 2. LOAD MODEL & TOKENIZER
# ==========================================
print("   Loading model into memory...")
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load from the extracted folder
tokenizer = AutoTokenizer.from_pretrained(extract_path)
model = AutoModelForSequenceClassification.from_pretrained(extract_path)
model.to(device)
model.eval()

# Helper to get human-readable labels (Reverse the mapping from your config)
# Update this if your labels are different, but this matches your previous config
id2label = {
    0: 'Unbiased',
    1: 'Biased Against Palestine',
    2: 'Biased Against Israel',
    3: 'Others'
}

# ==========================================
# 3. PREDICTION FUNCTION
# ==========================================
def predict_bias(texts):
    # Tokenize
    inputs = tokenizer(
        texts,
        padding=True,
        truncation=True,
        max_length=512,
        return_tensors="pt"
    )

    # Move to GPU
    inputs = {k: v.to(device) for k, v in inputs.items()}

    # Predict
    with torch.no_grad():
        outputs = model(**inputs)
        probs = torch.softmax(outputs.logits, dim=-1)
        predictions = torch.argmax(probs, dim=-1)

    # Format results
    results = []
    for i, pred_id in enumerate(predictions):
        pred_label = id2label.get(pred_id.item(), "Unknown")
        confidence = probs[i][pred_id].item()
        results.append((pred_label, confidence))

    return results

# ==========================================
# 4. RUN TEST EXAMPLES
# ==========================================
test_sentences = [
    # English Examples
    "The army neutralized the terrorist threat in the region.",
    "Civilians are suffering due to the brutal blockade.",
    "Peace talks are scheduled to resume next week in Cairo.",

    # Arabic Examples
    "قامت القوات باستهداف المدنيين العزل في قطاع غزة.",  # Biased/Charged language
    "أعلنت وزارة الصحة عن ارتفاع عدد الضحايا.",         # Neutral reporting
    "إسرائيل تدافع عن نفسها ضد الهجمات الصاروخية."      # Biased perspective
]

print("\n" + "="*50)
print("🔎 MODEL INFERENCE TEST")
print("="*50)

predictions = predict_bias(test_sentences)

for text, (label, conf) in zip(test_sentences, predictions):
    print(f"\n📝 Text: {text}")
    print(f"🏷️ Label: {label}")
    print(f"📊 Confidence: {conf:.4f}")

📦 Unzipping /content/xlm_roberta_bias_model.zip...
✅ Model extracted to: ./my_production_model
   Loading model into memory...


The tokenizer you are loading from './my_production_model' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.



🔎 MODEL INFERENCE TEST

📝 Text: The army neutralized the terrorist threat in the region.
🏷️ Label: Biased Against Palestine
📊 Confidence: 0.7837

📝 Text: Civilians are suffering due to the brutal blockade.
🏷️ Label: Unbiased
📊 Confidence: 0.6811

📝 Text: Peace talks are scheduled to resume next week in Cairo.
🏷️ Label: Unbiased
📊 Confidence: 0.8699

📝 Text: قامت القوات باستهداف المدنيين العزل في قطاع غزة.
🏷️ Label: Unbiased
📊 Confidence: 0.6133

📝 Text: أعلنت وزارة الصحة عن ارتفاع عدد الضحايا.
🏷️ Label: Unbiased
📊 Confidence: 0.8293

📝 Text: إسرائيل تدافع عن نفسها ضد الهجمات الصاروخية.
🏷️ Label: Unbiased
📊 Confidence: 0.8706


In [1]:
"""
XLM-RoBERTa Standalone Testing for FIGNEWS-2024 (FIXED VERSION)
================================================================
Fixes applied:
1. Complete label mapping (including 'Biased against both Palestine and Israel' and 'Not Applicable')
2. Class weight balancing using CustomTrainer with weighted CrossEntropyLoss
3. Device management fix for GPU inference
"""

import os
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
from collections import Counter
from typing import Dict, List, Tuple

# Core ML libraries
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, f1_score, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight
import seaborn as sns
import matplotlib.pyplot as plt

# Transformers
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    EarlyStoppingCallback
)
from datasets import Dataset
import torch
import torch.nn as nn

print("="*80)
print("XLM-ROBERTA STANDALONE TESTING (FIXED VERSION)")
print("="*80)


# ============================================================================
# CONFIGURATION
# ============================================================================

class Config:
    """Configuration for XLM-RoBERTa testing"""

    # File paths
    MAIN_FILE = "/content/Main.xlsx"
    IAA_FILES = ["/content/IAA-1.xlsx", "/content/IAA-2.xlsx", "/content/IAA-3.xlsx", "/content/IAA-4.xlsx"]

    # Label mapping - FIXED: Added missing mappings
    LABEL_MAP = {
        'Unbiased': 'Unbiased',
        'Biased against Palestine': 'Biased Against Palestine',
        'Biased Against Palestine': 'Biased Against Palestine',
        'Biased against Israel': 'Biased Against Israel',
        'Biased Against Israel': 'Biased Against Israel',
        'Unclear': 'Others',
        'Biased against others': 'Others',
        'Biased against both': 'Others',
        'Biased against both Palestine and Israel': 'Others',  # FIXED
        'Not Applicable': 'Others',                           # FIXED
        'Others': 'Others'
    }

    TARGET_LABELS = ['Unbiased', 'Biased Against Palestine', 'Biased Against Israel', 'Others']
    LABEL2ID = {label: idx for idx, label in enumerate(TARGET_LABELS)}
    ID2LABEL = {idx: label for label, idx in LABEL2ID.items()}

    # Model configuration
    MODEL_NAME = "xlm-roberta-base"

    # Training parameters
    IAA_TRAIN_SPLIT = 0.8
    RANDOM_STATE = 42

    # Training arguments - ADJUSTED for better handling of imbalance
    TRAINING_ARGS = {
        "output_dir": "./results_xlm_roberta_fixed",
        "num_train_epochs": 4,
        "per_device_train_batch_size": 16,
        "per_device_eval_batch_size": 16,
        "learning_rate": 2e-5,
        "weight_decay": 0.01,
        "eval_strategy": "epoch",
        "save_strategy": "epoch",
        "load_best_model_at_end": True,
        "metric_for_best_model": "f1_macro",  # Focus on macro F1 for imbalanced data
        "logging_steps": 50,
        "warmup_steps": 200,  # Increased warmup
        "save_total_limit": 2,
    }


# ============================================================================
# DATA LOADING & PREPROCESSING
# ============================================================================

def load_and_clean_data() -> Tuple[pd.DataFrame, pd.DataFrame]:
    """Load MAIN and IAA files, filter out rows with missing Bias labels."""
    print("\n[STEP 1] Loading and cleaning data...")

    # Load MAIN file
    print(f"  Loading {Config.MAIN_FILE}...")
    main_df = pd.read_excel(Config.MAIN_FILE)
    initial_main_count = len(main_df)

    # Filter out missing Bias labels from MAIN
    main_df = main_df[main_df['Bias'].notna() & (main_df['Bias'] != '')]
    filtered_main_count = len(main_df)
    print(f"    MAIN: {initial_main_count} rows → {filtered_main_count} rows (removed {initial_main_count - filtered_main_count} with missing labels)")

    # Load and concatenate IAA files
    iaa_dfs = []
    for iaa_file in Config.IAA_FILES:
        if os.path.exists(iaa_file):
            print(f"  Loading {iaa_file}...")
            iaa_df_temp = pd.read_excel(iaa_file)
            initial_count = len(iaa_df_temp)

            # Check both 'Bias' and 'Bais' (typo in IAA files)
            if 'Bais' in iaa_df_temp.columns:
                iaa_df_temp['Bias'] = iaa_df_temp['Bais']

            iaa_df_temp = iaa_df_temp[iaa_df_temp['Bias'].notna() & (iaa_df_temp['Bias'] != '')]
            filtered_count = len(iaa_df_temp)

            if filtered_count > 0:
                iaa_dfs.append(iaa_df_temp)
                print(f"    {iaa_file}: {initial_count} rows → {filtered_count} rows")
            else:
                print(f"    {iaa_file}: No valid labels found, skipping.")
        else:
            print(f"  Warning: {iaa_file} not found, skipping.")

    # Concatenate all IAA data
    if iaa_dfs:
        iaa_df = pd.concat(iaa_dfs, ignore_index=True)
        print(f"\n  Total IAA data: {len(iaa_df)} rows with valid labels")
    else:
        print("\n  Warning: No valid IAA data found!")
        iaa_df = pd.DataFrame()

    return main_df, iaa_df


def map_labels(df: pd.DataFrame) -> pd.DataFrame:
    """Map raw bias labels to standardized 4-class labels."""
    df = df.copy()
    df['Bias_Mapped'] = df['Bias'].map(Config.LABEL_MAP)

    # Handle any unmapped labels
    unmapped = df[df['Bias_Mapped'].isna()]['Bias'].unique()
    if len(unmapped) > 0:
        print(f"  ⚠️ WARNING: Unmapped labels found: {unmapped}")
        print(f"  These will cause errors - please update Config.LABEL_MAP!")
        # Still fill with 'Others' as fallback
        df['Bias_Mapped'] = df['Bias_Mapped'].fillna('Others')
    else:
        print(f"  ✓ All labels mapped successfully")

    return df


def apply_majority_vote(df: pd.DataFrame) -> pd.DataFrame:
    """Apply majority voting to IAA data grouped by Text/ID."""
    print("\n  Applying majority vote to collapse duplicate annotations...")

    # Create a unique text identifier
    df['Text_ID'] = df['ID'].astype(str) + "_" + df['Text'].str[:50]

    gold_standard_rows = []

    for text_id, group in df.groupby('Text_ID'):
        # Get most common label (mode)
        labels = group['Bias_Mapped'].tolist()
        label_counts = Counter(labels)
        majority_label = label_counts.most_common(1)[0][0]

        # Take first row and update its label
        gold_row = group.iloc[0].copy()
        gold_row['Bias_Mapped'] = majority_label
        gold_row['Annotator_Count'] = len(group)
        gold_standard_rows.append(gold_row)

    result_df = pd.DataFrame(gold_standard_rows)
    print(f"    Collapsed {len(df)} annotations → {len(result_df)} unique texts")

    return result_df


def create_train_test_split(main_df: pd.DataFrame, iaa_df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """Create training and test sets following the silver/gold strategy."""
    print("\n[STEP 2] Creating train/test splits...")

    # Get unique text IDs from IAA
    unique_ids = iaa_df['Text_ID'].unique() if 'Text_ID' in iaa_df.columns else iaa_df['ID'].unique()
    print(f"  Total unique IAA texts: {len(unique_ids)}")

    # Split by IDs (80/20)
    train_ids, test_ids = train_test_split(
        unique_ids,
        test_size=(1 - Config.IAA_TRAIN_SPLIT),
        random_state=Config.RANDOM_STATE,
        stratify=None
    )

    print(f"  IAA split: {len(train_ids)} train IDs, {len(test_ids)} test IDs")

    # Split IAA data
    if 'Text_ID' in iaa_df.columns:
        iaa_train = iaa_df[iaa_df['Text_ID'].isin(train_ids)].copy()
        iaa_test = iaa_df[iaa_df['Text_ID'].isin(test_ids)].copy()
    else:
        iaa_train = iaa_df[iaa_df['ID'].isin(train_ids)].copy()
        iaa_test = iaa_df[iaa_df['ID'].isin(test_ids)].copy()

    # Apply majority vote to both
    iaa_train_collapsed = apply_majority_vote(iaa_train)
    iaa_test_collapsed = apply_majority_vote(iaa_test)

    # Combine MAIN + IAA_train for training set
    train_df = pd.concat([main_df, iaa_train_collapsed], ignore_index=True)
    test_df = iaa_test_collapsed

    print(f"\n  Final training set: {len(train_df)} samples")
    print(f"    - From MAIN (silver): {len(main_df)}")
    print(f"    - From IAA (gold): {len(iaa_train_collapsed)}")
    print(f"  Final test set: {len(test_df)} samples (gold standard)")

    # Print class distribution
    print("\n  Training set class distribution:")
    train_dist = train_df['Bias_Mapped'].value_counts()
    print(train_dist)
    print("\n  Class proportions:")
    print((train_dist / len(train_df) * 100).round(2))

    print("\n  Test set class distribution:")
    print(test_df['Bias_Mapped'].value_counts())

    return train_df, test_df


# ============================================================================
# CUSTOM TRAINER WITH CLASS WEIGHTS
# ============================================================================

class WeightedTrainer(Trainer):
    """Custom Trainer that applies class weights to loss function."""

    def __init__(self, class_weights=None, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = class_weights

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        """Override compute_loss to use weighted CrossEntropyLoss."""
        labels = inputs.pop("labels")

        # Forward pass
        outputs = model(**inputs)
        logits = outputs.get("logits")

        # Compute weighted loss
        if self.class_weights is not None:
            # Move weights to same device as logits
            weights = self.class_weights.to(logits.device)
            loss_fct = nn.CrossEntropyLoss(weight=weights)
        else:
            loss_fct = nn.CrossEntropyLoss()

        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))

        return (loss, outputs) if return_outputs else loss


# ============================================================================
# XLM-ROBERTA MODEL
# ============================================================================

def compute_metrics(eval_pred):
    """Compute metrics for training."""
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)

    acc = accuracy_score(labels, predictions)
    f1_macro = f1_score(labels, predictions, average='macro')
    f1_weighted = f1_score(labels, predictions, average='weighted')

    return {
        'accuracy': acc,
        'f1_macro': f1_macro,
        'f1_weighted': f1_weighted
    }


def prepare_dataset(df: pd.DataFrame, tokenizer, text_column: str = 'Text'):
    """Prepare dataset for transformer training."""

    def tokenize_function(examples):
        return tokenizer(
            examples['text'],
            padding='max_length',
            truncation=True,
            max_length=512
        )

    # Prepare data
    data_dict = {
        'text': df[text_column].tolist(),
        'label': df['Bias_Mapped'].map(Config.LABEL2ID).tolist()
    }

    dataset = Dataset.from_dict(data_dict)
    tokenized_dataset = dataset.map(tokenize_function, batched=True)

    return tokenized_dataset


class XLMRoBERTaBiasDetector:
    """XLM-RoBERTa model for bias detection with class weight balancing."""

    def __init__(self):
        self.tokenizer = None
        self.model = None
        self.trainer = None
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        print(f"\n[MODEL] Initializing XLM-RoBERTa Bias Detector")
        print(f"  Model: {Config.MODEL_NAME}")
        print(f"  Device: {self.device}")

    def train(self, train_df: pd.DataFrame, eval_df: pd.DataFrame = None):
        """Train the model with class weight balancing."""
        print("\n[STEP 3] Training XLM-RoBERTa with Class Weight Balancing...")

        # Load tokenizer and model
        print(f"  Loading tokenizer and model: {Config.MODEL_NAME}...")
        self.tokenizer = AutoTokenizer.from_pretrained(Config.MODEL_NAME)
        self.model = AutoModelForSequenceClassification.from_pretrained(
            Config.MODEL_NAME,
            num_labels=len(Config.TARGET_LABELS),
            id2label=Config.ID2LABEL,
            label2id=Config.LABEL2ID
        )

        # Move model to device
        self.model.to(self.device)

        # Calculate class weights
        print("\n  Calculating class weights to handle imbalance...")
        labels = train_df['Bias_Mapped'].map(Config.LABEL2ID).values
        unique_labels = np.unique(labels)

        class_weights = compute_class_weight(
            class_weight='balanced',
            classes=unique_labels,
            y=labels
        )

        # Convert to tensor
        class_weights_tensor = torch.tensor(class_weights, dtype=torch.float32)

        print("  Class weights:")
        for label_id, weight in zip(unique_labels, class_weights):
            label_name = Config.ID2LABEL[label_id]
            print(f"    {label_name}: {weight:.4f}")

        # Prepare datasets
        print("\n  Preparing datasets...")
        train_dataset = prepare_dataset(train_df, self.tokenizer)
        eval_dataset = None
        if eval_df is not None and len(eval_df) > 0:
            eval_dataset = prepare_dataset(eval_df, self.tokenizer)

        # Training arguments
        training_args = TrainingArguments(
            **Config.TRAINING_ARGS,
            report_to="none"
        )

        # Use custom weighted trainer
        self.trainer = WeightedTrainer(
            class_weights=class_weights_tensor,
            model=self.model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=eval_dataset,
            compute_metrics=compute_metrics,
            callbacks=[EarlyStoppingCallback(early_stopping_patience=2)] if eval_dataset else None
        )

        # Train
        print("\n  Starting training...")
        print(f"    Epochs: {Config.TRAINING_ARGS['num_train_epochs']}")
        print(f"    Batch size: {Config.TRAINING_ARGS['per_device_train_batch_size']}")
        print(f"    Learning rate: {Config.TRAINING_ARGS['learning_rate']}")
        print(f"    Using class weights: Yes ✓")

        self.trainer.train()

        print("\n  ✓ Training completed successfully!")

    def predict(self, texts: List[str]) -> List[str]:
        """Predict labels for texts - FIXED: proper device management."""
        if not texts:
            return []

        # Tokenize
        inputs = self.tokenizer(
            texts,
            padding=True,
            truncation=True,
            max_length=512,
            return_tensors="pt"
        )

        # FIXED: Move inputs to the same device as model
        inputs = {k: v.to(self.device) for k, v in inputs.items()}

        self.model.eval()
        with torch.no_grad():
            outputs = self.model(**inputs)
            predictions = torch.argmax(outputs.logits, dim=-1)

        return [Config.ID2LABEL[pred.item()] for pred in predictions]

    def predict_proba(self, texts: List[str]) -> np.ndarray:
        """Predict probabilities for each class - FIXED: proper device management."""
        if not texts:
            return np.array([])

        # Tokenize
        inputs = self.tokenizer(
            texts,
            padding=True,
            truncation=True,
            max_length=512,
            return_tensors="pt"
        )

        # FIXED: Move inputs to the same device as model
        inputs = {k: v.to(self.device) for k, v in inputs.items()}

        self.model.eval()
        with torch.no_grad():
            outputs = self.model(**inputs)
            probs = torch.softmax(outputs.logits, dim=-1)

        return probs.cpu().numpy()

    def evaluate(self, test_df: pd.DataFrame):
        """Evaluate model on test set with detailed metrics."""
        print("\n[STEP 4] Evaluating XLM-RoBERTa...")

        texts = test_df['Text'].tolist()
        true_labels = test_df['Bias_Mapped'].tolist()

        # Get predictions
        print("  Generating predictions...")
        predictions = self.predict(texts)

        # Overall metrics
        print("\n" + "="*80)
        print("CLASSIFICATION REPORT")
        print("="*80)
        print(classification_report(
            true_labels,
            predictions,
            target_names=Config.TARGET_LABELS,
            digits=4
        ))

        acc = accuracy_score(true_labels, predictions)
        f1_macro = f1_score(true_labels, predictions, average='macro')
        f1_weighted = f1_score(true_labels, predictions, average='weighted')

        print("\n" + "="*80)
        print("OVERALL METRICS")
        print("="*80)
        print(f"  Accuracy:        {acc:.4f}")
        print(f"  Macro F1-Score:  {f1_macro:.4f}")
        print(f"  Weighted F1:     {f1_weighted:.4f}")

        # Language-specific analysis
        if 'Source Language' in test_df.columns:
            print("\n" + "="*80)
            print("LANGUAGE-SPECIFIC PERFORMANCE")
            print("="*80)

            for lang in ['Arabic', 'English']:
                mask = test_df['Source Language'].str.contains(lang, case=False, na=False)
                if mask.sum() > 0:
                    lang_true = [true_labels[i] for i in range(len(true_labels)) if mask.iloc[i]]
                    lang_pred = [predictions[i] for i in range(len(predictions)) if mask.iloc[i]]

                    lang_acc = accuracy_score(lang_true, lang_pred)
                    lang_f1 = f1_score(lang_true, lang_pred, average='macro')

                    print(f"\n  {lang}:")
                    print(f"    Samples: {len(lang_true)}")
                    print(f"    Accuracy: {lang_acc:.4f}")
                    print(f"    Macro F1: {lang_f1:.4f}")

        # Confusion matrix
        print("\n" + "="*80)
        print("CONFUSION MATRIX")
        print("="*80)
        cm = confusion_matrix(true_labels, predictions, labels=Config.TARGET_LABELS)
        print("\nTrue (rows) vs Predicted (columns):")
        cm_df = pd.DataFrame(cm, index=Config.TARGET_LABELS, columns=Config.TARGET_LABELS)
        print(cm_df)

        return {
            'predictions': predictions,
            'accuracy': acc,
            'f1_macro': f1_macro,
            'f1_weighted': f1_weighted,
            'confusion_matrix': cm
        }


# ============================================================================
# MAIN EXECUTION
# ============================================================================

def main():
    """Main execution function."""

    # Set random seeds
    np.random.seed(Config.RANDOM_STATE)
    torch.manual_seed(Config.RANDOM_STATE)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(Config.RANDOM_STATE)

    # Load and prepare data
    main_df, iaa_df = load_and_clean_data()

    # Map labels
    print("\n[PREPROCESSING] Mapping labels...")
    main_df = map_labels(main_df)
    if len(iaa_df) > 0:
        iaa_df = map_labels(iaa_df)

    # Create splits
    train_df, test_df = create_train_test_split(main_df, iaa_df)

    # Initialize and train model
    model = XLMRoBERTaBiasDetector()
    model.train(train_df, test_df)

    # Evaluate
    if len(test_df) > 0:
        results = model.evaluate(test_df)
    else:
        print("\n  Warning: No test data available for evaluation")
        results = None

    print("\n" + "="*80)
    print("TESTING COMPLETE")
    print("="*80)

    return model, results


# ============================================================================
# INFERENCE EXAMPLES
# ============================================================================

def test_predictions(model: XLMRoBERTaBiasDetector):
    """Test the model with example texts."""
    print("\n" + "="*80)
    print("EXAMPLE PREDICTIONS")
    print("="*80)

    sample_texts = [
        # English examples
        "Hamas terrorists launched brutal attacks on innocent Israeli civilians.",
        "Israeli forces killed dozens of Palestinian civilians in Gaza today.",
        "The conflict continues with casualties on both sides.",
        "Civilians are suffering due to the brutal blockade.",

        # Arabic examples
        "الاحتلال الإسرائيلي يقصف المدنيين في غزة",
        "حماس تطلق صواريخ على المدن الإسرائيلية",
        "استمرار الصراع مع سقوط ضحايا من الجانبين",
        "قامت القوات باستهداف المدنيين العزل في قطاع غزة"
    ]

    predictions = model.predict(sample_texts)
    probabilities = model.predict_proba(sample_texts)

    for i, (text, pred, probs) in enumerate(zip(sample_texts, predictions, probabilities), 1):
        print(f"\n[Example {i}]")
        print(f"  Text: {text}")
        print(f"  Prediction: {pred}")
        print(f"  Confidence:")
        for label, prob in zip(Config.TARGET_LABELS, probs):
            print(f"    {label}: {prob:.4f}")


if __name__ == "__main__":
    # Run main pipeline
    model, results = main()

    # Test with examples
    if model is not None:
        test_predictions(model)

XLM-ROBERTA STANDALONE TESTING (FIXED VERSION)

[STEP 1] Loading and cleaning data...
  Loading /content/Main.xlsx...
    MAIN: 13500 rows → 10800 rows (removed 2700 with missing labels)
  Loading /content/IAA-1.xlsx...
    /content/IAA-1.xlsx: 1200 rows → 1200 rows
  Loading /content/IAA-2.xlsx...
    /content/IAA-2.xlsx: 1200 rows → 1200 rows
  Loading /content/IAA-3.xlsx...
    /content/IAA-3.xlsx: No valid labels found, skipping.
  Loading /content/IAA-4.xlsx...
    /content/IAA-4.xlsx: No valid labels found, skipping.

  Total IAA data: 2400 rows with valid labels

[PREPROCESSING] Mapping labels...
  ✓ All labels mapped successfully
  ✓ All labels mapped successfully

[STEP 2] Creating train/test splits...
  Total unique IAA texts: 1115
  IAA split: 892 train IDs, 223 test IDs

  Applying majority vote to collapse duplicate annotations...
    Collapsed 1920 annotations → 960 unique texts

  Applying majority vote to collapse duplicate annotations...
    Collapsed 480 annotations →

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



  Calculating class weights to handle imbalance...
  Class weights:
    Unbiased: 0.3960
    Biased Against Palestine: 0.9277
    Biased Against Israel: 9.9661
    Others: 3.3754

  Preparing datasets...


Map:   0%|          | 0/11760 [00:00<?, ? examples/s]

Map:   0%|          | 0/240 [00:00<?, ? examples/s]


  Starting training...
    Epochs: 4
    Batch size: 16
    Learning rate: 2e-05
    Using class weights: Yes ✓


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,F1 Weighted
1,1.426,1.376134,0.470833,0.243732,0.468926
2,1.3422,1.177565,0.633333,0.349721,0.628848
3,1.3388,1.150135,0.654167,0.351105,0.639177
4,1.2404,1.194473,0.6375,0.360646,0.62932



  ✓ Training completed successfully!

[STEP 4] Evaluating XLM-RoBERTa...
  Generating predictions...

CLASSIFICATION REPORT
                          precision    recall  f1-score   support

                Unbiased     0.0000    0.0000    0.0000         6
Biased Against Palestine     0.4643    0.6290    0.5342        62
   Biased Against Israel     0.2500    0.1333    0.1739        15
                  Others     0.7568    0.7134    0.7344       157

                accuracy                         0.6375       240
               macro avg     0.3678    0.3689    0.3606       240
            weighted avg     0.6306    0.6375    0.6293       240


OVERALL METRICS
  Accuracy:        0.6375
  Macro F1-Score:  0.3606
  Weighted F1:     0.6293

LANGUAGE-SPECIFIC PERFORMANCE

  Arabic:
    Samples: 39
    Accuracy: 0.6667
    Macro F1: 0.2982

  English:
    Samples: 45
    Accuracy: 0.7556
    Macro F1: 0.3668

CONFUSION MATRIX

True (rows) vs Predicted (columns):
                        

In [4]:
# Force reinstall specific compatible versions
!pip install -U torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
!pip install -U transformers datasets accelerate triton

Looking in indexes: https://download.pytorch.org/whl/cu121
Collecting datasets
  Downloading datasets-4.4.2-py3-none-any.whl.metadata (19 kB)
Collecting triton
  Downloading triton-3.5.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (1.7 kB)
Collecting pyarrow>=21.0.0 (from datasets)
  Downloading pyarrow-22.0.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (3.2 kB)
Downloading datasets-4.4.2-py3-none-any.whl (512 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m512.3/512.3 kB[0m [31m17.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyarrow-22.0.0-cp312-cp312-manylinux_2_28_x86_64.whl (47.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.7/47.7 MB[0m [31m21.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyarrow, datasets
  Attempting uninstall: pyarrow
    Found existing installation: pyarrow 18.1.0
    Uninstalling pyarrow-18.1.0:
      Successfully uninstalled pyarrow-18.1.0
  Attempting uninst

In [2]:
"""
XLM-RoBERTa Training for FIGNEWS-2024 (COMPLETE FIXED VERSION)
================================================================
Fixes applied:
1. Uses BOTH Arabic MT and English MT columns (2x dataset via vertical concatenation)
2. Capped class weights (max 3.0) to prevent training instability
3. Google Drive checkpoint saving for best model
4. Proper device management for GPU inference
"""

import os
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
from collections import Counter
from typing import Dict, List, Tuple

# Core ML libraries
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, f1_score, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight
import seaborn as sns
import matplotlib.pyplot as plt

# Transformers
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    EarlyStoppingCallback
)
from datasets import Dataset
import torch
import torch.nn as nn

# Google Drive (for Colab)
from google.colab import drive

print("="*80)
print("XLM-ROBERTA TRAINING WITH MT AUGMENTATION")
print("="*80)


# ============================================================================
# CONFIGURATION
# ============================================================================

class Config:
    """Configuration for XLM-RoBERTa training"""

    # File paths
    MAIN_FILE = "/content/Main.xlsx"
    IAA_FILES = ["/content/IAA-1.xlsx", "/content/IAA-2.xlsx", "/content/IAA-3.xlsx", "/content/IAA-4.xlsx"]

    # Google Drive path for saving best model
    DRIVE_MOUNT_PATH = "/content/drive"
    MODEL_SAVE_PATH = "/content/drive/MyDrive/xlm_roberta_best_model"

    # MT Column names
    ARABIC_MT_COL = "Arabic MT"
    ENGLISH_MT_COL = "English MT"

    # Label mapping
    LABEL_MAP = {
        'Unbiased': 'Unbiased',
        'Biased against Palestine': 'Biased Against Palestine',
        'Biased Against Palestine': 'Biased Against Palestine',
        'Biased against Israel': 'Biased Against Israel',
        'Biased Against Israel': 'Biased Against Israel',
        'Unclear': 'Others',
        'Biased against others': 'Others',
        'Biased against both': 'Others',
        'Biased against both Palestine and Israel': 'Others',
        'Not Applicable': 'Others',
        'Others': 'Others'
    }

    TARGET_LABELS = ['Unbiased', 'Biased Against Palestine', 'Biased Against Israel', 'Others']
    LABEL2ID = {label: idx for idx, label in enumerate(TARGET_LABELS)}
    ID2LABEL = {idx: label for label, idx in LABEL2ID.items()}

    # Model configuration
    MODEL_NAME = "xlm-roberta-base"

    # Training parameters
    IAA_TRAIN_SPLIT = 0.8
    RANDOM_STATE = 42
    MAX_CLASS_WEIGHT = 3.0  # Cap to prevent extreme weights

    # Training arguments - optimized for 2x dataset
    TRAINING_ARGS = {
        "output_dir": "./results_xlm_roberta_mt",
        "num_train_epochs": 4,
        "per_device_train_batch_size": 16,
        "per_device_eval_batch_size": 16,
        "learning_rate": 2e-5,
        "weight_decay": 0.01,
        "eval_strategy": "epoch",
        "save_strategy": "epoch",
        "save_total_limit": 1,  # Keep only best checkpoint
        "load_best_model_at_end": True,
        "metric_for_best_model": "f1_macro",
        "greater_is_better": True,
        "logging_steps": 100,
        "warmup_steps": 300,  # Increased for larger dataset
        "fp16": True,  # Mixed precision for faster training
    }


# ============================================================================
# GOOGLE DRIVE SETUP
# ============================================================================

def setup_google_drive():
    """Mount Google Drive and create save directory."""
    print("\n[SETUP] Configuring Google Drive...")

    # Mount drive if not already mounted
    if not os.path.exists(Config.DRIVE_MOUNT_PATH):
        print("  Mounting Google Drive...")
        drive.mount(Config.DRIVE_MOUNT_PATH)
    else:
        print("  Google Drive already mounted ✓")

    # Create save directory if it doesn't exist
    save_dir = os.path.dirname(Config.MODEL_SAVE_PATH)
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
        print(f"  Created directory: {save_dir}")

    print(f"  Model will be saved to: {Config.MODEL_SAVE_PATH}")


# ============================================================================
# DATA LOADING & PREPROCESSING
# ============================================================================

def load_and_clean_data() -> Tuple[pd.DataFrame, pd.DataFrame]:
    """Load MAIN and IAA files, filter out rows with missing Bias labels."""
    print("\n[STEP 1] Loading and cleaning data...")

    # Load MAIN file
    print(f"  Loading {Config.MAIN_FILE}...")
    main_df = pd.read_excel(Config.MAIN_FILE)
    initial_main_count = len(main_df)

    # Filter out missing Bias labels from MAIN
    main_df = main_df[main_df['Bias'].notna() & (main_df['Bias'] != '')]
    filtered_main_count = len(main_df)
    print(f"    MAIN: {initial_main_count} rows → {filtered_main_count} rows (removed {initial_main_count - filtered_main_count} with missing labels)")

    # Load and concatenate IAA files
    iaa_dfs = []
    for iaa_file in Config.IAA_FILES:
        if os.path.exists(iaa_file):
            print(f"  Loading {iaa_file}...")
            iaa_df_temp = pd.read_excel(iaa_file)
            initial_count = len(iaa_df_temp)

            # Check both 'Bias' and 'Bais' (typo in IAA files)
            if 'Bais' in iaa_df_temp.columns:
                iaa_df_temp['Bias'] = iaa_df_temp['Bais']

            iaa_df_temp = iaa_df_temp[iaa_df_temp['Bias'].notna() & (iaa_df_temp['Bias'] != '')]
            filtered_count = len(iaa_df_temp)

            if filtered_count > 0:
                iaa_dfs.append(iaa_df_temp)
                print(f"    {iaa_file}: {initial_count} rows → {filtered_count} rows")
            else:
                print(f"    {iaa_file}: No valid labels found, skipping.")
        else:
            print(f"  Warning: {iaa_file} not found, skipping.")

    # Concatenate all IAA data
    if iaa_dfs:
        iaa_df = pd.concat(iaa_dfs, ignore_index=True)
        print(f"\n  Total IAA data: {len(iaa_df)} rows with valid labels")
    else:
        print("\n  Warning: No valid IAA data found!")
        iaa_df = pd.DataFrame()

    return main_df, iaa_df


def augment_with_mt(df: pd.DataFrame) -> pd.DataFrame:
    """
    Augment dataset by concatenating Arabic MT and English MT vertically.
    This doubles the dataset size: each row becomes 2 samples.
    """
    print("\n[MT AUGMENTATION] Creating 2x dataset with Arabic MT + English MT...")

    # Verify MT columns exist
    if Config.ARABIC_MT_COL not in df.columns or Config.ENGLISH_MT_COL not in df.columns:
        print(f"  ⚠️ WARNING: MT columns not found!")
        print(f"  Available columns: {df.columns.tolist()}")
        print(f"  Skipping MT augmentation...")
        return df

    original_count = len(df)

    # Create Arabic MT samples
    arabic_mt_df = df[[Config.ARABIC_MT_COL, 'Bias_Mapped']].copy()
    arabic_mt_df.rename(columns={Config.ARABIC_MT_COL: 'Text'}, inplace=True)
    arabic_mt_df = arabic_mt_df[arabic_mt_df['Text'].notna() & (arabic_mt_df['Text'] != '')]

    # Create English MT samples
    english_mt_df = df[[Config.ENGLISH_MT_COL, 'Bias_Mapped']].copy()
    english_mt_df.rename(columns={Config.ENGLISH_MT_COL: 'Text'}, inplace=True)
    english_mt_df = english_mt_df[english_mt_df['Text'].notna() & (english_mt_df['Text'] != '')]

    # Concatenate vertically
    augmented_df = pd.concat([arabic_mt_df, english_mt_df], ignore_index=True)

    print(f"  Original dataset: {original_count} samples")
    print(f"  After MT augmentation: {len(augmented_df)} samples")
    print(f"    - Arabic MT: {len(arabic_mt_df)} samples")
    print(f"    - English MT: {len(english_mt_df)} samples")
    print(f"  Augmentation ratio: {len(augmented_df) / original_count:.2f}x")

    return augmented_df


def map_labels(df: pd.DataFrame) -> pd.DataFrame:
    """Map raw bias labels to standardized 4-class labels."""
    df = df.copy()
    df['Bias_Mapped'] = df['Bias'].map(Config.LABEL_MAP)

    # Handle any unmapped labels
    unmapped = df[df['Bias_Mapped'].isna()]['Bias'].unique()
    if len(unmapped) > 0:
        print(f"  ⚠️ WARNING: Unmapped labels found: {unmapped}")
        print(f"  These will be mapped to 'Others'")
        df['Bias_Mapped'] = df['Bias_Mapped'].fillna('Others')
    else:
        print(f"  ✓ All labels mapped successfully")

    return df


def apply_majority_vote(df: pd.DataFrame) -> pd.DataFrame:
    """Apply majority voting to IAA data grouped by Text/ID."""
    print("\n  Applying majority vote to collapse duplicate annotations...")

    # Create a unique text identifier
    if 'ID' in df.columns:
        df['Text_ID'] = df['ID'].astype(str) + "_" + df['Text'].astype(str).str[:50]
    else:
        df['Text_ID'] = df['Text'].astype(str).str[:50]

    gold_standard_rows = []

    for text_id, group in df.groupby('Text_ID'):
        # Get most common label (mode)
        labels = group['Bias_Mapped'].tolist()
        label_counts = Counter(labels)
        majority_label = label_counts.most_common(1)[0][0]

        # Take first row and update its label
        gold_row = group.iloc[0].copy()
        gold_row['Bias_Mapped'] = majority_label
        gold_row['Annotator_Count'] = len(group)
        gold_standard_rows.append(gold_row)

    result_df = pd.DataFrame(gold_standard_rows)
    print(f"    Collapsed {len(df)} annotations → {len(result_df)} unique texts")

    return result_df


def create_train_test_split(main_df: pd.DataFrame, iaa_df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """Create training and test sets following the silver/gold strategy."""
    print("\n[STEP 2] Creating train/test splits...")

    # Get unique text IDs from IAA
    unique_ids = iaa_df['Text_ID'].unique() if 'Text_ID' in iaa_df.columns else iaa_df['ID'].unique()
    print(f"  Total unique IAA texts: {len(unique_ids)}")

    # Split by IDs (80/20)
    train_ids, test_ids = train_test_split(
        unique_ids,
        test_size=(1 - Config.IAA_TRAIN_SPLIT),
        random_state=Config.RANDOM_STATE,
        stratify=None
    )

    print(f"  IAA split: {len(train_ids)} train IDs, {len(test_ids)} test IDs")

    # Split IAA data
    if 'Text_ID' in iaa_df.columns:
        iaa_train = iaa_df[iaa_df['Text_ID'].isin(train_ids)].copy()
        iaa_test = iaa_df[iaa_df['Text_ID'].isin(test_ids)].copy()
    else:
        iaa_train = iaa_df[iaa_df['ID'].isin(train_ids)].copy()
        iaa_test = iaa_df[iaa_df['ID'].isin(test_ids)].copy()

    # Apply majority vote to both
    iaa_train_collapsed = apply_majority_vote(iaa_train)
    iaa_test_collapsed = apply_majority_vote(iaa_test)

    # IMPORTANT: Apply MT augmentation BEFORE combining with MAIN
    print("\n[STEP 3] Applying MT augmentation to training data...")
    main_df_augmented = augment_with_mt(main_df)
    iaa_train_augmented = augment_with_mt(iaa_train_collapsed)

    # Combine MAIN + IAA_train for training set
    train_df = pd.concat([main_df_augmented, iaa_train_augmented], ignore_index=True)

    # Also augment test set for consistency
    print("\n[STEP 4] Applying MT augmentation to test data...")
    test_df = augment_with_mt(iaa_test_collapsed)

    print(f"\n  Final training set: {len(train_df)} samples (after MT augmentation)")
    print(f"  Final test set: {len(test_df)} samples (after MT augmentation)")

    # Print class distribution
    print("\n  Training set class distribution:")
    train_dist = train_df['Bias_Mapped'].value_counts()
    print(train_dist)
    print("\n  Class proportions:")
    print((train_dist / len(train_df) * 100).round(2))

    print("\n  Test set class distribution:")
    print(test_df['Bias_Mapped'].value_counts())

    return train_df, test_df


# ============================================================================
# CUSTOM TRAINER WITH CAPPED CLASS WEIGHTS
# ============================================================================

class WeightedTrainer(Trainer):
    """Custom Trainer that applies CAPPED class weights to loss function."""

    def __init__(self, class_weights=None, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = class_weights

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        """Override compute_loss to use weighted CrossEntropyLoss."""
        labels = inputs.pop("labels")

        # Forward pass
        outputs = model(**inputs)
        logits = outputs.get("logits")

        # Compute weighted loss
        if self.class_weights is not None:
            # Move weights to same device as logits
            weights = self.class_weights.to(logits.device)
            loss_fct = nn.CrossEntropyLoss(weight=weights)
        else:
            loss_fct = nn.CrossEntropyLoss()

        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))

        return (loss, outputs) if return_outputs else loss


# ============================================================================
# XLM-ROBERTA MODEL
# ============================================================================

def compute_metrics(eval_pred):
    """Compute metrics for training."""
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)

    acc = accuracy_score(labels, predictions)
    f1_macro = f1_score(labels, predictions, average='macro')
    f1_weighted = f1_score(labels, predictions, average='weighted')

    return {
        'accuracy': acc,
        'f1_macro': f1_macro,
        'f1_weighted': f1_weighted
    }


def prepare_dataset(df: pd.DataFrame, tokenizer, text_column: str = 'Text'):
    """Prepare dataset for transformer training."""

    def tokenize_function(examples):
        return tokenizer(
            examples['text'],
            padding='max_length',
            truncation=True,
            max_length=512
        )

    # Prepare data
    data_dict = {
        'text': df[text_column].tolist(),
        'label': df['Bias_Mapped'].map(Config.LABEL2ID).tolist()
    }

    dataset = Dataset.from_dict(data_dict)
    tokenized_dataset = dataset.map(tokenize_function, batched=True)

    return tokenized_dataset


class XLMRoBERTaBiasDetector:
    """XLM-RoBERTa model for bias detection with capped class weights."""

    def __init__(self):
        self.tokenizer = None
        self.model = None
        self.trainer = None
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        print(f"\n[MODEL] Initializing XLM-RoBERTa Bias Detector")
        print(f"  Model: {Config.MODEL_NAME}")
        print(f"  Device: {self.device}")

    def train(self, train_df: pd.DataFrame, eval_df: pd.DataFrame = None):
        """Train the model with capped class weight balancing."""
        print("\n[STEP 5] Training XLM-RoBERTa with Capped Class Weights...")

        # Load tokenizer and model
        print(f"  Loading tokenizer and model: {Config.MODEL_NAME}...")
        self.tokenizer = AutoTokenizer.from_pretrained(Config.MODEL_NAME)
        self.model = AutoModelForSequenceClassification.from_pretrained(
            Config.MODEL_NAME,
            num_labels=len(Config.TARGET_LABELS),
            id2label=Config.ID2LABEL,
            label2id=Config.LABEL2ID
        )

        # Move model to device
        self.model.to(self.device)

        # Calculate class weights with capping
        print("\n  Calculating CAPPED class weights...")
        labels = train_df['Bias_Mapped'].map(Config.LABEL2ID).values
        unique_labels = np.unique(labels)

        # Compute balanced weights
        class_weights = compute_class_weight(
            class_weight='balanced',
            classes=unique_labels,
            y=labels
        )

        # CAP the weights to prevent extreme values
        class_weights = np.clip(class_weights, a_min=None, a_max=Config.MAX_CLASS_WEIGHT)

        # Convert to tensor
        class_weights_tensor = torch.tensor(class_weights, dtype=torch.float32)

        print(f"  Class weights (capped at {Config.MAX_CLASS_WEIGHT}):")
        for label_id, weight in zip(unique_labels, class_weights):
            label_name = Config.ID2LABEL[label_id]
            print(f"    {label_name}: {weight:.4f}")

        # Prepare datasets
        print("\n  Preparing datasets...")
        train_dataset = prepare_dataset(train_df, self.tokenizer)
        eval_dataset = None
        if eval_df is not None and len(eval_df) > 0:
            eval_dataset = prepare_dataset(eval_df, self.tokenizer)

        # Training arguments - save to local first, then copy to Drive
        training_args = TrainingArguments(
            **Config.TRAINING_ARGS,
            report_to="none"
        )

        # Use custom weighted trainer
        self.trainer = WeightedTrainer(
            class_weights=class_weights_tensor,
            model=self.model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=eval_dataset,
            compute_metrics=compute_metrics,
            callbacks=[EarlyStoppingCallback(early_stopping_patience=2)] if eval_dataset else None
        )

        # Train
        print("\n  Starting training...")
        print(f"    Epochs: {Config.TRAINING_ARGS['num_train_epochs']}")
        print(f"    Batch size: {Config.TRAINING_ARGS['per_device_train_batch_size']}")
        print(f"    Learning rate: {Config.TRAINING_ARGS['learning_rate']}")
        print(f"    Using capped class weights: Yes ✓ (max={Config.MAX_CLASS_WEIGHT})")
        print(f"    Mixed precision (fp16): {Config.TRAINING_ARGS['fp16']}")

        self.trainer.train()

        print("\n  ✓ Training completed successfully!")

        # Save best model to Google Drive
        self.save_to_drive()

    def save_to_drive(self):
        """Save the best model to Google Drive."""
        print("\n[SAVING] Copying best model to Google Drive...")

        try:
            # Save model and tokenizer to Drive
            self.model.save_pretrained(Config.MODEL_SAVE_PATH)
            self.tokenizer.save_pretrained(Config.MODEL_SAVE_PATH)

            print(f"  ✓ Model saved to: {Config.MODEL_SAVE_PATH}")
            print(f"  You can now use this model for inference!")

        except Exception as e:
            print(f"  ⚠️ Error saving to Drive: {e}")
            print(f"  Model is still available in local directory: {Config.TRAINING_ARGS['output_dir']}")

    def predict(self, texts: List[str]) -> List[str]:
        """Predict labels for texts with proper device management."""
        if not texts:
            return []

        # Tokenize
        inputs = self.tokenizer(
            texts,
            padding=True,
            truncation=True,
            max_length=512,
            return_tensors="pt"
        )

        # Move inputs to the same device as model
        inputs = {k: v.to(self.device) for k, v in inputs.items()}

        self.model.eval()
        with torch.no_grad():
            outputs = self.model(**inputs)
            predictions = torch.argmax(outputs.logits, dim=-1)

        return [Config.ID2LABEL[pred.item()] for pred in predictions]

    def predict_proba(self, texts: List[str]) -> np.ndarray:
        """Predict probabilities for each class with proper device management."""
        if not texts:
            return np.array([])

        # Tokenize
        inputs = self.tokenizer(
            texts,
            padding=True,
            truncation=True,
            max_length=512,
            return_tensors="pt"
        )

        # Move inputs to the same device as model
        inputs = {k: v.to(self.device) for k, v in inputs.items()}

        self.model.eval()
        with torch.no_grad():
            outputs = self.model(**inputs)
            probs = torch.softmax(outputs.logits, dim=-1)

        return probs.cpu().numpy()

    def evaluate(self, test_df: pd.DataFrame):
        """Evaluate model on test set with detailed metrics."""
        print("\n[STEP 6] Evaluating XLM-RoBERTa...")

        texts = test_df['Text'].tolist()
        true_labels = test_df['Bias_Mapped'].tolist()

        # Get predictions
        print("  Generating predictions...")
        predictions = self.predict(texts)

        # Overall metrics
        print("\n" + "="*80)
        print("CLASSIFICATION REPORT")
        print("="*80)
        print(classification_report(
            true_labels,
            predictions,
            target_names=Config.TARGET_LABELS,
            digits=4
        ))

        acc = accuracy_score(true_labels, predictions)
        f1_macro = f1_score(true_labels, predictions, average='macro')
        f1_weighted = f1_score(true_labels, predictions, average='weighted')

        print("\n" + "="*80)
        print("OVERALL METRICS")
        print("="*80)
        print(f"  Accuracy:        {acc:.4f}")
        print(f"  Macro F1-Score:  {f1_macro:.4f}")
        print(f"  Weighted F1:     {f1_weighted:.4f}")

        # Confusion matrix
        print("\n" + "="*80)
        print("CONFUSION MATRIX")
        print("="*80)
        cm = confusion_matrix(true_labels, predictions, labels=Config.TARGET_LABELS)
        print("\nTrue Labels (rows) vs Predicted Labels (columns):")
        cm_df = pd.DataFrame(cm, index=Config.TARGET_LABELS, columns=Config.TARGET_LABELS)
        print(cm_df)

        return {
            'predictions': predictions,
            'accuracy': acc,
            'f1_macro': f1_macro,
            'f1_weighted': f1_weighted,
            'confusion_matrix': cm
        }


# ============================================================================
# MAIN EXECUTION
# ============================================================================

def main():
    """Main execution function."""

    # Set random seeds
    np.random.seed(Config.RANDOM_STATE)
    torch.manual_seed(Config.RANDOM_STATE)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(Config.RANDOM_STATE)

    # Setup Google Drive
    setup_google_drive()

    # Load and prepare data
    main_df, iaa_df = load_and_clean_data()

    # Map labels
    print("\n[PREPROCESSING] Mapping labels...")
    main_df = map_labels(main_df)
    if len(iaa_df) > 0:
        iaa_df = map_labels(iaa_df)

    # Create splits (includes MT augmentation)
    train_df, test_df = create_train_test_split(main_df, iaa_df)

    # Initialize and train model
    model = XLMRoBERTaBiasDetector()
    model.train(train_df, test_df)

    # Evaluate
    if len(test_df) > 0:
        results = model.evaluate(test_df)
    else:
        print("\n  Warning: No test data available for evaluation")
        results = None

    print("\n" + "="*80)
    print("TRAINING & EVALUATION COMPLETE")
    print("="*80)
    print(f"\n✓ Best model saved to: {Config.MODEL_SAVE_PATH}")

    return model, results


# ============================================================================
# INFERENCE EXAMPLES
# ============================================================================

def test_predictions(model: XLMRoBERTaBiasDetector):
    """Test the model with example texts."""
    print("\n" + "="*80)
    print("EXAMPLE PREDICTIONS")
    print("="*80)

    sample_texts = [
        # English examples
        "Hamas terrorists launched brutal attacks on innocent Israeli civilians.",
        "Israeli forces killed dozens of Palestinian civilians in Gaza today.",
        "The conflict continues with casualties on both sides.",
        "Civilians are suffering due to the brutal blockade.",

        # Arabic examples
        "الاحتلال الإسرائيلي يقصف المدنيين في غزة",
        "حماس تطلق صواريخ على المدن الإسرائيلية",
        "استمرار الصراع مع سقوط ضحايا من الجانبين",
        "قامت القوات باستهداف المدنيين العزل في قطاع غزة"
    ]

    predictions = model.predict(sample_texts)
    probabilities = model.predict_proba(sample_texts)

    for i, (text, pred, probs) in enumerate(zip(sample_texts, predictions, probabilities), 1):
        print(f"\n[Example {i}]")
        print(f"  Text: {text}")
        print(f"  Prediction: {pred}")
        print(f"  Confidence:")
        for label, prob in zip(Config.TARGET_LABELS, probs):
            print(f"    {label}: {prob:.4f}")


if __name__ == "__main__":
    # Run main pipeline
    model, results = main()

    # Test with examples
    if model is not None:
        test_predictions(model)

XLM-ROBERTA TRAINING WITH MT AUGMENTATION

[SETUP] Configuring Google Drive...
  Mounting Google Drive...
Mounted at /content/drive
  Model will be saved to: /content/drive/MyDrive/xlm_roberta_best_model

[STEP 1] Loading and cleaning data...
  Loading /content/Main.xlsx...
    MAIN: 13500 rows → 10800 rows (removed 2700 with missing labels)
  Loading /content/IAA-1.xlsx...
    /content/IAA-1.xlsx: 1200 rows → 1200 rows
  Loading /content/IAA-2.xlsx...
    /content/IAA-2.xlsx: 1200 rows → 1200 rows
  Loading /content/IAA-3.xlsx...
    /content/IAA-3.xlsx: No valid labels found, skipping.
  Loading /content/IAA-4.xlsx...
    /content/IAA-4.xlsx: No valid labels found, skipping.

  Total IAA data: 2400 rows with valid labels

[PREPROCESSING] Mapping labels...
  ✓ All labels mapped successfully
  ✓ All labels mapped successfully

[STEP 2] Creating train/test splits...
  Total unique IAA texts: 1115
  IAA split: 892 train IDs, 223 test IDs

  Applying majority vote to collapse duplicate an

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



  Calculating CAPPED class weights...
  Class weights (capped at 3.0):
    Unbiased: 0.3960
    Biased Against Palestine: 0.9277
    Biased Against Israel: 3.0000
    Others: 3.0000

  Preparing datasets...


Map:   0%|          | 0/23520 [00:00<?, ? examples/s]

Map:   0%|          | 0/480 [00:00<?, ? examples/s]


  Starting training...
    Epochs: 4
    Batch size: 16
    Learning rate: 2e-05
    Using capped class weights: Yes ✓ (max=3.0)
    Mixed precision (fp16): True


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,F1 Weighted
1,1.2774,1.288809,0.372917,0.219193,0.339835
2,1.2532,1.256718,0.485417,0.278973,0.487792
3,1.2234,1.207785,0.691667,0.346925,0.649107
4,1.2268,1.22357,0.622917,0.337125,0.602748



  ✓ Training completed successfully!

[SAVING] Copying best model to Google Drive...
  ✓ Model saved to: /content/drive/MyDrive/xlm_roberta_best_model
  You can now use this model for inference!

[STEP 6] Evaluating XLM-RoBERTa...
  Generating predictions...

CLASSIFICATION REPORT
                          precision    recall  f1-score   support

                Unbiased     0.0000    0.0000    0.0000        12
Biased Against Palestine     0.5698    0.3952    0.4667       124
   Biased Against Israel     1.0000    0.0667    0.1250        30
                  Others     0.7168    0.8949    0.7960       314

                accuracy                         0.6917       480
               macro avg     0.5717    0.3392    0.3469       480
            weighted avg     0.6786    0.6917    0.6491       480


OVERALL METRICS
  Accuracy:        0.6917
  Macro F1-Score:  0.3469
  Weighted F1:     0.6491

CONFUSION MATRIX

True Labels (rows) vs Predicted Labels (columns):
                      