In [2]:
!pip install pandas numpy scikit-learn imbalanced-learn
!pip install transformers datasets torch accelerate
!pip install fasttext langdetect openpyxl joblib

Collecting fasttext
  Downloading fasttext-0.9.3.tar.gz (73 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.4/73.4 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m21.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pybind11>=2.2 (from fasttext)
  Using cached pybind11-3.0.1-py3-none-any.whl.metadata (10.0 kB)
Using cached pybind11-3.0.1-py3-none-any.whl (293 kB)
Building wheels for collected packages: fasttext, langdetect
  Building wheel for fasttext (pyproject.toml) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.3-cp312-cp312-linux_x86_64.whl size=4498210 sha2

In [11]:
"""
FIGNEWS-2024: MARBERT ARABIC (TEAM VERSION - SHARED DRIVE)
==========================================================
- Uses shared Google Drive for storage
- Saves best model only (with checkpoint cleanup)
- Data logic: Uses Arabic MT if English source, Text if Arabic source
"""

# ============================================================================
# GOOGLE DRIVE MOUNT
# ============================================================================
from google.colab import drive
drive.mount('/content/drive',force_remount=True)
print("✓ Google Drive mounted")

# ============================================================================
# IMPORTS
# ============================================================================
import os
import warnings
import re
import shutil
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
from collections import Counter
from typing import Dict, List, Tuple
from datetime import datetime

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, f1_score
from sklearn.utils.class_weight import compute_class_weight

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    EarlyStoppingCallback
)
from datasets import Dataset
import torch
import torch.nn as nn

print("="*80)
print("MARBERT ARABIC PIPELINE (Shared Drive + Data Augmentation)")
print("="*80)


# ============================================================================
# CONFIGURATION
# ============================================================================

class Config:
    """Pipeline configuration"""

    # ========== SHARED DRIVE PATHS ==========
    BASE_PATH = "/content/drive/MyDrive/fignews_shared_project/"

    # Data paths
    MAIN_FILE = BASE_PATH + "data/Main.xlsx"
    IAA_FILES = [
        BASE_PATH + "data/IAA-1.xlsx",
        BASE_PATH + "data/IAA-2.xlsx",
        BASE_PATH + "data/IAA-3.xlsx",
        BASE_PATH + "data/IAA-4.xlsx"
    ]

    # Output directory
    OUTPUT_DIR = BASE_PATH + "models/marbert_finetuned/"
    # =========================================

    # Label mapping
    LABEL_MAP = {
        'Unbiased': 'Unbiased',
        'Biased against Palestine': 'Biased Against Palestine',
        'Biased Against Palestine': 'Biased Against Palestine',
        'Biased against Israel': 'Biased Against Israel',
        'Biased Against Israel': 'Biased Against Israel',
        'Unclear': 'Others',
        'Biased against others': 'Others',
        'Biased against both': 'Others',
        'Biased against both Palestine and Israel': 'Others',
        'Not Applicable': 'Others',
        'Others': 'Others'
    }

    TARGET_LABELS = ['Unbiased', 'Biased Against Palestine',
                     'Biased Against Israel', 'Others']
    LABEL2ID = {label: idx for idx, label in enumerate(TARGET_LABELS)}
    ID2LABEL = {idx: label for label, idx in LABEL2ID.items()}

    MODEL_NAME = "UBC-NLP/MARBERTv2"
    IAA_TRAIN_SPLIT = 0.8
    RANDOM_STATE = 42

    # Training arguments (Save best model)
    TRAINING_ARGS = {
        "output_dir": OUTPUT_DIR,
        "num_train_epochs": 4,
        "per_device_train_batch_size": 16,
        "per_device_eval_batch_size": 16,
        "learning_rate": 2e-5,
        "weight_decay": 0.01,
        "eval_strategy": "epoch",
        "save_strategy": "epoch",  # Save after each epoch
        "save_total_limit": 1,  # Keep only best checkpoint
        "load_best_model_at_end": True,  # Load best model
        "metric_for_best_model": "f1_macro",
        "logging_steps": 50,
    }


# ============================================================================
# PREPROCESSING
# ============================================================================

def clean_urls_and_format(text: str) -> str:
    """Basic cleaning for LLM models."""
    if not isinstance(text, str):
        return ""
    text = re.sub(r'http\S+|www\.\S+', '', text)
    text = text.replace(':=:', ' ')
    text = re.sub(r'\s+', ' ', text).strip()
    return text


def filter_languages(df: pd.DataFrame) -> pd.DataFrame:
    """Keep ONLY English and Arabic rows."""
    mask = df['Source Language'].str.contains('English|Arabic', case=False, na=False)
    return df[mask].copy()


# ============================================================================
# DATA LOADING & PREPARATION
# ============================================================================

def load_and_clean_data() -> Tuple[pd.DataFrame, pd.DataFrame]:
    """Load MAIN and IAA files."""
    print("\n[STEP 1] Loading and cleaning data...")

    if not os.path.exists(Config.MAIN_FILE):
        raise FileNotFoundError(f"{Config.MAIN_FILE} not found!")

    print(f"  Loading {Config.MAIN_FILE}...")
    main_df = pd.read_excel(Config.MAIN_FILE)

    main_df = main_df[main_df['Bias'].notna() & (main_df['Bias'] != '')]
    main_df['Bias'] = main_df['Bias'].astype(str).str.strip()
    main_df = filter_languages(main_df)

    # Clean all text columns
    for col in ['Text', 'Arabic MT', 'English MT']:
        if col in main_df.columns:
            main_df[col] = main_df[col].apply(clean_urls_and_format)

    print(f"    MAIN: {len(main_df)} rows")

    # Load IAA
    iaa_dfs = []
    for iaa_file in Config.IAA_FILES:
        if os.path.exists(iaa_file):
            print(f"  Loading {iaa_file}...")
            iaa_df_temp = pd.read_excel(iaa_file)

            if 'Bais' in iaa_df_temp.columns:
                iaa_df_temp['Bias'] = iaa_df_temp['Bais']

            iaa_df_temp = iaa_df_temp[iaa_df_temp['Bias'].notna() &
                                       (iaa_df_temp['Bias'] != '')]
            iaa_df_temp['Bias'] = iaa_df_temp['Bias'].astype(str).str.strip()

            if len(iaa_df_temp) > 0:
                iaa_df_temp = filter_languages(iaa_df_temp)

            if len(iaa_df_temp) > 0:
                for col in ['Text', 'Arabic MT', 'English MT']:
                    if col in iaa_df_temp.columns:
                        iaa_df_temp[col] = iaa_df_temp[col].apply(clean_urls_and_format)
                iaa_dfs.append(iaa_df_temp)
                print(f"    {iaa_file}: {len(iaa_df_temp)} rows")

    iaa_df = pd.concat(iaa_dfs, ignore_index=True) if iaa_dfs else pd.DataFrame()
    print(f"\n  Total IAA: {len(iaa_df)} rows")

    return main_df, iaa_df


def map_labels(df: pd.DataFrame) -> pd.DataFrame:
    """Map labels to 4-class taxonomy."""
    df = df.copy()
    df['Bias_Mapped'] = df['Bias'].map(Config.LABEL_MAP)
    df['Bias_Mapped'] = df['Bias_Mapped'].fillna('Others')
    return df


def apply_majority_vote(df: pd.DataFrame) -> pd.DataFrame:
    """Apply majority voting."""
    print("\n  Applying majority vote...")
    df['Text_ID'] = df['ID'].astype(str) + "_" + df['Text'].str[:20]

    gold_rows = []
    for text_id, group in df.groupby('Text_ID'):
        labels = group['Bias_Mapped'].tolist()
        majority_label = Counter(labels).most_common(1)[0][0]

        gold_row = group.iloc[0].copy()
        gold_row['Bias_Mapped'] = majority_label
        gold_rows.append(gold_row)

    return pd.DataFrame(gold_rows)


def create_train_test_split(main_df: pd.DataFrame,
                            iaa_df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """Create train/test splits."""
    print("\n[STEP 2] Creating train/test splits...")

    unique_ids = (iaa_df['Text_ID'].unique() if 'Text_ID' in iaa_df.columns
                  else iaa_df['ID'].unique())

    train_ids, test_ids = train_test_split(
        unique_ids,
        test_size=(1 - Config.IAA_TRAIN_SPLIT),
        random_state=Config.RANDOM_STATE
    )

    if 'Text_ID' in iaa_df.columns:
        iaa_train = iaa_df[iaa_df['Text_ID'].isin(train_ids)].copy()
        iaa_test = iaa_df[iaa_df['Text_ID'].isin(test_ids)].copy()
    else:
        iaa_train = iaa_df[iaa_df['ID'].isin(train_ids)].copy()
        iaa_test = iaa_df[iaa_df['ID'].isin(test_ids)].copy()

    iaa_train_collapsed = apply_majority_vote(iaa_train)
    iaa_test_collapsed = apply_majority_vote(iaa_test)

    train_df = pd.concat([main_df, iaa_train_collapsed], ignore_index=True)
    test_df = iaa_test_collapsed

    print(f"\n  Training: {len(train_df)} samples")
    print(f"  Test: {len(test_df)} samples")

    return train_df, test_df


def prepare_arabic_text(df: pd.DataFrame) -> pd.DataFrame:
    """
    CRITICAL DATA LOGIC:
    - If Source is Arabic → use Text
    - If Source is English → use Arabic MT
    This maximizes Arabic training data.
    """
    df = df.copy()

    df['ModelText'] = df.apply(
        lambda row: row['Arabic MT'] if 'English' in str(row.get('Source Language', ''))
                   else row['Text'],
        axis=1
    )

    return df


# ============================================================================
# WEIGHTED TRAINER
# ============================================================================

class WeightedTrainer(Trainer):
    """Custom Trainer with weighted cross-entropy loss."""

    def __init__(self, class_weights, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = torch.tensor(class_weights, dtype=torch.float32)

    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        """Compute weighted loss."""
        if self.class_weights.device != model.device:
            self.class_weights = self.class_weights.to(model.device)

        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")

        loss_fct = nn.CrossEntropyLoss(weight=self.class_weights)
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))

        return (loss, outputs) if return_outputs else loss


def compute_metrics(eval_pred):
    """Compute metrics for evaluation."""
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)

    acc = accuracy_score(labels, predictions)
    f1_macro = f1_score(labels, predictions, average='macro', zero_division=0)

    return {
        'accuracy': acc,
        'f1_macro': f1_macro
    }


def prepare_dataset(df: pd.DataFrame, tokenizer, text_column: str = 'ModelText'):
    """Prepare dataset for transformer training."""

    def tokenize_function(examples):
        return tokenizer(
            examples['text'],
            padding='max_length',
            truncation=True,
            max_length=128
        )

    data_dict = {
        'text': df[text_column].tolist(),
        'label': df['Bias_Mapped'].map(Config.LABEL2ID).tolist()
    }

    dataset = Dataset.from_dict(data_dict)
    tokenized_dataset = dataset.map(tokenize_function, batched=True)

    return tokenized_dataset


# ============================================================================
# MARBERT MODEL
# ============================================================================

class MARBERTModel:
    """MARBERTv2 model wrapper."""

    def __init__(self):
        self.tokenizer = None
        self.model = None
        self.trainer = None
        print(f"\n[MARBERT] Initializing...")
        print(f"  Model: {Config.MODEL_NAME}")

    def train(self, train_df: pd.DataFrame, eval_df: pd.DataFrame = None):
        """Train MARBERTv2 with weighted loss."""
        print("\n[STEP 3] Training MARBERTv2...")

        # Calculate class weights
        labels = train_df['Bias_Mapped'].map(Config.LABEL2ID).values
        class_weights = compute_class_weight('balanced',
                                             classes=np.unique(labels),
                                             y=labels)

        full_weights = np.ones(len(Config.TARGET_LABELS))
        for cls_idx, weight in zip(np.unique(labels), class_weights):
            full_weights[cls_idx] = weight

        print(f"\n  Class Weights: {full_weights}")

        # Load tokenizer and model
        print(f"\n  Loading tokenizer and model...")
        self.tokenizer = AutoTokenizer.from_pretrained(Config.MODEL_NAME)
        self.model = AutoModelForSequenceClassification.from_pretrained(
            Config.MODEL_NAME,
            num_labels=len(Config.TARGET_LABELS),
            id2label=Config.ID2LABEL,
            label2id=Config.LABEL2ID
        )

        # Prepare datasets
        print("  Preparing datasets...")
        train_dataset = prepare_dataset(train_df, self.tokenizer)
        eval_dataset = None
        if eval_df is not None and len(eval_df) > 0:
            eval_dataset = prepare_dataset(eval_df, self.tokenizer)

        # Training arguments
        training_args = TrainingArguments(
            **Config.TRAINING_ARGS,
            report_to="none"
        )

        # Weighted Trainer
        self.trainer = WeightedTrainer(
            class_weights=full_weights,
            model=self.model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=eval_dataset,
            compute_metrics=compute_metrics,
            callbacks=[EarlyStoppingCallback(early_stopping_patience=2)] if eval_dataset else None
        )

        # Train
        print("\n  Starting training...")
        print(f"    Epochs: {Config.TRAINING_ARGS['num_train_epochs']}")
        print(f"    Batch size: {Config.TRAINING_ARGS['per_device_train_batch_size']}")

        self.trainer.train()

        print("\n  ✓ Training completed!")

    def predict(self, texts: List[str]) -> List[str]:
        """Predict labels for texts."""
        if not texts:
            return []

        inputs = self.tokenizer(
            texts,
            padding=True,
            truncation=True,
            max_length=128,
            return_tensors="pt"
        ).to(self.model.device)

        self.model.eval()
        with torch.no_grad():
            outputs = self.model(**inputs)
            predictions = torch.argmax(outputs.logits, dim=-1)

        return [Config.ID2LABEL[pred.item()] for pred in predictions]

    def save(self):
        """Save best model and clean up checkpoints."""
        os.makedirs(Config.OUTPUT_DIR, exist_ok=True)

        print(f"\n  Saving best model to shared Drive...")
        print(f"  Path: {Config.OUTPUT_DIR}")

        # Best model is already loaded (load_best_model_at_end=True)
        self.model.save_pretrained(Config.OUTPUT_DIR)
        self.tokenizer.save_pretrained(Config.OUTPUT_DIR)

        # Save training metadata
        import json
        metadata = {
            "model_name": Config.MODEL_NAME,
            "training_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
            "num_epochs": Config.TRAINING_ARGS["num_train_epochs"],
            "batch_size": Config.TRAINING_ARGS["per_device_train_batch_size"],
            "learning_rate": Config.TRAINING_ARGS["learning_rate"],
            "data_strategy": "Uses Arabic MT if source is English, Text if source is Arabic",
            "note": "Best model only (intermediate checkpoints removed)"
        }

        with open(os.path.join(Config.OUTPUT_DIR, "training_info.json"), "w") as f:
            json.dump(metadata, f, indent=2)

        # Clean up checkpoint folders
        print("\n  Cleaning up intermediate checkpoints...")
        for item in os.listdir(Config.OUTPUT_DIR):
            item_path = os.path.join(Config.OUTPUT_DIR, item)
            if os.path.isdir(item_path) and item.startswith('checkpoint-'):
                print(f"    Removing {item}...")
                shutil.rmtree(item_path)

        print(f"  ✓ Model saved to shared Drive!")

    def evaluate(self, test_df: pd.DataFrame):
        """Evaluate model on test set."""
        print("\n[STEP 4] Evaluating MARBERTv2...")

        texts = test_df['ModelText'].tolist()
        true_labels = test_df['Bias_Mapped'].tolist()

        print("  Generating predictions...")
        predictions = self.predict(texts)

        # Get unique labels in test set
        unique_labels = sorted(set(true_labels))
        present_label_names = [Config.TARGET_LABELS[Config.LABEL2ID[label]]
                               for label in unique_labels]

        print("\n" + "="*80)
        print("CLASSIFICATION REPORT (ARABIC)")
        print("="*80)
        print(classification_report(
            true_labels,
            predictions,
            labels=unique_labels,
            target_names=present_label_names,
            digits=4,
            zero_division=0
        ))

        acc = accuracy_score(true_labels, predictions)
        f1_macro = f1_score(true_labels, predictions, average='macro', zero_division=0)

        print(f"\nAccuracy: {acc:.4f}")
        print(f"Macro F1: {f1_macro:.4f}")

        return predictions, acc, f1_macro


# ============================================================================
# MAIN EXECUTION
# ============================================================================

def main():
    """Main training pipeline."""

    # Set random seeds
    np.random.seed(Config.RANDOM_STATE)
    torch.manual_seed(Config.RANDOM_STATE)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(Config.RANDOM_STATE)

    # Load data
    main_df, iaa_df = load_and_clean_data()

    # Map labels
    print("\nMapping labels...")
    main_df = map_labels(main_df)
    iaa_df = map_labels(iaa_df) if len(iaa_df) > 0 else iaa_df

    # Create splits
    train_df, test_df = create_train_test_split(main_df, iaa_df)

    # Prepare Arabic text (use Arabic MT for English sources)
    print("\n[STEP 3] Preparing Arabic text columns...")
    train_ar = prepare_arabic_text(train_df)
    test_ar = prepare_arabic_text(test_df)

    print(f"  Training samples (all rows with Arabic text): {len(train_ar)}")
    print(f"  Test samples: {len(test_ar)}")

    if len(train_ar) == 0:
        print("\n  ERROR: No training data found!")
        return

    # Train model
    print("\n" + "="*80)
    print("TRAINING MARBERT")
    print("="*80)

    model = MARBERTModel()
    eval_ar = test_ar if len(test_ar) > 0 else None
    model.train(train_ar, eval_ar)

    # Save model
    model.save()

    # Evaluate
    if len(test_ar) > 0:
        model.evaluate(test_ar)
    else:
        print("\n  Warning: No test data for evaluation")

    print("\n" + "="*80)
    print("MARBERT TRAINING COMPLETE")
    print("="*80)
    print(f"\nModel saved to shared Drive: {Config.OUTPUT_DIR}")


if __name__ == "__main__":
    main()

Mounted at /content/drive
✓ Google Drive mounted
MARBERT ARABIC PIPELINE (Shared Drive + Data Augmentation)

[STEP 1] Loading and cleaning data...
  Loading /content/drive/MyDrive/fignews_shared_project/data/Main.xlsx...
    MAIN: 4320 rows
  Loading /content/drive/MyDrive/fignews_shared_project/data/IAA-1.xlsx...
    /content/drive/MyDrive/fignews_shared_project/data/IAA-1.xlsx: 480 rows
  Loading /content/drive/MyDrive/fignews_shared_project/data/IAA-2.xlsx...
    /content/drive/MyDrive/fignews_shared_project/data/IAA-2.xlsx: 480 rows
  Loading /content/drive/MyDrive/fignews_shared_project/data/IAA-3.xlsx...
  Loading /content/drive/MyDrive/fignews_shared_project/data/IAA-4.xlsx...

  Total IAA: 960 rows

Mapping labels...

[STEP 2] Creating train/test splits...

  Applying majority vote...

  Applying majority vote...

  Training: 4704 samples
  Test: 96 samples

[STEP 3] Preparing Arabic text columns...
  Training samples (all rows with Arabic text): 4704
  Test samples: 96

TRAINI

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at UBC-NLP/MARBERTv2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  Preparing datasets...


Map:   0%|          | 0/4704 [00:00<?, ? examples/s]

Map:   0%|          | 0/96 [00:00<?, ? examples/s]


  Starting training...
    Epochs: 4
    Batch size: 16


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro
1,1.3162,1.269186,0.364583,0.240515
2,1.1877,1.047518,0.729167,0.426971
3,0.9677,1.007246,0.677083,0.499943
4,0.7334,1.077568,0.614583,0.430198



  ✓ Training completed!

  Saving best model to shared Drive...
  Path: /content/drive/MyDrive/fignews_shared_project/models/marbert_finetuned/

  Cleaning up intermediate checkpoints...
    Removing checkpoint-882...
  ✓ Model saved to shared Drive!

[STEP 4] Evaluating MARBERTv2...
  Generating predictions...

CLASSIFICATION REPORT (ARABIC)
                          precision    recall  f1-score   support

   Biased Against Israel     0.0000    0.0000    0.0000         1
Biased Against Palestine     0.6190    0.6500    0.6341        20
                  Others     0.5000    0.7500    0.6000         4
                Unbiased     0.8596    0.6901    0.7656        71

                accuracy                         0.6771        96
               macro avg     0.4947    0.5225    0.4999        96
            weighted avg     0.7856    0.6771    0.7234        96


Accuracy: 0.6771
Macro F1: 0.4999

MARBERT TRAINING COMPLETE

Model saved to shared Drive: /content/drive/MyDrive/fignews_