In [3]:
!pip install pandas numpy scikit-learn imbalanced-learn
!pip install transformers datasets torch accelerate
!pip install fasttext langdetect openpyxl joblib

Collecting fasttext
  Downloading fasttext-0.9.3.tar.gz (73 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.4/73.4 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m24.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pybind11>=2.2 (from fasttext)
  Using cached pybind11-3.0.1-py3-none-any.whl.metadata (10.0 kB)
Using cached pybind11-3.0.1-py3-none-any.whl (293 kB)
Building wheels for collected packages: fasttext, langdetect
  Building wheel for fasttext (pyproject.toml) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.3-cp312-cp312-linux_x86_64.whl size=4498212 sha2

In [4]:
"""
FIGNEWS-2024: DEBERTA ENGLISH (TEAM VERSION - SHARED DRIVE)
===========================================================
- Uses shared Google Drive for storage
- Saves best model only (with checkpoint cleanup)
- Data Logic: MAXIMIZE DATA (Matches Classical Pipeline)
  * Uses Original Text if Source is explicitly 'English'.
  * Uses English MT for 'Arabic', 'Unknown', or missing source labels.
"""

# ============================================================================
# GOOGLE DRIVE MOUNT
# ============================================================================
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
print("✓ Google Drive mounted")

# ============================================================================
# IMPORTS
# ============================================================================
import os
import warnings
import re
import shutil
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
from collections import Counter
from typing import Dict, List, Tuple
from datetime import datetime

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, f1_score
from sklearn.utils.class_weight import compute_class_weight

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    EarlyStoppingCallback
)
from datasets import Dataset
import torch
import torch.nn as nn

print("="*80)
print("DEBERTA ENGLISH PIPELINE (Shared Drive + Data Maximization)")
print("="*80)


# ============================================================================
# CONFIGURATION
# ============================================================================

class Config:
    """Pipeline configuration"""

    BASE_PATH = "/content/drive/MyDrive/fignews_shared_project/"

    # Data paths
    MAIN_FILE = BASE_PATH + "data/Main.xlsx"
    IAA_FILES = [
        BASE_PATH + "data/IAA-1.xlsx",
        BASE_PATH + "data/IAA-2.xlsx",
        BASE_PATH + "data/IAA-3.xlsx",
        BASE_PATH + "data/IAA-4.xlsx"
    ]

    # Output directory
    OUTPUT_DIR = BASE_PATH + "models/deberta_finetuned/"

    LABEL_MAP = {
        'Unbiased': 'Unbiased',
        'Biased against Palestine': 'Biased Against Palestine',
        'Biased Against Palestine': 'Biased Against Palestine',
        'Biased against Israel': 'Biased Against Israel',
        'Biased Against Israel': 'Biased Against Israel',
        'Unclear': 'Others',
        'Biased against others': 'Others',
        'Biased against both': 'Others',
        'Biased against both Palestine and Israel': 'Others',
        'Not Applicable': 'Others',
        'Others': 'Others'
    }

    TARGET_LABELS = ['Unbiased', 'Biased Against Palestine',
                     'Biased Against Israel', 'Others']
    LABEL2ID = {label: idx for idx, label in enumerate(TARGET_LABELS)}
    ID2LABEL = {idx: label for label, idx in LABEL2ID.items()}

    MODEL_NAME = "microsoft/deberta-v3-base"
    IAA_TRAIN_SPLIT = 0.8
    RANDOM_STATE = 42

    # Training arguments
    TRAINING_ARGS = {
        "output_dir": OUTPUT_DIR,
        "num_train_epochs": 4,
        "per_device_train_batch_size": 16,
        "per_device_eval_batch_size": 16,
        "gradient_accumulation_steps": 2,
        "learning_rate": 2e-5,
        "weight_decay": 0.01,
        "eval_strategy": "epoch",
        "save_strategy": "epoch",
        "save_total_limit": 1,
        "load_best_model_at_end": True,
        "metric_for_best_model": "f1_macro",
        "logging_steps": 50,
    }


# ============================================================================
# PREPROCESSING
# ============================================================================

def clean_urls_and_format(text: str) -> str:
    """Basic cleaning for LLM models."""
    if not isinstance(text, str): return ""
    text = re.sub(r'http\S+|www\.\S+', '', text)
    text = text.replace(':=:', ' ')
    return re.sub(r'\s+', ' ', text).strip()

def filter_valid_data(df: pd.DataFrame) -> pd.DataFrame:
    """
    CRITICAL CHANGE: Do NOT filter by 'Source Language' metadata.
    Keep rows if valid text exists in Text or English MT.
    """
    df = df.copy()

    # Columns to check
    check_cols = [c for c in ['Text', 'English MT'] if c in df.columns]
    if not check_cols: return df

    # Keep row if content exists
    mask = df[check_cols].apply(lambda x: x.astype(str).str.strip().str.len() > 0).any(axis=1)
    return df[mask].copy()


# ============================================================================
# DATA LOADING
# ============================================================================

def load_and_clean_data() -> Tuple[pd.DataFrame, pd.DataFrame]:
    print("\n[STEP 1] Loading and cleaning data...")

    if not os.path.exists(Config.MAIN_FILE):
        raise FileNotFoundError(f"{Config.MAIN_FILE} not found!")

    print(f"  Loading {Config.MAIN_FILE}...")
    main_df = pd.read_excel(Config.MAIN_FILE)
    main_df = main_df[main_df['Bias'].notna() & (main_df['Bias'] != '')]
    main_df['Bias'] = main_df['Bias'].astype(str).str.strip()

    # Clean text columns
    for col in ['Text', 'Arabic MT', 'English MT']:
        if col in main_df.columns:
            main_df[col] = main_df[col].apply(clean_urls_and_format)

    main_df = filter_valid_data(main_df)
    print(f"    MAIN: {len(main_df)} rows")

    iaa_dfs = []
    for f in Config.IAA_FILES:
        if os.path.exists(f):
            print(f"  Loading {f}...")
            t = pd.read_excel(f)
            if 'Bais' in t.columns: t['Bias'] = t['Bais']
            t = t[t['Bias'].notna() & (t['Bias'] != '')]
            t['Bias'] = t['Bias'].astype(str).str.strip()

            if len(t) > 0:
                for col in ['Text', 'Arabic MT', 'English MT']:
                    if col in t.columns: t[col] = t[col].apply(clean_urls_and_format)
                t = filter_valid_data(t)
                iaa_dfs.append(t)

    iaa_df = pd.concat(iaa_dfs, ignore_index=True) if iaa_dfs else pd.DataFrame()
    print(f"\n  Total IAA: {len(iaa_df)} rows")
    return main_df, iaa_df


def map_labels(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df['Bias_Mapped'] = df['Bias'].map(Config.LABEL_MAP).fillna('Others')
    return df


def apply_majority_vote(df: pd.DataFrame) -> pd.DataFrame:
    print("\n  Applying majority vote...")
    df['Text_ID'] = df['ID'].astype(str) + "_" + df['Text'].str[:20]
    gold_rows = []
    for _, g in df.groupby('Text_ID'):
        maj_label = Counter(g['Bias_Mapped']).most_common(1)[0][0]
        r = g.iloc[0].copy()
        r['Bias_Mapped'] = maj_label
        gold_rows.append(r)
    return pd.DataFrame(gold_rows)


def create_train_test_split(main_df, iaa_df):
    print("\n[STEP 2] Creating train/test splits...")
    u_ids = (iaa_df['Text_ID'].unique() if 'Text_ID' in iaa_df.columns else iaa_df['ID'].unique())
    train_ids, test_ids = train_test_split(u_ids, test_size=(1 - Config.IAA_TRAIN_SPLIT), random_state=Config.RANDOM_STATE)

    if 'Text_ID' in iaa_df.columns:
        iaa_train = iaa_df[iaa_df['Text_ID'].isin(train_ids)].copy()
        iaa_test = iaa_df[iaa_df['Text_ID'].isin(test_ids)].copy()
    else:
        iaa_train = iaa_df[iaa_df['ID'].isin(train_ids)].copy()
        iaa_test = iaa_df[iaa_df['ID'].isin(test_ids)].copy()

    train_df = pd.concat([main_df, apply_majority_vote(iaa_train)], ignore_index=True)
    test_df = apply_majority_vote(iaa_test)

    print(f"\n  Training: {len(train_df)} samples")
    print(f"  Test: {len(test_df)} samples")
    return train_df, test_df


def prepare_english_text(df: pd.DataFrame) -> pd.DataFrame:
    """
    CRITICAL DATA LOGIC: Maximize English Data.
    - If Source is explicitly 'English' -> Use 'Text' (Original)
    - OTHERWISE (Arabic, Unknown, Missing) -> Use 'English MT'
    - This is safer than the previous logic which defaulted to Text.
    """
    df = df.copy()

    df['ModelText'] = df.apply(
        lambda row: row['Text'] if 'English' in str(row.get('Source Language', ''))
                    else row['English MT'],
        axis=1
    )

    # Drop empty results
    df = df[df['ModelText'].notna() & (df['ModelText'].astype(str).str.strip() != '')]
    return df


# ============================================================================
# TRAINER SETUP
# ============================================================================

class WeightedTrainer(Trainer):
    def __init__(self, class_weights, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = torch.tensor(class_weights, dtype=torch.float32)

    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        if self.class_weights.device != model.device:
            self.class_weights = self.class_weights.to(model.device)
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        loss_fct = nn.CrossEntropyLoss(weight=self.class_weights)
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {
        'accuracy': accuracy_score(labels, predictions),
        'f1_macro': f1_score(labels, predictions, average='macro', zero_division=0)
    }

def prepare_dataset(df, tokenizer):
    def tokenize(examples):
        return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=128)

    data = {'text': df['ModelText'].tolist(), 'label': df['Bias_Mapped'].map(Config.LABEL2ID).tolist()}
    return Dataset.from_dict(data).map(tokenize, batched=True)


# ============================================================================
# MODEL WRAPPER
# ============================================================================

class DeBERTaModel:
    def __init__(self):
        print(f"\n[DeBERTa] Initializing {Config.MODEL_NAME}...")
        self.tokenizer = None
        self.model = None
        self.trainer = None

    def train(self, train_df, eval_df=None):
        print("\n[STEP 3] Training...")

        # Weights
        labels = train_df['Bias_Mapped'].map(Config.LABEL2ID).values
        class_weights = compute_class_weight('balanced', classes=np.unique(labels), y=labels)
        full_weights = np.ones(len(Config.TARGET_LABELS))
        for cls_idx, weight in zip(np.unique(labels), class_weights):
            full_weights[cls_idx] = weight
        print(f"  Class Weights: {full_weights}")

        # Load
        self.tokenizer = AutoTokenizer.from_pretrained(Config.MODEL_NAME)
        self.model = AutoModelForSequenceClassification.from_pretrained(
            Config.MODEL_NAME, num_labels=len(Config.TARGET_LABELS),
            id2label=Config.ID2LABEL, label2id=Config.LABEL2ID
        )

        # Data
        train_ds = prepare_dataset(train_df, self.tokenizer)
        eval_ds = prepare_dataset(eval_df, self.tokenizer) if eval_df is not None and len(eval_df) > 0 else None

        # Train
        args = TrainingArguments(**Config.TRAINING_ARGS, report_to="none")
        self.trainer = WeightedTrainer(
            class_weights=full_weights, model=self.model, args=args,
            train_dataset=train_ds, eval_dataset=eval_ds,
            compute_metrics=compute_metrics,
            callbacks=[EarlyStoppingCallback(early_stopping_patience=2)] if eval_ds else None
        )
        self.trainer.train()
        print("  ✓ Training completed")

    def save(self):
        print(f"\n[Saving] To {Config.OUTPUT_DIR}...")
        os.makedirs(Config.OUTPUT_DIR, exist_ok=True)
        self.model.save_pretrained(Config.OUTPUT_DIR)
        self.tokenizer.save_pretrained(Config.OUTPUT_DIR)

        # Cleanup
        for item in os.listdir(Config.OUTPUT_DIR):
            path = os.path.join(Config.OUTPUT_DIR, item)
            if os.path.isdir(path) and item.startswith('checkpoint-'):
                shutil.rmtree(path)
        print("  ✓ Saved and cleaned up checkpoints")

    def evaluate(self, test_df):
        print("\n[STEP 4] Evaluating...")
        texts = test_df['ModelText'].tolist()
        inputs = self.tokenizer(texts, padding=True, truncation=True, max_length=128, return_tensors="pt").to(self.model.device)

        self.model.eval()
        with torch.no_grad():
            outputs = self.model(**inputs)
            preds = torch.argmax(outputs.logits, dim=-1).cpu().numpy()

        preds_labels = [Config.ID2LABEL[p] for p in preds]
        true_labels = test_df['Bias_Mapped'].tolist()

        print(classification_report(true_labels, preds_labels, digits=4, zero_division=0))
        print(f"Accuracy: {accuracy_score(true_labels, preds_labels):.4f}")
        print(f"Macro F1: {f1_score(true_labels, preds_labels, average='macro', zero_division=0):.4f}")


# ============================================================================
# MAIN
# ============================================================================

def main():
    np.random.seed(Config.RANDOM_STATE)
    torch.manual_seed(Config.RANDOM_STATE)

    try: main_df, iaa_df = load_and_clean_data()
    except FileNotFoundError: return

    main_df = map_labels(main_df)
    iaa_df = map_labels(iaa_df) if len(iaa_df) > 0 else iaa_df
    train_df, test_df = create_train_test_split(main_df, iaa_df)

    print("\nPreparing English text columns (Safer Logic)...")
    train_en = prepare_english_text(train_df)
    test_en = prepare_english_text(test_df)

    print(f"  Training samples: {len(train_en)}")

    model = DeBERTaModel()
    model.train(train_en, test_en)
    model.save()
    model.evaluate(test_en)

if __name__ == "__main__":
    main()

Mounted at /content/drive
✓ Google Drive mounted
DEBERTA ENGLISH PIPELINE (Shared Drive + Data Maximization)

[STEP 1] Loading and cleaning data...
  Loading /content/drive/MyDrive/fignews_shared_project/data/Main.xlsx...
    MAIN: 10800 rows
  Loading /content/drive/MyDrive/fignews_shared_project/data/IAA-1.xlsx...
  Loading /content/drive/MyDrive/fignews_shared_project/data/IAA-2.xlsx...
  Loading /content/drive/MyDrive/fignews_shared_project/data/IAA-3.xlsx...
  Loading /content/drive/MyDrive/fignews_shared_project/data/IAA-4.xlsx...

  Total IAA: 2400 rows

[STEP 2] Creating train/test splits...

  Applying majority vote...

  Applying majority vote...

  Training: 11760 samples
  Test: 240 samples

Preparing English text columns (Safer Logic)...
  Training samples: 11760

[DeBERTa] Initializing microsoft/deberta-v3-base...

[STEP 3] Training...
  Class Weights: [0.3959596  0.92773746 9.96610169 3.37543054]


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/11760 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/371M [00:00<?, ?B/s]

Map:   0%|          | 0/240 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro
1,1.3159,1.277813,0.6375,0.38928
2,1.2565,1.200003,0.645833,0.389683
3,1.1622,1.014664,0.620833,0.467955
4,1.0969,1.015445,0.633333,0.453432


  ✓ Training completed

[Saving] To /content/drive/MyDrive/fignews_shared_project/models/deberta_finetuned/...
  ✓ Saved and cleaned up checkpoints

[STEP 4] Evaluating...
                          precision    recall  f1-score   support

   Biased Against Israel     0.1290    0.6667    0.2162         6
Biased Against Palestine     0.5507    0.6129    0.5802        62
                  Others     0.3846    0.3333    0.3571        15
                Unbiased     0.8031    0.6497    0.7183       157

                accuracy                         0.6208       240
               macro avg     0.4669    0.5656    0.4680       240
            weighted avg     0.6949    0.6208    0.6475       240

Accuracy: 0.6208
Macro F1: 0.4680
