# MARBERT Baseline Training for Arabic Polarization Detection

This notebook trains and evaluates MARBERT and MARBERT v2 models on the preprocessed Arabic dataset.

**Dataset**: Cleaned Arabic text with polarization labels  
**Models**: 
- MARBERT (UBC-NLP/MARBERT)
- MARBERT v2 (UBC-NLP/MARBERTv2)

**Task**: Binary classification (polarization detection)

## Setup: Install Required Packages

In [None]:
!pip install transformers torch scikit-learn pandas numpy tqdm accelerate -q

## Import Libraries

In [None]:
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification,
    Trainer, 
    TrainingArguments,
    DataCollatorWithPadding
)
from datasets import Dataset
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.auto import tqdm
import warnings
warnings.filterwarnings('ignore')

# Check GPU availability
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")

## Load and Explore Dataset

In [None]:
# Load the cleaned dataset
# Note: Update the path if running on Colab and data is uploaded
data_path = 'arb_clean_basic.csv'
df = pd.read_csv(data_path)

print(f"Dataset loaded successfully!")
print(f"Shape: {df.shape}")
print(f"Columns: {list(df.columns)}")
print(f"\nFirst few rows:")
print(df.head())
print(f"\nClass distribution:")
print(df['polarization'].value_counts())
print(f"\nClass balance:")
print(df['polarization'].value_counts(normalize=True))

## Split Data (90/10 Train/Test with Stratification)

In [None]:
# Split with stratification to maintain class balance
RANDOM_STATE = 42

train_df, test_df = train_test_split(
    df, 
    test_size=0.1, 
    random_state=RANDOM_STATE,
    stratify=df['polarization']
)

print(f"Train set size: {len(train_df)} ({len(train_df)/len(df)*100:.1f}%)")
print(f"Test set size: {len(test_df)} ({len(test_df)/len(df)*100:.1f}%)")

print(f"\nTrain set class distribution:")
print(train_df['polarization'].value_counts())
print(f"\nTest set class distribution:")
print(test_df['polarization'].value_counts())

print(f"\nClass balance verification:")
print(f"Train: {train_df['polarization'].value_counts(normalize=True)}")
print(f"Test:  {test_df['polarization'].value_counts(normalize=True)}")

## Prepare Datasets for Training

In [None]:
# Convert to HuggingFace Dataset format
train_dataset = Dataset.from_pandas(train_df[['text', 'polarization']].rename(columns={'polarization': 'label'}))
test_dataset = Dataset.from_pandas(test_df[['text', 'polarization']].rename(columns={'polarization': 'label'}))

print(f"‚úì Datasets prepared")
print(f"Train dataset: {train_dataset}")
print(f"Test dataset: {test_dataset}")

## Helper Functions for Training and Evaluation

In [None]:
def tokenize_function(examples, tokenizer):
    """Tokenize the texts"""
    return tokenizer(examples['text'], truncation=True, padding=False, max_length=512)

def compute_metrics(eval_pred):
    """Compute metrics for evaluation"""
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    
    accuracy = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average='weighted')
    
    return {
        'accuracy': accuracy,
        'f1': f1
    }

def plot_confusion_matrix(y_true, y_pred, title="Confusion Matrix"):
    """Plot confusion matrix"""
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(title)
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.show()

def evaluate_model(trainer, test_dataset, model_name):
    """Evaluate model and print detailed metrics"""
    print(f"\n{'='*80}")
    print(f"Evaluating {model_name}")
    print(f"{'='*80}")
    
    # Get predictions
    predictions = trainer.predict(test_dataset)
    preds = np.argmax(predictions.predictions, axis=1)
    labels = predictions.label_ids
    
    # Print metrics
    print("\nClassification Report:")
    print(classification_report(labels, preds, target_names=['Class 0', 'Class 1']))
    
    # Plot confusion matrix
    plot_confusion_matrix(labels, preds, title=f"{model_name} - Confusion Matrix")
    
    # Return metrics
    return {
        'accuracy': accuracy_score(labels, preds),
        'f1': f1_score(labels, preds, average='weighted'),
        'predictions': predictions
    }

## Train MARBERT Model

In [None]:
# Load MARBERT tokenizer and model
marbert_model_name = "UBC-NLP/MARBERT"
print(f"Loading {marbert_model_name}...")

marbert_tokenizer = AutoTokenizer.from_pretrained(marbert_model_name)
marbert_model = AutoModelForSequenceClassification.from_pretrained(
    marbert_model_name,
    num_labels=2
)

# Tokenize datasets
print("Tokenizing datasets...")
marbert_train_dataset = train_dataset.map(
    lambda x: tokenize_function(x, marbert_tokenizer),
    batched=True
)
marbert_test_dataset = test_dataset.map(
    lambda x: tokenize_function(x, marbert_tokenizer),
    batched=True
)

print(f"Train dataset size: {len(marbert_train_dataset)}")
print(f"Test dataset size: {len(marbert_test_dataset)}")

In [None]:
# Set up training arguments
training_args = TrainingArguments(
    output_dir='./results_marbert',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs_marbert',
    logging_steps=100,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    seed=RANDOM_STATE
)

# Create Trainer
marbert_trainer = Trainer(
    model=marbert_model,
    args=training_args,
    train_dataset=marbert_train_dataset,
    eval_dataset=marbert_test_dataset,
    compute_metrics=compute_metrics
)

# Train the model
print("Training MARBERT...")
marbert_trainer.train()
print("Training complete!")

## Evaluate MARBERT Model

In [None]:
# Evaluate MARBERT
marbert_results = evaluate_model(marbert_trainer, marbert_test_dataset, "MARBERT")
print(f"\nMARBERT Accuracy: {marbert_results['accuracy']:.4f}")
print(f"MARBERT F1 Score: {marbert_results['f1']:.4f}")

## Train MARBERT v2 Model

In [None]:
# Load MARBERT v2 tokenizer and model
marbertv2_model_name = "UBC-NLP/MARBERTv2"
print(f"Loading {marbertv2_model_name}...")

marbertv2_tokenizer = AutoTokenizer.from_pretrained(marbertv2_model_name)
marbertv2_model = AutoModelForSequenceClassification.from_pretrained(
    marbertv2_model_name,
    num_labels=2
)

# Tokenize datasets
print("Tokenizing datasets...")
marbertv2_train_dataset = train_dataset.map(
    lambda x: tokenize_function(x, marbertv2_tokenizer),
    batched=True
)
marbertv2_test_dataset = test_dataset.map(
    lambda x: tokenize_function(x, marbertv2_tokenizer),
    batched=True
)

print(f"Train dataset size: {len(marbertv2_train_dataset)}")
print(f"Test dataset size: {len(marbertv2_test_dataset)}")

In [None]:
# Set up training arguments for MARBERT v2
training_args_v2 = TrainingArguments(
    output_dir='./results_marbertv2',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs_marbertv2',
    logging_steps=100,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    seed=RANDOM_STATE
)

# Create Trainer
marbertv2_trainer = Trainer(
    model=marbertv2_model,
    args=training_args_v2,
    train_dataset=marbertv2_train_dataset,
    eval_dataset=marbertv2_test_dataset,
    compute_metrics=compute_metrics
)

# Train the model
print("Training MARBERT v2...")
marbertv2_trainer.train()
print("Training complete!")

## Evaluate MARBERT v2 Model

In [None]:
# Evaluate MARBERT v2
marbertv2_results = evaluate_model(marbertv2_trainer, marbertv2_test_dataset, "MARBERT v2")
print(f"\nMARBERT v2 Accuracy: {marbertv2_results['accuracy']:.4f}")
print(f"MARBERT v2 F1 Score: {marbertv2_results['f1']:.4f}")

## Compare Results: MARBERT vs MARBERT v2

In [None]:
# Create comparison table
comparison_df = pd.DataFrame({
    'Model': ['MARBERT', 'MARBERT v2'],
    'Accuracy': [marbert_results['accuracy'], marbertv2_results['accuracy']],
    'F1 Score': [marbert_results['f1'], marbertv2_results['f1']]
})

print("\n" + "="*80)
print("FINAL RESULTS COMPARISON")
print("="*80)
print(comparison_df.to_string(index=False))
print("="*80)

# Plot comparison
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# Accuracy comparison
models = ['MARBERT', 'MARBERT v2']
accuracy_scores = [marbert_results['accuracy'], marbertv2_results['accuracy']]
ax1.bar(models, accuracy_scores, color=['#3498db', '#e74c3c'])
ax1.set_ylabel('Accuracy')
ax1.set_title('Model Accuracy Comparison')
ax1.set_ylim([0, 1])
for i, v in enumerate(accuracy_scores):
    ax1.text(i, v + 0.01, f'{v:.4f}', ha='center', va='bottom')

# F1 Score comparison
f1_scores = [marbert_results['f1'], marbertv2_results['f1']]
ax2.bar(models, f1_scores, color=['#3498db', '#e74c3c'])
ax2.set_ylabel('F1 Score')
ax2.set_title('Model F1 Score Comparison')
ax2.set_ylim([0, 1])
for i, v in enumerate(f1_scores):
    ax2.text(i, v + 0.01, f'{v:.4f}', ha='center', va='bottom')

plt.tight_layout()
plt.show()

# Determine winner
if marbert_results['f1'] > marbertv2_results['f1']:
    winner = "MARBERT"
    improvement = marbert_results['f1'] - marbertv2_results['f1']
elif marbertv2_results['f1'] > marbert_results['f1']:
    winner = "MARBERT v2"
    improvement = marbertv2_results['f1'] - marbert_results['f1']
else:
    winner = "TIE"
    improvement = 0

print(f"\nüèÜ Winner: {winner}")
if winner != "TIE":
    print(f"   Improvement: +{improvement:.4f} F1 score")