<a href="https://colab.research.google.com/github/BootCamp-BMA/colabs/blob/main/MARABERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# -*- coding: utf-8 -*-
import pandas as pd
from google.colab import drive
import matplotlib.pyplot as plt
from transformers import AutoTokenizer, BertForSequenceClassification, AdamW, get_scheduler
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import TensorDataset, DataLoader
import torch.nn as nn
import time
import os
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
from tqdm import tqdm
from google.colab import drive
drive.mount('/content/drive')

print("Imported all required libraries")

Imported all required libraries


In [None]:
def load_and_visualize_data(file_path):
    """Load dataset and visualize label distribution."""
    drive.mount('/content/drive', force_remount=True)
    df = pd.read_csv(file_path)
    print(f"Loaded dataset with shape: {df.shape}")
    print("First 10 rows:\n", df.head(10))

    label_counts = df['label'].value_counts()
    plt.figure(figsize=(8, 6))
    plt.bar(['Not Fake (0)', 'Fake (1)'], label_counts, color=['green', 'red'])
    plt.title('Distribution of Fake vs Not Fake News')
    plt.xlabel('Label')
    plt.ylabel('Count')
    plt.show()

    return df

print("Defined load_and_visualize_data function")

Defined load_and_visualize_data function


In [None]:
def load_and_visualize_data(file_path):
    """Load dataset and visualize label distribution."""
    drive.mount('/content/drive', force_remount=True)
    df = pd.read_csv(file_path)
    print(f"Loaded dataset with shape: {df.shape}")
    print("First 10 rows:\n", df.head(10))

    label_counts = df['label'].value_counts()
    plt.figure(figsize=(8, 6))
    plt.bar(['Not Fake (0)', 'Fake (1)'], label_counts, color=['green', 'red'])
    plt.title('Distribution of Fake vs Not Fake News')
    plt.xlabel('Label')
    plt.ylabel('Count')
    plt.show()

    return df

print("Defined load_and_visualize_data function")

Defined load_and_visualize_data function


In [None]:
def split_and_tokenize_data(df, tokenizer_name='UBC-NLP/MARBERT', max_length=128, test_size=0.2, random_state=42):
    """Split data into train/test and tokenize."""
    train_texts, test_texts, train_labels, test_labels = train_test_split(
        df['news'].tolist(),
        df['label'].tolist(),
        test_size=test_size,
        random_state=random_state
    )
    print(f"Split data: Train size: {len(train_texts)}, Test size: {len(test_texts)}")

    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
    print(f"Loaded tokenizer: {tokenizer_name}")

    train_tokens = tokenizer(train_texts, max_length=max_length, padding='max_length', truncation=True, return_tensors='pt')
    test_tokens = tokenizer(test_texts, max_length=max_length, padding='max_length', truncation=True, return_tensors='pt')
    print("Tokenized train data. Input IDs shape:", train_tokens['input_ids'].shape)
    print("Tokenized test data. Input IDs shape:", test_tokens['input_ids'].shape)

    train_labels = torch.tensor(train_labels)
    test_labels = torch.tensor(test_labels)
    print("Converted labels to tensors. Train shape:", train_labels.shape, "Test shape:", test_labels.shape)

    return train_tokens, test_tokens, train_labels, test_labels

print("Defined split_and_tokenize_data function")

Defined split_and_tokenize_data function


In [None]:
def create_dataloaders(train_tokens, test_tokens, train_labels, test_labels, batch_size=16):
    """Create DataLoaders for training and testing."""
    train_data = TensorDataset(train_tokens['input_ids'], train_tokens['attention_mask'], train_labels)
    test_data = TensorDataset(test_tokens['input_ids'], test_tokens['attention_mask'], test_labels)
    print("Created TensorDatasets. Train length:", len(train_data), "Test length:", len(test_data))

    train_dataloader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
    test_dataloader = DataLoader(test_data, batch_size=batch_size, shuffle=False)
    print(f"Created DataLoaders with batch size {batch_size}. Train batches:", len(train_dataloader), "Test batches:", len(test_dataloader))

    return train_dataloader, test_dataloader

print("Defined create_dataloaders function")

Defined create_dataloaders function


In [None]:
def setup_model_and_device(model_name='UBC-NLP/MARBERT', num_labels=2, layer_control=None):
    """Set up MARBERT model and device, with optional layer control."""
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = BertForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
    model.to(device)
    print(f"Loaded {model_name} model with {num_labels} labels and moved to {device}")

    if layer_control:
        for i, layer in enumerate(model.bert.encoder.layer):
            for param in layer.parameters():
                param.requires_grad = layer_control.get(i, False)
        print("----------------------------------------")
        print(f"Number of transformer layers: {len(model.bert.encoder.layer)}")
        for i, layer in enumerate(model.bert.encoder.layer):
            trainable = any(param.requires_grad for param in layer.parameters())
            print(f"Layer {i}: {'Trainable' if trainable else 'Frozen'}")
        print("----------------------------------------")

    return model, device

print("Defined setup_model_and_device function")

Defined setup_model_and_device function


In [None]:
def setup_training_components(model, train_dataloader, num_epochs=5, lr=2e-5, class_weights=[2.0, 1.0], device=None):
    """Set up optimizer, scheduler, and loss function."""
    optimizer = AdamW(model.parameters(), lr=lr, eps=1e-8)
    num_training_steps = len(train_dataloader) * num_epochs
    lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)
    print(f"Set up AdamW optimizer (lr={lr}) and linear scheduler. Total steps: {num_training_steps}")

    class_weights = torch.tensor(class_weights).to(device)
    criterion = nn.CrossEntropyLoss(weight=class_weights)
    print(f"Defined CrossEntropyLoss with class weights {class_weights.tolist()} on {device}")

    return optimizer, lr_scheduler, criterion

print("Defined setup_training_components function")

Defined setup_training_components function


In [None]:
def train_model(model, train_dataloader, optimizer, criterion, lr_scheduler, device, epochs=5):
    """Train the model."""
    print("Starting Training...\n")
    model.train()
    start_time = time.time()

    for epoch in range(epochs):
        print(f"Epoch {epoch+1}/{epochs}")
        loop = tqdm(train_dataloader, desc=f"Training Epoch {epoch+1}")
        total_loss = 0
        for batch in loop:
            input_ids, attention_mask, labels = [b.to(device) for b in batch]
            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask=attention_mask)
            loss = criterion(outputs.logits, labels)
            loss.backward()
            optimizer.step()
            lr_scheduler.step()
            total_loss += loss.item()
            loop.set_postfix(loss=loss.item())
        avg_loss = total_loss / len(train_dataloader)
        print(f"Average loss for Epoch {epoch+1}: {avg_loss:.4f}")

    elapsed_time = time.time() - start_time
    print(f"\nTraining complete! Total time elapsed: {elapsed_time:.2f} seconds")

print("Defined train_model function")

Defined train_model function


In [None]:
def evaluate_model(model, test_dataloader, device):
    """Evaluate the model on test data."""
    model.eval()
    all_preds, all_labels = [], []

    with torch.no_grad():
        for batch in test_dataloader:
            input_ids, attention_mask, labels = [b.to(device) for b in batch]
            outputs = model(input_ids, attention_mask=attention_mask)
            preds = torch.argmax(outputs.logits, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    accuracy = accuracy_score(all_labels, all_preds)
    class_report = classification_report(all_labels, all_preds, target_names=['Not Fake', 'Fake'])
    roc_auc = roc_auc_score(all_labels, all_preds)

    print("Evaluation completed on test set")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Classification Report:\n{class_report}")
    print(f"ROC-AUC Score: {roc_auc:.4f}")

    return all_preds, all_labels

print("Defined evaluate_model function")

Defined evaluate_model function


In [None]:
def extract_and_save_embeddings(model, train_dataloader, test_dataloader, device, save_dir):
    """Extract and save embeddings."""
    def extract(dataloader, desc):
        model.eval()
        embeddings = []
        with torch.no_grad():
            for batch in tqdm(dataloader, desc=desc):
                input_ids, attention_mask, _ = [b.to(device) for b in batch]
                outputs = model.bert(input_ids, attention_mask=attention_mask)
                batch_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
                embeddings.extend(batch_embeddings)
        print(f"Extracted embeddings for {len(embeddings)} samples")
        return embeddings

    train_embeddings = extract(train_dataloader, "Extracting Train Embeddings")
    test_embeddings = extract(test_dataloader, "Extracting Test Embeddings")

    os.makedirs(save_dir, exist_ok=True)
    train_emb_path = os.path.join(save_dir, "train_embeddings.csv")
    test_emb_path = os.path.join(save_dir, "test_embeddings.csv")

    train_emb_df = pd.DataFrame(train_embeddings)
    test_emb_df = pd.DataFrame(test_embeddings)
    train_emb_df.to_csv(train_emb_path, index=False)
    test_emb_df.to_csv(test_emb_path, index=False)

    print("Embeddings saved successfully!")
    print("-------------------------------------")
    print("Top 5 Test Embeddings:")
    print(test_emb_df.head(5))

print("Defined extract_and_save_embeddings function")

Defined extract_and_save_embeddings function


In [None]:
# def main():
#     """Main function to run the entire pipeline."""
#     # Define parameters
#     file_path = '/content/drive/MyDrive/arabic_fake_news/FASSILA/cleaned_data.csv'
#     save_dir = '/content/drive/MyDrive/arabic_fake_news/FASSILA/'
#     batch_size = 16
#     num_epochs = 5
#     max_length = 128
#     lr = 2e-5
#     class_weights = [2.0, 1.0]
#     layer_control = {
#         0: True, 1: True, 2: True, 3: True, 4: True, 5: True,
#         6: True, 7: True, 8: True, 9: True, 10: True, 11: True
#     }

#     # Execute pipeline
#     df = load_and_visualize_data(file_path)
#     train_tokens, test_tokens, train_labels, test_labels = split_and_tokenize_data(df, max_length=max_length)
#     train_dataloader, test_dataloader = create_dataloaders(train_tokens, test_tokens, train_labels, test_labels, batch_size)
#     model, device = setup_model_and_device(layer_control=layer_control)
#     optimizer, lr_scheduler, criterion = setup_training_components(model, train_dataloader, num_epochs, lr, class_weights, device)
#     train_model(model, train_dataloader, optimizer, criterion, lr_scheduler, device, num_epochs)
#     evaluate_model(model, test_dataloader, device)
#     extract_and_save_embeddings(model, train_dataloader, test_dataloader, device, save_dir)

# # Run the pipeline
# main()
# print("Pipeline execution completed!")

In [None]:
import itertools
import pandas as pd
import time
import torch
import transformers
import os
import random

def main():
    """Optimized function using Randomized Search while avoiding duplicate configurations."""
    # Define parameter lists
    file_path = '/content/drive/MyDrive/arabic_fake_news/FASSILA/cleaned_data.csv'
    save_dir = '/content/drive/MyDrive/arabic_fake_news/FASSILA/'
    results_path = os.path.join(save_dir, "experiment_results_MARABERT.csv")

    batch_sizes = [8, 16, 32]
    num_epochs_list = [3, 5, 7]
    max_lengths = [64, 128, 256]
    learning_rates = [1e-5, 2e-5, 5e-5]
    class_weights_list = [[1.0, 1.0], [2.0, 1.0], [1.0, 2.0]]

    # Load data once
    df = load_and_visualize_data(file_path)
    dataset_size = len(df)

    # System information
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    cuda_available = torch.cuda.is_available()
    torch_version = torch.__version__
    transformers_version = transformers.__version__

    # Results storage
    results = []
    save_interval = 10  # Save results every 10 experiments
    max_runtime = 12 * 3600  # 12 hours in seconds
    start_global_time = time.time()

    # Track unique experiment configurations
    tested_configs = set()

    num_layers = 12
    num_experiments = 1000  # Limit to 1000 randomized experiments
    layer_indices = list(range(num_layers))

    while len(results) < num_experiments:
        if time.time() - start_global_time > max_runtime:
            print("⏳ Time limit reached (12 hours). Stopping experiments.")
            break  # Stop if 12 hours exceeded

        # Randomly select hyperparameters
        batch_size = random.choice(batch_sizes)
        num_epochs = random.choice(num_epochs_list)
        max_length = random.choice(max_lengths)
        lr = random.choice(learning_rates)
        class_weights = random.choice(class_weights_list)

        # Randomly select trainable layers (1 to 6 layers frozen randomly)
        num_trainable = random.randint(1, 6)
        frozen_layers = tuple(sorted(random.sample(layer_indices, num_trainable)))  # Use tuple to store in set

        # Create a unique key for this configuration
        config_key = (batch_size, num_epochs, max_length, lr, str(class_weights), frozen_layers)

        # Check if this configuration was tested before
        if config_key in tested_configs:
            continue  # Skip this iteration and try another random configuration

        # Mark this configuration as tested
        tested_configs.add(config_key)

        layer_control = {i: (i not in frozen_layers) for i in range(num_layers)}

        print("\n🔹 Starting new experiment configuration:")
        print(f"Batch: {batch_size}, Epochs: {num_epochs}, Max Length: {max_length}, LR: {lr}, Class Weights: {class_weights}")
        print(f"Trainable Layers: {layer_control}")

        start_time = time.time()

        try:
            # Data processing
            train_tokens, test_tokens, train_labels, test_labels = split_and_tokenize_data(df, max_length=max_length)
            train_dataloader, test_dataloader = create_dataloaders(train_tokens, test_tokens, train_labels, test_labels, batch_size)

            # Model setup
            model, device = setup_model_and_device(layer_control=layer_control)

            # Training setup
            optimizer, lr_scheduler, criterion = setup_training_components(model, train_dataloader, num_epochs, lr, class_weights, device)

            # Training
            train_model(model, train_dataloader, optimizer, criterion, lr_scheduler, device, num_epochs)

            # Evaluation
            preds, labels = evaluate_model(model, test_dataloader, device)
            accuracy = accuracy_score(labels, preds)
            roc_auc = roc_auc_score(labels, preds)

            elapsed_time = time.time() - start_time

            # Store results for this iteration
            result = {
                'dataset_path': file_path,
                'dataset_size': dataset_size,
                'batch_size': batch_size,
                'num_epochs': num_epochs,
                'max_length': max_length,
                'learning_rate': lr,
                'class_weights': str(class_weights),
                'trainable_layers': str(frozen_layers),
                'num_trainable_layers': num_trainable,
                'device': str(device),
                'cuda_available': cuda_available,
                'torch_version': torch_version,
                'transformers_version': transformers_version,
                'execution_time': elapsed_time,
                'accuracy': accuracy,
                'roc_auc': roc_auc
            }
            results.append(result)

            # Save periodically
            if len(results) % save_interval == 0:
                pd.DataFrame(results).to_csv(results_path, index=False)
                print(f" Saved intermediate results to {results_path}")
            print()
            print('############################################################################')
            print(result)
            print('############################################################################')
            print()
        except Exception as e:
            print(f" Error in iteration: {str(e)}")

    # Final Save
    pd.DataFrame(results).to_csv(results_path, index=False)
    print("\n All experiments completed!")
    print(f" Final results saved to {results_path}")
    print(" Top 5 results sorted by accuracy:")
    print(pd.DataFrame(results).sort_values(by='accuracy', ascending=False).head(5))

    return pd.DataFrame(results)

# Run the pipeline
results_df = main()
