<a href="https://colab.research.google.com/github/BootCamp-BMA/colabs/blob/main/dziriBertRandomSearch2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

drive/MyDrive/arabic_fake_news/FASSILA/cleaned_data.csv


In [None]:
file_path = "/content/drive/My Drive/arabic_fake_news/FASSILA/cleaned_data.csv"


df = pd.read_csv(file_path)

print(df.shape)

df.head(5)




(9636, 2)


Unnamed: 0,news,label
0,فلقرن لواحد وعشرين لقاو الدوايات اللي ضد لفيروسات,1
1,عرف ردود الافعال عربيا وعالميا بعد واش صرا فغزة,0
2,راه معول مون فكوريا الجنوبية باش يتعاون مع روس...,0
3,تدعو ايرماراحشدا ل الواقعية المملكة المتحدة عل...,1
4,الذهب طلع بدعم من عوامل فنية,0


In [None]:
import torch
import pandas as pd
import random
import time
import os
from transformers import AutoTokenizer, AutoModelForSequenceClassification, get_scheduler
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from google.colab import drive
from tqdm import tqdm
import sys
from IPython.display import clear_output

drive.mount('/content/drive')

# Generic utility functions
def set_seed(seed=42):
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.benchmark = True

def load_data(file_path, text_column='news', label_column='label'):
    df = pd.read_csv(file_path)
    texts = df[text_column].tolist()
    labels = df[label_column].tolist()
    return texts, labels

def tokenize_data(texts, tokenizer, max_length=128):
    return tokenizer(texts, truncation=True, padding=True, max_length=max_length, return_tensors='pt')

def create_dataset(encodings, labels):
    return TensorDataset(encodings['input_ids'], encodings['attention_mask'], torch.tensor(labels))

def split_data(dataset, batch_size, test_size=0.2, num_workers=8):
    train_data, test_data = train_test_split(dataset, test_size=test_size, random_state=42)
    train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True, num_workers=num_workers, pin_memory=True, persistent_workers=True)
    test_loader = DataLoader(test_data, batch_size=batch_size, num_workers=num_workers, pin_memory=True, persistent_workers=True)
    return train_loader, test_loader

def configure_model(model_name, trainable_layers=None, num_labels=2, dropout=0.1):
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
    model.dropout = torch.nn.Dropout(dropout)
    if trainable_layers:
        for name, param in model.named_parameters():
            if 'bert.encoder.layer' in name:
                layer_num = int(name.split('.')[3])
                param.requires_grad = trainable_layers.get(layer_num, False)
            elif 'classifier' in name or 'dropout' in name:
                param.requires_grad = True
            else:
                param.requires_grad = False
    return model

def setup_training_components(model, train_loader, num_epochs, learning_rate, class_weights, device):
    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, eps=1e-8)
    num_training_steps = len(train_loader) * num_epochs
    lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)
    class_weights = torch.tensor(class_weights, device=device)
    criterion = nn.CrossEntropyLoss(weight=class_weights)
    return optimizer, lr_scheduler, criterion

def train_model(model, train_loader, optimizer, criterion, lr_scheduler, device, num_epochs):
    model.to(device)
    scaler = torch.amp.GradScaler('cuda')
    for epoch in range(num_epochs):
        model.train()
        # Use dynamic_ncols and position to ensure proper updating
        loop = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}", file=sys.stdout, leave=False, dynamic_ncols=True, position=0)
        for batch in loop:
            input_ids, attention_mask, labels = [x.to(device, non_blocking=True) for x in batch]
            optimizer.zero_grad(set_to_none=True)
            with torch.amp.autocast('cuda'):
                outputs = model(input_ids, attention_mask=attention_mask)
                loss = criterion(outputs.logits, labels)
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
            lr_scheduler.step()
            loop.set_postfix(loss=loss.item())
        # Refresh the display after each epoch
        loop.close()
        print()  # Newline to separate epochs cleanly
    return model

def evaluate_model(model, test_loader, device):
    model.eval()
    test_preds, test_labels = [], []
    with torch.no_grad():
        with torch.amp.autocast('cuda'):
            for batch in test_loader:
                input_ids, attention_mask, labels = [x.to(device, non_blocking=True) for x in batch]
                outputs = model(input_ids, attention_mask=attention_mask)
                preds = torch.argmax(outputs.logits, dim=1)
                test_preds.extend(preds.cpu().tolist())
                test_labels.extend(labels.cpu().tolist())
    return accuracy_score(test_labels, test_preds)

def load_or_create_results_df(file_path):
    if os.path.exists(file_path):
        return pd.read_csv(file_path)
    return pd.DataFrame(columns=['id', 'execution_time', 'trainable_layers', 'accuracy', 'batch_size', 'learning_rate', 'dropout', 'num_epochs', 'weight_decay'])

def check_existing_experiment(df, trainable_layers, batch_size, learning_rate, dropout, num_epochs, weight_decay):
    trainable_layers_str = str(trainable_layers) if trainable_layers else "None"
    mask = (df['trainable_layers'] == trainable_layers_str) & \
           (df['batch_size'] == batch_size) & \
           (df['learning_rate'] == learning_rate) & \
           (df['dropout'] == dropout) & \
           (df['num_epochs'] == num_epochs) & \
           (df['weight_decay'] == weight_decay)
    return mask.any()

def save_results(df, file_path):
    df.to_csv(file_path, index=False)

# Simplified initial evaluation without threading
def initial_evaluation(model_name, train_loader, test_loader, device, initial_params):
    initial_configs = [{i: True} for i in range(12)]  # Single-layer configs
    best_accuracy = 0.0
    best_layers = None

    for layers in initial_configs[:4]:  # Limit to 4 to keep it simple
        model = configure_model(model_name, layers, dropout=initial_params['dropout'])
        optimizer, lr_scheduler, criterion = setup_training_components(model, train_loader, initial_params['num_epochs'],
                                                                      initial_params['learning_rate'], initial_params['class_weights'], device)
        trained_model = train_model(model, train_loader, optimizer, criterion, lr_scheduler, device, initial_params['num_epochs'])
        accuracy = evaluate_model(trained_model, test_loader, device)
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_layers = layers

    return best_layers, best_accuracy

# Simplified Hill climbing with enhanced output
def hill_climbing_layers(model_name, train_loader, test_loader, device, initial_params, max_time=7200, tolerance=0.01):
    start_time = time.time()

    # Initial sequential evaluation
    best_layers, best_accuracy = initial_evaluation(model_name, train_loader, test_loader, device, initial_params)
    best_layer_count = len(best_layers)
    print(f"Initial best layers: {best_layers}, Accuracy: {best_accuracy:.4f}, Layer count: {best_layer_count}")

    iteration = 0
    while time.time() - start_time < max_time:
        iteration += 1
        current_layers = list(best_layers.keys())
        neighbors = []
        for layer in range(12):
            if layer not in current_layers:
                new_layers = best_layers.copy()
                new_layers[layer] = True
                neighbors.append(new_layers)
            elif len(current_layers) > 1:
                new_layers = best_layers.copy()
                del new_layers[layer]
                neighbors.append(new_layers)

        random.shuffle(neighbors)
        improved = False

        for neighbor in neighbors:
            if time.time() - start_time >= max_time:
                break
            exec_start = time.time()

            # Clear previous output and print header
            clear_output(wait=True)
            print(f"#######################################")
            print(f"Iteration {iteration}")
            print("-----------------------------------------------------")
            print(f"Latest Best: Layers={best_layers}, Acc={best_accuracy:.4f}, Count={best_layer_count}")
            print("-----------------------------------------------------")
            print(f"Trying: {neighbor}")
            print("-----------------------------------------------------")
            print("Training:")

            # Configure and train model with epoch-wise display
            model = configure_model(model_name, neighbor, dropout=initial_params['dropout'])
            optimizer, lr_scheduler, criterion = setup_training_components(model, train_loader, initial_params['num_epochs'],
                                                                          initial_params['learning_rate'], initial_params['class_weights'], device)
            model.to(device)
            scaler = torch.amp.GradScaler('cuda')

            for epoch in range(initial_params['num_epochs']):
                model.train()
                # Use dynamic_ncols and position for proper progress bar updating
                loop = tqdm(train_loader, desc=f"Epoch {epoch+1}/{initial_params['num_epochs']}", file=sys.stdout, leave=False, dynamic_ncols=True, position=0)
                for batch in loop:
                    input_ids, attention_mask, labels = [x.to(device, non_blocking=True) for x in batch]
                    optimizer.zero_grad(set_to_none=True)
                    with torch.amp.autocast('cuda'):
                        outputs = model(input_ids, attention_mask=attention_mask)
                        loss = criterion(outputs.logits, labels)
                    scaler.scale(loss).backward()
                    scaler.step(optimizer)
                    scaler.update()
                    lr_scheduler.step()
                    loop.set_postfix(loss=loss.item())
                loop.close()
                print()  # Newline to separate epochs

            # Evaluate model
            accuracy = evaluate_model(model, test_loader, device)
            layer_count = len(neighbor)
            exec_time = time.time() - exec_start

            # Check for improvement
            if (accuracy > best_accuracy) or \
               (abs(accuracy - best_accuracy) <= tolerance and layer_count < best_layer_count):
                best_accuracy = accuracy
                best_layers = neighbor
                best_layer_count = layer_count
                improved = True
                print(f"\nIteration {iteration}: New best layers: {best_layers}, Accuracy: {best_accuracy:.4f}, Layer count: {best_layer_count}, Time: {exec_time:.2f}s")
                break

        if not improved and iteration > 5:
            best_layers = {random.randint(0, 11): True}
            print(f"\nIteration {iteration}: No improvement. Restarting with: {best_layers}")
            continue
        elif not improved:
            clear_output(wait=True)
            print(f"#######################################")
            print(f"Iteration {iteration}")
            print("-----------------------------------------------------")
            print(f"Final Best: Layers={best_layers}, Accuracy={best_accuracy:.4f}, Count={best_layer_count}")
            print("-----------------------------------------------------")
            print("No improvement found. Stopping.")
            break

    return best_layers, best_accuracy

# Random search for other parameters with enhanced output
def random_search_params(model_name, dataset, device, best_layers, param_space, max_time=3600):
    start_time = time.time()
    results_df = load_or_create_results_df(param_space['results_path'])
    iteration = 0
    best_accuracy = 0.0
    best_params = None

    while time.time() - start_time < max_time:
        iteration += 1
        params = {
            'batch_size': random.choice(param_space['batch_size']),
            'learning_rate': random.choice(param_space['learning_rate']),
            'dropout': random.choice(param_space['dropout']),
            'num_epochs': random.choice(param_space['num_epochs']),
            'weight_decay': random.choice(param_space['weight_decay']),
            'class_weights': param_space['class_weights'],
            'results_path': param_space['results_path']
        }

        if check_existing_experiment(results_df, best_layers, params['batch_size'], params['learning_rate'],
                                     params['dropout'], params['num_epochs'], params['weight_decay']):
            continue

        train_loader, test_loader = split_data(dataset, params['batch_size'])
        exec_start = time.time()

        # Clear previous output and print header
        clear_output(wait=True)
        print(f"#######################################")
        print(f"Random Search Iteration {iteration}")
        print("-----------------------------------------------------")
        if best_params:
            print(f"Latest Best: Acc={best_accuracy:.4f}, Params={best_params}")
        else:
            print("Latest Best: None yet")
        print("-----------------------------------------------------")
        print(f"Trying: Layers={best_layers}, Params={params}")
        print("-----------------------------------------------------")
        print("Training:")

        # Configure and train model with epoch-wise display
        model = configure_model(model_name, best_layers, dropout=params['dropout'])
        optimizer, lr_scheduler, criterion = setup_training_components(model, train_loader, params['num_epochs'],
                                                                      params['learning_rate'], params['class_weights'], device)
        model.to(device)
        scaler = torch.amp.GradScaler('cuda')

        for epoch in range(params['num_epochs']):
            model.train()
            # Use dynamic_ncols and position for proper progress bar updating
            loop = tqdm(train_loader, desc=f"Epoch {epoch+1}/{params['num_epochs']}", file=sys.stdout, leave=False, dynamic_ncols=True, position=0)
            for batch in loop:
                input_ids, attention_mask, labels = [x.to(device, non_blocking=True) for x in batch]
                optimizer.zero_grad(set_to_none=True)
                with torch.amp.autocast('cuda'):
                    outputs = model(input_ids, attention_mask=attention_mask)
                    loss = criterion(outputs.logits, labels)
                scaler.scale(loss).backward()
                scaler.step(optimizer)
                scaler.update()
                lr_scheduler.step()
                loop.set_postfix(loss=loss.item())
            loop.close()
            print()  # Newline to separate epochs

        # Evaluate model
        accuracy = evaluate_model(model, test_loader, device)
        execution_time = time.time() - exec_start

        # Update best if improved
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_params = params.copy()
            print(f"\nIteration {iteration}: New best accuracy: {best_accuracy:.4f}, Time: {execution_time:.2f}s, Params={best_params}")
        else:
            print(f"\nIteration {iteration}: Accuracy: {accuracy:.4f}, Time: {execution_time:.2f}s, Params={params}")

        # Save results
        new_row = {
            'id': len(results_df) + 1,
            'execution_time': execution_time,
            'trainable_layers': str(best_layers),
            'accuracy': accuracy,
            'batch_size': params['batch_size'],
            'learning_rate': params['learning_rate'],
            'dropout': params['dropout'],
            'num_epochs': params['num_epochs'],
            'weight_decay': params['weight_decay']
        }
        results_df = pd.concat([results_df, pd.DataFrame([new_row])], ignore_index=True)
        save_results(results_df, params['results_path'])

    # Final summary
    clear_output(wait=True)
    print(f"#######################################")
    print("Random Search Completed")
    print("-----------------------------------------------------")
    print(f"Final Best: Layers={best_layers}, Accuracy={best_accuracy:.4f}, Params={best_params}")
    print("-----------------------------------------------------")
    print(f"Total iterations: {iteration}, Total time: {(time.time() - start_time):.2f}s")

def main():
    data_path = '/content/drive/My Drive/arabic_fake_news/FASSILA/cleaned_data.csv'
    results_path = '/content/drive/My Drive/arabic_fake_news/FASSILA/03_out.csv'
    model_name = "alger-ia/dziribert"
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    initial_params = {
        'batch_size': 16,
        'learning_rate': 2e-5,
        'dropout': 0.1,
        'num_epochs': 2,
        'weight_decay': 0.01,
        'class_weights': [2.0, 1.0],
        'results_path': results_path
    }

    param_space = {
        'batch_size': [8, 16, 32],
        'learning_rate': [1e-5, 2e-5, 5e-5],
        'dropout': [0.1, 0.2, 0.3],
        'num_epochs': [2, 3, 4],
        'weight_decay': [0.001, 0.01, 0.1],
        'class_weights': [2.0, 1.0],
        'results_path': results_path
    }

    set_seed()
    texts, labels = load_data(data_path)
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    encodings = tokenize_data(texts, tokenizer)
    dataset = create_dataset(encodings, labels)
    train_loader, test_loader = split_data(dataset, initial_params['batch_size'])

    print("Starting Hill Climbing for Layers...")
    hill_start = time.time()
    best_layers, best_accuracy = hill_climbing_layers(model_name, train_loader, test_loader, device, initial_params, max_time=1*3600)
    hill_time = time.time() - hill_start
    print(f"Best layers found: {best_layers}, Accuracy: {best_accuracy:.4f}, Time: {hill_time:.2f}s")

    print("Starting Random Search for Other Parameters...")
    random_search_params(model_name, dataset, device, best_layers, param_space, max_time=max(3600, 10800 - hill_time))

if __name__ == "__main__":
    main()

#######################################
Random Search Iteration 23
-----------------------------------------------------
Latest Best: Acc=0.6805, Params={'batch_size': 8, 'learning_rate': 5e-05, 'dropout': 0.3, 'num_epochs': 3, 'weight_decay': 0.01, 'class_weights': [2.0, 1.0], 'results_path': '/content/drive/My Drive/arabic_fake_news/FASSILA/03_out.csv'}
-----------------------------------------------------
Trying: Layers={10: True}, Params={'batch_size': 32, 'learning_rate': 1e-05, 'dropout': 0.1, 'num_epochs': 4, 'weight_decay': 0.001, 'class_weights': [2.0, 1.0], 'results_path': '/content/drive/My Drive/arabic_fake_news/FASSILA/03_out.csv'}
-----------------------------------------------------
Training:


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at alger-ia/dziribert and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


                                                                        
                                                                        
                                                                        
Epoch 4/4:  37%|███▋      | 90/241 [00:03<00:05, 27.90it/s, loss=0.572]

In [None]:
best_layers