<a href="https://colab.research.google.com/github/BootCamp-BMA/colabs/blob/main/dziribert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
import pandas as pd
data_path='drive/MyDrive/WithLora/cleaned_data.csv'
dataset = pd.read_csv(data_path)
print(dataset.shape)
print(dataset.head(10))

(9636, 2)
                                                news  label
0  فلقرن لواحد وعشرين لقاو الدوايات اللي ضد لفيروسات      1
1    عرف ردود الافعال عربيا وعالميا بعد واش صرا فغزة      0
2  راه معول مون فكوريا الجنوبية باش يتعاون مع روس...      0
3  تدعو ايرماراحشدا ل الواقعية المملكة المتحدة عل...      1
4                       الذهب طلع بدعم من عوامل فنية      0
5  السوق كي رحت ليه وساومت لقيت بلي سومات طاحو يا...      1
6  غير من السوق كي رحت ليه وساومت لقيت بلي سومات ...      0
7  بوتفليقة ماراحش لسويسرا باش يحوس راه يداوي خلو...      0
8  نجمات فالسينما الهندية يحكو حكايتهم علي التحرش...      0
9  فدارفور لي مزقتها الحرب ، شدد مسوول المساعدات ...      1


In [None]:
# !pip install -U transformers
# !pip install --upgrade transformers
# !pip install datasets
# !pip install -U datasets peft



In [None]:
import os
import json
import time
import random
import torch
import pandas as pd
from pathlib import Path
from datetime import timedelta
from transformers import BertForSequenceClassification, Trainer, TrainingArguments
from peft import get_peft_model, LoraConfig, TaskType

from sklearn.model_selection import train_test_split
from transformers import BertTokenizer
from datasets import Dataset, DatasetDict

import numpy as np
from sklearn.metrics import accuracy_score


In [None]:

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# === LOAD DATA ===
data_path = 'drive/MyDrive/WithLora/cleaned_data.csv'
df = pd.read_csv(data_path)
print(df.shape)
print(df.head())

# === SPLIT INTO TRAIN / VALID / TEST ===
train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['label'])
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42, stratify=temp_df['label'])

print(f"Train: {len(train_df)}, Val: {len(val_df)}, Test: {len(test_df)}")

# === CONVERT TO HuggingFace Dataset ===
dataset = DatasetDict({
    'train': Dataset.from_pandas(train_df),
    'validation': Dataset.from_pandas(val_df),
    'test': Dataset.from_pandas(test_df),
})

# === TOKENIZATION ===
model_name = "alger-ia/dziribert"

tokenizer = BertTokenizer.from_pretrained(model_name)

def tokenize_function(example):
    return tokenizer(example['news'], padding="max_length", truncation=True, max_length=512)

tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

# === METRICS FUNCTION ===
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {"accuracy": accuracy_score(labels, preds)}

# Ready to train!


Using device: cuda
(9636, 2)
                                                news  label
0  فلقرن لواحد وعشرين لقاو الدوايات اللي ضد لفيروسات      1
1    عرف ردود الافعال عربيا وعالميا بعد واش صرا فغزة      0
2  راه معول مون فكوريا الجنوبية باش يتعاون مع روس...      0
3  تدعو ايرماراحشدا ل الواقعية المملكة المتحدة عل...      1
4                       الذهب طلع بدعم من عوامل فنية      0
Train: 7708, Val: 964, Test: 964


Map:   0%|          | 0/7708 [00:00<?, ? examples/s]

Map:   0%|          | 0/964 [00:00<?, ? examples/s]

Map:   0%|          | 0/964 [00:00<?, ? examples/s]

In [None]:

# === SETUP ===
model_name =  "alger-ia/dziribert"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
experiments_path = "drive/MyDrive/WithLora/nonLoRAExprements.json"

# Hyperparameter search space
learning_rates = [1e-5, 2e-5, 3e-5, 5e-5, 1e-4, 2e-4]
batch_sizes = [8, 16, 32]
num_epochs = [3, 4, 5, 6]
weight_decays = [0.01, 0.02, 0.05]




# === Load or initialize experiment tracking file ===
if os.path.exists(experiments_path):
    with open(experiments_path, "r") as f:
        experiments = json.load(f)
else:
    experiments = []

df_experiments = pd.DataFrame(experiments)
print(f"Loaded {len(df_experiments)} past experiments.")

# === Time & Experiment Budget ===
start_time = time.time()
MAX_TIME = 12 * 60 * 60
MAX_EXPERIMENTS = 2
experiment_count = len(df_experiments)

# === Begin experiment loop ===
while (time.time() - start_time) < MAX_TIME and experiment_count < MAX_EXPERIMENTS:

    # === Sample random config ===
    config = {
        "model_name": model_name,
        "learning_rate": random.choice(learning_rates),
        "batch_size": random.choice(batch_sizes),
        "num_epochs": random.choice(num_epochs),
        "weight_decay": random.choice(weight_decays),
    }

    # === Check for duplicates ===
    if not df_experiments.empty and (
        (df_experiments[
            (df_experiments["learning_rate"] == config["learning_rate"]) &
            (df_experiments["batch_size"] == config["batch_size"]) &
            (df_experiments["num_epochs"] == config["num_epochs"]) &
            (df_experiments["weight_decay"] == config["weight_decay"])
        ].shape[0]) > 0
    ):
        print("Duplicate config found, skipping...")
        continue

    # === Start timer for this experiment ===
    exp_start = time.time()

    print(f"\nRunning Experiment #{experiment_count + 1} with config: {config}")

    # === Load model ===
    model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2).to(device)

    # === Training Arguments ===
    training_args = TrainingArguments(
        output_dir=f"./results-full/exp_{experiment_count + 1}",
        learning_rate=config["learning_rate"],
        per_device_train_batch_size=config["batch_size"],
        per_device_eval_batch_size=config["batch_size"],
        num_train_epochs=config["num_epochs"],
        weight_decay=config["weight_decay"],
        logging_dir=f'./logs-full/exp_{experiment_count + 1}',
        report_to="none",
    )

    # === Trainer ===
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset["train"],
        eval_dataset=tokenized_dataset["validation"],
        compute_metrics=compute_metrics,
    )

    # === Train & Evaluate ===
    trainer.train()
    metrics = trainer.evaluate(tokenized_dataset["test"])

    # === Time taken ===
    exp_time = time.time() - exp_start

    # === Save Experiment Result ===
    result = {
        "#": experiment_count + 1,
        **config,
        "accuracy": metrics.get("eval_accuracy", None),
        "elapsed_time_sec": exp_time,
        "elapsed_time_readable": str(timedelta(seconds=int(exp_time)))
    }

    experiments.append(result)
    df_experiments = pd.DataFrame(experiments)

    # Save to disk
    Path(os.path.dirname(experiments_path)).mkdir(parents=True, exist_ok=True)
    with open(experiments_path, "w") as f:
        json.dump(experiments, f, indent=2)

    print(f"Experiment #{experiment_count + 1} completed and saved.")
    experiment_count += 1

print("=== Experiment session ended ===")


Loaded 0 past experiments.

Running Experiment #1 with config: {'model_name': 'alger-ia/dziribert', 'learning_rate': 3e-05, 'batch_size': 16, 'num_epochs': 5, 'weight_decay': 0.01}


config.json:   0%|          | 0.00/620 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at alger-ia/dziribert and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
500,0.6885
1000,0.6354
1500,0.5584
2000,0.4741


Experiment #1 completed and saved.

Running Experiment #2 with config: {'model_name': 'alger-ia/dziribert', 'learning_rate': 0.0002, 'batch_size': 8, 'num_epochs': 3, 'weight_decay': 0.05}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at alger-ia/dziribert and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
500,0.7208
1000,0.7108
1500,0.7015
2000,0.6966
2500,0.6964


Experiment #2 completed and saved.
=== Experiment session ended ===


In [None]:
# !pip install --upgrade transformers

In [None]:

experiment_count=0
# === Constants ===
model_name =     "alger-ia/dziribert"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
experiments_path = "drive/MyDrive/WithLora/LoRAExperiments.json"
MAX_TIME = 12 * 60 * 60  # 12 hours
MAX_EXPERIMENTS = 3

# === Hyperparameter search space ===
learning_rates = [1e-5, 2e-5, 3e-5, 5e-5, 1e-4, 2e-4]
batch_sizes = [8, 16, 32]
num_epochs = [3, 4, 5, 6]
r_values = [4, 8, 16]
lora_alpha_values = [8, 16, 32]
lora_dropout_values = [0.1, 0.2, 0.3]
weight_decay_values = [0.01, 0.02, 0.05]

# === Load or initialize experiment tracking ===
if os.path.exists(experiments_path):
    with open(experiments_path, "r") as f:
        experiments = json.load(f)
else:
    experiments = []
print(experiment_count)
df_experiments = pd.DataFrame(experiments)
print(f"Loaded {len(df_experiments)} LoRA experiments.")

# === Begin loop ===
start_time = time.time()
experiment_count = len(df_experiments)
print('outside')
experiment_count=0
print( experiment_count , MAX_EXPERIMENTS )
while (time.time() - start_time) < MAX_TIME and experiment_count < MAX_EXPERIMENTS:
    print('inside')

    # === Sample random config ===
    config = {
        "model_name": model_name,
        "learning_rate": random.choice(learning_rates),
        "batch_size": random.choice(batch_sizes),
        "num_epochs": random.choice(num_epochs),
        "weight_decay": random.choice(weight_decay_values),
        "lora_r": random.choice(r_values),
        "lora_alpha": random.choice(lora_alpha_values),
        "lora_dropout": random.choice(lora_dropout_values),
    }

    # === Check for duplicates ===
    if not df_experiments.empty and (
        (df_experiments[
            (df_experiments["learning_rate"] == config["learning_rate"]) &
            (df_experiments["batch_size"] == config["batch_size"]) &
            (df_experiments["num_epochs"] == config["num_epochs"]) &
            (df_experiments["weight_decay"] == config["weight_decay"]) &
            (df_experiments["lora_r"] == config["lora_r"]) &
            (df_experiments["lora_alpha"] == config["lora_alpha"]) &
            (df_experiments["lora_dropout"] == config["lora_dropout"])
        ].shape[0]) > 0
    ):
        print("Duplicate config found, skipping...")
        continue

    print(f"\n[Experiment #{experiment_count + 1}] Config: {config}")
    exp_start = time.time()

    # === Load base model ===
    base_model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2).to(device)

    # === Apply LoRA ===
    lora_config = LoraConfig(
        task_type=TaskType.SEQ_CLS,
        r=config["lora_r"],
        lora_alpha=config["lora_alpha"],
        lora_dropout=config["lora_dropout"],
        bias="none"
    )
    model = get_peft_model(base_model, lora_config)
    model.print_trainable_parameters()

    # === Trainer Setup ===
    training_args = TrainingArguments(
        output_dir=f"./results-lora/exp_{experiment_count + 1}",
        eval_steps=500,
        save_strategy="epoch",
        learning_rate=config["learning_rate"],
        per_device_train_batch_size=config["batch_size"],
        per_device_eval_batch_size=config["batch_size"],
        num_train_epochs=config["num_epochs"],
        weight_decay=config["weight_decay"],
        logging_dir=f'./logs-lora/exp_{experiment_count + 1}',
        report_to="none"
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset["train"],
        eval_dataset=tokenized_dataset["validation"],
        compute_metrics=compute_metrics,
    )

    try:
        trainer.train()
        metrics = trainer.evaluate(tokenized_dataset["test"])
    except Exception as e:
        print(f"Experiment failed: {e}")
        continue

    # === Save results ===
    elapsed_time = time.time() - exp_start
    result = {
        "#": experiment_count + 1,
        **config,
        "accuracy": metrics.get("eval_accuracy", None),
        "elapsed_time_sec": elapsed_time,
        "elapsed_time_readable": str(timedelta(seconds=int(elapsed_time)))
    }

    experiments.append(result)
    df_experiments = pd.DataFrame(experiments)

    Path(os.path.dirname(experiments_path)).mkdir(parents=True, exist_ok=True)
    with open(experiments_path, "w") as f:
        json.dump(experiments, f, indent=2)

    print(f"Experiment #{experiment_count + 1} completed and saved.")
    experiment_count += 1

print("=== LoRA Experiment session complete ===")


0
Loaded 3 LoRA experiments.
outside
0 3
inside

[Experiment #1] Config: {'model_name': 'alger-ia/dziribert', 'learning_rate': 2e-05, 'batch_size': 8, 'num_epochs': 3, 'weight_decay': 0.01, 'lora_r': 4, 'lora_alpha': 8, 'lora_dropout': 0.2}


model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at alger-ia/dziribert and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


trainable params: 148,994 || all params: 124,591,876 || trainable%: 0.1196


Step,Training Loss
500,0.6986
1000,0.693
1500,0.6855
2000,0.6872
2500,0.6835


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at alger-ia/dziribert and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Experiment #1 completed and saved.
inside
Duplicate config found, skipping...
inside
Duplicate config found, skipping...
inside
Duplicate config found, skipping...
inside

[Experiment #2] Config: {'model_name': 'alger-ia/dziribert', 'learning_rate': 0.0001, 'batch_size': 32, 'num_epochs': 3, 'weight_decay': 0.05, 'lora_r': 4, 'lora_alpha': 32, 'lora_dropout': 0.3}


No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


trainable params: 148,994 || all params: 124,591,876 || trainable%: 0.1196


Step,Training Loss
500,0.6671


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at alger-ia/dziribert and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Experiment #2 completed and saved.
inside
Duplicate config found, skipping...
inside
Duplicate config found, skipping...
inside
Duplicate config found, skipping...
inside
Duplicate config found, skipping...
inside

[Experiment #3] Config: {'model_name': 'alger-ia/dziribert', 'learning_rate': 0.0002, 'batch_size': 32, 'num_epochs': 6, 'weight_decay': 0.01, 'lora_r': 8, 'lora_alpha': 32, 'lora_dropout': 0.2}


No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


trainable params: 296,450 || all params: 124,739,332 || trainable%: 0.2377


Step,Training Loss
500,0.6234
1000,0.5136


Experiment #3 completed and saved.
=== LoRA Experiment session complete ===
