

# Libraries and Setup



In [None]:
!pip install -q -r /content/drive/MyDrive/Arabic-Dialect-Detector/requirements.txt

In [None]:
!pip install -q wandb

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModelForTokenClassification, Trainer, TrainingArguments, BitsAndBytesConfig, DataCollatorWithPadding
from peft import LoraConfig, TaskType, get_peft_model
from datasets import load_dataset, DatasetDict
from huggingface_hub import login
from dotenv import load_dotenv
from google.colab import userdata

import numpy as np
import pandas as pd
import wandb
import torch
import os

In [None]:
hf_token = userdata.get('HF_TOKEN')
login(token=hf_token)
wandb.login(key=userdata.get('WANDB'))



True

# Model Design

 ### Quantization configs

In [None]:
# bnb library quantizes model
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=False
)

# peft config for quantized model
peft_config = LoraConfig(
    task_type=TaskType.SEQ_CLS, # SEQ_CLS for dialect detection, SEQ_2_SEQ_LM for translation
    inference_mode=False,
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
        "lm_head"
    ]
)

### Model Setup

In [None]:
# define model
model_name = "meta-llama/Llama-3.1-8B"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name,
                                                           quantization_config=bnb_config,
                                                           num_labels=4,
                                                           device_map ="auto",
                                                           use_cache = False,
                                                           torch_dtype=torch.bfloat16
                                                           )

model.gradient_checkpointing_enable()
model.enable_input_require_grads()
model.config.use_cache = False

model.resize_token_embeddings(len(tokenizer))
model = get_peft_model(model, peft_config)

for param in model.parameters():
    if param.requires_grad:
        # Print for debugging
        print(f"Parameter requires grad: {param.shape}")

for name, param in model.named_parameters():
    if 'lora' in name.lower():
        param.requires_grad = True
    else:
        param.requires_grad = False

model.print_trainable_parameters()

In [None]:
model.config

#Data

In [None]:
# load data
training_data_path = r"/content/drive/MyDrive/Arabic-Dialect-Detector/src/data/Arabic_dialect.csv"
dataset = load_dataset("csv", data_files=training_data_path)

dataset = dataset.rename_column("result", "label")
dataset = dataset.rename_column("t", "text")

dataset.set_format(type="torch", columns=["text", "label"])

In [None]:
# split data
train_test = dataset["train"].train_test_split(test_size=0.4, seed= 42)
test_val = train_test["test"].train_test_split(test_size=0.5, seed= 42)

columns_to_remove = ['Date','User', 'Tweet']
# format data into DatsetDict
dataset = DatasetDict({
    "train": train_test["train"].remove_columns(columns_to_remove),
    "validation": test_val["train"].remove_columns(columns_to_remove),
    "test": test_val["test"].remove_columns(columns_to_remove)
})
dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 23036
    })
    validation: Dataset({
        features: ['label', 'text'],
        num_rows: 7679
    })
    test: Dataset({
        features: ['label', 'text'],
        num_rows: 7679
    })
})

In [None]:
# Set the pad token if it's missing
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = model.config.eos_token_id
tokenizer.padding_side = "right"

# Create label mapping
unique_labels = set(dataset['train']['label'])
label2id = {label: idx for idx, label in enumerate(sorted(unique_labels))}
id2label = {idx: label for label, idx in label2id.items()}

# Update model config with label mapping
model.config.label2id = label2id
model.config.id2label = id2label

In [None]:
# Tokenize the dataset
def preprocess_function(examples):
    tokenized = tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=128,
        return_tensors=None
    )

    tokenized["labels"] = [label2id[label] for label in examples["label"]]
    return tokenized

# Tokenize the dataset
tokenized_datasets = dataset.map(preprocess_function,
                                 batched=True,
                                 load_from_cache_file=False,
                                 remove_columns=dataset["train"].column_names)
tokenized_datasets

Map:   0%|          | 0/23036 [00:00<?, ? examples/s]

Map:   0%|          | 0/7679 [00:00<?, ? examples/s]

Map:   0%|          | 0/7679 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 23036
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 7679
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 7679
    })
})

In [None]:
print(f"Padding Token: {tokenizer.pad_token}")
print(tokenizer.special_tokens_map)
print(f"Padding Token ID: {tokenizer.pad_token_id}")

tokenized_datasets['train'][0]

Padding Token: <|end_of_text|>
{'bos_token': '<|begin_of_text|>', 'eos_token': '<|end_of_text|>', 'pad_token': '<|end_of_text|>'}
Padding Token ID: 128001


{'input_ids': tensor([128000, 115633, 120038, 108452, 101581, 102680, 101397, 119450, 103238,
          28590,  71704, 101428,  46677, 101756,  50488,  78373, 128001, 128001,
         128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001,
         128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001,
         128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001,
         128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001,
         128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001,
         128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001,
         128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001,
         128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001,
         128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001,
         128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001,
         128001

# Train

In [None]:
data_collator = DataCollatorWithPadding(
    tokenizer=tokenizer,
    padding="max_length",
    max_length=128,
    pad_to_multiple_of=8,
    return_tensors="pt"
)

In [None]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    bf16=True,
    gradient_accumulation_steps=2,
    gradient_checkpointing=True,
    remove_unused_columns=True,
    load_best_model_at_end=True,

)

# help memory please????
torch.cuda.empty_cache()
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

In [None]:
# Initialize the Trainer, adding the compute_metrics argument
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator
)
trainable_params = sum(p.requires_grad for p in model.parameters())
print(f"Number of trainable parameters: {trainable_params}")
print()
print()
# Fine-tune the model
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Number of trainable parameters: 448




Epoch,Training Loss,Validation Loss
1,0.9865,0.425778
2,0.5184,0.41308
3,0.213,0.659081




TrainOutput(global_step=4320, training_loss=0.6294023275375367, metrics={'train_runtime': 4701.2615, 'train_samples_per_second': 14.7, 'train_steps_per_second': 0.919, 'total_flos': 3.726691109972214e+17, 'train_loss': 0.6294023275375367, 'epoch': 3.0})

Save model

In [None]:
#Save in Google Drive
save_directory = "/content/drive/MyDrive/dialect_model"

model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).




('/content/drive/MyDrive/dialect_model/tokenizer_config.json',
 '/content/drive/MyDrive/dialect_model/special_tokens_map.json',
 '/content/drive/MyDrive/dialect_model/tokenizer.json')

In [None]:
# save locally, download from left-hand pannel
local_save_dir = "./dialect_model"
model.save_pretrained(local_save_dir)
tokenizer.save_pretrained(local_save_dir)

# Create a zip file
!zip -r dialect_model.zip dialect_model/

  adding: dialect_model/ (stored 0%)
  adding: dialect_model/tokenizer.json (deflated 85%)
  adding: dialect_model/adapter_config.json (deflated 56%)
  adding: dialect_model/README.md (deflated 66%)
  adding: dialect_model/adapter_model.safetensors (deflated 8%)
  adding: dialect_model/tokenizer_config.json (deflated 96%)
  adding: dialect_model/special_tokens_map.json (deflated 64%)


In [None]:
# # how to load model after saving it
from peft import PeftModel, PeftConfig

# Load the base model first
base_model = AutoModelForSequenceClassification.from_pretrained(
    "meta-llama/Llama-3.1-8B",
    quantization_config=bnb_config,
    num_labels=4,
    device_map="auto"
)

# Load the PEFT configuration and model
peft_model_path = "/content/dialect_model"  # Either your Drive path or local path
config = PeftConfig.from_pretrained(peft_model_path)
model = PeftModel.from_pretrained(base_model, peft_model_path)
tokenizer = AutoTokenizer.from_pretrained(peft_model_path)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.1-8B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Eval

In [None]:
# Evaluate on the test set
results = trainer.evaluate(tokenized_datasets["test"])
print(results)


KeyboardInterrupt: 

### Prompting Model to test responsivness


In [None]:
# Function to classify text
def classify_dialect(text):
    # Prepare the input text
    inputs = tokenizer(
        text,
        return_tensors="pt",
        padding="max_length",
        truncation=True,
        max_length=128
    ).to(model.device)

    # Get prediction
    with torch.no_grad():
        outputs = model(**inputs)
        predictions = outputs.logits.softmax(dim=-1)
        predicted_class = predictions.argmax().item()

    # Egypt, Gulf, Jordan, Yemen
    dialect_labels = {0: 'E', 1: 'G', 2: 'J', 3: 'Y'}

    # Get probabilities for each class
    probabilities = predictions[0].tolist()
    results = {dialect_labels[i]: f"{prob:.2%}" for i, prob in enumerate(probabilities)}

    return dialect_labels[predicted_class], results

# Example usage
text = "ازيك"
predicted_dialect, probabilities = classify_dialect(text)
print(f"Predicted dialect: {predicted_dialect}")
print("\nProbabilities for each dialect:")
for dialect, prob in probabilities.items():
    print(f"{dialect}: {prob}")

Predicted dialect: J

Probabilities for each dialect:
E: 12.41%
G: 2.15%
J: 84.57%
Y: 0.88%
