

# Libraries and Setup



In [None]:
# !pip install -q -r /content/drive/MyDrive/Arabic-Dialect-Detector/requirements.txt

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModelForTokenClassification, Trainer, TrainingArguments, BitsAndBytesConfig, DataCollatorWithPadding
from peft import LoraConfig, TaskType, get_peft_model
from datasets import load_dataset, DatasetDict
from huggingface_hub import login
from dotenv import load_dotenv
from google.colab import userdata

import numpy as np
import pandas as pd
import torch
import os

In [None]:
# local
load_dotenv()
hf_token = os.getenv('HUGGINGFACE_TOKEN')
login(token=hf_token)

In [None]:
# # colab
# hf_token = userdata.get('HF_TOKEN')
# login(token=hf_token)

# Model Design

 ### Quantization configs

In [None]:
# bnb library quantizes model
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype="float16",
    bnb_4bit_use_double_quant=False
)

# peft config for quantized model
peft_config = LoraConfig(
    task_type=TaskType.SEQ_CLS, # SEQ_CLS for dialect detection, SEQ_2_SEQ_LM for translation
    inference_mode=False,
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    target_modules=["q_proj", "v_proj"]
)

### Model Setup

In [None]:
# define model
model_name = "meta-llama/Llama-3.1-8B"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name,
                                                           quantization_config=bnb_config,
                                                           num_labels=4,
                                                           device_map ="auto")

model.resize_token_embeddings(len(tokenizer))
model = get_peft_model(model, peft_config)

model.print_trainable_parameters()

In [None]:
# model.config

#Data

In [None]:
# load data
training_data_path = r"/content/drive/MyDrive/Arabic-Dialect-Detector/src/data/Arabic_dialect.csv"
dataset = load_dataset("csv", data_files=training_data_path)

dataset = dataset.rename_column("result", "label")
dataset = dataset.rename_column("t", "text")

dataset.set_format(type="torch", columns=["text", "label"])

In [None]:
# split data
train_test = dataset["train"].train_test_split(test_size=0.4, seed= 42)
test_val = train_test["test"].train_test_split(test_size=0.5, seed= 42)

columns_to_remove = ['Date','User', 'Tweet']
# format data into DatsetDict
dataset = DatasetDict({
    "train": train_test["train"].remove_columns(columns_to_remove),
    "validation": test_val["train"].remove_columns(columns_to_remove),
    "test": test_val["test"].remove_columns(columns_to_remove)
})
dataset

In [None]:
# Set the pad token if it's missing
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = model.config.eos_token_id
tokenizer.padding_side = "right"

# Create label mapping
unique_labels = set(dataset['train']['label'])
label2id = {label: idx for idx, label in enumerate(sorted(unique_labels))}
id2label = {idx: label for label, idx in label2id.items()}

# Update model config with label mapping
model.config.label2id = label2id
model.config.id2label = id2label

In [None]:
# Tokenize the dataset
def preprocess_function(examples):
    tokenized = tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=128,
        return_tensors=None
    )

    tokenized["labels"] = [label2id[label] for label in examples["label"]]
    return tokenized

# Tokenize the dataset
tokenized_datasets = dataset.map(preprocess_function,
                                 batched=True,
                                 load_from_cache_file=False,
                                 remove_columns=dataset["train"].column_names)
tokenized_datasets

In [None]:
print(f"Padding Token: {tokenizer.pad_token}")
print(tokenizer.special_tokens_map)
print(f"Padding Token ID: {tokenizer.pad_token_id}")

tokenized_datasets['train'][0]

# Train

In [None]:
data_collator = DataCollatorWithPadding(
    tokenizer=tokenizer,
    padding=True,
    max_length=128,
    pad_to_multiple_of=8,
    return_tensors="pt"
)

In [None]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    fp16=True,
    gradient_accumulation_steps=2,
    gradient_checkpointing=True,
    remove_unused_columns=True,
    load_best_model_at_end=True
)

# help memory please????
torch.cuda.empty_cache()
model.gradient_checkpointing_enable()
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

In [None]:
# Initialize the Trainer, adding the compute_metrics argument
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator

)
# Fine-tune the model
trainer.train()

Eval

In [None]:
# Evaluate on the test set
results = trainer.evaluate(tokenized_datasets["test"])
print(results)
