

# Libraries and Setup



In [None]:
#%pip install -q -r /content/drive/MyDrive/Arabic-Dialect-Detector/requirements.txt

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModelForTokenClassification, Trainer, TrainingArguments, BitsAndBytesConfig, DataCollatorWithPadding
from peft import LoraConfig, TaskType, get_peft_model
from datasets import load_dataset, DatasetDict
from huggingface_hub import login
from dotenv import load_dotenv
from google.colab import userdata

import numpy as np
import pandas as pd
import torch
import os

In [None]:
load_dotenv()
hf_token = os.getenv('HUGGINGFACE_TOKEN')
login(token=hf_token)

# Model Design

 ### Quantization configs

In [None]:
# bnb library quantizes model
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype="float16",
    bnb_4bit_use_double_quant=False
)

# peft config for quantized model
peft_config = LoraConfig(
    task_type=TaskType.SEQ_CLS, # SEQ_CLS for dialect detection, SEQ_2_SEQ_LM for translation
    inference_mode=False,
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    target_modules=["q_proj", "v_proj"]
)

### Model Setup

In [None]:
# define model
model_name = "meta-llama/Llama-3.1-8B"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name,
                                                           quantization_config=bnb_config,
                                                           num_labels=5,
                                                           device_map ="auto")

model.resize_token_embeddings(len(tokenizer))
model = get_peft_model(model, peft_config)

model.print_trainable_parameters()

### Data

In [None]:
# load data
training_data_path = r"/content/drive/MyDrive/Arabic-Dialect-Detector/src/data/Arabic_dialect.csv"
dataset = load_dataset("csv", data_files=training_data_path)

dataset = dataset.rename_column("result", "label")
dataset = dataset.rename_column("t", "text")

dataset.set_format(type="torch", columns=["text", "label"])

In [None]:
# split data
train_test = dataset["train"].train_test_split(test_size=0.4, seed= 42)
test_val = train_test["test"].train_test_split(test_size=0.5, seed= 42)

columns_to_remove = ['Date','User', 'Tweet']
# format data into DatsetDict
dataset = DatasetDict({
    "train": train_test["train"].remove_columns(columns_to_remove),
    "validation": test_val["train"].remove_columns(columns_to_remove),
    "test": test_val["test"].remove_columns(columns_to_remove)
})
dataset

In [None]:
lengths = [len(tokenizer.encode(text)) for text in dataset['train']['text']]
print(f"Average length: {np.mean(lengths):.2f}")
print(f"95th percentile length: {np.percentile(lengths, 95):.2f}")
print(f"Max length: {max(lengths)}")

In [None]:
# Set the pad token if it's missing
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = model.config.eos_token_id
tokenizer.padding_side = "right"


# Tokenization function
max_l = 128 #TODO fix based on stats
def preprocess_function(examples):
    tokenized = tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=max_l,
        return_tensors=None,  
        return_attention_mask=True
    )
    tokenized["labels"] = examples["label"] 
    return tokenized


# Tokenize the dataset
tokenized_datasets = dataset.map(preprocess_function,
                                 batched=True,
                                 load_from_cache_file=False,
                                 batch_size = 16,
                                 remove_columns=dataset["train"].column_names)

print(tokenized_datasets)
# tokenized_datasets['train'][0]

In [None]:
columns_to_remove = ['label', 'text']

tokenized_datasets = tokenized_datasets.map(
    lambda examples: {"labels": examples["label"]},
)

tokenized_datasets = DatasetDict({
    "train": tokenized_datasets["train"].remove_columns(columns_to_remove),
    "validation": tokenized_datasets["validation"].remove_columns(columns_to_remove),
    "test": tokenized_datasets["test"].remove_columns(columns_to_remove),
})
tokenized_datasets

In [None]:
print(f"Padding Token: {tokenizer.pad_token}")
print(tokenizer.special_tokens_map)
print(f"Padding Token ID: {tokenizer.pad_token_id}")

In [None]:
print(tokenized_datasets["train"][0]) 

# Train

### Params

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, pad_to_multiple_of=8)

In [None]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    fp16=True,
    gradient_accumulation_steps=4,
    gradient_checkpointing=True
)

# help memory please????
torch.cuda.empty_cache()
model.gradient_checkpointing_enable()
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

### Train Model

In [None]:
# Initialize the Trainer, adding the compute_metrics argument
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator
)

# Fine-tune the model
trainer.train()

Eval

In [None]:
# Evaluate on the test set
results = trainer.evaluate(tokenized_datasets["test"])
print(results)