In [None]:
!pip install transformers datasets accelerate evaluate

Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.6


In [None]:
!pip install -q transformers datasets accelerate scikit-learn

## The Problem
- to fine-tune a model for Arabic end-of-utterance (EOU) detection
## Notebook Goal
- Get a dataset
- Make the dataset compatible for the problem
- Find and evaluate models for EOU
- Finetune the choosen model

### Dataset Preperation

In [None]:
import kagglehub
import pandas as pd
import os
from collections import Counter
import re
from datasets import Dataset
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
import evaluate
import time
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix

In [None]:
# Download latest version
path = kagglehub.dataset_download("nexdatafrank/arabic-real-world-speech-dataset")

print("Path to dataset files:", path)

Using Colab cache for faster access to the 'arabic-real-world-speech-dataset' dataset.
Path to dataset files: /kaggle/input/arabic-real-world-speech-dataset


In [None]:
for root, dirs, files in os.walk(path):
    for f in files:
        print(os.path.join(root, f))

/kaggle/input/arabic-real-world-speech-dataset/000098_2-0011.wav
/kaggle/input/arabic-real-world-speech-dataset/000098_2-0010.wav
/kaggle/input/arabic-real-world-speech-dataset/000098_2-0014.wav
/kaggle/input/arabic-real-world-speech-dataset/000098_2-0011.txt
/kaggle/input/arabic-real-world-speech-dataset/000098_2-0012.txt
/kaggle/input/arabic-real-world-speech-dataset/000098_2-0013.wav
/kaggle/input/arabic-real-world-speech-dataset/000098_2-0013.txt
/kaggle/input/arabic-real-world-speech-dataset/000098_2-0012.wav
/kaggle/input/arabic-real-world-speech-dataset/000098_2-0010.txt
/kaggle/input/arabic-real-world-speech-dataset/000098_2-0014.txt


In [None]:
extensions = set()

for root, dirs, files in os.walk(path):
    for f in files:
        ext = os.path.splitext(f)[1].lower()  # extension like .wav
        extensions.add(ext)

print("File types found:", extensions)

File types found: {'.wav', '.txt'}


In [None]:
from collections import Counter

counter = Counter()

for root, dirs, files in os.walk(path):
    for f in files:
        ext = os.path.splitext(f)[1].lower()
        counter[ext] += 1

print(counter)

Counter({'.wav': 5, '.txt': 5})


In [None]:
for root, dirs, files in os.walk(path):
    for f in files:
        if f.endswith(".txt"):
            txt_file = os.path.join(root, f)
            print("\n=== File:", txt_file, "===\n")
            with open(txt_file, "r", encoding="utf-8") as file:
                print(file.read())


=== File: /kaggle/input/arabic-real-world-speech-dataset/000098_2-0011.txt ===

يعني شفتوه انتم وسمعتوه، المذيعة تقول بما في ذلك النص القرآني رمي للأسلام، راحت تقول له طبعا،

=== File: /kaggle/input/arabic-real-world-speech-dataset/000098_2-0012.txt ===

طبعا، احكي شلون تغى تجدد الخطاب الديني؟ وش، تبغى تجدد الخطاب الديني، تبي تغير نصوص القرآن،

=== File: /kaggle/input/arabic-real-world-speech-dataset/000098_2-0013.txt ===

اكتشفت شئ يعني والعياذ بالله ما كان يعرفه النبي عليه الصلاة والسلام،

=== File: /kaggle/input/arabic-real-world-speech-dataset/000098_2-0010.txt ===

اليهودية، والمسحية، والهندوسية اللي ما عمرنا شفنا حد منهم ينتقدها او يتكلم عنها اصلا، الرمي كله للأسلام شفتوا الدليل،

=== File: /kaggle/input/arabic-real-world-speech-dataset/000098_2-0014.txt ===

ما ابغى اقول اكثر من كذا صراحة ما اقدر حتى أقولها كا كمثال وإلا كنكتة،


- not enough data
- after researching the converstional ones were not saudi, like [dialogue-arabic-dialects datset](https://github.com/tareknaous/dialogue-arabic-dialects/blob/main/dataset.csv)

- next approach try to make synthatic one with ChatGPT with the below promt
  - "i want 500, acrioss Najdi / Hijazi / Qasimmi and different region in saud, without timestamps, with automatic EOU labels, in csv format"

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
dataset_path = '/content/drive/MyDrive/Colab Notebooks/HamsAI/saudi_eou_dataset.csv'

In [None]:
saudi_eou_dataset = pd.read_csv(dataset_path)
saudi_eou_dataset.head()

Unnamed: 0,dialog_id,turn,dialect,speaker,text,eou
0,1,1,Najdi,A,ايه سمعت الخبر قبل شوي,1
1,1,2,Najdi,B,والله زين كذا,1
2,1,3,Najdi,A,اممم لحظة شوي…,0
3,1,4,Najdi,B,ترى الموضوع اللي قلت لك عنه…,0
4,1,5,Najdi,A,والله زين كذا,1


In [None]:
saudi_eou_dataset.shape

(5490, 6)

In [None]:
saudi_eou_dataset.tail()

Unnamed: 0,dialog_id,turn,dialect,speaker,text,eou
5485,500,9,Najdi,A,ايه تمام، فهمت عليك,1
5486,500,10,Najdi,B,كان ودي اسألك عن شيء…,0
5487,500,11,Najdi,A,ايه سمعت الخبر قبل شوي,1
5488,500,12,Najdi,B,ترى الموضوع اللي قلت لك عنه…,0
5489,500,13,Najdi,A,طيب خلاص نتفق بكرة,1


In [None]:
saudi_eou_dataset["dialect"].unique()

array(['Najdi', 'Hijazi', 'Qassimi'], dtype=object)

In [None]:
sample_id = saudi_eou_dataset["dialog_id"].sample(1).iloc[0]
sample_dialog = saudi_eou_dataset[saudi_eou_dataset["dialog_id"] == sample_id]

print("Dialog ID:", sample_id)
sample_dialog

Dialog ID: 297


Unnamed: 0,dialog_id,turn,dialect,speaker,text,eou
3248,297,1,Najdi,A,ايه سمعت الخبر قبل شوي,1
3249,297,2,Najdi,B,طيب خلاص نتفق بكرة,1
3250,297,3,Najdi,A,والله زين كذا,1
3251,297,4,Najdi,B,طيب خلاص نتفق بكرة,1
3252,297,5,Najdi,A,اممم لحظة شوي…,0
3253,297,6,Najdi,B,ايه تمام، فهمت عليك,1
3254,297,7,Najdi,A,والله زين كذا,1
3255,297,8,Najdi,B,طيب خلاص نتفق بكرة,1
3256,297,9,Najdi,A,طيب خلاص نتفق بكرة,1
3257,297,10,Najdi,B,طيب خلاص نتفق بكرة,1


- some issues found
  - the generated text seems to be repeated
  - there are extra unneeded columns

In [None]:
repetition_stats = (
    saudi_eou_dataset.groupby(["dialog_id", "text"])
    .size()
    .reset_index(name="count")
)

# Keep only texts that appear more than once in same dialog
repeated = repetition_stats[repetition_stats["count"] > 1]

repeated

Unnamed: 0,dialog_id,text,count
6,1,والله زين كذا,2
8,2,خلاص تمام كذا,2
9,2,كنت حابة أقولك…,2
10,2,كنت دحين بأشرح لك…,2
11,2,مافي مشكلة ابداً,2
...,...,...,...
3207,499,مافي مشكلة ابداً,2
3208,500,ايه تمام، فهمت عليك,2
3209,500,ايه سمعت الخبر قبل شوي,4
3212,500,تمام اجل نمشي على الخطة,2


In [None]:
repetition_stats["count"].max()

7

- regenerate again with ChatGPT

In [None]:
dataset_path = '/content/drive/MyDrive/Colab Notebooks/HamsAI/saudi_eou_dataset_v2.csv'

In [None]:
saudi_eou_dataset = pd.read_csv(dataset_path)
saudi_eou_dataset.head()

Unnamed: 0,dialog_id,turn,dialect,speaker,text,eou
0,1,1,Haili,A,تمام خلاص ارتبها,1
1,1,2,Haili,B,دقيقة بخبرك…,0
2,1,3,Haili,A,اذكر اني قلت لك…,0
3,1,4,Haili,B,مهوب كذا بس…,0
4,1,5,Haili,A,ايه طيب تمام,1


In [None]:
saudi_eou_dataset["dialect"].unique()

array(['Haili', 'Hijazi', 'Asiri', 'Jizani', 'Najdi', 'Qassimi'],
      dtype=object)

In [None]:
sample_id = saudi_eou_dataset["dialog_id"].sample(1).iloc[0]
sample_dialog = saudi_eou_dataset[saudi_eou_dataset["dialog_id"] == sample_id]

print("Dialog ID:", sample_id)
sample_dialog

Dialog ID: 349


Unnamed: 0,dialog_id,turn,dialect,speaker,text,eou
4494,349,1,Qassimi,A,ايه زين كذا,1
4495,349,2,Qassimi,B,كنت بأقول سالفة…,0
4496,349,3,Qassimi,A,واضح يابو فلان,1
4497,349,4,Qassimi,B,سمعتك واضح,1
4498,349,5,Qassimi,A,تمام كلمتك وصلت,1
4499,349,6,Qassimi,B,خلاص اتفقنا,1
4500,349,7,Qassimi,A,تمام يالله نتوكل,1
4501,349,8,Qassimi,B,الله يحييك,1
4502,349,9,Qassimi,A,تمام نرتبها بكرة,1
4503,349,10,Qassimi,B,ابغاك تسمعني زين…,0


In [None]:
repetition_stats = (
    saudi_eou_dataset.groupby(["dialog_id", "text"])
    .size()
    .reset_index(name="count")
)

# Keep only texts that appear more than once in the same dialog
repeated = repetition_stats[repetition_stats["count"] > 1]

# Sort by repetition count (descending)
repeated = repeated.sort_values(by="count", ascending=False)

repeated

Unnamed: 0,dialog_id,text,count
3886,358,يب تمام,5
4654,431,طيب خلاص نتفق بكرة,4
4342,402,خلاص تمام كذا,4
4309,399,ايه تمام، فهمت عليك,4
4328,401,ايوه فهمتك دحين,4
...,...,...,...
5323,492,تمام بس نتأكد اول,2
5322,492,ايوه مضبوط الكلام,2
5318,491,والله رأيك سليم,2
5312,491,سمعت السالفة,2


In [None]:
repetition_stats = (
    saudi_eou_dataset.groupby(["text"])
    .size()
    .reset_index(name="count")
)

# Keep only texts that appear more than once in the same dialog
repeated = repetition_stats[repetition_stats["count"] > 1]

# Sort by repetition count (descending)
repeated = repeated.sort_values(by="count", ascending=False)

repeated

Unnamed: 0,text,count
11,ايه طيب تمام,129
49,عندي خبر لك,125
35,خل نتفاهم عليها بكرة,123
73,يب تمام,123
12,ايه منيب ناسي,123
...,...,...
56,كنت بفكر في شيء…,46
20,تذكرت شيء دحين…,45
0,ابشرح لك…,43
22,ترى الموضوع ما خلص…,43


- it's now repeated as text but less in dialogs
- make another version with the below promt
  - "make another verstion of the data set, remove the dialog column
now it should be only text, dialect and eou, with no repetino across the text column
500 rows total"

In [None]:
dataset_path = '/content/drive/MyDrive/Colab Notebooks/HamsAI/saudi_eou_dataset_flat.csv'

In [None]:
saudi_eou_dataset = pd.read_csv(dataset_path)
saudi_eou_dataset.head()

Unnamed: 0,text,dialect,eou
0,واضح من كلامك,Asiri,1
1,تمام نتفق,Jizani,1
2,ايوه نشوف وش يصير,Jizani,1
3,يب شرحتها مضبوط,Najdi,1
4,تمام يالله نتوكل,Qassimi,1


In [None]:
repetition_stats = (
    saudi_eou_dataset.groupby(["text"])
    .size()
    .reset_index(name="count")
)

# Keep only texts that appear more than once in the same dialog
repeated = repetition_stats[repetition_stats["count"] > 1]

# Sort by repetition count (descending)
repeated = repeated.sort_values(by="count", ascending=False)

repeated

Unnamed: 0,text,count


- no repeated text

In [None]:
eou_stats = (
    saudi_eou_dataset.groupby(["eou"])
    .size()
    .reset_index(name="count")
)

eou_stats

Unnamed: 0,eou,count
0,0,128
1,1,372


- the 20–30% “unfinished” is realistic for spoken dialog data because people generally speak in complete thoughts

In [None]:
saudi_eou_dataset['dialect'].unique()

array(['Asiri', 'Jizani', 'Najdi', 'Qassimi', 'Haili', 'Hijazi'],
      dtype=object)

In [None]:
saudi_eou_dataset.shape

(500, 3)

In [None]:
saudi_eou_dataset.drop('dialect', axis=1, inplace=True)
saudi_eou_dataset.shape

(500, 2)

In [None]:
def normalize_text(text):
    # Remove arabic diacritics
    text = re.sub(r"[\u064B-\u0652]", "", text)

    # Remove punctuation: . , … ! ? " '
    text = re.sub(r"[.,!?\"'؛:…]", "", text)

    # Replace multiple spaces with one
    text = re.sub(r"\s+", " ", text)

    return text.strip()

In [None]:
saudi_eou_dataset["text"] = saudi_eou_dataset["text"].apply(normalize_text)

In [None]:
saudi_eou_dataset = saudi_eou_dataset.rename(columns={"eou": "labels"})

In [None]:
saudi_eou_dataset.head()

Unnamed: 0,text,labels
0,واضح من كلامك,1
1,تمام نتفق,1
2,ايوه نشوف وش يصير,1
3,يب شرحتها مضبوط,1
4,تمام يالله نتوكل,1


### Model Selection

- to selece a model, the below criteria should be taken into cosideration
  - light so it can preform fast (in realtime) and be cost efficient
  - have good accuracy for different Saudi dialects

In [None]:
models_to_test = {
    "xlm-roberta-base": "xlm-roberta-base",
    "arabert": "aubmindlab/bert-base-arabertv2",
    "arabicbert": "asafaya/bert-base-arabic"
}

In [None]:
dataset = Dataset.from_pandas(saudi_eou_dataset)
dataset = dataset.train_test_split(test_size=0.2, seed=42)

In [None]:
dataset["train"].to_pandas()["labels"].value_counts(normalize=True)

Unnamed: 0_level_0,proportion
labels,Unnamed: 1_level_1
1,0.7425
0,0.2575


In [None]:
dataset["test"].to_pandas()["labels"].value_counts(normalize=True)

Unnamed: 0_level_0,proportion
labels,Unnamed: 1_level_1
1,0.75
0,0.25


In [None]:
def get_tokenizer(model_name):
    return AutoTokenizer.from_pretrained(model_name)

# def tokenize(batch, tokenizer):
#     return tokenizer(batch["text"], truncation=True, padding="max_length", max_length=64)

def tokenize(batch, tokenizer):
  tokens = tokenizer(
      batch["text"],
      truncation=True,
      padding="max_length",
      max_length=64,
  )
  tokens["labels"] = batch["labels"]
  return tokens

In [None]:
# import numpy as np
import time
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer

def compute_metrics(eval_pred):
    """
    eval_pred: EvalPrediction(predictions=..., label_ids=...)
    or (predictions, label_ids)
    """
    # Handle both tuple and EvalPrediction
    if isinstance(eval_pred, tuple):
        predictions, labels = eval_pred
    else:
        predictions, labels = eval_pred.predictions, eval_pred.label_ids

    # Sometimes predictions is a tuple (logits, ...)
    if isinstance(predictions, tuple):
        predictions = predictions[0]

    preds = np.argmax(predictions, axis=-1)

    # Ensure they are 1D arrays
    preds = np.array(preds).flatten()
    labels = np.array(labels).flatten()

    # Accuracy
    acc = (preds == labels).mean()

    # F1 (binary)
    # Avoid division by zero
    tp = np.sum((preds == 1) & (labels == 1))
    fp = np.sum((preds == 1) & (labels == 0))
    fn = np.sum((preds == 0) & (labels == 1))

    precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
    recall    = tp / (tp + fn) if (tp + fn) > 0 else 0.0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0

    return {
        "accuracy": float(acc),
        "f1": float(f1),
    }


def evaluate_model(model_name, dataset):
    print(f"\n🔍 Evaluating: {model_name}")

    tokenizer = get_tokenizer(model_name)
    tokenized = dataset.map(lambda x: tokenize(x, tokenizer), batched=True)

    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

    training_args = TrainingArguments(
        output_dir=f"results-{model_name}",
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=1,
        eval_strategy="epoch",
        logging_steps=10,
        save_strategy="no",
        load_best_model_at_end=False
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized["train"],
        eval_dataset=tokenized["test"],
        compute_metrics=compute_metrics,
    )

    # Fine-tune
    trainer.train()

    # Compute metrics
    metrics = trainer.evaluate()
    print(f"Metrics: {metrics}")

    # --- Inference latency measurement ---
    sample = tokenized["test"][0]
    # Build a batch of size 1
    inputs = {
        "input_ids": torch.tensor(sample["input_ids"]).unsqueeze(0),
        "attention_mask": torch.tensor(sample["attention_mask"]).unsqueeze(0),
    }

    model.eval()
    with torch.no_grad():
        start = time.time()
        for _ in range(100):
            _ = model(**inputs)
        end = time.time()

    latency = (end - start) / 100 * 1000  # ms
    print(f"Latency per inference ≈ {latency:.2f} ms")

    return {
        "model": model_name,
        "accuracy": metrics["eval_accuracy"],
        "f1": metrics["eval_f1"],
        "latency_ms": latency,
    }

In [None]:
results = []

for name, model_id in models_to_test.items():
    results.append(evaluate_model(model_id, dataset))


🔍 Evaluating: xlm-roberta-base


Map:   0%|          | 0/400 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.5969,0.561109,0.75,0.857143


Metrics: {'eval_loss': 0.5611085295677185, 'eval_accuracy': 0.75, 'eval_f1': 0.8571428571428571, 'eval_runtime': 18.5554, 'eval_samples_per_second': 5.389, 'eval_steps_per_second': 0.377, 'epoch': 1.0}
Latency per inference ≈ 269.63 ms

🔍 Evaluating: aubmindlab/bert-base-arabertv2


Map:   0%|          | 0/400 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.4441,0.363121,0.84,0.903614


Metrics: {'eval_loss': 0.3631209433078766, 'eval_accuracy': 0.84, 'eval_f1': 0.9036144578313253, 'eval_runtime': 18.649, 'eval_samples_per_second': 5.362, 'eval_steps_per_second': 0.375, 'epoch': 1.0}
Latency per inference ≈ 272.43 ms

🔍 Evaluating: asafaya/bert-base-arabic


Map:   0%|          | 0/400 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at asafaya/bert-base-arabic and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.4667,0.419137,0.75,0.857143


Metrics: {'eval_loss': 0.4191370904445648, 'eval_accuracy': 0.75, 'eval_f1': 0.8571428571428571, 'eval_runtime': 20.0654, 'eval_samples_per_second': 4.984, 'eval_steps_per_second': 0.349, 'epoch': 1.0}
Latency per inference ≈ 279.32 ms


In [None]:
pd.DataFrame(results)

Unnamed: 0,model,accuracy,f1,latency_ms
0,xlm-roberta-base,0.75,0.857143,269.630718
1,aubmindlab/bert-base-arabertv2,0.84,0.903614,272.425878
2,asafaya/bert-base-arabic,0.75,0.857143,279.323332


- `aubmindlab/bert-base-arabertv2` is the best of the three because of
  - it has the highest accuracy of 84%
  - its F1 is also the highest with 0.9 (which is especially important for imbalanced datasets)
- Although its latency is not ideal, I intend to try the below solutions
  - Convert to ONNX
  - Apply int8 quantization
  - Reduce max_length

### Model Fine-Tuning and Evaluation

- Even though `aubmindlab/bert-base-arabertv2` is not ideal, it's good enough for the shortage of time

In [None]:
train_df, test_df = train_test_split(
    saudi_eou_dataset,
    test_size=0.2,
    random_state=42,
    stratify=saudi_eou_dataset["labels"]
)

print("Train label counts:\n", train_df["labels"].value_counts())
print("\nTest label counts:\n", test_df["labels"].value_counts())

Train label counts:
 labels
1    298
0    102
Name: count, dtype: int64

Test label counts:
 labels
1    74
0    26
Name: count, dtype: int64


In [None]:
train_dataset = Dataset.from_pandas(train_df.reset_index(drop=True))
test_dataset  = Dataset.from_pandas(test_df.reset_index(drop=True))

dataset = {"train": train_dataset, "test": test_dataset}

In [None]:
model_name = "aubmindlab/bert-base-arabertv2"
tokenizer = AutoTokenizer.from_pretrained(model_name)

Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


tokenizer_config.json:   0%|          | 0.00/611 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/384 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [None]:
def tokenize(batch):
    tokens = tokenizer(
        batch["text"],
        truncation=True,
        padding="max_length",
        max_length=64,
    )
    tokens["labels"] = batch["labels"]
    return tokens

In [None]:
tokenized_train = dataset["train"].map(tokenize, batched=True)
tokenized_test  = dataset["test"].map(tokenize, batched=True)

tokenized_train = tokenized_train.remove_columns(
    [c for c in tokenized_train.column_names if c not in ["input_ids", "attention_mask", "labels"]]
)
tokenized_test = tokenized_test.remove_columns(
    [c for c in tokenized_test.column_names if c not in ["input_ids", "attention_mask", "labels"]]
)

tokenized_train.set_format("torch")
tokenized_test.set_format("torch")

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=2
)

model.safetensors:   0%|          | 0.00/543M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    if isinstance(logits, tuple):
        logits = logits[0]

    preds = np.argmax(logits, axis=-1)

    acc = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels,
        preds,
        average="binary",
        pos_label=1
    )

    return {
        "accuracy": acc,
        "precision": precision,
        "recall": recall,
        "f1": f1,
    }

In [None]:
batch_size = 16
num_epochs = 3

training_args = TrainingArguments(
    output_dir="eou-arabertv2",
    eval_strategy="epoch",
    save_strategy="no",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    logging_steps=50,
    load_best_model_at_end=False
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: (1) Create a W&B account
[34m[1mwandb[0m: (2) Use an existing W&B account
[34m[1mwandb[0m: (3) Don't visualize my results
[34m[1mwandb[0m: Enter your choice:

 2


[34m[1mwandb[0m: You chose 'Use an existing W&B account'
[34m[1mwandb[0m: Logging into https://api.wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: Find your API key here: https://wandb.ai/authorize?ref=models
[34m[1mwandb[0m: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mafnan-aldajani01[0m ([33mafnan-aldajani01-king-saud-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin




Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.343674,0.85,0.831461,1.0,0.907975
2,0.411300,0.148272,0.98,0.973684,1.0,0.986667
3,0.411300,0.136893,0.97,0.961039,1.0,0.980132




TrainOutput(global_step=75, training_loss=0.32096025784810384, metrics={'train_runtime': 935.888, 'train_samples_per_second': 1.282, 'train_steps_per_second': 0.08, 'total_flos': 39466658304000.0, 'train_loss': 0.32096025784810384, 'epoch': 3.0})

In [None]:
results = trainer.evaluate()
results



{'eval_loss': 0.1368926465511322,
 'eval_accuracy': 0.97,
 'eval_precision': 0.961038961038961,
 'eval_recall': 1.0,
 'eval_f1': 0.9801324503311258,
 'eval_runtime': 19.182,
 'eval_samples_per_second': 5.213,
 'eval_steps_per_second': 0.365,
 'epoch': 3.0}

In [None]:
predictions = trainer.predict(tokenized_test)
logits = predictions.predictions
labels = predictions.label_ids

y_pred = np.argmax(logits, axis=-1)
y_true = labels

cm = confusion_matrix(y_true, y_pred, labels=[0,1])
cm



array([[23,  3],
       [ 0, 74]])

from the training resultes we can find:
- validation loss steadly decreased
- accuracy has increased and reached 97% which is good
- recall is 1 which is what we need all positives are found (we want the model to correctly predicts end of uetterance)
- percision is good too
- lastly F1 is shows excellent balance between precision and recall with 0.98

###  Model Deployment

In [None]:
trainer.save_model("arabic-eou-bertv2")
tokenizer.save_pretrained("arabic-eou-bertv2")

('arabic-eou-bertv2/tokenizer_config.json',
 'arabic-eou-bertv2/special_tokens_map.json',
 'arabic-eou-bertv2/vocab.txt',
 'arabic-eou-bertv2/added_tokens.json',
 'arabic-eou-bertv2/tokenizer.json')

In [None]:
!pip install huggingface_hub



In [None]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
from huggingface_hub import HfApi, create_repo

create_repo("arabic-eou-bertv2", repo_type="model")

RepoUrl('https://huggingface.co/AfnanSD/arabic-eou-bertv2', endpoint='https://huggingface.co', repo_type='model', repo_id='AfnanSD/arabic-eou-bertv2')

In [None]:
from huggingface_hub import HfApi

api = HfApi()
api.upload_folder(
    folder_path="arabic-eou-bertv2",
    repo_id="AfnanSD/arabic-eou-bertv2",
    repo_type="model",
    create_pr= False
)

Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...-bertv2/model.safetensors:   0%|          |  555kB /  541MB            

  ...-bertv2/training_args.bin:  89%|########9 | 5.20kB / 5.84kB            

CommitInfo(commit_url='https://huggingface.co/AfnanSD/arabic-eou-bertv2/commit/1e4b33e4ef7b97b12978b435c27bb38b2b2378b4', commit_message='Upload folder using huggingface_hub', commit_description='', oid='1e4b33e4ef7b97b12978b435c27bb38b2b2378b4', pr_url=None, repo_url=RepoUrl('https://huggingface.co/AfnanSD/arabic-eou-bertv2', endpoint='https://huggingface.co', repo_type='model', repo_id='AfnanSD/arabic-eou-bertv2'), pr_revision=None, pr_num=None)

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

model = AutoModelForSequenceClassification.from_pretrained("AfnanSD/arabic-eou-bertv2")
tokenizer = AutoTokenizer.from_pretrained("AfnanSD/arabic-eou-bertv2")

inputs = tokenizer("طيب لحظة شوي ...", return_tensors="pt")
prob = torch.softmax(model(**inputs).logits, dim=-1)[0,1]
print(prob)

config.json:   0%|          | 0.00/646 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/541M [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

tensor(0.9348, grad_fn=<SelectBackward0>)
