In [None]:
!pip install transformers[torch] datasets arabic-reshaper python-bidi pandas scikit-learn



In [None]:
import pandas as pd
from datasets import Dataset

# 1. Install openpyxl to read Excel files
!pip install openpyxl

# 2. Load the Excel files
train_df = pd.read_excel('Train.xlsx')
test_df = pd.read_excel('Test.xlsx')

# 3. Rename columns to match what the model expects
# We map 'q_body' to 'question' and 'category' to 'specialty'
train_df = train_df.rename(columns={'q_body': 'question', 'category': 'specialty'})
test_df = test_df.rename(columns={'q_body': 'question', 'category': 'specialty'})

# 4. Convert text labels into numerical IDs
train_df['label'] = train_df['specialty'].astype('category').cat.codes
test_df['label'] = test_df['specialty'].astype('category').cat.codes

# 5. Save the mapping for later use
specialties = train_df['specialty'].astype('category').cat.categories.tolist()
id2label = {i: label for i, label in enumerate(specialties)}
label2id = {label: i for i, label in enumerate(specialties)}

print(f"Data Loaded Successfully!")
print(f"Found {len(specialties)} specialties: {specialties}")

Data Loaded Successfully!
Found 20 specialties: ['الامراض الجلدية', 'الامراض الجنسية', 'الاورام الخبيثة والحميدة', 'الطب العام', 'امراض الاطفال', 'امراض الجهاز التنفسي', 'امراض الجهاز الهضمي', 'امراض الدم', 'امراض العضلات والعظام و المفاصل', 'امراض العيون', 'امراض الغدد الصماء', 'امراض القلب و الشرايين', 'امراض المسالك البولية والتناسلية', 'امراض باطنية', 'امراض نسائية', 'امراض نفسية وعصبية', 'انف اذن وحنجرة', 'جراحة تجميل', 'جراحة عامة', 'طب الاسنان']


In [None]:
from transformers import AutoTokenizer

model_name = "aubmindlab/bert-base-arabertv2"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    return tokenizer(examples["question"], truncation=True, padding="max_length", max_length=128)

# Convert to HuggingFace format
train_ds = Dataset.from_pandas(train_df[['question', 'label']])
test_ds = Dataset.from_pandas(test_df[['question', 'label']])

tokenized_train = train_ds.map(tokenize_function, batched=True)
tokenized_test = test_ds.map(tokenize_function, batched=True)

print("Ready for training!")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Map:   0%|          | 0/136000 [00:00<?, ? examples/s]

Map:   0%|          | 0/34000 [00:00<?, ? examples/s]

Ready for training!


In [None]:
!pip install -q transformers datasets accelerate evaluate

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
# Install dependencies (Colab)
!pip install -q transformers datasets accelerate evaluate

import numpy as np
import torch
from transformers import (
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
    EarlyStoppingCallback
)
from sklearn.metrics import accuracy_score, f1_score

# =========================
# Check GPU
# =========================
if torch.cuda.is_available():
    print("Using GPU:", torch.cuda.get_device_name(0))
else:
    print("GPU not found!")

# =========================
# Metrics
# =========================
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average="weighted")
    return {"accuracy": acc, "f1": f1}

# =========================
# Shuffle dataset
# =========================
tokenized_train = tokenized_train.shuffle(seed=42)
tokenized_test = tokenized_test.shuffle(seed=42)

# =========================
# Load Model
# =========================
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(specialties),
    id2label=id2label,
    label2id=label2id
)

# =========================
# Dynamic Padding
# =========================
data_collator = DataCollatorWithPadding(tokenizer)

# =========================
# Training Arguments (v5 compatible)
# =========================
training_args = TrainingArguments(
    output_dir="./medical_model_v2",
    num_train_epochs=3,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    warmup_steps=500,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    fp16=True,
    logging_steps=200,
    save_total_limit=2,
    report_to="none"
)

# =========================
# Trainer (tokenizer REMOVED)
# =========================
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

# =========================
# Train
# =========================
print("Starting optimized training...")
trainer.train()

# =========================
# Save
# =========================
trainer.save_model("./best_medical_model")
tokenizer.save_pretrained("./best_medical_model")

print("Training complete and model saved.")


Using GPU: Tesla T4


Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

[1mBertForSequenceClassification LOAD REPORT[0m from: aubmindlab/bert-base-arabertv2
Key                                        | Status     | 
-------------------------------------------+------------+-
cls.seq_relationship.weight                | UNEXPECTED | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED | 
cls.predictions.transform.dense.bias       | UNEXPECTED | 
bert.embeddings.position_ids               | UNEXPECTED | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED | 
cls.seq_relationship.bias                  | UNEXPECTED | 
cls.predictions.bias                       | UNEXPECTED | 
cls.predictions.transform.dense.weight     | UNEXPECTED | 
classifier.bias                            | MISSING    | 
classifier.weight                          | MISSING    | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING[3m	:those params were newly initialized because missing from the 

Starting optimized training...


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.90754,0.954801,0.712794,0.709532
2,0.762792,0.883377,0.729882,0.728587
3,0.629136,0.884393,0.731,0.729965


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

There were missing keys in the checkpoint model loaded: ['bert.embeddings.LayerNorm.weight', 'bert.embeddings.LayerNorm.bias', 'bert.encoder.layer.0.attention.output.LayerNorm.weight', 'bert.encoder.layer.0.attention.output.LayerNorm.bias', 'bert.encoder.layer.0.output.LayerNorm.weight', 'bert.encoder.layer.0.output.LayerNorm.bias', 'bert.encoder.layer.1.attention.output.LayerNorm.weight', 'bert.encoder.layer.1.attention.output.LayerNorm.bias', 'bert.encoder.layer.1.output.LayerNorm.weight', 'bert.encoder.layer.1.output.LayerNorm.bias', 'bert.encoder.layer.2.attention.output.LayerNorm.weight', 'bert.encoder.layer.2.attention.output.LayerNorm.bias', 'bert.encoder.layer.2.output.LayerNorm.weight', 'bert.encoder.layer.2.output.LayerNorm.bias', 'bert.encoder.layer.3.attention.output.LayerNorm.weight', 'bert.encoder.layer.3.attention.output.LayerNorm.bias', 'bert.encoder.layer.3.output.LayerNorm.weight', 'bert.encoder.layer.3.output.LayerNorm.bias', 'bert.encoder.layer.4.attention.output.La

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Training complete and model saved.


In [None]:
from transformers import pipeline

model_path = "./best_medical_model"

pipe = pipeline(
    "text-classification",
    model=model_path,
    tokenizer="aubmindlab/bert-base-arabertv2",
    top_k=3
)

texts = [
    "ضربات قلبي سريعة جدا وحاسس بنهجان وألم في صدري",
    "عندي بقع حمراء بتهرش في جسمي كله وبتزيد بالليل",
    "بقالي فترة بنسى كتير وعندي رعشة خفيفة في إيدي",
    "حاسس بحرقان شديد في البول وألم في جنبي اليمين",
    "بنتي عندها طفح جلدي وحرارة عالية ورافضة تاكل",
    "عندي ألم شديد في ضرسي ومحتاج خلع"
]

for text in texts:
    results = pipe(text)
    print(f"\nالنص: {text}")
    print("-" * 30)

    predictions = results[0] if isinstance(results[0], list) else results

    for res in predictions:
        print(f"- التخصص: {res['label']} ({res['score']:.2f})")

Loading weights:   0%|          | 0/201 [00:00<?, ?it/s]


النص: ضربات قلبي سريعة جدا وحاسس بنهجان وألم في صدري
------------------------------
- التخصص: امراض القلب و الشرايين (0.96)
- التخصص: امراض نفسية وعصبية (0.01)
- التخصص: امراض الجهاز التنفسي (0.01)

النص: عندي بقع حمراء بتهرش في جسمي كله وبتزيد بالليل
------------------------------
- التخصص: الامراض الجلدية (0.82)
- التخصص: الطب العام (0.06)
- التخصص: جراحة تجميل (0.04)

النص: بقالي فترة بنسى كتير وعندي رعشة خفيفة في إيدي
------------------------------
- التخصص: امراض نفسية وعصبية (0.63)
- التخصص: امراض الدم (0.14)
- التخصص: امراض القلب و الشرايين (0.10)

النص: حاسس بحرقان شديد في البول وألم في جنبي اليمين
------------------------------
- التخصص: امراض المسالك البولية والتناسلية (0.88)
- التخصص: امراض باطنية (0.02)
- التخصص: جراحة عامة (0.02)

النص: بنتي عندها طفح جلدي وحرارة عالية ورافضة تاكل
------------------------------
- التخصص: جراحة تجميل (0.90)
- التخصص: امراض الدم (0.03)
- التخصص: امراض الغدد الصماء (0.02)

النص: عندي ألم شديد في ضرسي ومحتاج خلع
------------------------------

In [None]:
import shutil
shutil.make_archive("my_medical_model", 'zip', "./best_medical_model")
print("Done! Now download 'my_medical_model.zip' from the files tab.")

Done! Now download 'my_medical_model.zip' from the files tab.


In [None]:
import shutil
from google.colab import files
shutil.make_archive("my_best_model", 'zip', "./best_medical_model")
files.download("my_best_model.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive
