*Libraries*


In [None]:
import re, string
import numpy as np
import pandas as pd
from itertools import chain
from collections import Counter

# NLP
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, ISRIStemmer
from nltk.stem import SnowballStemmer



# sklearn
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.metrics import (
    accuracy_score, f1_score, precision_score, recall_score, classification_report
)

# gensim
import gensim.downloader as api
from gensim.models import Word2Vec, FastText

# fastText LID
import fasttext

# PyTorch / Transformers
import torch
from transformers import (
    AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForSequenceClassification,
    pipeline, DataCollatorWithPadding
)
from datasets import Dataset
from torch.optim import AdamW
from torch.utils.data import DataLoader
import evaluate

# TensorFlow / Keras
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import (
    Embedding, Bidirectional, GRU, LSTM, Dropout, Dense
)

# Download required NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')


2025-06-10 16:42:09.776407: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to /home/ubuntu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/ubuntu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/ubuntu/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

*Loading Data*


In [2]:
df= pd.read_csv('/home/ubuntu/NLP/altibbi_specialty_data.csv')

In [3]:
df

Unnamed: 0,specialty_id,name_ar,question_body
0,23,طب عيون,استشاره عيون
1,14,جراحة العظام والمفاصل,السلام عليكم ممكن دكتور مفاصل واعصاب
2,14,جراحة العظام والمفاصل,عندي نقص فيتامين د هل ممكن استخدم معه كالسيوم
3,23,طب عيون,عمليه الحول للكبار
4,14,جراحة العظام والمفاصل,ألم بالكتف الايسر من فترة
...,...,...,...
92554,18,طب اسنان,اريد التحدث مع طبيبب اسنان
92555,91,الطب النفسي,عندي قلق مابعد الولاده استشارات نفسيه
92556,14,جراحة العظام والمفاصل,هل ممكن يدكتور ان تتم عمليه اعاده الكسر بسبب ت...
92557,23,طب عيون,زوجتي تعاني من ضعف النظر درجة ٤.٥


***EDA***

In [4]:
df.groupby("specialty_id").apply(
    lambda x: x[['specialty_id', 'name_ar', 'question_body']].sample(3, random_state=42)
).reset_index(drop=True)


  df.groupby("specialty_id").apply(


Unnamed: 0,specialty_id,name_ar,question_body
0,14,جراحة العظام والمفاصل,الم بالجسد والدكتور منع المسكنات لانها عندها ج...
1,14,جراحة العظام والمفاصل,عندى كسر بالعضلة اليسار الى تحت القلب ماهى الج...
2,14,جراحة العظام والمفاصل,انا مريض سكر من الدرجه الثانيه ومن فتره اشعر ب...
3,18,طب اسنان,التهاب قوي بلثه والحلق
4,18,طب اسنان,عندى حشوه تجمليه على اسناني الاماميه بس بعد مد...
5,18,طب اسنان,انا عندي عشرون سنه وعايز اشيل درسي
6,23,طب عيون,عندي صداع قوي (شقيقه) اول مااقوم من نوم واحيان...
7,23,طب عيون,لو عاوزه اعمل نظاره حفظ نظر ده بيحتاج اني اروح...
8,23,طب عيون,حبة في جفن العين العلوي من الداحل
9,25,تغذية,انا اعاني من السمنه كتلة جسمي فوق ال30 وتعبت م...


In [5]:
# Check for empty strings in the question_body
empty_questions = df[df["question_body"].str.strip() == ""]
print(f"\nEmpty question_body entries: {len(empty_questions)}")


Empty question_body entries: 0


In [6]:
df.isnull().sum()

specialty_id     0
name_ar          0
question_body    0
dtype: int64

In [7]:
df = df.drop_duplicates()

*Checking for URLS*

In [8]:
# Regex pattern to detect URLs
url_pattern = r'http\S+|www\S+|https\S+'

# Filter rows that contain URLs
rows_with_urls = df[df['question_body'].str.contains(url_pattern, na=False)]

rows_with_urls

Unnamed: 0,specialty_id,name_ar,question_body
9028,14,جراحة العظام والمفاصل,https://altibbi.com/r/2ztbh8oj1y73
10249,91,الطب النفسي,https://altibbi.com/r/am3jz3k5g5c080
13738,14,جراحة العظام والمفاصل,https://altibbi.com/r/917i35ix94owkw
17794,25,تغذية,https://altib.bi/oehT
17838,91,الطب النفسي,https://altib.bi/K3j2
29569,14,جراحة العظام والمفاصل,ألم مستمر أسفل الظهر، منذ سنة تقريبا مرفقا صور...
29683,14,جراحة العظام والمفاصل,اصابه في الركبه https://argon.truecloudjo.com/...
32092,23,طب عيون,https://altibbi.com/r/1b4xr6jyzzr444k
34624,91,الطب النفسي,https://altibbi.com/r/b3jkjbum5pni
39675,91,الطب النفسي,https://altib.bi/iuic


In [9]:
#Funstion for removing URLs from the text
def remove_urls(text):
    url_pattern = r'http\S+|www\S+|https\S+'
    return re.sub(url_pattern, '', text)

# Apply the function to the 'question_body' column and save the result
df['question_body'] = df['question_body'].apply(remove_urls)

#check some cleaned results
print(df[['question_body']].head())


                                   question_body
0                                   استشاره عيون
1           السلام عليكم ممكن دكتور مفاصل واعصاب
2  عندي نقص فيتامين د هل ممكن استخدم معه كالسيوم
3                             عمليه الحول للكبار
4                      ألم بالكتف الايسر من فترة


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['question_body'] = df['question_body'].apply(remove_urls)


*Checking for any english words*

In [10]:
# Function to check if a question contains any English words
def contains_english_word(text):
    # Look for sequences of English letters (a word)
    pattern = r'\b[A-Za-z]+\b'
    return bool(re.search(pattern, text))

df['has_english'] = df['question_body'].apply(contains_english_word)

# See how many questions contain English words
print(df['has_english'].sum())

# See some examples
print(df[df['has_english']][['question_body']].head(20))

3267
                                         question_body
40   Tawuniya السلام عليكم اشعر باالالم وبروده في ا...
49   Tawuniya حاجة الى المناقشة مع دكتور عام بسبب و...
51   عندي الم بالضرس قوي ذهبت لدكتور الاسنان فيه خر...
76      Tawuniya لدى ابنتي انتفاخ في الجفن مثل الكورة.
92   لسلام عليكم انا بنت عندي ٢١ سنه سناني فيها spa...
117  وجود خراج أسفل ال crown وأدى الى تورم المنطقة ...
145  Tawuniya اريد طبيب اطفال لإبني عساف يكشف عليه ...
197  السلام عليكم احتاج استشاري طب وجراحه عيون . عن...
275  انا برضع ممكن اخد برشام او دواء شرب اسمو Nurax...
286  شخصني طبيب الجلديه قبل سنتين ب complex apthosi...
346  Tawuniya ابنتي الصغيره دخلت اصبعها في عيني واص...
360                                     Tawuniya اسنان
428      Tawuniya عندي الم بالضرس وشاكه بالحمل ايش اخذ
448  يوجد لدي اخوة لم يكملو تعليمهم بسبب ارسالهم ال...
472  تكلمت مع دكتوره قبل يومين في الاستشارة المجاني...
535  Tawuniya اريد استشارة دكتور عظام بخصوص عملية ك...
605         Tawuniya احمرار وحكه بالعين من ثلاث ايام..
607  

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['has_english'] = df['question_body'].apply(contains_english_word)


In [11]:
# Function to extract English words from text
def extract_english_words_better(text):
    # Remove most punctuation (except dash and slash)
    text = re.sub(rf"[{re.escape(string.punctuation.replace('-', '').replace('/', ''))}]", " ", text)

    # Extract candidate words that are mostly A-Z or a-z
    words = re.findall(r'\b[A-Za-z][A-Za-z\-\/]{1,}\b', text)

    # Filter out all-numeric or very short items
    filtered = [w for w in words if not w.lower().isdigit() and len(w) > 1]
    return filtered


df['english_words'] = df['question_body'].apply(extract_english_words_better)

# Show rows with extracted English
df_with_english = df[df['english_words'].apply(len) > 0]
print(df_with_english[['question_body', 'english_words']].head(50))

                                          question_body  \
40    Tawuniya السلام عليكم اشعر باالالم وبروده في ا...   
49    Tawuniya حاجة الى المناقشة مع دكتور عام بسبب و...   
51    عندي الم بالضرس قوي ذهبت لدكتور الاسنان فيه خر...   
76       Tawuniya لدى ابنتي انتفاخ في الجفن مثل الكورة.   
92    لسلام عليكم انا بنت عندي ٢١ سنه سناني فيها spa...   
117   وجود خراج أسفل ال crown وأدى الى تورم المنطقة ...   
145   Tawuniya اريد طبيب اطفال لإبني عساف يكشف عليه ...   
197   السلام عليكم احتاج استشاري طب وجراحه عيون . عن...   
275   انا برضع ممكن اخد برشام او دواء شرب اسمو Nurax...   
286   شخصني طبيب الجلديه قبل سنتين ب complex apthosi...   
346   Tawuniya ابنتي الصغيره دخلت اصبعها في عيني واص...   
360                                      Tawuniya اسنان   
428       Tawuniya عندي الم بالضرس وشاكه بالحمل ايش اخذ   
448   يوجد لدي اخوة لم يكملو تعليمهم بسبب ارسالهم ال...   
472   تكلمت مع دكتوره قبل يومين في الاستشارة المجاني...   
535   Tawuniya اريد استشارة دكتور عظام بخصوص عملية ك... 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['english_words'] = df['question_body'].apply(extract_english_words_better)


In [12]:
all_english_words = list(chain.from_iterable(df['english_words']))
print(f"Total extracted English words: {len(all_english_words)}")
print(f"Unique English words: {len(set(all_english_words))}")


Total extracted English words: 6043
Unique English words: 2176


In [13]:
from collections import Counter

word_freq = Counter(w.lower() for w in all_english_words)
print(word_freq.most_common(10000))

[('tawuniya', 1987), ('the', 66), ('of', 63), ('and', 58), ('mg', 57), ('mri', 42), ('in', 40), ('to', 37), ('adhd', 35), ('pain', 34), ('my', 34), ('is', 33), ('for', 32), ('with', 30), ('consultation', 28), ('doctor', 27), ('have', 24), ('it', 24), ('eye', 23), ('from', 20), ('cyl', 19), ('mild', 16), ('prk', 16), ('crp', 14), ('medial', 14), ('no', 13), ('cdm', 13), ('expert', 13), ('af', 13), ('axis', 13), ('knee', 13), ('posterior', 13), ('left', 13), ('teeth', 13), ('counselling', 12), ('sph', 12), ('joint', 12), ('am', 12), ('tsh', 12), ('but', 11), ('me', 11), ('cbc', 11), ('right', 10), ('disc', 10), ('xr', 10), ('on', 9), ('general', 9), ('taa', 9), ('changes', 9), ('can', 9), ('back', 9), ('cervical', 9), ('systane', 9), ('hiv', 9), ('horn', 9), ('tear', 9), ('years', 8), ('weeks', 8), ('signal', 8), ('normal', 8), ('lumbar', 8), ('x-ray', 8), ('acl', 8), ('meniscus', 8), ('lower', 8), ('ml', 8), ('at', 8), ('esr', 8), ('days', 8), ('prozac', 8), ('gel', 8), ('get', 7), ('wh

**Machine Traslation Cleaning (MarianMT)**

In [None]:
df["question_body"] = df["question_body"].astype(str)

#using FastText LID to Find English Runs
lid = fasttext.load_model("lid.176.bin")        

def extract_english_phrases(text: str) -> list[str]:
    # rough split on whitespace / punctuation
    raw_tokens = re.findall(r"\b\w+\b", text)
    phrases    = []
    buf        = []

    for tok in raw_tokens:
        is_ascii = tok.encode("ascii", "ignore").decode("ascii") == tok
        if not is_ascii:
            flush = True
        else:
            # fastText predicts labels like "__label__en"
            lang, conf = lid.predict(tok.lower())
            flush = lang[0] != "__label__en" or conf[0] < 0.80    # 80 % conf threshold

        if flush and buf:
            phrases.append(" ".join(buf))
            buf = []
        if not flush: 
            buf.append(tok)

    if buf:
        phrases.append(" ".join(buf))
    return phrases

phrase_docs = [" ".join(extract_english_phrases(t)) for t in df["question_body"]]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["question_body"] = df["question_body"].astype(str)


In [None]:
#Rank by TF-IDF, Select top-K phrases
tfidf = TfidfVectorizer(
    tokenizer=str.split,    # treat each phrase as a single token
    lowercase=False,
)
X          = tfidf.fit_transform(phrase_docs)
tfidf_sum  = X.sum(axis=0).A1                # global importance score
phrases    = np.array(tfidf.get_feature_names_out())
phrase2idf = dict(zip(phrases, tfidf_sum))

TOP_K = 5000         
top_phrases = sorted(phrase2idf, key=phrase2idf.get, reverse=True)[:TOP_K]
print(f"Selected {len(top_phrases)} TF-IDF-ranked phrases for MT")

#Glosary of Common Terms
glossary = {

    # ─── Organisations & Proper Names ──────────────────────────
    "Tawuniya": "التعاونية للتأمين",

    # ─── Medications / Brand-names ─────────────────────────────
    "Voltfast": "فولتفاست (ديكلوفيناك بوتاسيوم)",
    "Primalan": "بريمالان (ميكليزين)",
    "Nurax":    "نوركس",
    "Depojoy":  "ديبوجوي",                
    "Systane Ultra": "سايستان ألترا (قطرة مرطبة للعين)",
    "Systane":       "سايستان",
    "Fluca":   "فلوكا (فلوكونازول)",
    "Regimax": "ريجيماكس (مكمل غذائي)",
    "Chromax": "كرومكس (مكمل الكروميوم)",

    # ─── Dental / Ophthalmology / Radiology Terms ─────────────
    "crown":          "تاج الأسنان",
    "spacing":        "تباعد الأسنان",
    "Deep scaling":   "تنظيف جذور عميق",
    "OCT":            "التصوير المقطعي البصري (OCT)",

    # ─── Clinical Conditions & Concepts ───────────────────────
    "stroke":               "سكتة دماغية",
    "insulin resistance":   "مقاومة الإنسولين",
    "beta cell function":   "وظيفة خلايا بيتا",
    "complex":              "معقدة",                     
    "Depression":           "الاكتئاب",
    "eye twitching":        "رعشة العين",
    "Right Eye twitching":  "رعشة العين اليمنى",
    "open minded":          "منفتح",

    # ─── Laboratory Abbreviations ─────────────────────────────
    "WBC":  "خلايا الدم البيضاء (WBC)",
    "PLT":  "الصفائح الدموية (PLT)",
    "NEUT": "العدلات (NEUT)",

    # ─── Catch-alls / Misc. ───────────────────────────────────
    "iq": "معدل الذكاء",
}


Selected 435 TF-IDF-ranked phrases for MT




In [16]:
tok_mt   = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-ar")
model_mt = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-en-ar").half().eval()
translator = pipeline("translation",
                      model=model_mt, tokenizer=tok_mt,
                      device=0)              

# translate in manageable batches
def batch_translate(strings, batch_size=64):
    out = []
    for i in range(0, len(strings), batch_size):
        batch = strings[i:i+batch_size]
        out.extend(t["translation_text"] for t in translator(batch, max_length=60))
    return out

translations = batch_translate(top_phrases)    
en2ar = dict(zip(top_phrases, translations))  
en2ar.update(glossary) 

Device set to use cuda:0


In [None]:
# Function to replace phrases in text using a mapping dictionary
def replace_phrases(text: str, mapping: dict[str, str]) -> str:
    # sort by length so longer phrases replace first
    for en in sorted(mapping, key=len, reverse=True):
        pattern = re.compile(rf"\b{re.escape(en)}\b", flags=re.IGNORECASE)
        text    = pattern.sub(mapping[en], text)
    return text

df["question_body_mt"] = [
    replace_phrases(txt, en2ar) for txt in tqdm(df["question_body"], desc="Replacing")
]

Replacing: 100%|██████████| 82809/82809 [01:41<00:00, 814.51it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["question_body_mt"] = [


In [None]:
# Function to calculate the ratio of English words in a text
def english_ratio(text):
    ascii_tokens = re.findall(r"\b[a-zA-Z]+\b", text)
    ar_tokens    = re.findall(r"\b[ء-ي]+\b", text)
    total = len(ascii_tokens) + len(ar_tokens) or 1
    return len(ascii_tokens) / total

print(f"{df['question_body_mt'].apply(english_ratio).mean() * 100:.2f}")

0.24


In [None]:
# Check for leftover English tokens
leftover = set()

for txt in df["question_body_mt"]:          
    leftover.update(re.findall(r"\b[a-zA-Z][a-zA-Z]+\b", txt))

print("Left-over unique tokens:", len(leftover)) 

Left-over unique tokens: 1711


In [None]:
#Residual Cleanup
auto = [w for w in leftover if w.islower() or w.istitle()]
manual = [w for w in leftover if w.isupper()]

# MarianMT single-token pass
new_ar = batch_translate(auto, batch_size=128)   
en2ar.update(dict(zip(auto, new_ar)))

# quick manual mapping examples
glossary = {
    "WBC": "خلايا الدم البيضاء (WBC)",
    "MRI": "التصوير بالرنين المغناطيسي (MRI)",
    "IQ":  "معدل الذكاء",
}
en2ar.update(glossary)


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


In [None]:
# Final pass to replace any remaining English phrases
mask_dirty = df["question_body_mt"].str.contains(r"[a-zA-Z]")
df.loc[mask_dirty, "question_body_mt"] = (
    df.loc[mask_dirty, "question_body_mt"]
      .apply(lambda t: replace_phrases(t, en2ar))
)


In [None]:
# Calculate the residual English ratio after cleanup
residual = df["question_body_mt"].apply(english_ratio).mean() * 100
print(f"Residual English now: {residual:.2f} %")


Residual English now: 0.05 %


In [None]:
# Function to remove any English words or phrases from the text left after translation
def remove_english(text):
    # Remove anything that contains Latin characters (even part of a word)
    return re.sub(r'[a-zA-Z0-9]+', '', text)

# Apply to your DataFrame
df['question_body_mt'] = df['question_body_mt'].apply(remove_english)

# Clean up extra whitespace and invisible characters
df['question_body_mt'] = df['question_body_mt'].str.replace(r'\s+', ' ', regex=True).str.strip()

print(df['question_body_mt'])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['question_body_mt'] = df['question_body_mt'].apply(remove_english)


0                                             استشاره عيون
1                     السلام عليكم ممكن دكتور مفاصل واعصاب
2            عندي نقص فيتامين د هل ممكن استخدم معه كالسيوم
3                                       عمليه الحول للكبار
4                                ألم بالكتف الايسر من فترة
                               ...                        
92554                           اريد التحدث مع طبيبب اسنان
92555                عندي قلق مابعد الولاده استشارات نفسيه
92556    هل ممكن يدكتور ان تتم عمليه اعاده الكسر بسبب ت...
92557                    زوجتي تعاني من ضعف النظر درجة ٤.٥
92558                  ابي استفسر عن النظاره الطبيه للعيون
Name: question_body_mt, Length: 82809, dtype: object


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['question_body_mt'] = df['question_body_mt'].str.replace(r'\s+', ' ', regex=True).str.strip()


*Preprocessing and using ISRI Stemmer (ISRI Stemmer Pipline)*

In [None]:
# Function to preprocess text using ISRI stemmer
def preprocess_text_Isri(text):
    # Normalize
    text = re.sub(r'[إأآ]', 'ا', text)
    text = re.sub(r'ؤ', 'و', text)
    text = re.sub(r'ئ', 'ي', text)
    text = re.sub(r'ء', '', text)
    text = re.sub(r'ة', 'ه', text)

    # Remove diacritics
    text = re.sub(r'[\u064B-\u0652]', '', text)

    # Remove punctuation
    text = re.sub(r'[^\w\s]', ' ', text)

    # Apply tokenization
    words = nltk.word_tokenize(text)

    # Remove stopwords
    arabic_stopwords = set(stopwords.words('arabic'))
    words = [word for word in words if word not in arabic_stopwords]

    #Apply stemming
    stemmer = ISRIStemmer()
    words = [stemmer.stem(word) for word in words]

    return words

In [25]:
df["isri_txt"] = df["question_body_mt"].apply(preprocess_text_Isri)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["isri_txt"] = df["question_body_mt"].apply(preprocess_text_Isri)


*Preprocessing & Using PorterStemmer (PorterStemmer Pipline)*


In [None]:
# Function to preprocess text using Porter stemmer
def preprocess_text_Porter(text):
    # Normalize
    text = re.sub(r'[إأآ]', 'ا', text)
    text = re.sub(r'ؤ', 'و', text)
    text = re.sub(r'ئ', 'ي', text)
    text = re.sub(r'ء', '', text)
    text = re.sub(r'ة', 'ه', text)

    # Remove diacritics
    text = re.sub(r'[\u064B-\u0652]', '', text)

    # Remove punctuation, numbers, English letters
    text = re.sub(r'[^\w\s]', ' ', text)

    # Apply tokenization
    words = nltk.word_tokenize(text)

    # Remove stopwords
    arabic_stopwords = set(stopwords.words('arabic'))
    words = [word for word in words if word not in arabic_stopwords]

    #Apply stemming
    porter = PorterStemmer()
    words = [porter.stem(word) for word in words]

    return words

In [27]:
df["porter_txt"]  = df["question_body_mt"].apply(preprocess_text_Porter)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["porter_txt"]  = df["question_body_mt"].apply(preprocess_text_Porter)


*Preprocessing & Using Snowball Stemmer (SnowballStemmer Pipline)*

In [None]:
# Function to preprocess text using Snowball stemmer
snowball = SnowballStemmer("arabic")

def preprocess_text_Snowball(text: str) -> list[str]:
    # 1) Normalize & strip diacritics/punctuation
    txt = re.sub(r'[إأآ]', 'ا', text)
    txt = re.sub(r'ؤ', 'و', txt)
    txt = re.sub(r'ئ', 'ي', txt)
    txt = re.sub(r'ء',  '' , txt)
    txt = re.sub(r'ة', 'ه', txt)
    txt = re.sub(r'[\u064B-\u0652]', '', txt)   # remove tashkeel
    txt = re.sub(r'[^\w\s]', ' ', txt)           # remove punctuation
    
    # 2) Tokenize & remove stopwords
    tokens = nltk.word_tokenize(txt)
    arabic_stopwords = set(stopwords.words('arabic'))
    tokens = [w for w in tokens if w not in arabic_stopwords]
    
    # 3) Snowball stemming
    stems = [snowball.stem(w) for w in tokens]
    return stems


In [68]:
df["snowball_txt"] = df["question_body_mt"].apply(preprocess_text_Snowball)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["snowball_txt"] = df["question_body_mt"].apply(preprocess_text_Snowball)


*Base Model For Testing Stemmers (NB)*

In [28]:
#Base model for testing Stemmers
def NB_Stemming_Test(col_name):
    X_train, X_test, y_train, y_test = train_test_split(
        df[col_name], df["specialty_id"], test_size=0.3,
        random_state=42, stratify=df["specialty_id"]
    )

    X_train = X_train.apply(lambda x: " ".join(x) if isinstance(x, list) else str(x))
    X_test  = X_test.apply( lambda x: " ".join(x) if isinstance(x, list) else str(x))


    tfidf = TfidfVectorizer(min_df=1, ngram_range=(1, 2))
    Xtr = tfidf.fit_transform(X_train)
    Xte = tfidf.transform(X_test)

    nb = MultinomialNB()
    nb.fit(Xtr, y_train)
    y_pred = nb.predict(Xte)

    acc  = accuracy_score(y_test,     y_pred)
    f1   = f1_score    (y_test,     y_pred, average="weighted")
    prec = precision_score(y_test,  y_pred, average="weighted")
    rec  = recall_score   (y_test,  y_pred, average="weighted")

    print(f"\n=== {col_name} | MultinomialNB ===")
    print(f"Accuracy : {acc:.3f}")
    print(f"F1       : {f1:.3f}")
    print(f"Precision: {prec:.3f}")
    print(f"Recall   : {rec:.3f}")

In [70]:
for col in ["porter_txt", "isri_txt","snowball_txt"]:
    NB_Stemming_Test(col)


=== porter_txt | MultinomialNB ===
Accuracy : 0.869
F1       : 0.870
Precision: 0.882
Recall   : 0.869

=== isri_txt | MultinomialNB ===
Accuracy : 0.879
F1       : 0.879
Precision: 0.889
Recall   : 0.879

=== snowball_txt | MultinomialNB ===
Accuracy : 0.879
F1       : 0.879
Precision: 0.889
Recall   : 0.879


**Checking for mislabled data and correcting it using Semi-Supervised learning**

In [None]:
# Ensure all values in question_body are strings
df['question_body_mt'] = df['question_body_mt'].apply(lambda x: ' '.join(x) if isinstance(x, list) else x)


# Prepare data
vectorizer = TfidfVectorizer(ngram_range=(1,2), max_features=5000)
X = vectorizer.fit_transform(df['question_body_mt'])
y = df['specialty_id']

# Split into a small trusted training set and the rest
X_train, X_check, y_train, y_check, idx_train, idx_check = train_test_split(
    X, y, df.index, test_size=0.8, random_state=42
)

# Train model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['question_body_mt'] = df['question_body_mt'].apply(lambda x: ' '.join(x) if isinstance(x, list) else x)


0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [31]:
# Add predictions and confidence
probs = model.predict_proba(X_check)
preds = model.predict(X_check)
confidence = np.max(probs, axis=1)

# Slice original DataFrame correctly
df_check = df.loc[idx_check].copy()

df_check['predicted_label'] = preds
df_check['confidence'] = confidence
df_check['mismatch'] = df_check['predicted_label'] != df_check['specialty_id']

# Show likely mislabeled rows
likely_wrong = df_check[(df_check['mismatch']) & (df_check['confidence'] > 0.9)]
print(likely_wrong[['question_body_mt', 'specialty_id', 'predicted_label', 'confidence']].head(10))


                                        question_body_mt  specialty_id  \
65088                                         دكتور نفسي            25   
81893                 عندي صداع حاد والم في عيوني ورقبتي            14   
62372                                علاج بروده في القدم            18   
7578                                      كسر في الترقوة            91   
87720  السلام عليكم. عندي ولد عمره سنوات فيه خوف من ا...            25   
58983                                  اريد ازيد من وزني            91   
38509                                         الم الرقبه            18   
83898                         الضغط واللام الظهر الشديده            25   
79620                                          طبيب عيون            91   
87543            انا انسه وعندي سنه ومحتاجه اخس وزن طولي            91   

       predicted_label  confidence  
65088               91    0.970963  
81893               23    0.941356  
62372               14    0.974270  
7578                14    0.950165  


In [32]:
# Automatically fix highly confident wrong labels
df.loc[likely_wrong.index, 'specialty_id'] = likely_wrong['predicted_label']

# Optional: save for review
likely_wrong.to_csv("review_mislabeled_questions.csv", index=False)

df.head(15)

Unnamed: 0,specialty_id,name_ar,question_body,has_english,english_words,question_body_mt,isri_txt,porter_txt
0,23,طب عيون,استشاره عيون,False,[],استشاره عيون,"[شار, عين]","[استشاره, عيون]"
1,14,جراحة العظام والمفاصل,السلام عليكم ممكن دكتور مفاصل واعصاب,False,[],السلام عليكم ممكن دكتور مفاصل واعصاب,"[سلم, علي, مكن, دكتور, فصل, عصب]","[السلام, عليكم, ممكن, دكتور, مفاصل, واعصاب]"
2,14,جراحة العظام والمفاصل,عندي نقص فيتامين د هل ممكن استخدم معه كالسيوم,False,[],عندي نقص فيتامين د هل ممكن استخدم معه كالسيوم,"[عند, نقص, يتم, مكن, خدم, معه, سيم]","[عندي, نقص, فيتامين, ممكن, استخدم, معه, كالسيوم]"
3,23,طب عيون,عمليه الحول للكبار,False,[],عمليه الحول للكبار,"[عمل, حول, كبر]","[عمليه, الحول, للكبار]"
4,14,جراحة العظام والمفاصل,ألم بالكتف الايسر من فترة,False,[],ألم بالكتف الايسر من فترة,"[الم, كتف, يسر, فتر]","[الم, بالكتف, الايسر, فتره]"
5,18,طب اسنان,تخدير في الخد شمال نزولا الى الفم,False,[],تخدير في الخد شمال نزولا الى الفم,"[خدر, لخد, نزل, الى, لفم]","[تخدير, الخد, نزولا, الى, الفم]"
6,14,جراحة العظام والمفاصل,استيقظت من النوم قدمي من تحت زر الكعب تؤلم الم...,False,[],استيقظت من النوم قدمي من تحت زر الكعب تؤلم الم...,"[يقظ, نوم, قدم, زر, كعب, ولم, الم, شدد, الم, ز...","[استيقظت, النوم, قدمي, زر, الكعب, تولم, الم, ش..."
7,23,طب عيون,ابني عمره ٦ سنوات عينه حمراء و مليئه بالغمز اه...,False,[],ابني عمره ٦ سنوات عينه حمراء و مليئه بالغمز اه...,"[ابن, عمر, ٦, سنو, عين, حمر, ليه, غمز, هرض, رم...","[ابني, عمره, ٦, سنوات, عينه, حمرا, ملييه, بالغ..."
8,25,تغذية,كيف أخسر وزن,False,[],كيف أخسر وزن,"[خسر, وزن]","[اخسر, وزن]"
9,18,طب اسنان,لدى حبوب في اخر لساني من جهات حلق لدي حبوب في حلق,False,[],لدى حبوب في اخر لساني من جهات حلق لدي حبوب في حلق,"[حبب, اخر, لسا, جهت, حلق, لدي, حبب, حلق]","[حبوب, اخر, لساني, جهات, حلق, لدي, حبوب, حلق]"


In [33]:
print(f"Corrected labels: {len(likely_wrong)}")

Corrected labels: 182


**Word Embediing Comparison**

TF-IDF

In [34]:
tfidf_vec = TfidfVectorizer(min_df=1, ngram_range=(1,2))
X_tfidf  = tfidf_vec.fit_transform(df['isri_txt'].astype(str))
print('TF-IDF:', X_tfidf.shape)

TF-IDF: (82809, 332612)


BOW

In [35]:
bow_vec = CountVectorizer(min_df=1)
X_bow = bow_vec.fit_transform(df['isri_txt'].astype(str))
print('BoW:', X_bow.shape)

BoW: (82809, 17476)


Word2Vec

In [36]:
sentences = df['isri_txt'].tolist()
w2v = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=2, epochs=20)
X_w2v = np.vstack([
    np.mean([w2v.wv[w] for w in s if w in w2v.wv] or [np.zeros(100)], axis=0)
    for s in sentences])
print('Word2Vec:', X_w2v.shape)

Word2Vec: (82809, 100)


FastText

In [37]:
ft = FastText(sentences, vector_size=100, window=5, min_count=1, workers=2, epochs=20)
X_ft = np.vstack([
    np.mean([ft.wv[w] for w in s if w in ft.wv] or [np.zeros(100)], axis=0)
    for s in sentences])
print('FastText:', X_ft.shape)

FastText: (82809, 100)


In [38]:
Y = df['specialty_id'] 

*First Base model for Word Embedding Comparison (Naive Bayes)*

In [None]:
# Function to evaluate word embeddings using SVM model
def svm_eval(X, name):
    # 1) train/test split
    X_train, X_test, y_train, y_test = train_test_split(
        X, Y,
        test_size=0.3,
        random_state=42,
        stratify=Y
    )

    # 2) build & train the SVM
    clf = LinearSVC(C=1.0,max_iter=10_000)
    clf.fit(X_train, y_train)

    # 3) predict
    y_pred = clf.predict(X_test)

    # 4) compute metrics
    acc  = accuracy_score(y_test,    y_pred)
    f1   = f1_score    (y_test,    y_pred, average="weighted")
    prec = precision_score(y_test, y_pred, average="weighted")
    rec  = recall_score   (y_test, y_pred, average="weighted")

    # 5) report
    print(
        f"[SVM] {name}: "
        f"Acc {acc:.3f} | "
        f"F1 {f1:.3f} | "
        f"Precision {prec:.3f} | "
        f"Recall {rec:.3f}"
    )


In [91]:
for name, X in {'TF‑IDF':X_tfidf,'BoW':X_bow,'Word2Vec':X_w2v,'FastText':X_ft}.items():
    svm_eval(X,name)

[SVM] TF‑IDF: Acc 0.907 | F1 0.907 | Precision 0.909 | Recall 0.907
[SVM] BoW: Acc 0.896 | F1 0.896 | Precision 0.897 | Recall 0.896
[SVM] Word2Vec: Acc 0.891 | F1 0.891 | Precision 0.893 | Recall 0.891
[SVM] FastText: Acc 0.890 | F1 0.890 | Precision 0.891 | Recall 0.890


Tokenization for NN model (BidirectionalGRU)

In [None]:
# Tokenization and Vocabulary Preparation
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['isri_txt'])
word_index = tokenizer.word_index
vocab_size = len(word_index) + 1

In [None]:
# Convert texts to sequences and pad them
seqs = tokenizer.texts_to_sequences(df['isri_txt'])
MAX_LEN = 50                             
X_pad  = pad_sequences(seqs, maxlen=MAX_LEN, padding="post")

# one-hot labels (specialty_id is the target column)
y_cat = to_categorical(df['specialty_id'])

In [None]:
# Word Embedding Matrices
EMB_DIM = 100
VOCAB   = vocab_size 
def make_matrix(model):
    mat = np.zeros((VOCAB, EMB_DIM))
    for word, idx in word_index.items():
        if idx < VOCAB and word in model.wv:
            mat[idx] = model.wv[word]
    return mat

w2v_mat = make_matrix(w2v) 
ft_mat  = make_matrix(ft)

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X_pad, y_cat, test_size=0.2,
    stratify=df['specialty_id'], random_state=42
)

*Second Base model for Word Embedding Comparison (BIdirectionalGRU)*
this model only tests Word2vec and Fasttext because Rnn's work on sequentional word embeddings only

In [None]:
# Build and compile the BiGRU model
def build_bigru(emb_matrix, name):
    emb_matrix = emb_matrix.astype("float32")

    model = Sequential(name=name)
    model.add(Embedding(input_dim=VOCAB,output_dim=EMB_DIM,input_length=MAX_LEN,weights=[emb_matrix],trainable=False,dtype="float32",mask_zero=False))

    
    model.add(Bidirectional(
        GRU( 128,return_sequences=False,reset_after=False,recurrent_activation="sigmoid",implementation=1)))

    model.add(Dropout(0.3))
    model.add(Dense(64, activation="relu"))
    model.add(Dense(y_cat.shape[1], activation="softmax"))

    model.compile(
        optimizer="adam",
        loss="categorical_crossentropy",
        metrics=["accuracy"]
    )
    return model


BiDirectionalGRU with Word2Vec 

In [None]:
# Train and evaluate the BiGRU model with Word2Vec embeddings
gru_w2v = build_bigru(w2v_mat, "GRU_W2V")
gru_w2v.fit(
    X_train, y_train,
    epochs=4, batch_size=256,
    validation_split=0.1, verbose=1
)

y_pred = np.argmax(gru_w2v.predict(X_test, verbose=0), axis=1)
y_true = np.argmax(y_test,    axis=1)

acc  = accuracy_score(y_true, y_pred)
f1   = f1_score    (y_true, y_pred, average='weighted')
prec = precision_score(y_true, y_pred, average='weighted')
rec  = recall_score   (y_true, y_pred, average='weighted')

print(
    f"[GRU + Word2Vec]  "
    f"Acc={acc:.3f}  "
    f"F1={f1:.3f}  "
    f"Precision={prec:.3f}  "
    f"Recall={rec:.3f}"
)


Epoch 1/4




[1m233/233[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 116ms/step - accuracy: 0.6906 - loss: 1.3631 - val_accuracy: 0.8992 - val_loss: 0.3040
Epoch 2/4
[1m233/233[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 117ms/step - accuracy: 0.9020 - loss: 0.2960 - val_accuracy: 0.9034 - val_loss: 0.2904
Epoch 3/4
[1m233/233[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 116ms/step - accuracy: 0.9097 - loss: 0.2737 - val_accuracy: 0.9037 - val_loss: 0.2835
Epoch 4/4
[1m233/233[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 115ms/step - accuracy: 0.9123 - loss: 0.2634 - val_accuracy: 0.9067 - val_loss: 0.2823
[GRU + Word2Vec]  Acc=0.905  F1=0.905  Precision=0.906  Recall=0.905


Bidirectional with FastText

In [None]:
# Train and evaluate the BiGRU model with FastText embeddings
gru_ft = build_bigru(ft_mat, "GRU_FT")
gru_ft.fit(X_train, y_train, epochs=4, batch_size=256,
           validation_split=0.1, verbose=1)

y_pred = np.argmax(gru_ft.predict(X_test, verbose=0), axis=1)

acc  = accuracy_score(y_true, y_pred)
f1   = f1_score    (y_true, y_pred, average='weighted')
prec = precision_score(y_true, y_pred, average='weighted')
rec  = recall_score   (y_true, y_pred, average='weighted')

print(
    f"[GRU + FastText]  "
    f"Acc={acc:.3f}  "
    f"F1={f1:.3f}  "
    f"Precision={prec:.3f}  "
    f"Recall={rec:.3f}"
)

Epoch 1/4




[1m233/233[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 117ms/step - accuracy: 0.7002 - loss: 1.3475 - val_accuracy: 0.9017 - val_loss: 0.3093
Epoch 2/4
[1m233/233[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 114ms/step - accuracy: 0.9003 - loss: 0.3068 - val_accuracy: 0.9004 - val_loss: 0.2981
Epoch 3/4
[1m233/233[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 114ms/step - accuracy: 0.9067 - loss: 0.2814 - val_accuracy: 0.9032 - val_loss: 0.2906
Epoch 4/4
[1m233/233[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 114ms/step - accuracy: 0.9149 - loss: 0.2562 - val_accuracy: 0.9035 - val_loss: 0.2855
[GRU + FastText]  Acc=0.906  F1=0.906  Precision=0.907  Recall=0.906


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


**Testing the Best Model**

*Train Test Split For NB* 

In [None]:
# Prepare FastText embeddings for classification
sentences = df['isri_txt'].tolist()
FT_DIM    = ft.vector_size
doc_vecs  = np.vstack([
    np.mean([ft.wv[w] for w in sent] or [np.zeros(FT_DIM)], axis=0)
    for sent in sentences
])

# Split the data into training and testing sets for GNB
X_tr_FT, X_te_FT, y_tr_lbl, y_te_lbl = train_test_split(
    doc_vecs, df['specialty_id'],
    test_size=0.2, stratify=df['specialty_id'], random_state=42
)

*Train Test Split For RNN's*

In [None]:
# Train and evaluate For RNN's
X_tr_seq, X_te_seq, y_tr_cat, y_te_cat = train_test_split(
    X_pad, y_cat,
    test_size=0.2, stratify=df['specialty_id'], random_state=42
)

*Gussian Naive Bayes*

In [None]:
# Train Gaussian Naive Bayes on FastText embeddings
gnb = GaussianNB()
gnb.fit(X_tr_FT, y_tr_lbl)
y_pred = gnb.predict(X_te_FT)

acc  = accuracy_score(y_te_lbl, y_pred)
f1   = f1_score    (y_te_lbl, y_pred, average='weighted')
prec = precision_score(y_te_lbl, y_pred, average='weighted')
rec  = recall_score   (y_te_lbl, y_pred, average='weighted')

print(
    f"[FastText + GNB]  "
    f"Acc={acc:.3f}  "
    f"F1={f1:.3f}  "
    f"Precision={prec:.3f}  "
    f"Recall={rec:.3f}"
)

[FastText + GNB]  Acc=0.854  F1=0.856  Precision=0.860  Recall=0.854


In [None]:
# Prepare the FastText embedding matrix for Keras
EMB_DIM = 100
VOCAB   = vocab_size 
def make_matrix(model):
    mat = np.zeros((VOCAB, EMB_DIM))
    for word, idx in word_index.items():
        if idx < VOCAB and word in model.wv:
            mat[idx] = model.wv[word]
    return mat
 
ft_mat  = make_matrix(ft)

*Base Structure for both GRU and LSTM*

In [None]:
# Function to build a BiGRU and LSTM model
def build_bigru_or_lstm(cell_class, name):
    model = Sequential(name=name)
    
    model.add(Embedding(
        input_dim=VOCAB, output_dim=EMB_DIM, input_length=X_pad.shape[1],
        weights=[ft_mat], trainable=False))
    
    model.add(Bidirectional(cell_class(128)))
    
    model.add(Dropout(0.3))
    
    model.add(Dense(64, activation='relu'))
    
    model.add(Dense(y_cat.shape[1], activation='softmax'))
    
    model.compile(optimizer='adam',loss='categorical_crossentropy', metrics=['accuracy'])
    
    return model

*GRU*

In [None]:
# Train and evaluate the BiGRU model with FastText embeddings
gru_ft = build_bigru_or_lstm(GRU, "GRU_FT")
gru_ft.fit(X_tr_seq, y_tr_cat, epochs=4, batch_size=256,
           validation_split=0.1, verbose=1)

y_pred = np.argmax(gru_ft.predict(X_te_seq, verbose=0), axis=1)
y_true = np.argmax(y_te_cat, axis=1)

acc  = accuracy_score(y_true, y_pred)
f1   = f1_score    (y_true, y_pred, average='weighted')
prec = precision_score(y_true, y_pred, average='weighted')
rec  = recall_score   (y_true, y_pred, average='weighted')

print(
    f"[FastText + BiGRU]  "
    f"Acc={acc:.3f}  "
    f"F1={f1:.3f}  "
    f"Precision={prec:.3f}  "
    f"Recall={rec:.3f}"
)

Epoch 1/4




[1m233/233[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 119ms/step - accuracy: 0.6776 - loss: 1.3624 - val_accuracy: 0.8951 - val_loss: 0.3071
Epoch 2/4
[1m233/233[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 117ms/step - accuracy: 0.9008 - loss: 0.3001 - val_accuracy: 0.9045 - val_loss: 0.2945
Epoch 3/4
[1m233/233[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 117ms/step - accuracy: 0.9065 - loss: 0.2855 - val_accuracy: 0.9048 - val_loss: 0.2901
Epoch 4/4
[1m233/233[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 117ms/step - accuracy: 0.9121 - loss: 0.2639 - val_accuracy: 0.9075 - val_loss: 0.2820
[FastText + BiGRU]  Acc=0.906  F1=0.907  Precision=0.909  Recall=0.906


*LSTM*

In [None]:
# Train and evaluate the LSTM model with FastText embeddings
lstm_ft = build_bigru_or_lstm(LSTM, "LSTM_FT")
lstm_ft.fit(X_tr_seq, y_tr_cat, epochs=4, batch_size=256,
            validation_split=0.1, verbose=1)

y_pred = np.argmax(lstm_ft.predict(X_te_seq, verbose=0), axis=1)

acc   = accuracy_score(y_true,     y_pred)
f1    = f1_score    (y_true,     y_pred, average="weighted")
prec  = precision_score(y_true,  y_pred, average="weighted")
rec   = recall_score   (y_true,  y_pred, average="weighted")

print(f"[FastText + BiLSTM]  "
      f"Acc={acc:.3f}  "
      f"F1={f1:.3f}  "
      f"Precision={prec:.3f}  "
      f"Recall={rec:.3f}")

Epoch 1/4




[1m233/233[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 125ms/step - accuracy: 0.6998 - loss: 1.2992 - val_accuracy: 0.9010 - val_loss: 0.3074
Epoch 2/4
[1m233/233[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 122ms/step - accuracy: 0.9012 - loss: 0.3039 - val_accuracy: 0.9048 - val_loss: 0.2912
Epoch 3/4
[1m233/233[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 123ms/step - accuracy: 0.9091 - loss: 0.2738 - val_accuracy: 0.9045 - val_loss: 0.2906
Epoch 4/4
[1m233/233[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 122ms/step - accuracy: 0.9119 - loss: 0.2609 - val_accuracy: 0.9063 - val_loss: 0.2903
[FastText + BiLSTM]  Acc=0.906  F1=0.907  Precision=0.908  Recall=0.906


**Text Classification (Bert & GPT)**

In [71]:
# Convert specialty_id to a categorical label
df["label"] = df["specialty_id"].astype("category").cat.codes
num_labels  = df["label"].nunique()

# Split the dataset into training and testing sets
train_df, test_df = train_test_split(
    df[["question_body_mt", "label"]],
    test_size=0.2, stratify=df["label"], random_state=42
)

# Convert DataFrames to Hugging Face Datasets
ds_train = Dataset.from_pandas(train_df.reset_index(drop=True))
ds_test  = Dataset.from_pandas(test_df .reset_index(drop=True))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["label"] = df["specialty_id"].astype("category").cat.codes


**Arabert**

In [None]:
# Set the device for PyTorch
MODEL_ID = "aubmindlab/bert-base-arabertv02"
tok   = AutoTokenizer.from_pretrained(MODEL_ID)
model = AutoModelForSequenceClassification.from_pretrained(
            MODEL_ID, num_labels=num_labels).to(device)



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv02 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Tokenization function for the dataset
def tok_fn(batch):
    return tok(batch["question_body_mt"],
               truncation=True, padding="max_length", max_length=128)

ds_train = ds_train.map(tok_fn, batched=True,
                        remove_columns=["question_body_mt"])
ds_test  = ds_test .map(tok_fn, batched=True,
                        remove_columns=["question_body_mt"])

ds_train.set_format(type="torch")
ds_test .set_format(type="torch")

Map: 100%|██████████| 66247/66247 [00:07<00:00, 8615.03 examples/s]
Map: 100%|██████████| 16562/16562 [00:01<00:00, 9655.41 examples/s]


In [None]:
# Create DataLoaders for training and testing
collator = DataCollatorWithPadding(tok)
train_loader = DataLoader(ds_train, batch_size=16, shuffle=True,
                          collate_fn=collator, pin_memory=(device=="cuda"))
test_loader  = DataLoader(ds_test,  batch_size=32, shuffle=False,
                          collate_fn=collator, pin_memory=(device=="cuda"))

print("DataLoaders ready.")

DataLoaders ready.


In [None]:
# Training the model
optimizer  = AdamW(model.parameters(), lr=2e-5)
grad_accum = 2
scaler      = torch.cuda.amp.GradScaler(enabled=(device=="cuda"))

for epoch in range(1, 4):
    model.train()
    total_loss = 0
    for step, batch in enumerate(train_loader, start=1):
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.cuda.amp.autocast(enabled=(device=="cuda")):
            out  = model(**batch)
            loss = out.loss / grad_accum
        scaler.scale(loss).backward()
        if step % grad_accum == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
        total_loss += loss.item() * grad_accum
    avg_loss = total_loss / len(train_loader)
    print(f"✓ Epoch {epoch} | avg loss = {avg_loss:.4f}")


  scaler      = torch.cuda.amp.GradScaler(enabled=(device=="cuda"))
  with torch.cuda.amp.autocast(enabled=(device=="cuda")):


✓ Epoch 1 | avg loss = 0.3019
✓ Epoch 2 | avg loss = 0.2193
✓ Epoch 3 | avg loss = 0.1825


In [None]:
# Evaluation on the test set
model.eval()
all_preds, all_labels = [], []
with torch.no_grad():
    for batch in test_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        logits = model(**batch).logits
        preds  = logits.argmax(dim=-1).cpu().numpy()
        labels = batch["labels"].cpu().numpy()
        all_preds.append(preds)
        all_labels.append(labels)

preds  = np.concatenate(all_preds)
labels = np.concatenate(all_labels)

#Compute metrics
metric_acc  = evaluate.load("accuracy")
metric_f1   = evaluate.load("f1")
metric_prec = evaluate.load("precision")
metric_rec  = evaluate.load("recall")

metric_acc.add_batch(predictions=preds, references=labels)
metric_f1 .add_batch(predictions=preds, references=labels)
metric_prec.add_batch(predictions=preds, references=labels)
metric_rec .add_batch(predictions=preds, references=labels)

results = {
    "Accuracy" : metric_acc.compute()["accuracy"],
    "F1"       : metric_f1.compute(average="weighted")["f1"],
    "Precision": metric_prec.compute(average="weighted")["precision"],
    "Recall"   : metric_rec.compute(average="weighted")["recall"],
}
rounded_results = {k: round(v, 3) for k, v in results.items()}
print(rounded_results)


{'Accuracy': 0.918, 'F1': 0.918, 'Precision': 0.918, 'Recall': 0.918}


**GPT**

In [None]:
# Load the pre-trained model and tokenizer
MODEL_ID = "aubmindlab/aragpt2-base"
tok = AutoTokenizer.from_pretrained(MODEL_ID)

# Ensure the pad token is set
if tok.pad_token is None:
    tok.pad_token = tok.eos_token

# Set the device for PyTorch
model = AutoModelForSequenceClassification.from_pretrained(
            MODEL_ID, num_labels=num_labels,
            pad_token_id=tok.pad_token_id).to(device)

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at aubmindlab/aragpt2-base and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Tokenization function for the dataset
def tok_fn(batch):
    return tok(batch["question_body_mt"],
               truncation=True, padding="max_length", max_length=128)

ds_train = ds_train.map(tok_fn, batched=True,
                        remove_columns=["question_body_mt"])
ds_test  = ds_test .map(tok_fn, batched=True,
                        remove_columns=["question_body_mt"])
ds_train.set_format("torch"); ds_test.set_format("torch")

Map: 100%|██████████| 66247/66247 [00:07<00:00, 9138.72 examples/s] 
Map: 100%|██████████| 16562/16562 [00:01<00:00, 10247.07 examples/s]


In [None]:
# Create DataLoaders for training and testing
collator = DataCollatorWithPadding(tok)
train_loader = DataLoader(ds_train, batch_size=16, shuffle=True,
                          collate_fn=collator, pin_memory=(device=="cuda"))
test_loader  = DataLoader(ds_test , batch_size=32, shuffle=False,
                          collate_fn=collator, pin_memory=(device=="cuda"))

In [None]:
# Training the model
optimizer  = AdamW(model.parameters(), lr=5e-5)   # GPT often likes 5e-5
grad_accum = 2
scaler = torch.cuda.amp.GradScaler(enabled=(device=="cuda"))

for epoch in range(3):
    model.train(); tot_loss = 0
    for step, batch in enumerate(train_loader, 1):
        batch = {k:v.to(device) for k,v in batch.items()}
        with torch.cuda.amp.autocast(enabled=(device=="cuda")):
            out = model(**batch)
            loss = out.loss / grad_accum
        scaler.scale(loss).backward()
        if step % grad_accum == 0:
            scaler.step(optimizer); scaler.update()
            optimizer.zero_grad()
        tot_loss += loss.item() * grad_accum
    print(f"✓ Epoch {epoch+1} | avg loss {tot_loss/len(train_loader):.4f}")

  scaler = torch.cuda.amp.GradScaler(enabled=(device=="cuda"))
  with torch.cuda.amp.autocast(enabled=(device=="cuda")):


✓ Epoch 1 | avg loss 0.4465
✓ Epoch 2 | avg loss 0.2873
✓ Epoch 3 | avg loss 0.2433


In [None]:
# Evaluation on the test set
model.eval()
all_preds, all_labels = [], []
with torch.no_grad():
    for batch in test_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        logits = model(**batch).logits
        preds  = logits.argmax(dim=-1).cpu().numpy()
        labels = batch["labels"].cpu().numpy()
        all_preds.append(preds)
        all_labels.append(labels)

preds  = np.concatenate(all_preds)
labels = np.concatenate(all_labels)

#Compute metrics

metric_acc  = evaluate.load("accuracy")
metric_f1   = evaluate.load("f1")
metric_prec = evaluate.load("precision")
metric_rec  = evaluate.load("recall")

metric_acc.add_batch(predictions=preds, references=labels)
metric_f1 .add_batch(predictions=preds, references=labels)
metric_prec.add_batch(predictions=preds, references=labels)
metric_rec .add_batch(predictions=preds, references=labels)

results = {
    "Accuracy" : metric_acc.compute()["accuracy"],
    "F1"       : metric_f1.compute(average="weighted")["f1"],
    "Precision": metric_prec.compute(average="weighted")["precision"],
    "Recall"   : metric_rec.compute(average="weighted")["recall"],
}
rounded_results = {k: round(v, 3) for k, v in results.items()}
print(rounded_results)


{'Accuracy': 0.905, 'F1': 0.906, 'Precision': 0.908, 'Recall': 0.905}
