# Import Important Libraries & Load dataset

In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix,accuracy_score, classification_report
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import joblib

In [2]:
Train_df = pd.read_csv('preprocessed_train_with_stopwords.csv')
Train_df.drop(columns='Unnamed: 0', inplace=True)
Train_df

Unnamed: 0,text,label
0,مازالا كاتمشي لتامازيرت ولا ماعند باباها وجه,MA
1,سستيني صافي قلتها ليك البارح,MA
2,معلش يااشرف للاسف الرؤية مش واضحة عند كتير من ...,EG
3,لبشوف فرحة لاعبين السعودية بقول متأهلين عالنها...,LB
4,تشليع الجمعه وبيان فك الإرتباط أقرأ الرساله دى...,SD
...,...,...
118175,صحابى مستغربين ان اختى بتقولى اقلعى الحجاب وان...,EG
118176,لما توقع وما حدا يشوفك صدقني أن الفرحة أكبر من...,LB
118177,مش من حقن شو السفارة لبيت بيون انا كنت مثلك ما...,LB
118178,الشعب ماخمل الشعب قرف متل مارضى العيش مع الزبا...,LB


In [3]:

Test_df = pd.read_csv('preprocessed_test_with_stopwords.csv')
Test_df.drop(columns='Unnamed: 0', inplace=True)
Test_df

Unnamed: 0,text,label
0,اخير ليك ما اخير ليك كيفك بس ما من حق تسب السو...,SD
1,الله يبرد عليها ويجمعها معاه في جنات عدن,SD
2,الكلمتين دول فيهم الزيتونة الحقيقة يعني,EG
3,سلاام لكل يا حبايبنا جئت اقول مساء الخير واشوف...,SD
4,أنا ما بدي تجي وتقول اعذريني مشغول لفتة من بعي...,LB
...,...,...
29540,بالعكس حبيبى مودى تحويلت سمافرو كرزاز مية فى ا...,LY
29541,طب العهد مين ده الزمالك,EG
29542,هو موش هايتنحي ولا ايه ياجودعاان راح نيويورك م...,EG
29543,بس انا ماكنتش ف البيت وماشوفتش الحلقة الاعادة ...,EG


# TF-IDF


In [4]:
print(Train_df['text'].isna().sum())
print(Test_df['text'].isna().sum())

157
37


In [5]:
print("Rows with NaN values in Train_df:")
Train_df[Train_df['text'].isna()]

Rows with NaN values in Train_df:


Unnamed: 0,text,label
670,,LY
908,,LY
1310,,MA
1375,,LB
1513,,LY
...,...,...
113370,,MA
115084,,MA
115632,,LY
116706,,EG


In [6]:
print("Rows with NaN values in Test_df:")
Test_df[Test_df['text'].isna()]

Rows with NaN values in Test_df:


Unnamed: 0,text,label
616,,EG
925,,EG
1342,,LB
2061,,MA
2952,,EG
3798,,SD
4782,,SD
4876,,SD
5078,,LY
5233,,EG


In [7]:
# Drop rows with NaN values in the 'text' column of Train_df
Train_df = Train_df.dropna(subset=['text'])

# Drop rows with NaN values in the 'text' column of Test_df
Test_df = Test_df.dropna(subset=['text'])
    

In [8]:
tfidf = TfidfVectorizer(use_idf=True)
X_train_counts = tfidf.fit_transform(Train_df['text'])
X_test_counts = tfidf.transform(Test_df['text'])


# ML Models


In [9]:
clf_balance = LogisticRegression(random_state=42,class_weight='balanced',solver='newton-cg',C=10).fit(X_train_counts, Train_df['label'])
y_pred=clf_balance.predict(X_test_counts)
print(confusion_matrix(Test_df['label'],y_pred))
print(classification_report(Test_df['label'],y_pred))

[[10104   283   557   162   407]
 [  260  4727   309    89   135]
 [  531   341  5923   263   235]
 [  163   112   212  1707   108]
 [  409   153   251    66  2001]]
              precision    recall  f1-score   support

          EG       0.88      0.88      0.88     11513
          LB       0.84      0.86      0.85      5520
          LY       0.82      0.81      0.81      7293
          MA       0.75      0.74      0.74      2302
          SD       0.69      0.69      0.69      2880

    accuracy                           0.83     29508
   macro avg       0.80      0.80      0.80     29508
weighted avg       0.83      0.83      0.83     29508



In [10]:
from sklearn.tree import DecisionTreeClassifier
dtree = DecisionTreeClassifier()
dtree= dtree.fit(X_train_counts,Train_df['label'])
predictions = dtree.predict(X_test_counts)
print(confusion_matrix(Test_df['label'], predictions))
print(classification_report(Test_df['label'], predictions))

[[8903  576 1315  198  521]
 [ 804 3668  721  142  185]
 [1568  680 4418  350  277]
 [ 410  215  526 1020  131]
 [ 899  279  477  107 1118]]
              precision    recall  f1-score   support

          EG       0.71      0.77      0.74     11513
          LB       0.68      0.66      0.67      5520
          LY       0.59      0.61      0.60      7293
          MA       0.56      0.44      0.50      2302
          SD       0.50      0.39      0.44      2880

    accuracy                           0.65     29508
   macro avg       0.61      0.57      0.59     29508
weighted avg       0.64      0.65      0.64     29508



In [11]:
from sklearn.ensemble import StackingClassifier

level0 = list()
level0.append(('lr', LogisticRegression()))
level0.append(('dtree', DecisionTreeClassifier()))
level1 =LogisticRegression()
model = StackingClassifier(estimators=level0, final_estimator=level1, cv=3)
model.fit(X_train_counts,Train_df['label'])
yhat = model.predict(X_test_counts)
print(confusion_matrix(Test_df['label'],yhat))

print(classification_report(Test_df['label'],yhat))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

[[10392   259   575    66   221]
 [  320  4674   369    61    96]
 [  698   320  5973   160   142]
 [  241   120   298  1573    70]
 [  556   155   311    48  1810]]
              precision    recall  f1-score   support

          EG       0.85      0.90      0.88     11513
          LB       0.85      0.85      0.85      5520
          LY       0.79      0.82      0.81      7293
          MA       0.82      0.68      0.75      2302
          SD       0.77      0.63      0.69      2880

    accuracy                           0.83     29508
   macro avg       0.82      0.78      0.79     29508
weighted avg       0.83      0.83      0.83     29508



# save the best model


In [12]:

joblib.dump(clf_balance, 'logistic_regression_model.pkl')
joblib.dump(tfidf, 'tfidf_vectorizer.pkl')

['tfidf_vectorizer.pkl']

# reload the model and test it


In [3]:
import tnkeeh as tn
import re

def predict_label(text):

    # text preprocessing
    cleander = tn.Tnkeeh(remove_diacritics=True,
                     remove_html_elements=True,
                     remove_twitter_meta=True,
                     remove_links=True,
                     remove_english=True,
                     remove_repeated_chars=True,
                     remove_long_words=True,
                     normalize=True
                     )

    text = cleander.clean_raw_text(text)
    text = text[0]

    text = text.replace(r'[0-9٠-٩]', '')
    text = text.replace("؟", "")
    text = text.replace("@", "")
    text = text.replace("_", "")
    text = text.replace("-", "")

    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F700-\U0001F77F"  # alchemical symbols
                               u"\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
                               u"\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
                               u"\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
                               u"\U0001FA00-\U0001FA6F"  # Chess Symbols
                               u"\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
                               u"\U00002702-\U000027B0"  # Dingbats
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)

    arabic_punctuation_pattern = r'[^\w\s\u0621-\u063A\u0641-\u064A]'
    text = re.sub(arabic_punctuation_pattern,'',text)

    text = re.sub(r'\s+', ' ', text).strip()

    # Load the model and the vectorizer
    clf_balance = joblib.load('logistic_regression_model.pkl')
    tfidf = joblib.load('tfidf_vectorizer.pkl')

    # Transform the input text
    text_transformed = tfidf.transform([text])

    # Predict the label
    predicted_label = clf_balance.predict(text_transformed)

    return predicted_label[0]

# Example prediction
text = "يازول"
predicted_label = predict_label(text)
print(f"The predicted label for '{text}' is: {predicted_label}")

The predicted label for 'يازول' is: SD
