In [2]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

# Load Dataset
file_path = 'filtered_dataset_filtered.xls'  # Update this with your actual file path
data = pd.read_csv(file_path, encoding='utf-8')

# Clean Text
def clean_text(text):
    text = re.sub(r'[^\u0600-\u06FF\s]', '', text)  # Keep Arabic and spaces only
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text

data['Cleaned_Title'] = data['Title'].apply(clean_text)

# Encode Categories
label_encoder = LabelEncoder()
data['Category_Label'] = label_encoder.fit_transform(data['Category'])

# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=500, ngram_range=(1, 2), stop_words=None)
X = tfidf_vectorizer.fit_transform(data['Cleaned_Title'])
y = data['Category_Label']

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [3]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# Train Naive Bayes
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train, y_train)

# Evaluate Naive Bayes
y_pred_nb = nb_classifier.predict(X_test)
accuracy_nb = accuracy_score(y_test, y_pred_nb)
report_nb = classification_report(y_test, y_pred_nb, target_names=label_encoder.classes_)

print("Naive Bayes Accuracy:", accuracy_nb)
print("Naive Bayes Classification Report:\n", report_nb)


Naive Bayes Accuracy: 0.6981132075471698
Naive Bayes Classification Report:
               precision    recall  f1-score   support

      اقتصاد       0.55      0.64      0.59        28
       رياضه       0.92      0.85      0.88        27
       سياحة       0.67      0.85      0.75        26
       سياسه       0.73      0.44      0.55        25

    accuracy                           0.70       106
   macro avg       0.72      0.70      0.69       106
weighted avg       0.71      0.70      0.69       106



In [4]:
from sklearn.svm import LinearSVC

# Train SVM
svm_classifier = LinearSVC(random_state=42)
svm_classifier.fit(X_train, y_train)

# Evaluate SVM
y_pred_svm = svm_classifier.predict(X_test)
accuracy_svm = accuracy_score(y_test, y_pred_svm)
report_svm = classification_report(y_test, y_pred_svm, target_names=label_encoder.classes_)

print("SVM Accuracy:", accuracy_svm)
print("SVM Classification Report:\n", report_svm)


SVM Accuracy: 0.6792452830188679
SVM Classification Report:
               precision    recall  f1-score   support

      اقتصاد       0.61      0.68      0.64        28
       رياضه       0.81      0.81      0.81        27
       سياحة       0.64      0.69      0.67        26
       سياسه       0.65      0.52      0.58        25

    accuracy                           0.68       106
   macro avg       0.68      0.68      0.68       106
weighted avg       0.68      0.68      0.68       106





In [5]:
from sklearn.tree import DecisionTreeClassifier

# Train Decision Tree
dt_classifier = DecisionTreeClassifier(random_state=42)
dt_classifier.fit(X_train, y_train)

# Evaluate Decision Tree
y_pred_dt = dt_classifier.predict(X_test)
accuracy_dt = accuracy_score(y_test, y_pred_dt)
report_dt = classification_report(y_test, y_pred_dt, target_names=label_encoder.classes_)

print("Decision Tree Accuracy:", accuracy_dt)
print("Decision Tree Classification Report:\n", report_dt)


Decision Tree Accuracy: 0.4811320754716981
Decision Tree Classification Report:
               precision    recall  f1-score   support

      اقتصاد       0.43      0.43      0.43        28
       رياضه       0.75      0.67      0.71        27
       سياحة       0.35      0.58      0.43        26
       سياسه       0.55      0.24      0.33        25

    accuracy                           0.48       106
   macro avg       0.52      0.48      0.48       106
weighted avg       0.52      0.48      0.48       106



In [6]:
# Imports
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report

# Load Dataset
file_path = 'filtered_dataset_filtered.xls'  # Replace with your file path
data = pd.read_csv(file_path, encoding='utf-8')

# Clean Text
def clean_text(text):
    text = re.sub(r'[^\u0600-\u06FF\s]', '', text)  # Keep Arabic and spaces only
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text

data['Cleaned_Title'] = data['Title'].apply(clean_text)

# Encode Categories
label_encoder = LabelEncoder()
data['Category_Label'] = label_encoder.fit_transform(data['Category'])

# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=500, ngram_range=(1, 2), stop_words=None)
X = tfidf_vectorizer.fit_transform(data['Cleaned_Title'])
y = data['Category_Label']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Parameter grids for tuning
param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}

param_grid_gb = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7]
}

# Random Forest
grid_rf = GridSearchCV(RandomForestClassifier(random_state=42), param_grid_rf, cv=3, scoring='accuracy', n_jobs=-1)
grid_rf.fit(X_train, y_train)
best_rf = grid_rf.best_estimator_
y_pred_rf_tuned = best_rf.predict(X_test)
accuracy_rf_tuned = accuracy_score(y_test, y_pred_rf_tuned)
report_rf_tuned = classification_report(y_test, y_pred_rf_tuned, target_names=label_encoder.classes_)

# Gradient Boosting
grid_gb = GridSearchCV(GradientBoostingClassifier(random_state=42), param_grid_gb, cv=3, scoring='accuracy', n_jobs=-1)
grid_gb.fit(X_train, y_train)
best_gb = grid_gb.best_estimator_
y_pred_gb_tuned = best_gb.predict(X_test)
accuracy_gb_tuned = accuracy_score(y_test, y_pred_gb_tuned)
report_gb_tuned = classification_report(y_test, y_pred_gb_tuned, target_names=label_encoder.classes_)

# Results
print("Random Forest (Tuned) Accuracy:", accuracy_rf_tuned)
print("Random Forest (Tuned) Report:\n", report_rf_tuned)
print("Gradient Boosting (Tuned) Accuracy:", accuracy_gb_tuned)
print("Gradient Boosting (Tuned) Report:\n", report_gb_tuned)


Random Forest (Tuned) Accuracy: 0.5849056603773585
Random Forest (Tuned) Report:
               precision    recall  f1-score   support

      اقتصاد       0.48      0.43      0.45        28
       رياضه       0.87      0.74      0.80        27
       سياحة       0.54      0.73      0.62        26
       سياسه       0.48      0.44      0.46        25

    accuracy                           0.58       106
   macro avg       0.59      0.59      0.58       106
weighted avg       0.59      0.58      0.58       106

Gradient Boosting (Tuned) Accuracy: 0.6037735849056604
Gradient Boosting (Tuned) Report:
               precision    recall  f1-score   support

      اقتصاد       0.58      0.54      0.56        28
       رياضه       0.87      0.74      0.80        27
       سياحة       0.55      0.62      0.58        26
       سياسه       0.46      0.52      0.49        25

    accuracy                           0.60       106
   macro avg       0.62      0.60      0.61       106
weighted avg 

In [7]:
# Required Libraries
import numpy as np
import pandas as pd
import re
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout
from keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from gensim.models import KeyedVectors

# Load Dataset
file_path = 'filtered_dataset_filtered.xls'  # Replace with your file path
data = pd.read_csv(file_path, encoding='utf-8')

# Preprocessing: Remove diacritics
def remove_diacritics(text):
    arabic_diacritics = re.compile(""" ّ|َ|ً|ُ|ٌ|ِ|ٍ|ْ|ـ """, re.VERBOSE)
    return re.sub(arabic_diacritics, '', text)

data['Cleaned_Title'] = data['Title'].apply(remove_diacritics)

# Tokenization
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data['Cleaned_Title'])
sequences = tokenizer.texts_to_sequences(data['Cleaned_Title'])

# Padding sequences
max_len = 100  # Maximum sequence length
X_padded = pad_sequences(sequences, maxlen=max_len, padding='post')

# Load pretrained FastText embeddings using gensim
embedding_file = 'cc.ar.300.vec'  # Replace with your downloaded embeddings file path
embedding_index = KeyedVectors.load_word2vec_format(embedding_file, binary=False)

# Create Embedding Matrix
vocab_size = len(tokenizer.word_index) + 1  # Including zero for padding
embedding_dim = 300
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in tokenizer.word_index.items():
    if word in embedding_index:
        embedding_matrix[i] = embedding_index[word]

# Encode Categories
label_encoder = {category: idx for idx, category in enumerate(data['Category'].unique())}
data['Category_Label'] = data['Category'].map(label_encoder)
y_one_hot = to_categorical(data['Category_Label'])

# Train-test Split
X_train, X_test, y_train, y_test = train_test_split(X_padded, y_one_hot, test_size=0.2, random_state=42)

# Build CNN Model
model = Sequential()
model.add(Embedding(vocab_size, embedding_dim, weights=[embedding_matrix], input_length=max_len, trainable=False))
model.add(Conv1D(128, 5, activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(y_one_hot.shape[1], activation='softmax'))

# Compile Model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the Model
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2, verbose=1)

# Evaluate the Model
loss, accuracy = model.evaluate(X_test, y_test, verbose=1)
print("Test Loss:", loss)
print("Test Accuracy:", accuracy)





Epoch 1/10


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Loss: 0.6839859485626221
Test Accuracy: 0.7452830076217651


In [8]:
from sklearn.metrics import classification_report, accuracy_score
import numpy as np

# Get predictions from the model
y_pred = model.predict(X_test)

# Convert one-hot encoded predictions and true labels to their class indices
y_pred_classes = np.argmax(y_pred, axis=1)
y_test_classes = np.argmax(y_test, axis=1)

# Print the classification report
print("Classification Report:")
print(classification_report(y_test_classes, y_pred_classes, target_names=label_encoder.keys()))

# Optionally, print accuracy
accuracy = accuracy_score(y_test_classes, y_pred_classes)
print(f"Accuracy: {accuracy:.4f}")


Classification Report:
              precision    recall  f1-score   support

       سياسه       0.70      0.62      0.65        26
      اقتصاد       0.56      0.75      0.64        24
       سياحة       0.80      0.74      0.77        27
       رياضه       0.96      0.86      0.91        29

    accuracy                           0.75       106
   macro avg       0.75      0.74      0.74       106
weighted avg       0.76      0.75      0.75       106

Accuracy: 0.7453


In [9]:
def preprocess_input_text(text):
    """
    Preprocess user input by removing diacritics and tokenizing the text.
    """
    # Remove diacritics
    def remove_diacritics(text):
        arabic_diacritics = re.compile(""" ّ|َ|ً|ُ|ٌ|ِ|ٍ|ْ|ـ """, re.VERBOSE)
        return re.sub(arabic_diacritics, '', text)

    text = remove_diacritics(text)
    
    # Tokenize and pad the input text
    sequence = tokenizer.texts_to_sequences([text])
    padded_sequence = pad_sequences(sequence, maxlen=max_len, padding='post')
    return padded_sequence

def predict_category(input_text):
    """
    Predict the category of the user-entered text.
    """
    # Preprocess the input text
    processed_text = preprocess_input_text(input_text)
    
    # Get model predictions
    prediction = model.predict(processed_text)
    predicted_class = np.argmax(prediction, axis=1)[0]
    
    # Map the class index back to the category name
    category_name = list(label_encoder.keys())[list(label_encoder.values()).index(predicted_class)]
    return category_name

# Interactive loop for user input
while True:
    user_input = input("Enter a text to classify (or type 'exit' to quit): ")
    if user_input.lower() == 'exit':
        print("Exiting...")
        break
    predicted_category = predict_category(user_input)
    print(f"Predicted Category: {predicted_category}")


Enter a text to classify (or type 'exit' to quit): انهيار الليرة أبرز التداعيات الاقتصادية على سكان دمشق
Predicted Category: اقتصاد
Enter a text to classify (or type 'exit' to quit): تعرف على عمليات الاحتيال المالي عند السفر
Predicted Category: سياحة
Enter a text to classify (or type 'exit' to quit): مدينة ألمانية في أحضان رومانية
Predicted Category: سياحة
Enter a text to classify (or type 'exit' to quit): نتائج قرعة كأس العالم للأندية لكرة القدم 2025
Predicted Category: رياضه
Enter a text to classify (or type 'exit' to quit): خطة نجوم مانشستر سيتي لمواجهة الشكوك والانتقادات
Predicted Category: رياضه
Enter a text to classify (or type 'exit' to quit): هل ساهم تغيّر خطاب هيئة تحرير الشام في نتائج "ردع العدوان"؟
Predicted Category: اقتصاد
Enter a text to classify (or type 'exit' to quit): بوليتيكو: 4 حالات عفو رئاسية أكثر إثارة للجدل من عفو بايدن
Predicted Category: اقتصاد
Enter a text to classify (or type 'exit' to quit): دعوة لبايدن بمنتدى "أسبن" للاعتراف بدولة فلسطين قبل بدء حكم ترامب
