In [None]:
# Install necessary libraries
!pip install transformers openpyxl scikit-learn torch tensorflow

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder,StandardScaler
from sklearn.metrics import  accuracy_score, precision_score, recall_score, f1_score
import time

In [None]:

# Load the dataset
file_path = "Dataset of Arabic Spam and Ham Tweets.xlsx"
df = pd.read_excel(file_path)

# Extract textual features using AraBERT
tokenizer = AutoTokenizer.from_pretrained("aubmindlab/bert-base-arabert")
model = AutoModel.from_pretrained("aubmindlab/bert-base-arabert")

def extract_textual_features(texts):
    tokens = tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=512, is_split_into_words=True)
    with torch.no_grad():
        outputs = model(**tokens)
    features = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
    return features

df['AraBERT Features'] = df['Cleaned Text'].apply(lambda x: extract_textual_features([str(x)]))



In [None]:
# Extract supplementary statistical features
statistical_features = df[['Retweet Count', 'Favorite Count']].values

# Combine AraBERT features with statistical features
combined_features = np.concatenate([df['AraBERT Features'].values.tolist(), statistical_features], axis=1)

# Handle missing values by filling NaN with the mean of each column
combined_features = np.nan_to_num(combined_features, nan=np.nanmean(combined_features, axis=0))

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(combined_features)

#Encode the target variable
le = LabelEncoder()
df['Label'] = le.fit_transform(df['Label'])

# Set up cross-validation
cv = KFold(n_splits=5, shuffle=True, random_state=42)


In [None]:
# Define classifiers
classifiers = {
    "Random Forest": RandomForestClassifier(random_state=42),
    "SVM": SVC(random_state=42),
    "Gaussian Naive Bayes": GaussianNB(),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Logistic Regression": LogisticRegression(random_state=42, max_iter=1000),
    "K Nearest Neighbors": KNeighborsClassifier(n_neighbors=5),
    "Neural Network" : MLPClassifier(random_state=42),
    "AdaBoost": AdaBoostClassifier(random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42),
    "XGBoost": XGBClassifier(random_state=42)
}

# Define metrics
metrics = {
    "Accuracy": accuracy_score,
    "Precision": precision_score,
    "Recall": recall_score,
    "F1 Score": f1_score
}

# Initialize an empty dictionary to store results
results = {}
time_results = {}

# Perform cross-validation and calculate metrics for each classifier
for clf_name, clf in classifiers.items():
    clf_results = []
    clf_time_results = []
    time_values = {
        "Train Time (s)": [],
        "Prediction Time (s)": []
    }
    metric_values = {
        "Accuracy": [],
        "Precision": [],
        "Recall": [],
        "F1 Score": []
    }
    # Perform cross-validation predictions
    for fold, (train_idx, test_idx) in enumerate(cv.split(X_scaled)):

        X_train_fold, X_test_fold = X_scaled[train_idx], X_scaled[test_idx]
        y_train_fold, y_test_fold = df['Label'].iloc[train_idx], df['Label'].iloc[test_idx]

        start_time = time.time()
        clf.fit(X_train_fold, y_train_fold)
        mid_time = time.time()
        y_pred_fold = clf.predict(X_test_fold)
        end_time = time.time()

        time_values["Train Time (s)"].append(mid_time - start_time)
        time_values["Prediction Time (s)"].append(end_time - mid_time)

        for metric_name, metric_func in metrics.items():
            metric_value = metric_func(y_test_fold, y_pred_fold)
            metric_values[metric_name].append(metric_value)

    # Calculate average metric value across folds
    for metric_name, metric_value in metric_values.items():
        average_metric_value = np.mean(metric_value)
        clf_results.append(average_metric_value)

    for time_name, time_value in time_values.items():
        average_time_value = np.mean(time_value)
        clf_time_results.append(average_time_value)

    results[clf_name] = clf_results
    time_results[clf_name] = clf_time_results + [clf_time_results[0]+clf_time_results[1]]

# Create a DataFrame to store results
metrics_keys = list(metrics.keys())
results_df = pd.DataFrame.from_dict(results, orient='index', columns=metrics_keys)
results_df.index.name = 'Classifier'

time_names = ["Train Time (s)","Prediction Time (s)","Totoal Time (s)"]
time_results_df = pd.DataFrame.from_dict(time_results, orient='index', columns=time_names)
time_results_df.index.name = 'Classifier'

print(results_df)
print()
print(time_results_df)


                      Accuracy  Precision    Recall  F1 Score
Classifier                                                   
Random Forest         0.943673   0.966667  0.854960  0.905275
SVM                   0.951837   0.948095  0.906944  0.925752
Gaussian Naive Bayes  0.895429   0.838254  0.839087  0.834492
Decision Tree         0.899592   0.823271  0.868651  0.844387
Logistic Regression   0.963837   0.958095  0.929167  0.942151
K Nearest Neighbors   0.939592   0.922374  0.887698  0.901567
Neural Network        0.963918   0.928640  0.968452  0.947231
AdaBoost              0.975918   0.964706  0.963690  0.961908
Gradient Boosting     0.907673   0.863791  0.848016  0.853884
XGBoost               0.959755   0.946184  0.928571  0.935643

                      Train Time (s)  Prediction Time (s)  Totoal Time (s)
Classifier                                                                
Random Forest               0.339593             0.005676         0.345269
SVM                         0.