<a href="https://colab.research.google.com/github/Ammar11232/Detection-AI-Generated-Arabic-Text/blob/main/Data%20Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#Phase 1.2
import pandas as pd

In [None]:
#Phase 1.2
!pip install datasets

In [None]:
#Phase 1.2
from datasets import load_dataset

In [None]:
#Phase 1.2
dataset = load_dataset("KFUPM-JRCAI/arabic-generated-abstracts")

In [None]:
#Phase 1.3
print(dataset)

In [None]:
#Phase 1.3
df1 = pd.DataFrame(dataset['by_polishing'])

In [None]:
#Phase 1.3
df2 = pd.DataFrame(dataset['from_title'])

In [None]:
#Phase 1.3
df3 = pd.DataFrame(dataset['from_title_and_content'])

In [None]:
#Phase 1.3
df = pd.concat([df1, df2, df3], ignore_index=True)

In [None]:
#Phase 1.3
print(df.info())

In [None]:
#Phase 1.3
print(df.columns)

In [None]:
#Phase 1.3
print(df.dtypes)

In [None]:
#Phase 1.3
print(df)

In [None]:
#Phase 1.3
print(df.head(10))

In [None]:
#Phase 1.3
print(df.columns)

In [None]:
#Phase 1.3
human_df = pd.DataFrame({
    "abstract": df["original_abstract"],
    "label": "Human"
})


In [None]:
#Phase 1.3
ai_texts = pd.concat([
    df["allam_generated_abstract"],
    df["jais_generated_abstract"],
    df["llama_generated_abstract"],
    df["openai_generated_abstract"]
], ignore_index=True)


In [None]:
#Phase 1.3
ai_df = pd.DataFrame({
    "abstract": ai_texts,
    "label": "AI"
})

In [None]:
#Phase 1.3
full_df = pd.concat([human_df, ai_df], ignore_index=True)

print(full_df.head())
print(full_df["label"].value_counts())

In [None]:
#Phase 1.3
print(df.duplicated(['original_abstract']).sum())
print(df.duplicated(['allam_generated_abstract']).sum())
print(df.duplicated(['jais_generated_abstract']).sum())
print(df.duplicated(['llama_generated_abstract']).sum())
print(df.duplicated(['openai_generated_abstract']).sum())

In [None]:
#Phase 1.3
print(full_df.isnull().sum())

In [None]:
#Phase 2.1
import re
import nltk
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem.isri import ISRIStemmer
from datasets import load_dataset
nltk.download('stopwords')

In [None]:
#Phase 2.1
print(full_df.head())

In [None]:
#Phase 2.1
def remove_diacritics(text):
    arabic_diacritics = re.compile(r'[\u0617-\u061A\u064B-\u0652]')
    return re.sub(arabic_diacritics, '', text)

def normalize_arabic(text):
    text = re.sub("[إأآا]", "ا", text)
    text = re.sub("ى", "ي", text)
    text = re.sub("ؤ", "و", text)
    text = re.sub("ئ", "ي", text)
    text = re.sub("ة", "ه", text)
    text = re.sub("[^؀-ۿ ]+", " ", text)
    return text

arabic_stopwords = set(stopwords.words("arabic"))
stemmer = ISRIStemmer()

def preprocess_text(text):
    text = str(text)
    text = remove_diacritics(text)
    text = normalize_arabic(text)
    tokens = text.split()
    tokens = [w for w in tokens if w not in arabic_stopwords]
    tokens = [stemmer.stem(w) for w in tokens]
    return " ".join(tokens)

In [None]:
#Phase 2.1
full_df["abstract_clean"] = full_df["abstract"].apply(preprocess_text)
full_df.head(2)




In [None]:
#Phase 2.2
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from collections import Counter
import seaborn as sns
import numpy as np

In [None]:
#Phase 2.2
def text_stats(texts):
    words = [w for txt in texts for w in txt.split()]
    avg_word_len = np.mean([len(w) for w in words])
    avg_sent_len = np.mean([len(txt.split()) for txt in texts])
    vocab = set(words)
    ttr = len(vocab) / len(words)
    return avg_word_len, avg_sent_len, ttr

In [None]:
#Phase 2.2
stats_human = text_stats(df["original_abstract"])
stats_ai = text_stats(ai_texts)
print("\n Statistical Summary:")
print(f"Human-written: Avg word len={stats_human[0]:.2f}, Avg sent len={stats_human[1]:.2f}, TTR={stats_human[2]:.3f}")
print(f"AI-generated : Avg word len={stats_ai[0]:.2f}, Avg sent len={stats_ai[1]:.2f}, TTR={stats_ai[2]:.3f}")

In [None]:
#Phase 2.2
import matplotlib.pyplot as plt

df["human_length"] = df["original_abstract"].apply(lambda x: len(x.split()))
df["ai_length"] = ai_texts.apply(lambda x: len(x.split()))

plt.figure(figsize=(8,5))
plt.hist(df["human_length"], bins=30, alpha=0.6, label="Human-written", color='blue')
plt.hist(df["ai_length"], bins=30, alpha=0.6, label="AI-generated", color='orange')
plt.xlabel("Sentence Length (words)")
plt.ylabel("Frequency")
plt.title("Sentence Length Distribution")
plt.legend()
plt.show()

In [None]:
#Phase 2.2
def type_token_ratio(text):
    words = text.split()
    return len(set(words)) / len(words) if words else 0

df["human_ttr"] = df["original_abstract"].apply(type_token_ratio)
df["ai_ttr"] = ai_texts.apply(type_token_ratio)

plt.figure(figsize=(6,5))
plt.boxplot([df["human_ttr"], df["ai_ttr"]], labels=["Human", "AI"])
plt.title("Vocabulary Richness (Type–Token Ratio)")
plt.ylabel("TTR Score")
plt.show()

In [None]:
#Phase 2.2
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer

def plot_top_ngrams(texts, n=2, top_k=10, label="Dataset"):
    texts = texts.dropna().astype(str)

    vec = CountVectorizer(ngram_range=(n, n))
    bag = vec.fit_transform(texts)

    if bag.shape[1] == 0:
        print(f"No {n}-grams found for {label}.")
        return

    sum_words = bag.sum(axis=0)
    freqs = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    freqs = sorted(freqs, key=lambda x: x[1], reverse=True)[:top_k]

    words, counts = zip(*freqs)

    plt.figure(figsize=(10, 4))
    sns.barplot(x=list(counts), y=list(words))
    plt.title(f"Top {top_k} {n}-grams for {label} abstracts")
    plt.tight_layout()
    plt.show()

In [None]:
plot_top_ngrams(df["original_abstract"], n=2, label="Human")


In [None]:
#Phase 2.2
from collections import Counter
import pandas as pd

human_words = " ".join(df["original_abstract"]).split()
ai_words = " ".join(ai_texts).split()

human_freq = Counter(human_words)
ai_freq = Counter(ai_words)

common_words = set(list(human_freq.keys())[:100]) & set(list(ai_freq.keys())[:100])

data = []
for w in common_words:
    data.append((w, human_freq[w], ai_freq[w]))

freq_df = pd.DataFrame(data, columns=["word", "Human", "Ai"]).sort_values("Human", ascending=False)[:15]

freq_df.plot(x="word", kind="bar", figsize=(20,4), title="Top Words: Human vs AI", rot=45)
plt.ylabel("Frequency")
plt.show()

In [None]:
#Phase 3.1
import re
import math
import numpy as np
import pandas as pd
import unicodedata
from collections import Counter
from datasets import load_dataset
import regex as re2

In [None]:
#Phase 3.1
def simple_word_tokenize(text):
    """
    Tokenize text into words / symbols with Arabic support.
    """
    return re2.findall(r"\p{Arabic}+|\w+|[^\s\w]", text, flags=re2.VERSION1)

def sentence_tokenize(text):
    """
    Split text into sentences using Arabic/English punctuation.
    """
    if not isinstance(text, str):
        return []
    parts = re.split(r'(?<=[\.\?\!\u061F\u061B])\s+', text)
    return [p.strip() for p in parts if p.strip()]

def paragraph_tokenize(text):
    """
    Split text into paragraphs based on double newlines.
    """
    if not isinstance(text, str):
        return []
    paragraphs = re.split(r'\s*\n\s*\n\s*|\s*\r\n\s*\r\n\s*', text.strip())
    return [p.strip() for p in paragraphs if p.strip()]

In [None]:
#Phase 3.1
original_text_col = "abstract"
clean_text_col = "abstract_clean"

full_df["tokens"] = full_df[clean_text_col].apply(
    lambda t: [tok for tok in simple_word_tokenize(t) if tok.strip()] if isinstance(t, str) else []
)

full_df["words"] = full_df["tokens"].apply(
    lambda toks: [tok for tok in toks if re.search(r'\w', tok)]
)

full_df["sentences"] = full_df[original_text_col].apply(sentence_tokenize)

full_df["paragraphs"] = full_df[original_text_col].apply(paragraph_tokenize)

print("Feature engineering completed! Columns now:")
print(full_df.columns)
full_df.head(2)

In [None]:
#Phase3.1 Feature 16: Number of words with repeated letters
feature_name = f'{clean_text_col}_f016_words_with_repeated_letters'

def _words_with_repeated_letters(words):
    """Counts words containing at least one pair of adjacent identical letters."""
    if not words:
        return 0
    return sum(1 for w in words if re.search(r'(.)\1', w))

full_df[feature_name] = full_df["words"].apply(_words_with_repeated_letters)

In [None]:
#Phase3.1 Feature 34: 34. Total number of sentences (S)
full_df['f034_Total_number_of_sentences_(S)'] = full_df["sentences"].apply(len)


In [None]:
#Phase 3.1 Feature 39: Average number of words/ S
full_df['f039_Average_words_per_sentence'] = full_df.apply(
    lambda row: len(row['words']) / row['f034_Total_number_of_sentences_(S)']
    if row['f034_Total_number_of_sentences_(S)'] > 0 else 0,
    axis=1
)

In [None]:
#Phase3.1 Feature 62: Number of imperfective
col = original_text_col
morph_features_col = f'{col}_morph_features'

if morph_features_col in full_df.columns:
    full_df[f'{col}_f062_num_imperfective'] = full_df[morph_features_col].apply(
        lambda d: sum(1 for perf in d.get('is_perfective', []) if not perf)
    )
else:
    full_df[f'{col}_f062_num_imperfective'] = 0

In [None]:
#Phase 3.1 Feature 85: Sentence Length Variance: Variance in the number of words per sentence.
def sentence_length_variance(text):
    sentences = sentence_tokenize(text)
    if len(sentences) <= 1:
        return 0
    lengths = [len(s.split()) for s in sentences]
    mean_len = sum(lengths) / len(lengths)
    return sum((l - mean_len) ** 2 for l in lengths) / len(lengths)

col = original_text_col
feature_name = f"{col}_f085_sentence_length_variance"

full_df[feature_name] = full_df[col].apply(sentence_length_variance)

In [None]:
#Phase 3.1 Feature 108: Politeness Score: Measures politeness.
polite_words = [
    "من فضلك", "شكراً", "لو سمحت", "عفواً",
    "من فضل حضرتك", "تكرماً", "فضلاً", "شاكراً لك",
    "مع الشكر", "أكون لك من الشاكرين", "متفضلاً",
    "أرجوك", "لطفاً", "إذا سمحت", "لو تكرمت",
    "شكراً جزيلاً", "جزاك الله خيراً", "بارك الله فيك",
    "شكراً لك", "أشكرك", "ممتن لك", "أكون ممتناً",
    "تفضّل", "من لطفك", "يسعدني", "أتمنى منك",
    "لو سمحتم", "لو تفضلتم", "بكل لطف", "كل الاحترام"
]
def politeness_score(text):
    if not isinstance(text, str) or len(text) == 0:
        return 0.0

    try:
        count = sum(text.count(word) for word in polite_words)
        total_words = len(simple_word_tokenize(text))
        return (count / total_words) if total_words > 0 else 0.0
    except Exception:
        return 0.0

col = original_text_col
feature_name = f'{col}_f108_politeness_score'

full_df[feature_name] = full_df[col].apply(politeness_score)

In [None]:
#Phase 3.2
from sklearn.model_selection import train_test_split
train_df, temp_df = train_test_split(
    full_df,
    test_size=0.30,
    random_state=42,
    shuffle=True
)

val_df, test_df = train_test_split(
    temp_df,
    test_size=0.50,
    random_state=42,
    shuffle=True
)


In [None]:
#Phase 3.2
print("TOTAL:", len(full_df))
print("TRAIN:", len(train_df))
print("VAL:", len(val_df))
print("TEST:", len(test_df))

In [None]:
#Phase 3.2
full_df.head(5)

In [None]:
pip install xlsxwriter

In [None]:
output_file = "processed data1.xlsx"
full_df.head.to_excel(output_file, index=False, engine='xlsxwriter')


In [None]:
#Phase 4

from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1, 2),
    analyzer='word'
)

tfidf_vectorizer.fit(train_df["abstract_clean"])

X_train_tfidf = tfidf_vectorizer.transform(train_df["abstract_clean"])
X_val_tfidf   = tfidf_vectorizer.transform(val_df["abstract_clean"])
X_test_tfidf  = tfidf_vectorizer.transform(test_df["abstract_clean"])

print("TF-IDF shapes:")
print("Train:", X_train_tfidf.shape)
print("Validation:", X_val_tfidf.shape)
print("Test:", X_test_tfidf.shape)

In [None]:
#Phase 4

from scipy.sparse import hstack
EXCLUDED_COLS = [
    'label',
    'abstract',
    'abstract_clean',
    'tokens',
    'words',
    'sentences',
    'paragraphs'
]

numeric_cols = [
    col for col in train_df.select_dtypes(include=np.number).columns
    if col not in EXCLUDED_COLS
]

X_train_num_array = train_df[numeric_cols].values
X_val_num_array   = val_df[numeric_cols].values
X_test_num_array  = test_df[numeric_cols].values

X_train = hstack([X_train_tfidf, X_train_num_array])
X_val   = hstack([X_val_tfidf,   X_val_num_array])
X_test  = hstack([X_test_tfidf,  X_test_num_array])

y_train = train_df["label"]
y_val   = val_df["label"]
y_test  = test_df["label"]

In [None]:
#Phase 4

print("X and y are ready for ML models.")
print("Train:", X_train.shape, y_train.shape)
print("Validation:", X_val.shape, y_val.shape)
print("Test:", X_test.shape, y_test.shape)

In [None]:
#Phase 4.1

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

lr_model = LogisticRegression(max_iter=3000, random_state=42)

lr_model.fit(X_train, y_train)

y_val_pred = lr_model.predict(X_val)

print("Validation Accuracy:", accuracy_score(y_val, y_val_pred))
print("\nClassification Report (Validation):")
print(classification_report(y_val, y_val_pred))

In [None]:
#Phase 4.1

y_test_pred = lr_model.predict(X_test)

print("Test Accuracy:", accuracy_score(y_test, y_test_pred))
print("\nClassification Report (Test):")
print(classification_report(y_test, y_test_pred))

import seaborn as sns
import matplotlib.pyplot as plt

cm = confusion_matrix(y_test, y_test_pred)

sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix - Logistic Regression")
plt.show()

In [None]:
#Phase 4.2
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report


In [None]:
#Phase 4.2
models = {}

svm_model = SVC(kernel='linear', C=1.0, random_state=42)
svm_model.fit(X_train, y_train)

In [None]:
#Phase 4.2
y_val_pred_svm = svm_model.predict(X_val)
print("SVM Validation Accuracy:", accuracy_score(y_val, y_val_pred_svm))
print(classification_report(y_val, y_val_pred_svm))

models['SVM'] = svm_model

In [None]:
#Phase 4.2
rf_model = RandomForestClassifier(
    n_estimators=200,
    max_depth=None,
    random_state=42,
    n_jobs=-1
)
rf_model.fit(X_train, y_train)




In [None]:
#Phase 4.2
y_val_pred_rf = rf_model.predict(X_val)
print("Random Forest Validation Accuracy:", accuracy_score(y_val, y_val_pred_rf))
print(classification_report(y_val, y_val_pred_rf))

models['RandomForest'] = rf_model

In [None]:
#Phase 4.2
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

model_names = ['SVM', 'RandomForest']

for name in model_names:
    model = models[name]

    y_test_pred = model.predict(X_test)

    print(f"\n===== {name} Test Evaluation =====")
    print("Accuracy:", accuracy_score(y_test, y_test_pred))
    print("\nClassification Report:")
    print(classification_report(y_test, y_test_pred))

    cm = confusion_matrix(y_test, y_test_pred)
    plt.figure(figsize=(5,4))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(f'Confusion Matrix - {name}')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.show()

In [None]:
#Phase 4.3
from sentence_transformers import SentenceTransformer
import numpy as np

bert_model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")

X_train_emb = bert_model.encode(
    train_df["abstract_clean"].tolist(),
    convert_to_numpy=True,
    show_progress_bar=True
)
X_val_emb = bert_model.encode(
    val_df["abstract_clean"].tolist(),
    convert_to_numpy=True,
    show_progress_bar=True
)
X_test_emb = bert_model.encode(
    test_df["abstract_clean"].tolist(),
    convert_to_numpy=True,
    show_progress_bar=True
)

y_train = train_df["label"].values
y_val = val_df["label"].values
y_test = test_df["label"].values

print("Train embedding shape:", X_train_emb.shape)

In [None]:
#Phase 4.3
from tensorflow.keras import layers, models
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

ffnn_model = models.Sequential([
    layers.Input(shape=(X_train_emb.shape[1],)),
    layers.Dense(256, activation="relu"),
    layers.BatchNormalization(),
    layers.Dropout(0.3),

    layers.Dense(128, activation="relu"),
    layers.BatchNormalization(),
    layers.Dropout(0.3),

    layers.Dense(1, activation="sigmoid")
])

ffnn_model.compile(
    optimizer="adam",
    loss="binary_crossentropy",
    metrics=["accuracy"]
)

ffnn_model.summary()

In [None]:
#Phase 4.3
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder

print("Num GPUs Available:", len(tf.config.list_physical_devices('GPU')))
tf.config.optimizer.set_jit(False)



#  Convert embeddings & labels to correct dtype
X_train_emb = X_train_emb.astype("float32")
X_val_emb   = X_val_emb.astype("float32")
X_test_emb  = X_test_emb.astype("float32")



label_encoder = LabelEncoder()

y_train = label_encoder.fit_transform(y_train)
y_val   = label_encoder.transform(y_val)
y_test  = label_encoder.transform(y_test)

# Convert to int32 required by TensorFlow
y_train = y_train.astype("int32")
y_val   = y_val.astype("int32")
y_test  = y_test.astype("int32")

print("Encoded classes:", label_encoder.classes_)




In [None]:
#Phase 4.3
early_stop = EarlyStopping(
    monitor="val_loss",
    patience=3,
    restore_best_weights=True,
    verbose=1
)

checkpoint = ModelCheckpoint(
    "best_ffnn_bert.h5",
    monitor="val_loss",
    save_best_only=True,
    verbose=1
)

history = ffnn_model.fit(
    X_train_emb, y_train,
    validation_data=(X_val_emb, y_val),
    epochs=20,
    batch_size=32,
    callbacks=[early_stop, checkpoint],
    verbose=2
)



In [None]:
#Phase 4.3
from sklearn.metrics import accuracy_score, classification_report

y_test_pred_prob = ffnn_model.predict(X_test_emb)
y_test_pred = (y_test_pred_prob > 0.5).astype("int32").ravel()

print("\nTest Accuracy:", accuracy_score(y_test, y_test_pred))
print(classification_report(y_test, y_test_pred))

In [None]:
#Phase 4.4
import os
import joblib
from tensorflow.keras.models import Model as KerasModel

def save_all_models(models_dict, save_dir="models"):

    os.makedirs(save_dir, exist_ok=True)

    for model_name, model_obj in models_dict.items():

        if isinstance(model_obj, KerasModel):
            file_path = os.path.join(save_dir, f"{model_name}.h5")
            model_obj.save(file_path)
            print(f"[Saved] Keras model → {file_path}")

        else:
            file_path = os.path.join(save_dir, f"{model_name}.pkl")
            joblib.dump(model_obj, file_path)
            print(f"[Saved] Pickle model → {file_path}")

    print("\nAll models saved successfully!")

In [None]:
#Phase 4.4
import os
os.makedirs("models", exist_ok=True)
models_dict = {
    "lr_model": lr_model,
    "svm": svm_model,
    "random_forest": rf_model,
    "ffnn": ffnn_model
}

save_all_models(models_dict)
