Import Libaries Phase one

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns #for ploting  
from datasets import load_dataset

Load dataset from Hugging Face

In [3]:
dataset = load_dataset("KFUPM-JRCAI/arabic-generated-abstracts")

In [5]:
print(dataset)

DatasetDict({
    by_polishing: Dataset({
        features: ['original_abstract', 'allam_generated_abstract', 'jais_generated_abstract', 'llama_generated_abstract', 'openai_generated_abstract'],
        num_rows: 2851
    })
    from_title: Dataset({
        features: ['original_abstract', 'allam_generated_abstract', 'jais_generated_abstract', 'llama_generated_abstract', 'openai_generated_abstract'],
        num_rows: 2963
    })
    from_title_and_content: Dataset({
        features: ['original_abstract', 'allam_generated_abstract', 'jais_generated_abstract', 'llama_generated_abstract', 'openai_generated_abstract'],
        num_rows: 2574
    })
})


Prepare Subset Function 

In [14]:
def prepare_subset(name):
    df = dataset[name].to_pandas()
    
    # Human texts
    df_human = pd.DataFrame({
        "text": df["original_abstract"],
        "label": "human",
        "subset": name
    })
    
    # AI texts (from 4 models: allam, jais, llama, openai)
    ai_texts = pd.concat([
        pd.DataFrame({"text": df["allam_generated_abstract"], "label": "AI", "subset": name}),
        pd.DataFrame({"text": df["jais_generated_abstract"], "label": "AI", "subset": name}),
        pd.DataFrame({"text": df["llama_generated_abstract"], "label": "AI", "subset": name}),
        pd.DataFrame({"text": df["openai_generated_abstract"], "label": "AI", "subset": name}),
    ], ignore_index=True)
    
    # Merge Human + AI
    df_combined = pd.concat([df_human, ai_texts], ignore_index=True)
    # Confirmation message
    return df_combined
print(" Function prepare_subset()is Defined")

 Function prepare_subset()is Defined


Data Exploration

In [27]:
def explore_subset(df, name):
    print(f"\n📊 Subset: {name}")
    print("Total rows:", df.shape[0])
    
    # Count Human vs AI
    human_count = (df["label"] == "human").sum()
    ai_count = (df["label"] == "AI").sum()
    print("Human texts:", human_count)
    print("AI texts:", ai_count)
    
    # First 5 rows
    print("\n📝 First 5 rows:")
    print(df.head())
    
    # DataFrame info
    print("\nℹ️ DataFrame info:")
    print(df.info())
    
    # Descriptive statistics
    print("\n📊 Descriptive statistics:")
    print(df.describe(include="all"))
    
    # Missing values
    print("\n❓ Missing values:")
    print(df.isnull().sum())
    
    # Duplicates
    print("\n🔁 Number of duplicate rows:", df.duplicated().sum())

print(" Function explore_subset() is ready to use")

 Function explore_subset() is ready to use


Process Subsets

In [28]:
df_by_polishing = prepare_subset("by_polishing")
explore_subset(df_by_polishing, "by_polishing")

df_from_title = prepare_subset("from_title")
explore_subset(df_from_title, "from_title")

df_from_title_and_content = prepare_subset("from_title_and_content")
explore_subset(df_from_title_and_content, "from_title_and_content")


📊 Subset: by_polishing
Total rows: 14255
Human texts: 2851
AI texts: 11404

📝 First 5 rows:
                                                text  label        subset
0  كثيرا ما ارتبطت المصادر التاريخية في الأندلس خ...  human  by_polishing
1  يعد العامل الثقافي احد ابرز الاسباب التي يعزى ...  human  by_polishing
2  شكلت تلك الجهود والمساعي الرائدة التي قام بها ...  human  by_polishing
3  يقوم المقال على اشكالية الضرائب الغير شرعية في...  human  by_polishing
4  تتفق المصادر التاريخية المتوفرة حول موضوع تطور...  human  by_polishing

ℹ️ DataFrame info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14255 entries, 0 to 14254
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    14255 non-null  object
 1   label   14255 non-null  object
 2   subset  14255 non-null  object
dtypes: object(3)
memory usage: 334.2+ KB
None

📊 Descriptive statistics:
                                                     text  label        subset
co

Marge all Data

In [29]:
df_all = pd.concat([df_by_polishing, df_from_title, df_from_title_and_content], ignore_index=True)

print("\n✅ Final merged DataFrame")
print("Total texts:", df_all.shape[0])
print(df_all["label"].value_counts())
print("Subsets distribution:", df_all["subset"].value_counts())


✅ Final merged DataFrame
Total texts: 41940
label
AI       33552
human     8388
Name: count, dtype: int64
Subsets distribution: subset
from_title                14815
by_polishing              14255
from_title_and_content    12870
Name: count, dtype: int64
