In [12]:
import os
import numpy as np
import cv2
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler

In [16]:
# Step 1: Loading IAM dataset

# Path to the IAM dataset folder
iam_dataset_path = r"C:\Users\Carine\Desktop\data\human"

# Get all subfolders (writers) and load images
image_paths = []
human_labels = []

for subfolder in os.listdir(iam_dataset_path):
    subfolder_path = os.path.join(iam_dataset_path, subfolder)
    if os.path.isdir(subfolder_path):
        for img_file in os.listdir(subfolder_path):
            if img_file.endswith(".png"):
                image_paths.append(os.path.join(subfolder_path, img_file))
                human_labels.append(0)  # 0 = human

# Read images in grayscale, resize, normalize
human_images = []
for img_path in image_paths:
    img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
    img = cv2.resize(img, (128, 128))  # or any size you prefer
    human_images.append(img)

human_images = np.array(human_images, dtype="float32") / 255.0
human_labels = np.array(human_labels)

In [None]:
# Step 2: Processing DeepWriting Dataset

# Paths to DeepWriting datasets
deepwriting_train_path = r"C:\Users\Carine\Desktop\data\AI\deepwriting_training.npz"
deepwriting_val_path = r"C:\Users\Carine\Desktop\data\AI\deepwriting_validation.npz"

# Output folders for processed images
train_output_path = r"C:\Users\Carine\Desktop\data\AI\deepwriting_training_images"
val_output_path = r"C:\Users\Carine\Desktop\data\AI\deepwriting_validation_images"
os.makedirs(train_output_path, exist_ok=True)
os.makedirs(val_output_path, exist_ok=True)

# Function to load, convert strokes to images, and save them
def process_deepwriting(npz_path, output_folder):
    data = np.load(npz_path, allow_pickle=True)
    strokes = data["strokes"]  # Extract stroke sequences

    def strokes_to_image(stroke_data, img_size=128):
        x = np.cumsum(stroke_data[:, 0])
        y = np.cumsum(stroke_data[:, 1])

        x = ((x - x.min()) / (x.max() - x.min()) * (img_size - 1)).astype(np.int32)
        y = ((y - y.min()) / (y.max() - y.min()) * (img_size - 1)).astype(np.int32)

        img = np.ones((img_size, img_size), dtype=np.uint8) * 255  # White background

        for i in range(len(x) - 1):
            cv2.line(img, (x[i], y[i]), (x[i+1], y[i+1]), 0, thickness=2)

        return img

    for idx, stroke_data in enumerate(strokes):
        img = strokes_to_image(stroke_data)
        cv2.imwrite(os.path.join(output_folder, f"deepwriting_{idx}.png"), img)

    print(f"Processed {len(strokes)} images and saved to {output_folder}")

# Process training and validation datasets
process_deepwriting(deepwriting_train_path, train_output_path)
process_deepwriting(deepwriting_val_path, val_output_path)


Processed 34577 images and saved to C:\Users\Carine\Desktop\data\AI\deepwriting_training_images
Processed 705 images and saved to C:\Users\Carine\Desktop\data\AI\deepwriting_validation_images


In [17]:
# Step 3: Loading DeepWriting Dataset

# Paths to converted DeepWriting images
deepwriting_train_path = r"C:\Users\Carine\Desktop\data\AI\deepwriting_training_images"
deepwriting_val_path = r"C:\Users\Carine\Desktop\data\AI\deepwriting_validation_images"

deepwriting_image_paths = []
ai_labels = []

# Load training images
for img_file in os.listdir(deepwriting_train_path):
    if img_file.endswith(".png"):
        deepwriting_image_paths.append(os.path.join(deepwriting_train_path, img_file))
        ai_labels.append(1)  # Label 1 = AI-generated handwriting

# Load validation images
for img_file in os.listdir(deepwriting_val_path):
    if img_file.endswith(".png"):
        deepwriting_image_paths.append(os.path.join(deepwriting_val_path, img_file))
        ai_labels.append(1)  # AI label

# Read, resize, normalize AI images
ai_images = []
for img_path in deepwriting_image_paths:
    img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
    img = cv2.resize(img, (128, 128))  # Standardizing size
    ai_images.append(img)

ai_images = np.array(ai_images, dtype="float32") / 255.0
ai_labels = np.array(ai_labels)


In [21]:
# Step 4: Merge Before Undersampling

# Merge human and AI images & labels
all_images = np.concatenate((human_images, ai_images), axis=0)
all_labels = np.concatenate((human_labels, ai_labels), axis=0)

# Flatten image data for undersampling
all_images_flat = all_images.reshape(all_images.shape[0], -1)

# Print original dataset size
print("Before Undersampling:")
print("Total samples:", all_images_flat.shape[0])
print("Human samples:", sum(all_labels == 0))
print("AI samples:", sum(all_labels == 1))

Before Undersampling:
Total samples: 36821
Human samples: 1539
AI samples: 35282


In [22]:
# Step 5: Undersampling to Balance Data 

# Set undersampling strategy to match human sample count
human_sample_count = sum(all_labels == 0)
rus = RandomUnderSampler(sampling_strategy={0: human_sample_count, 1: human_sample_count}, random_state=42)

X_balanced_flat, y_balanced = rus.fit_resample(all_images_flat, all_labels)

# Reshape back to (128,128) images
X_balanced = X_balanced_flat.reshape(-1, 128, 128)

# Print new dataset size after undersampling
print("\nAfter Undersampling:")
print("Balanced Total samples:", X_balanced.shape[0])
print("Balanced Human samples:", sum(y_balanced == 0))
print("Balanced AI samples:", sum(y_balanced == 1))


After Undersampling:
Balanced Total samples: 3078
Balanced Human samples: 1539
Balanced AI samples: 1539


In [24]:
# Step 5: Split into 80% Training, 20% Validation

X_train, X_val, y_train, y_val = train_test_split(
    X_balanced, y_balanced,
    test_size=0.2,       # 20% validation
    random_state=42,
    shuffle=True,
    stratify=y_balanced  # Preserves class balance
)

print("\nFinal Split:")
print("Training set:", X_train.shape)
print("Validation set:", X_val.shape)
print("Human in train:", sum(y_train == 0), "AI in train:", sum(y_train == 1))
print("Human in val:", sum(y_val == 0), "AI in val:", sum(y_val == 1))


Final Split:
Training set: (2462, 128, 128)
Validation set: (616, 128, 128)
Human in train: 1231 AI in train: 1231
Human in val: 308 AI in val: 308


In [25]:
# Step 6: Save Final Dataset
# Set path for saving the dataset
save_path = r"C:\Users\Carine\Desktop\data\human_vs_ai_dataset.npz"

# Save data as NPZ format
np.savez_compressed(save_path, X_train=X_train, y_train=y_train, X_val=X_val, y_val=y_val)

print(f"\n✅ Final balanced dataset saved at: {save_path}")


✅ Final balanced dataset saved at: C:\Users\Carine\Desktop\data\human_vs_ai_dataset.npz


In [26]:
import hashlib

# Compute hash of each image to check for duplicates
train_hashes = {hashlib.md5(x.tobytes()).hexdigest() for x in X_train}
val_hashes = {hashlib.md5(x.tobytes()).hexdigest() for x in X_val}

# Intersection should be zero (i.e., no duplicates)
data_leakage_check = train_hashes.intersection(val_hashes)
print(f"Number of overlapping samples: {len(data_leakage_check)}")

Number of overlapping samples: 0


In [27]:
from sklearn.model_selection import cross_val_score

cv_scores = cross_val_score(best_rf, X_train, y_train, cv=5, scoring="accuracy")
print(f"Cross-validation Accuracy: {np.mean(cv_scores) * 100:.2f}%")

NameError: name 'best_rf' is not defined