# Load In

In [None]:
import os
import kagglehub

# Kaggle dataset ID
dataset_name = "alessandrasala79/ai-vs-human-generated-dataset"

# Download dataset
dataset_path = kagglehub.dataset_download(dataset_name)

print("Dataset downloaded to:", dataset_path)


Downloading from https://www.kaggle.com/api/v1/datasets/download/alessandrasala79/ai-vs-human-generated-dataset?dataset_version_number=4...


100%|██████████| 9.76G/9.76G [03:55<00:00, 44.5MB/s]

Extracting files...





Dataset downloaded to: /root/.cache/kagglehub/datasets/alessandrasala79/ai-vs-human-generated-dataset/versions/4


In [None]:
# prompt: print files names in dataset

import os

for filename in os.listdir(dataset_path):
    print(filename)


test_data_v2
train_data
train.csv
test.csv


In [None]:
import os
import pandas as pd

# ----- Adjust paths based on dataset -----
BASE_PATH = "/root/.cache/kagglehub/datasets/alessandrasala79/ai-vs-human-generated-dataset/versions/4"
TRAIN_IMG_PATH = os.path.join(BASE_PATH, "train_data")
TEST_IMG_PATH = os.path.join(BASE_PATH, "test_data_v2")

# Define paths for CSV files
TRAIN_CSV = os.path.join(BASE_PATH, "train.csv")
TEST_CSV = os.path.join(BASE_PATH, "test.csv")

# Load CSV files
train_df = pd.read_csv(TRAIN_CSV)
test_df = pd.read_csv(TEST_CSV)

# Display first few rows of train and test data
print("Train Data Sample:")
display(train_df.head())

print("\nTest Data Sample:")
display(test_df.head())

# Count the number of images in train and test datasets
num_train_images = len(os.listdir(TRAIN_IMG_PATH))
num_test_images = len(os.listdir(TEST_IMG_PATH))

print(f"Number of Train Images: {num_train_images}")
print(f"Number of Test Images: {num_test_images}")

# Create a dictionary mapping image filenames to their labels (for ground truth)
true_labels_dict = dict(zip(train_df["file_name"], train_df["label"]))

# Print a sample of ground truth labels
print("Sample Ground Truth Labels:")
for img, label in list(true_labels_dict.items())[:5]:  # Show first 5 entries
    print(f"{img}: {'AI-generated' if label == 1 else 'Human-generated'}")


Train Data Sample:


Unnamed: 0.1,Unnamed: 0,file_name,label
0,0,train_data/a6dcb93f596a43249135678dfcfc17ea.jpg,1
1,1,train_data/041be3153810433ab146bc97d5af505c.jpg,0
2,2,train_data/615df26ce9494e5db2f70e57ce7a3a4f.jpg,1
3,3,train_data/8542fe161d9147be8e835e50c0de39cd.jpg,0
4,4,train_data/5d81fa12bc3b4cea8c94a6700a477cf2.jpg,1



Test Data Sample:


Unnamed: 0,id
0,test_data_v2/1a2d9fd3e21b4266aea1f66b30aed157.jpg
1,test_data_v2/ab5df8f441fe4fbf9dc9c6baae699dc7.jpg
2,test_data_v2/eb364dd2dfe34feda0e52466b7ce7956.jpg
3,test_data_v2/f76c2580e9644d85a741a42c6f6b39c0.jpg
4,test_data_v2/a16495c578b7494683805484ca27cf9f.jpg


Number of Train Images: 79950
Number of Test Images: 5540
Sample Ground Truth Labels:
train_data/a6dcb93f596a43249135678dfcfc17ea.jpg: AI-generated
train_data/041be3153810433ab146bc97d5af505c.jpg: Human-generated
train_data/615df26ce9494e5db2f70e57ce7a3a4f.jpg: AI-generated
train_data/8542fe161d9147be8e835e50c0de39cd.jpg: Human-generated
train_data/5d81fa12bc3b4cea8c94a6700a477cf2.jpg: AI-generated


# Xception

In [None]:
import os
import kagglehub
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight

import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.applications.xception import Xception, preprocess_input
from tensorflow.keras.layers import GlobalAveragePooling2D, Dense, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint

# -------------------------
# 1. Download the dataset
# -------------------------
dataset_name = "alessandrasala79/ai-vs-human-generated-dataset"
dataset_path = kagglehub.dataset_download(dataset_name)
print("Dataset downloaded to:", dataset_path)

# -------------------------
# 2. Set paths and check directory structure
# -------------------------
BASE_PATH = dataset_path  # This folder is .../versions/4
print("\nFiles in BASE_PATH:")
print(os.listdir(BASE_PATH))

TRAIN_CSV = os.path.join(BASE_PATH, "train.csv")
TEST_CSV = os.path.join(BASE_PATH, "test.csv")
TRAIN_IMG_PATH = os.path.join(BASE_PATH, "train_data")
TEST_IMG_PATH = os.path.join(BASE_PATH, "test_data_v2")

# -------------------------
# 3. Read CSV files
# -------------------------
train_df = pd.read_csv(TRAIN_CSV)
test_df = pd.read_csv(TEST_CSV)

print("\nTrain Data (head):")
print(train_df.head())
print("\nTest Data (head):")
print(test_df.head())

# -------------------------
# 4. Fix file names in train_df
# -------------------------
# Remove the "train_data/" prefix so that only the pure file name remains.
train_df["file_name"] = train_df["file_name"].apply(lambda x: x.replace("train_data/", "").strip())

# -------------------------
# 5. Validate that file names exist in TRAIN_IMG_PATH
# -------------------------
train_files_on_disk = set(os.listdir(TRAIN_IMG_PATH))
valid_train_df = train_df[train_df["file_name"].isin(train_files_on_disk)].copy()
print(f"\nNumber of valid train images: {len(valid_train_df)}")
print("Sample of valid train_df:")
print(valid_train_df.head())

# -------------------------
# 6. Prepare test_df for submission predictions
# -------------------------
# Rename 'id' to 'file_name' and remove the "test_data_v2/" prefix.
test_df.rename(columns={"id": "file_name"}, inplace=True)
test_df["file_name"] = test_df["file_name"].apply(lambda x: x.replace("test_data_v2/", "").strip())

test_files_on_disk = set(os.listdir(TEST_IMG_PATH))
valid_test_df = test_df[test_df["file_name"].isin(test_files_on_disk)].copy()
print(f"\nNumber of valid test images (for submission): {len(valid_test_df)}")
# No labels available for test; this dataframe will be used for submission.

# -------------------------
# 7. Create Training Dataset Using All Valid Images (~80k)
# -------------------------
# Use all valid training images (roughly 80k images) for training.
full_train_df = valid_train_df.copy()
# Split full training data into 90% training and 10% validation.
train_df_full, val_df_full = train_test_split(full_train_df, test_size=0.1, random_state=42)
print("\nFull Training Dataset Sizes:")
print("Train set size:", len(train_df_full))
print("Validation set size:", len(val_df_full))

# -------------------------
# 8. Build ImageDataGenerators
# -------------------------
IMG_SIZE = (299, 299)
BATCH_SIZE = 4

# Training generator with data augmentation.
train_datagen = ImageDataGenerator(
    preprocessing_function=preprocess_input,
    rotation_range=20,
    width_shift_range=0.1,
    height_shift_range=0.1,
    horizontal_flip=True,
    shear_range=0.15,
    zoom_range=0.15,
    brightness_range=[0.8, 1.2]
)
# Validation generator: only preprocessing.
val_datagen = ImageDataGenerator(preprocessing_function=preprocess_input)
# Test (submission) generator: only preprocessing.
test_datagen = ImageDataGenerator(preprocessing_function=preprocess_input)

train_generator = train_datagen.flow_from_dataframe(
    dataframe=train_df_full,
    directory=TRAIN_IMG_PATH,
    x_col="file_name",
    y_col="label",
    target_size=IMG_SIZE,
    class_mode="raw",  # Binary labels: 0 or 1
    batch_size=BATCH_SIZE,
    shuffle=True
)

val_generator = val_datagen.flow_from_dataframe(
    dataframe=val_df_full,
    directory=TRAIN_IMG_PATH,
    x_col="file_name",
    y_col="label",
    target_size=IMG_SIZE,
    class_mode="raw",
    batch_size=BATCH_SIZE,
    shuffle=False
)

# Competition test generator for submission (all official test images)
competition_test_generator = test_datagen.flow_from_dataframe(
    dataframe=valid_test_df,
    directory=TEST_IMG_PATH,
    x_col="file_name",
    y_col=None,
    target_size=IMG_SIZE,
    class_mode=None,
    batch_size=BATCH_SIZE,
    shuffle=False
)

# -------------------------
# 9. Compute class weights to address potential imbalance
# -------------------------
class_labels = np.unique(train_df_full["label"])
class_weights = compute_class_weight('balanced', classes=class_labels, y=train_df_full["label"])
class_weight_dict = {int(label): weight for label, weight in zip(class_labels, class_weights)}
print("\nComputed class weights:", class_weight_dict)

# -------------------------
# 10. Build and compile the Xception model
# -------------------------
# Load Xception with pre-trained ImageNet weights (exclude the top layers).
base_model = Xception(weights='imagenet', include_top=False, input_shape=(299, 299, 3))
x = GlobalAveragePooling2D()(base_model.output)
x = Dropout(0.5)(x)
# Final dense layer with sigmoid activation for binary classification.
predictions = Dense(1, activation='sigmoid')(x)
model = Model(inputs=base_model.input, outputs=predictions)

# Use a lower learning rate for fine-tuning.
optimizer = Adam(learning_rate=1e-4)
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

# -------------------------
# 11. Set up callbacks: EarlyStopping, ReduceLROnPlateau, and ModelCheckpoint
# -------------------------
checkpoint = ModelCheckpoint("best_model.h5", monitor='val_loss', save_best_only=True, verbose=1)
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True, verbose=1)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2, min_lr=1e-6, verbose=1)

callbacks = [checkpoint, early_stopping, reduce_lr]

# -------------------------
# 12. Train the model for 10 epochs using the full training dataset
# -------------------------
EPOCHS = 10

history = model.fit(
    train_generator,
    validation_data=val_generator,
    epochs=EPOCHS,
    class_weight=class_weight_dict,
    callbacks=callbacks
)

# After training, you can load the best model if needed:
# model.load_weights("best_model.h5")

# -------------------------
# 13. Evaluate the best model on the validation set and generate a detailed report
# -------------------------
val_preds = model.predict(val_generator)
y_true = val_df_full["label"].values
y_pred = (val_preds >= 0.5).astype(int)

# Compute metrics
acc = accuracy_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)
prec = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
cm = confusion_matrix(y_true, y_pred)
TP = cm[1, 1]
FP = cm[0, 1]
TN = cm[0, 0]
FN = cm[1, 0]

print("\nFinal Model Metrics on Validation Set:")
print(f"Accuracy: {acc:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"Precision: {prec:.4f}")
print(f"Recall: {recall:.4f}")
print("Confusion Matrix:")
print(cm)
print(f"True Positives (TP): {TP}")
print(f"False Positives (FP): {FP}")
print(f"True Negatives (TN): {TN}")
print(f"False Negatives (FN): {FN}")

# -------------------------
# 14. Predict on Official Test Set for Kaggle Submission
# -------------------------
submission_preds = model.predict(competition_test_generator)
# Retrieve file names from the generator (order matches predictions)
submission_filenames = competition_test_generator.filenames

# Create a DataFrame for submission.
submission_df = pd.DataFrame({
    "id": submission_filenames,
    "prediction": submission_preds.flatten()
})
# Optionally, convert probabilities to binary labels if required:
submission_df["prediction_label"] = (submission_df["prediction"] >= 0.5).astype(int)

submission_df.to_csv("submission.csv", index=False)
print("\nSubmission file 'submission.csv' created with predictions on the official test set.")


Dataset downloaded to: /root/.cache/kagglehub/datasets/alessandrasala79/ai-vs-human-generated-dataset/versions/4

Files in BASE_PATH:
['test_data_v2', 'train.csv', 'train_data', 'test.csv']

Train Data (head):
   Unnamed: 0                                        file_name  label
0           0  train_data/a6dcb93f596a43249135678dfcfc17ea.jpg      1
1           1  train_data/041be3153810433ab146bc97d5af505c.jpg      0
2           2  train_data/615df26ce9494e5db2f70e57ce7a3a4f.jpg      1
3           3  train_data/8542fe161d9147be8e835e50c0de39cd.jpg      0
4           4  train_data/5d81fa12bc3b4cea8c94a6700a477cf2.jpg      1

Test Data (head):
                                                  id
0  test_data_v2/1a2d9fd3e21b4266aea1f66b30aed157.jpg
1  test_data_v2/ab5df8f441fe4fbf9dc9c6baae699dc7.jpg
2  test_data_v2/eb364dd2dfe34feda0e52466b7ce7956.jpg
3  test_data_v2/f76c2580e9644d85a741a42c6f6b39c0.jpg
4  test_data_v2/a16495c578b7494683805484ca27cf9f.jpg

Number of valid train images: 79

  self._warn_if_super_not_called()


Epoch 1/10
[1m17989/17989[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 95ms/step - accuracy: 0.9146 - loss: 0.2095
Epoch 1: val_loss improved from inf to 0.02832, saving model to best_model.h5




[1m17989/17989[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1782s[0m 97ms/step - accuracy: 0.9146 - loss: 0.2095 - val_accuracy: 0.9921 - val_loss: 0.0283 - learning_rate: 1.0000e-04
Epoch 2/10
[1m17989/17989[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 89ms/step - accuracy: 0.9766 - loss: 0.0654
Epoch 2: val_loss did not improve from 0.02832
[1m17989/17989[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1626s[0m 90ms/step - accuracy: 0.9766 - loss: 0.0654 - val_accuracy: 0.9851 - val_loss: 0.0460 - learning_rate: 1.0000e-04
Epoch 3/10
[1m17989/17989[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 89ms/step - accuracy: 0.9831 - loss: 0.0484
Epoch 3: val_loss did not improve from 0.02832

Epoch 3: ReduceLROnPlateau reducing learning rate to 4.999999873689376e-05.
[1m17989/17989[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1633s[0m 91ms/step - accuracy: 0.9831 - loss: 0.0484 - val_ac



[1m17989/17989[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1639s[0m 91ms/step - accuracy: 0.9949 - loss: 0.0141 - val_accuracy: 0.9949 - val_loss: 0.0163 - learning_rate: 2.5000e-05
Epoch 7/10
[1m17989/17989[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 89ms/step - accuracy: 0.9961 - loss: 0.0108
Epoch 7: val_loss improved from 0.01628 to 0.01573, saving model to best_model.h5




[1m17989/17989[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1633s[0m 91ms/step - accuracy: 0.9961 - loss: 0.0108 - val_accuracy: 0.9951 - val_loss: 0.0157 - learning_rate: 2.5000e-05
Epoch 8/10
[1m17989/17989[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 89ms/step - accuracy: 0.9969 - loss: 0.0092
Epoch 8: val_loss did not improve from 0.01573
[1m17989/17989[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1625s[0m 90ms/step - accuracy: 0.9969 - loss: 0.0092 - val_accuracy: 0.9939 - val_loss: 0.0168 - learning_rate: 2.5000e-05
Epoch 9/10
[1m17989/17989[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 89ms/step - accuracy: 0.9975 - loss: 0.0068
Epoch 9: val_loss did not improve from 0.01573

Epoch 9: ReduceLROnPlateau reducing learning rate to 1.249999968422344e-05.
[1m17989/17989[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1622s[0m 90ms/step - accuracy: 0.9975 - loss: 0.0068 - val_ac



[1m17989/17989[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1620s[0m 90ms/step - accuracy: 0.9977 - loss: 0.0060 - val_accuracy: 0.9965 - val_loss: 0.0097 - learning_rate: 1.2500e-05
Restoring model weights from the end of the best epoch: 10.
[1m1999/1999[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 14ms/step

Final Model Metrics on Validation Set:
Accuracy: 0.9965
F1 Score: 0.9965
Precision: 0.9947
Recall: 0.9982
Confusion Matrix:
[[3992   21]
 [   7 3975]]
True Positives (TP): 3975
False Positives (FP): 21
True Negatives (TN): 3992
False Negatives (FN): 7


  self._warn_if_super_not_called()


[1m1385/1385[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m106s[0m 75ms/step

Submission file 'submission.csv' created with predictions on the official test set.


In [None]:
model.save("best_model_manual.h5")



In [None]:
# Add "test_data_v2/" prefix to each id
submission_df["id"] = "test_data_v2/" + submission_df["id"]

# Keep only the necessary columns: 'id' and 'prediction_label'
submission_df = submission_df[["id", "prediction_label"]]

# Save the submission file
submission_df.to_csv("whole.csv", index=False)

print("\nSubmission file 'whole.csv' created with predictions on the official test set.")



Submission file 'whole.csv' created with predictions on the official test set.


# RegNet

In [None]:
import os
import kagglehub
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight

import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.applications.imagenet_utils import preprocess_input
from tensorflow.keras.layers import (GlobalAveragePooling2D, Dense, Dropout, Conv2D,
                                     BatchNormalization, Activation, Add)
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint

# =============================
# Custom RegNet Implementation
# =============================

def regnet_block(x, filters, stride=1, group_width=1):
    """
    A simplified RegNet-style bottleneck block with group convolution.
    This block performs:
      1. 1x1 conv to reduce channels.
      2. 3x3 group conv.
      3. 1x1 conv to expand channels.
    A residual connection is added, adjusting dimensions if necessary.
    """
    shortcut = x

    # 1x1 convolution to reduce channels.
    x = Conv2D(filters, kernel_size=1, strides=1, padding='same', use_bias=False)(x)
    x = BatchNormalization()(x)
    x = Activation('relu')(x)

    # 3x3 group convolution.
    x = Conv2D(filters, kernel_size=3, strides=stride, padding='same',
               groups=group_width, use_bias=False)(x)
    x = BatchNormalization()(x)
    x = Activation('relu')(x)

    # 1x1 convolution to expand channels (bottleneck expansion factor = 4).
    x = Conv2D(filters * 4, kernel_size=1, strides=1, padding='same', use_bias=False)(x)
    x = BatchNormalization()(x)

    # Adjust shortcut if needed.
    if shortcut.shape[-1] != filters * 4 or stride != 1:
        shortcut = Conv2D(filters * 4, kernel_size=1, strides=stride, padding='same', use_bias=False)(shortcut)
        shortcut = BatchNormalization()(shortcut)

    x = Add()([x, shortcut])
    x = Activation('relu')(x)
    return x

def build_regnet_backbone(input_shape=(299, 299, 3)):
    """
    Builds a custom RegNet-like backbone.
    The network consists of a stem followed by three stages,
    each containing several regnet blocks with increasing filters.
    """
    inputs = tf.keras.Input(shape=input_shape)

    # Stem.
    x = Conv2D(32, kernel_size=3, strides=2, padding='same', use_bias=False)(inputs)
    x = BatchNormalization()(x)
    x = Activation('relu')(x)

    # Stage 1: 3 blocks with filters=32 and group_width=8.
    for i in range(3):
        stride = 2 if i == 0 else 1  # Downsample on first block.
        x = regnet_block(x, filters=32, stride=stride, group_width=8)

    # Stage 2: 4 blocks with filters=64 and group_width=16.
    for i in range(4):
        stride = 2 if i == 0 else 1
        x = regnet_block(x, filters=64, stride=stride, group_width=16)

    # Stage 3: 6 blocks with filters=128 and group_width=32.
    for i in range(6):
        stride = 2 if i == 0 else 1
        x = regnet_block(x, filters=128, stride=stride, group_width=32)

    backbone = Model(inputs=inputs, outputs=x, name="CustomRegNetBackbone")
    return backbone

# =============================
# Main Training Pipeline
# =============================

# 0. Debug: Check Eager Execution
print("Eager execution enabled:", tf.executing_eagerly())

# 1. Download the dataset.
dataset_name = "alessandrasala79/ai-vs-human-generated-dataset"
dataset_path = kagglehub.dataset_download(dataset_name)
print("Dataset downloaded to:", dataset_path)

# 2. Set paths and check directory structure.
BASE_PATH = dataset_path  # This folder is: .../versions/4
print("\nFiles in BASE_PATH:")
print(os.listdir(BASE_PATH))

TRAIN_CSV = os.path.join(BASE_PATH, "train.csv")
TEST_CSV = os.path.join(BASE_PATH, "test.csv")
TRAIN_IMG_PATH = os.path.join(BASE_PATH, "train_data")
TEST_IMG_PATH = os.path.join(BASE_PATH, "test_data_v2")

# 3. Read CSV files.
train_df = pd.read_csv(TRAIN_CSV)
test_df = pd.read_csv(TEST_CSV)

print("\nTrain Data (head):")
print(train_df.head())
print("\nTest Data (head):")
print(test_df.head())

# 4. Fix file names in train_df.
# Remove the "train_data/" prefix so that only the pure file name remains.
train_df["file_name"] = train_df["file_name"].apply(lambda x: x.replace("train_data/", "").strip())

# 5. Validate that file names exist in TRAIN_IMG_PATH.
train_files_on_disk = set(os.listdir(TRAIN_IMG_PATH))
valid_train_df = train_df[train_df["file_name"].isin(train_files_on_disk)].copy()
print(f"\nNumber of valid train images: {len(valid_train_df)}")
print("Sample of valid train_df:")
print(valid_train_df.head())

# 6. Prepare test_df for submission predictions.
# Rename 'id' to 'file_name' and remove the "test_data_v2/" prefix.
test_df.rename(columns={"id": "file_name"}, inplace=True)
test_df["file_name"] = test_df["file_name"].apply(lambda x: x.replace("test_data_v2/", "").strip())

test_files_on_disk = set(os.listdir(TEST_IMG_PATH))
valid_test_df = test_df[test_df["file_name"].isin(test_files_on_disk)].copy()
print(f"\nNumber of valid test images (for submission): {len(valid_test_df)}")
# Official test images have no labels.

# 7. Create Full Training Dataset Using First 5000 Images with Stratified 90/10 Split.
demo_df = valid_train_df.head(5000).reset_index(drop=True)
train_df_full, val_df_full = train_test_split(
    demo_df, test_size=0.1, random_state=42, stratify=demo_df["label"]
)
print("\nDemo Dataset Sizes:")
print("Train set size:", len(train_df_full))
print("Validation set size:", len(val_df_full))

# 8. Build ImageDataGenerators.
IMG_SIZE = (299, 299)
BATCH_SIZE = 4

train_datagen = ImageDataGenerator(
    preprocessing_function=preprocess_input,
    rotation_range=20,
    width_shift_range=0.1,
    height_shift_range=0.1,
    horizontal_flip=True,
    shear_range=0.15,
    zoom_range=0.15,
    brightness_range=[0.8, 1.2]
)
val_datagen = ImageDataGenerator(preprocessing_function=preprocess_input)
test_datagen = ImageDataGenerator(preprocessing_function=preprocess_input)

train_generator = train_datagen.flow_from_dataframe(
    dataframe=train_df_full,
    directory=TRAIN_IMG_PATH,
    x_col="file_name",
    y_col="label",
    target_size=IMG_SIZE,
    class_mode="raw",  # Binary labels (0 or 1)
    batch_size=BATCH_SIZE,
    shuffle=True
)

val_generator = val_datagen.flow_from_dataframe(
    dataframe=val_df_full,
    directory=TRAIN_IMG_PATH,
    x_col="file_name",
    y_col="label",
    target_size=IMG_SIZE,
    class_mode="raw",
    batch_size=BATCH_SIZE,
    shuffle=False
)

competition_test_generator = test_datagen.flow_from_dataframe(
    dataframe=valid_test_df,
    directory=TEST_IMG_PATH,
    x_col="file_name",
    y_col=None,
    target_size=IMG_SIZE,
    class_mode=None,
    batch_size=BATCH_SIZE,
    shuffle=False
)

# 9. Compute class weights to address potential imbalance.
class_labels = np.unique(train_df_full["label"])
class_weights = compute_class_weight('balanced', classes=class_labels, y=train_df_full["label"])
class_weight_dict = {int(label): weight for label, weight in zip(class_labels, class_weights)}
print("\nComputed class weights:", class_weight_dict)

# 10. Build and compile the RegNet model.
# (We attempt to import a RegNet model; if not available, we use our custom implementation.)
try:
    from keras_regnet.models import RegNetX002  # Hypothetical package.
    backbone = RegNetX002(weights='imagenet', include_top=False, input_shape=(299,299,3))
    print("Using RegNetX002 as backbone.")
except ImportError:
    print("keras-regnet package not found. Building custom RegNet backbone.")
    backbone = build_regnet_backbone(input_shape=(299,299,3))

# Build the full model.
x = backbone.output
x = GlobalAveragePooling2D()(x)
x = Dropout(0.5)(x)
predictions = Dense(1, activation='sigmoid')(x)
model = Model(inputs=backbone.input, outputs=predictions)

optimizer = Adam(learning_rate=1e-4)
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
model.summary()  # Check the architecture and parameter count.

# 11. Set up callbacks: EarlyStopping, ReduceLROnPlateau, and ModelCheckpoint.
checkpoint_dir = "./checkpoints_stage1"
os.makedirs(checkpoint_dir, exist_ok=True)
checkpoint_path = os.path.join(checkpoint_dir, "checkpoint_latest.h5")

if os.path.exists(checkpoint_path):
    try:
        model = tf.keras.models.load_model(checkpoint_path)
        print(f"Resuming training from checkpoint: {checkpoint_path}")
    except Exception as e:
        print(f"Checkpoint loading error: {e}\nProceeding with training from scratch.")
else:
    print("No checkpoint found. Starting training from scratch.")

checkpoint_cb = ModelCheckpoint(checkpoint_path, monitor='val_loss', save_best_only=True,
                                  save_weights_only=False, verbose=1)
early_stopping_cb = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True, verbose=1)
reduce_lr_cb = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2, min_lr=1e-6, verbose=1)
callbacks = [checkpoint_cb, early_stopping_cb, reduce_lr_cb]

# 12. Train the model for 10 epochs.
EPOCHS = 10
history = model.fit(
    train_generator,
    validation_data=val_generator,
    epochs=EPOCHS,
    class_weight=class_weight_dict,
    callbacks=callbacks
)

# 13. Save the final complete model after training (HDF5 format).
model.save("final_model.h5")
print("Final model saved to 'final_model.h5'.")

# 14. Evaluate the best model on the validation set.
val_preds = model.predict(val_generator)
y_true = val_df_full["label"].values
y_pred = (val_preds >= 0.5).astype(int)

acc = accuracy_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)
prec = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
cm = confusion_matrix(y_true, y_pred)
TP = cm[1, 1]
FP = cm[0, 1]
TN = cm[0, 0]
FN = cm[1, 0]

print("\nFinal Model Metrics on Validation Set:")
print(f"Accuracy: {acc:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"Precision: {prec:.4f}")
print(f"Recall: {recall:.4f}")
print("Confusion Matrix:")
print(cm)
print(f"True Positives (TP): {TP}")
print(f"False Positives (FP): {FP}")
print(f"True Negatives (TN): {TN}")
print(f"False Negatives (FN): {FN}")

# 15. Predict on Official Test Set for Kaggle Submission.
submission_preds = model.predict(competition_test_generator)
submission_filenames = competition_test_generator.filenames

submission_df = pd.DataFrame({
    "id": submission_filenames,
    "prediction": submission_preds.flatten()
})
submission_df["prediction_label"] = (submission_df["prediction"] >= 0.5).astype(int)
submission_df.to_csv("submission.csv", index=False)
print("\nSubmission file 'submission.csv' created with predictions on the official test set.")


Eager execution enabled: True
Dataset downloaded to: /root/.cache/kagglehub/datasets/alessandrasala79/ai-vs-human-generated-dataset/versions/4

Files in BASE_PATH:
['test_data_v2', 'train_data', 'train.csv', 'test.csv']

Train Data (head):
   Unnamed: 0                                        file_name  label
0           0  train_data/a6dcb93f596a43249135678dfcfc17ea.jpg      1
1           1  train_data/041be3153810433ab146bc97d5af505c.jpg      0
2           2  train_data/615df26ce9494e5db2f70e57ce7a3a4f.jpg      1
3           3  train_data/8542fe161d9147be8e835e50c0de39cd.jpg      0
4           4  train_data/5d81fa12bc3b4cea8c94a6700a477cf2.jpg      1

Test Data (head):
                                                  id
0  test_data_v2/1a2d9fd3e21b4266aea1f66b30aed157.jpg
1  test_data_v2/ab5df8f441fe4fbf9dc9c6baae699dc7.jpg
2  test_data_v2/eb364dd2dfe34feda0e52466b7ce7956.jpg
3  test_data_v2/f76c2580e9644d85a741a42c6f6b39c0.jpg
4  test_data_v2/a16495c578b7494683805484ca27cf9f.jpg

Nu

No checkpoint found. Starting training from scratch.


  self._warn_if_super_not_called()


Epoch 1/10
[1m1125/1125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 94ms/step - accuracy: 0.6908 - loss: 0.8112
Epoch 1: val_loss improved from inf to 0.92357, saving model to ./checkpoints_stage1/checkpoint_latest.h5




[1m1125/1125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m149s[0m 99ms/step - accuracy: 0.6908 - loss: 0.8111 - val_accuracy: 0.7720 - val_loss: 0.9236 - learning_rate: 1.0000e-04
Epoch 2/10
[1m1125/1125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 95ms/step - accuracy: 0.7536 - loss: 0.6145
Epoch 2: val_loss improved from 0.92357 to 0.36799, saving model to ./checkpoints_stage1/checkpoint_latest.h5




[1m1125/1125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m110s[0m 98ms/step - accuracy: 0.7536 - loss: 0.6145 - val_accuracy: 0.8600 - val_loss: 0.3680 - learning_rate: 1.0000e-04
Epoch 3/10
[1m1125/1125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 95ms/step - accuracy: 0.7717 - loss: 0.5294
Epoch 3: val_loss improved from 0.36799 to 0.33081, saving model to ./checkpoints_stage1/checkpoint_latest.h5




[1m1125/1125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m109s[0m 97ms/step - accuracy: 0.7717 - loss: 0.5294 - val_accuracy: 0.8680 - val_loss: 0.3308 - learning_rate: 1.0000e-04
Epoch 4/10
[1m1125/1125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 94ms/step - accuracy: 0.7639 - loss: 0.5144
Epoch 4: val_loss improved from 0.33081 to 0.31439, saving model to ./checkpoints_stage1/checkpoint_latest.h5




[1m1125/1125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m108s[0m 96ms/step - accuracy: 0.7639 - loss: 0.5144 - val_accuracy: 0.8860 - val_loss: 0.3144 - learning_rate: 1.0000e-04
Epoch 5/10
[1m1125/1125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 94ms/step - accuracy: 0.8046 - loss: 0.4454
Epoch 5: val_loss did not improve from 0.31439
[1m1125/1125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m108s[0m 96ms/step - accuracy: 0.8046 - loss: 0.4454 - val_accuracy: 0.8400 - val_loss: 0.3742 - learning_rate: 1.0000e-04
Epoch 6/10
[1m1125/1125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 94ms/step - accuracy: 0.8029 - loss: 0.4444
Epoch 6: val_loss did not improve from 0.31439

Epoch 6: ReduceLROnPlateau reducing learning rate to 4.999999873689376e-05.
[1m1125/1125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m108s[0m 96ms/step - accuracy: 0.8030 - loss: 0.4444 - val_accuracy: 0.8380 



[1m1125/1125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m110s[0m 98ms/step - accuracy: 0.8344 - loss: 0.3710 - val_accuracy: 0.9040 - val_loss: 0.2580 - learning_rate: 5.0000e-05
Epoch 9/10
[1m1125/1125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 94ms/step - accuracy: 0.8473 - loss: 0.3487
Epoch 9: val_loss improved from 0.25803 to 0.25408, saving model to ./checkpoints_stage1/checkpoint_latest.h5




[1m1125/1125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m108s[0m 96ms/step - accuracy: 0.8474 - loss: 0.3487 - val_accuracy: 0.9060 - val_loss: 0.2541 - learning_rate: 5.0000e-05
Epoch 10/10
[1m1125/1125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 95ms/step - accuracy: 0.8589 - loss: 0.3268
Epoch 10: val_loss did not improve from 0.25408
[1m1125/1125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m109s[0m 97ms/step - accuracy: 0.8589 - loss: 0.3268 - val_accuracy: 0.8640 - val_loss: 0.6083 - learning_rate: 5.0000e-05
Restoring model weights from the end of the best epoch: 9.




Final model saved to 'final_model.h5'.
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 18ms/step

Final Model Metrics on Validation Set:
Accuracy: 0.9060
F1 Score: 0.9054
Precision: 0.9109
Recall: 0.9000
Confusion Matrix:
[[228  22]
 [ 25 225]]
True Positives (TP): 225
False Positives (FP): 22
True Negatives (TN): 228
False Negatives (FN): 25


  self._warn_if_super_not_called()


[1m1385/1385[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m115s[0m 82ms/step

Submission file 'submission.csv' created with predictions on the official test set.


In [None]:
model.save("regnet.h5")



In [None]:
# Add "test_data_v2/" prefix to each id
submission_df["id"] = "test_data_v2/" + submission_df["id"]

# Keep only the necessary columns: 'id' and 'prediction_label'
submission_df = submission_df[["id", "prediction_label"]]

# Save the submission file
submission_df.to_csv("see_regnet_20000.csv", index=False)

print("\nSubmission file 'see_regnet_20000.csv' created with predictions on the official test set.")



Submission file 'see_regnet_20000.csv' created with predictions on the official test set.


# Check File Match

In [None]:
TEST_CSV

'/root/.cache/kagglehub/datasets/alessandrasala79/ai-vs-human-generated-dataset/versions/4/test.csv'

In [None]:
import pandas as pd

# Specify file paths (adjust these paths as necessary)
original_test_csv_path = TEST_CSV       # original test CSV path
submission_csv_path = "see_regnet_20000.csv"             # submission CSV generated earlier

# Load the CSV files
original_test_df = pd.read_csv(original_test_csv_path)
submission_df = pd.read_csv(submission_csv_path)

# Check if the header (column names) is identical for the first column
original_first_col = original_test_df.columns[0]
submission_first_col = submission_df.columns[0]

if original_first_col == submission_first_col:
    print(f"Header match: Both have '{original_first_col}' as the first column.")
else:
    print(f"Header mismatch: Original first column is '{original_first_col}' while submission first column is '{submission_first_col}'.")

# Now check if the values in the first column are exactly identical (order and content)
original_ids = list(original_test_df[original_first_col])
submission_ids = list(submission_df[submission_first_col])

if original_ids == submission_ids:
    print("All 5540 rows in the first column are exactly identical between the two files.")
else:
    print("Mismatch found in the first column values!")
    # Optionally, find and print the differences.
    for i, (orig, sub) in enumerate(zip(original_ids, submission_ids)):
        if orig != sub:
            print(f"Row {i}: Original = {orig}, Submission = {sub}")
    # Also check if the number of rows is different.
    if len(original_ids) != len(submission_ids):
        print(f"Number of rows differ: Original has {len(original_ids)} rows, Submission has {len(submission_ids)} rows.")


Header match: Both have 'id' as the first column.
All 5540 rows in the first column are exactly identical between the two files.
