<a href="https://colab.research.google.com/github/AnonyBOSS/detect-AI/blob/main/InceptionresnetV2%20final%20project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.image import ImageDataGenerator

In [None]:
import os
import tensorflow as tf
import pandas as pd
import kagglehub
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.image import ImageDataGenerator

# Function to download dataset
def download_dataset(dataset_name):
    path = kagglehub.dataset_download(dataset_name)
    print(f"Downloaded {dataset_name} to {path}")
    return path

# Download datasets
main_dataset_path = download_dataset("alessandrasala79/ai-vs-human-generated-dataset")
cifake_path = download_dataset("birdy654/cifake-real-and-ai-generated-synthetic-images")

# Function to load dataset from CSV
def load_csv_dataset(base_dir, train_csv, test_csv):
    df_train = pd.read_csv(os.path.join(base_dir, train_csv))
    df_test = pd.read_csv(os.path.join(base_dir, test_csv))

    print("Train CSV Columns:", df_train.columns)
    print("Test CSV Columns:", df_test.columns)

    # Determine correct column names
    file_col_train = 'file_name' if 'file_name' in df_train.columns else df_train.columns[1]  # Default to second column
    file_col_test = 'file_name' if 'file_name' in df_test.columns else ('id' if 'id' in df_test.columns else df_test.columns[0])
    label_col_train = 'label' if 'label' in df_train.columns else df_train.columns[-1]
    label_col_test = 'label' if 'label' in df_test.columns else df_test.columns[-1]

    df_train[file_col_train] = df_train[file_col_train].apply(lambda x: os.path.join(base_dir, x))
    df_test[file_col_test] = df_test[file_col_test].apply(lambda x: os.path.join(base_dir, x))

    return df_train[[file_col_train, label_col_train]], df_test[[file_col_test, label_col_test]]

# Load main dataset
df_train, df_test = load_csv_dataset(main_dataset_path, 'train.csv', 'test.csv')

# Function to load images from folder structure
def load_folder_dataset(base_dir, train_folder, test_folder):
    train_real = [os.path.join(base_dir, train_folder, "REAL", img) for img in os.listdir(os.path.join(base_dir, train_folder, "REAL"))]
    train_fake = [os.path.join(base_dir, train_folder, "FAKE", img) for img in os.listdir(os.path.join(base_dir, train_folder, "FAKE"))]
    test_real = [os.path.join(base_dir, test_folder, "REAL", img) for img in os.listdir(os.path.join(base_dir, test_folder, "REAL"))]
    test_fake = [os.path.join(base_dir, test_folder, "FALE", img) for img in os.listdir(os.path.join(base_dir, test_folder, "FAKE"))]

    train_images = train_real + train_fake
    train_labels = [1] * len(train_real) + [0] * len(train_fake)
    test_images = test_real + test_fake
    test_labels = [1] * len(test_real) + [0] * len(test_fake)

    return train_images, train_labels, test_images, test_labels

# Load CIFAKE dataset
cifake_train_paths, cifake_train_labels, cifake_test_paths, cifake_test_labels = load_folder_dataset(cifake_path, 'train', 'test')

# Merge datasets
train_paths = list(df_train.iloc[:, 0]) + cifake_train_paths
train_labels = list(df_train.iloc[:, 1]) + cifake_train_labels
val_paths = list(df_test.iloc[:, 0]) + cifake_test_paths
val_labels = list(df_test.iloc[:, 1]) + cifake_test_labels

# Split dataset
train_paths, val_paths, train_labels, val_labels = train_test_split(
    train_paths, train_labels, test_size=0.1, stratify=train_labels, random_state=42)

print(f"Total Training Images: {len(train_paths)}")
print(f"Total Validation Images: {len(val_paths)}")

# Image preprocessing functions
def preprocess_image(image_path, label, is_training=True):
    image = tf.io.read_file(image_path)
    image = tf.image.decode_jpeg(image, channels=3)
    image = tf.image.resize(image, [224, 224])
    image = tf.cast(image, tf.float32) / 255.0
    if is_training:
        image = tf.image.random_flip_left_right(image)
    return image, label

# Create TensorFlow datasets
batch_size = 32
AUTOTUNE = tf.data.AUTOTUNE

train_ds = tf.data.Dataset.from_tensor_slices((train_paths, train_labels))
train_ds = train_ds.map(lambda x, y: preprocess_image(x, y, True), num_parallel_calls=AUTOTUNE)
train_ds = train_ds.shuffle(1000).batch(batch_size).prefetch(AUTOTUNE)

val_ds = tf.data.Dataset.from_tensor_slices((val_paths, val_labels))
val_ds = val_ds.map(lambda x, y: preprocess_image(x, y, False), num_parallel_calls=AUTOTUNE)
val_ds = val_ds.batch(batch_size).prefetch(AUTOTUNE)

print("Dataset is ready for training!")

Downloaded alessandrasala79/ai-vs-human-generated-dataset to /root/.cache/kagglehub/datasets/alessandrasala79/ai-vs-human-generated-dataset/versions/4
Downloaded birdy654/cifake-real-and-ai-generated-synthetic-images to /root/.cache/kagglehub/datasets/birdy654/cifake-real-and-ai-generated-synthetic-images/versions/3
Train CSV Columns: Index(['Unnamed: 0', 'file_name', 'label'], dtype='object')
Test CSV Columns: Index(['id'], dtype='object')
Total Training Images: 161955
Total Validation Images: 17995
Dataset is ready for training!


In [None]:
from tensorflow.keras.applications import InceptionResNetV2
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, Dropout, GlobalAveragePooling2D

In [None]:
base_model = InceptionResNetV2(weights="imagenet", include_top=False, input_shape=(224, 224, 3))

# Freeze the base model (optional)
base_model.trainable = False

# Build the full model
model = Sequential([
    base_model,
    GlobalAveragePooling2D(),  # Converts feature maps to a vector
    Dense(128, activation='relu'),  # Fully connected layer
    Dropout(0.5),  # Prevent overfitting
    Dense(1, activation="sigmoid")
])

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/inception_resnet_v2/inception_resnet_v2_weights_tf_dim_ordering_tf_kernels_notop.h5
[1m219055592/219055592[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 0us/step


In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, Dropout, MaxPooling2D, Conv2D, BatchNormalization

In [None]:
model= Sequential()
model.add(Conv2D(32, kernel_size=(3, 3), activation='relu', input_shape=(224, 224, 3)))
model.add(BatchNormalization())
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))
model.add(Conv2D(64, kernel_size=(3, 3), activation='relu'))
model.add(BatchNormalization())
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))
model.add(Conv2D(128, kernel_size=(3, 3), activation='relu'))
model.add(BatchNormalization())
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))
model.add(Flatten())
model.add(Dense(512, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

In [None]:
import tensorflow as tf
from tensorflow.keras.metrics import BinaryAccuracy
from tensorflow.keras import optimizers

model.compile(
    optimizer=optimizers.Adam(learning_rate=0.0001),
    loss='binary_crossentropy',
        metrics=[
        BinaryAccuracy()
    ]
)

call_backs = [
    tf.keras.callbacks.EarlyStopping(
        monitor='val_loss',
        patience=3,
        restore_best_weights=True
    ),
    tf.keras.callbacks.ModelCheckpoint(
        filepath='best_model.h5',
        monitor='val_loss',
        save_best_only=True
    ),
    tf.keras.callbacks.ReduceLROnPlateau(
        monitor='val_loss',
        factor=0.2,
        patience=2,
        min_lr=1e-6
    ),
    tf.keras.callbacks.TensorBoard(
        log_dir='logs',
        histogram_freq=1
    ),
    tf.keras.callbacks.TerminateOnNaN(),
    tf.keras.callbacks.LearningRateScheduler(lambda epoch: 1e-3 * 0.9 ** epoch),
    tf.keras.callbacks.CSVLogger('training.log'),
    tf.keras.callbacks.History(),
    tf.keras.callbacks.LambdaCallback(
        on_epoch_begin=lambda epoch, logs: print(f"Epoch {epoch} started")
    )
]
model.summary()

In [None]:
epochs = 5
history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=epochs
)


Epoch 1/5
[1m5062/5062[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m960s[0m 182ms/step - binary_accuracy: 0.7102 - loss: 0.5529 - val_binary_accuracy: 0.8331 - val_loss: 0.3759
Epoch 2/5
[1m5062/5062[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m888s[0m 166ms/step - binary_accuracy: 0.8245 - loss: 0.3894 - val_binary_accuracy: 0.8554 - val_loss: 0.3319
Epoch 3/5
[1m5062/5062[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m839s[0m 165ms/step - binary_accuracy: 0.8441 - loss: 0.3530 - val_binary_accuracy: 0.8628 - val_loss: 0.3147
Epoch 4/5
[1m5062/5062[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m838s[0m 165ms/step - binary_accuracy: 0.8560 - loss: 0.3313 - val_binary_accuracy: 0.8655 - val_loss: 0.3021
Epoch 5/5
[1m5062/5062[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m838s[0m 165ms/step - binary_accuracy: 0.8635 - loss: 0.3162 - val_binary_accuracy: 0.8744 - val_loss: 0.2878


In [None]:
path = kagglehub.dataset_download("alessandrasala79/ai-vs-human-generated-dataset")
base_dir = path
print("Path to dataset files:", path)
def preprocess_val(image):
    image = tf.image.resize(image, [224, 224], method=tf.image.ResizeMethod.BICUBIC)  # Resize
    image = tf.cast(image, tf.float32) / 255.0  # Normalize to [0,1]

    mean = tf.constant([0.485, 0.456, 0.406])
    std = tf.constant([0.229, 0.224, 0.225])
    image = (image - mean) / std  # Normalize

    return image

def load_test_image(image_path):
    image = tf.io.read_file(image_path)
    image = tf.image.decode_jpeg(image, channels=3)
    image = preprocess_val(image)
    return image
train_csv_path = os.path.join(base_dir, 'train.csv')
test_csv_path  = os.path.join(base_dir, 'test.csv')

# Reading the training CSV file
df_train = pd.read_csv(train_csv_path)
# Example of a row: file_name="train_data/041be3153810...", label=0 or 1

# Reading the testing CSV file
df_test = pd.read_csv(os.path.join(base_dir, 'test.csv'))
# Exemple: df_test['id'] = "test_data/e25323c62af644fba97afb846261b05b.jpg", etc.

# Adding the full path to the file_name instead of just "trainORtest_data/xxx.jpg"
df_test['id'] = df_test['id'].apply(lambda x: os.path.join(base_dir, x))
df_train['file_name'] = df_train['file_name'].apply(lambda x: os.path.join(base_dir, x))

test_ds = tf.data.Dataset.from_tensor_slices(df_test['id'].values)
test_ds = test_ds.map(load_test_image).batch(batch_size)
testf_csv_path  = os.path.join(base_dir, 'test.csv')
dff_test = pd.read_csv(os.path.join(base_dir, 'test.csv'))
y_test_preds = model.predict(test_ds).flatten()
y_test_preds = (y_test_preds > 0.5).astype(int)

submission = pd.DataFrame({
    'id': dff_test['id'],
    'label': y_test_preds
})
submission.to_csv('submission.csv', index=False)
print('Submission file saved!')

Path to dataset files: /root/.cache/kagglehub/datasets/alessandrasala79/ai-vs-human-generated-dataset/versions/4
[1m174/174[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m141s[0m 719ms/step
Submission file saved!
