In [18]:
"""
Team Name: agamstudy2005
File: train_model.py

This script performs the following steps:
  1. Scans the dataset directory (which contains 'fake' and 'real' subdirectories) for images.
  2. Assigns labels (0 for fake, 1 for real) to the images.
  3. Splits the data into training, validation, and test sets (60/20/20 split) using train_test_split.
  4. Creates TensorFlow datasets for all splits.
  5. Builds and trains an improved CNN model (using EfficientNetB0 with data augmentation) to classify the images.
  6. Evaluates and prints the model accuracy on training, validation, and test data.
  7. Computes and prints the F1 score on the test set.
"""
import os
import tensorflow as tf
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

def get_image_paths_and_labels(dataset_path):
    """
    Scans the dataset directory for images in the subfolders 'fake' and 'real'
    and returns lists of file paths and corresponding labels.

    Expected directory structure:
        dataset/
            fake/   --> Contains fake images.
            real/   --> Contains real images.

    Labels:
      - 0 for fake
      - 1 for real
    """
    categories = ["fake", "real"]
    file_paths = []
    labels = []

    for label, category in enumerate(categories):
        category_path = os.path.join(dataset_path, category)
        if not os.path.isdir(category_path):
            print(f"Directory {category_path} does not exist. Skipping.")
            continue

        for file in os.listdir(category_path):
            if file.lower().endswith(('.jpg', '.jpeg', '.png')):
                full_path = os.path.join(category_path, file)
                file_paths.append(full_path)
                labels.append(label)

    return file_paths, labels

def load_image(image_path, target_size=(224, 224)):
    """
    Reads and preprocesses an image:
      - Reads the image from disk.
      - Decodes it (assuming it's a JPEG/PNG image).
      - Resizes it to the target size.

    Note:
      We do NOT scale the image to [0, 1] because the EfficientNet preprocessing function
      expects pixel values in the range [0, 255].
    """
    image = tf.io.read_file(image_path)
    image = tf.image.decode_image(image, channels=3, expand_animations=False)
    image = tf.image.resize(image, target_size)
    image = tf.cast(image, tf.float32)  # keep values in [0, 255]
    return image

def create_dataset(image_paths, labels, batch_size=32, target_size=(224, 224)):
    """
    Creates a tf.data.Dataset from image file paths and corresponding labels.
    """
    image_paths_tensor = tf.constant(image_paths)
    labels_tensor = tf.constant(labels, dtype=tf.int32)

    def _load_image_and_label(path, label):
        image = load_image(path, target_size)
        return image, label

    dataset = tf.data.Dataset.from_tensor_slices((image_paths_tensor, labels_tensor))
    dataset = dataset.map(_load_image_and_label, num_parallel_calls=tf.data.AUTOTUNE)
    dataset = dataset.shuffle(buffer_size=len(image_paths))
    dataset = dataset.batch(batch_size)
    dataset = dataset.prefetch(buffer_size=tf.data.AUTOTUNE)
    return dataset

def build_transfer_learning_model(input_shape=(224, 224, 3), num_classes=2):
    """
    Builds an improved model using transfer learning with EfficientNetB0.

    This version includes data augmentation and uses the EfficientNetB0 preprocessing function.
    The base model is not forced to training mode, so layers like BatchNormalization and Dropout
    behave correctly during inference.
    """
    # Load the EfficientNetB0 base model without the top classifier layers.
    base_model = tf.keras.applications.EfficientNetB0(
        input_shape=input_shape,
        include_top=False,
        weights='imagenet'
    )
    base_model.trainable = True  # allow fine-tuning of the entire network

    inputs = tf.keras.Input(shape=input_shape)

    # Data augmentation (active only during training)
    x = tf.keras.layers.RandomFlip("horizontal")(inputs)
    x = tf.keras.layers.RandomRotation(0.1)(x)

    # Preprocess the input using EfficientNet's dedicated preprocessing function.
    # This converts pixel values from [0, 255] to the expected range (typically [-1, 1]).
    x = tf.keras.applications.efficientnet.preprocess_input(x)

    # Pass the preprocessed inputs through the base model.
    x = base_model(x)

    # Add global average pooling and extra Dense layers.
    x = tf.keras.layers.GlobalAveragePooling2D()(x)
    x = tf.keras.layers.Dense(256, activation='relu')(x)
    x = tf.keras.layers.Dropout(0.2)(x)
    outputs = tf.keras.layers.Dense(num_classes, activation='softmax')(x)

    model = tf.keras.Model(inputs, outputs)

    # Compile the model with a moderate learning rate.
    model.compile(
        optimizer=tf.keras.optimizers.Adam(1e-4),
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )
    return model

def main():
    # -------------------------
    # 1. Set the Dataset Path
    # -------------------------
    dataset_path = "/content/drive/MyDrive/dataset"  # Ensure this directory contains 'fake' and 'real' subfolders.

    # -------------------------
    # 2. Get Image Paths and Labels
    # -------------------------
    file_paths, labels = get_image_paths_and_labels(dataset_path)
    print(f"Total images found: {len(file_paths)}")

    if len(file_paths) == 0:
        print("No images found. Exiting.")
        return

    # -------------------------
    # 3. Split Data into Training, Validation, and Test Sets
    # -------------------------
    train_paths, temp_paths, train_labels, temp_labels = train_test_split(
        file_paths, labels, test_size=0.4, random_state=42, stratify=labels
    )
    val_paths, test_paths, val_labels, test_labels = train_test_split(
        temp_paths, temp_labels, test_size=0.5, random_state=42, stratify=temp_labels
    )

    print("Number of training images:", len(train_paths))
    print("Number of validation images:", len(val_paths))
    print("Number of test images:", len(test_paths))

    # -------------------------
    # 4. Create TensorFlow Datasets
    # -------------------------
    batch_size = 32
    target_size = (224, 224)  # Input size for EfficientNetB0
    train_dataset = create_dataset(train_paths, train_labels, batch_size, target_size)
    val_dataset = create_dataset(val_paths, val_labels, batch_size, target_size)
    test_dataset = create_dataset(test_paths, test_labels, batch_size, target_size)

    # -------------------------
    # 5. Build and Compile the Improved Model.
    # -------------------------
    model = build_transfer_learning_model(input_shape=(224, 224, 3), num_classes=2)
    model.summary()

    # -------------------------
    # 6. Train the Model
    # -------------------------
    epochs = 25  # Increase the number of epochs as needed.
    history = model.fit(train_dataset, validation_data=val_dataset, epochs=epochs)

    # -------------------------
    # 7. Evaluate Model Accuracy on Training, Validation, and Test Data
    # -------------------------
    train_loss, train_accuracy = model.evaluate(train_dataset)
    print("Training Accuracy: {:.2f}%".format(train_accuracy * 100))

    val_loss, val_accuracy = model.evaluate(val_dataset)
    print("Validation Accuracy: {:.2f}%".format(val_accuracy * 100))

    test_loss, test_accuracy = model.evaluate(test_dataset)
    print("Test Accuracy: {:.2f}%".format(test_accuracy * 100))

    # -------------------------
    # 8. Compute F1 Score on the Test Set
    # -------------------------
    y_true = []
    y_pred = []
    for images, labels in test_dataset:
        predictions = model.predict(images)
        predictions = np.argmax(predictions, axis=1)
        y_pred.extend(predictions)
        y_true.extend(labels.numpy())

    f1 = f1_score(y_true, y_pred, average='weighted')
    print("Test F1 Score: {:.4f}".format(f1))

    model.save('my_trained_model.h5')
    print("Model saved as 'my_trained_model.h5'")

if __name__ == "__main__":
    main()

Total images found: 2010
Number of training images: 1206
Number of validation images: 402
Number of test images: 402


Epoch 1/25
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m74s[0m 582ms/step - accuracy: 0.6057 - loss: 0.6454 - val_accuracy: 0.7015 - val_loss: 0.5771
Epoch 2/25
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 395ms/step - accuracy: 0.8136 - loss: 0.4126 - val_accuracy: 0.7512 - val_loss: 0.5131
Epoch 3/25
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 394ms/step - accuracy: 0.8605 - loss: 0.3100 - val_accuracy: 0.7985 - val_loss: 0.4727
Epoch 4/25
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 394ms/step - accuracy: 0.9003 - loss: 0.2531 - val_accuracy: 0.8234 - val_loss: 0.4351
Epoch 5/25
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 390ms/step - accuracy: 0.9250 - loss: 0.1946 - val_accuracy: 0.8184 - val_loss: 0.4258
Epoch 6/25
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 399ms/step - accuracy: 0.9487 - loss: 0.1580 - val_accuracy: 0.8159 - val_loss: 0.4017
Epoch 7/25
[1m38/38[



Test F1 Score: 0.8855
Model saved as 'my_trained_model.h5'


In [15]:
import os
import json
import numpy as np
import tensorflow as tf

# Define the folder with test images and the output JSON filename.
TEST_FOLDER = "/content/drive/MyDrive/test"         # Ensure this folder exists in your Colab environment.
OUTPUT_JSON = "predictions.json"

# Load the saved model.
# Make sure the file 'my_trained_model.h5' exists in your current working directory.
model = tf.keras.models.load_model('my_trained_model.h5')
print("Model loaded successfully.")

def load_image_for_prediction(image_path, target_size=(224, 224)):
    """
    Loads and preprocesses an image for prediction.

    Note: This function assumes your model uses EfficientNet preprocessing.
    """
    image = tf.io.read_file(image_path)
    image = tf.image.decode_image(image, channels=3, expand_animations=False)
    image = tf.image.resize(image, target_size)
    image = tf.cast(image, tf.float32)  # Keep pixel values in [0, 255]
    # Preprocess the image (this converts the pixel values to the expected range)
    image = tf.keras.applications.efficientnet.preprocess_input(image)
    return image

# Get a sorted list of test image filenames (assuming names like "1.png", "2.png", …).
image_files = sorted(
    [f for f in os.listdir(TEST_FOLDER) if f.lower().endswith(('.png', '.jpg', '.jpeg'))],
    key=lambda x: int(os.path.splitext(x)[0])
)

predictions_list = []

# Loop over each test image, make a prediction, and store the result.
for file_name in image_files:
    image_path = os.path.join(TEST_FOLDER, file_name)
    image = load_image_for_prediction(image_path, target_size=(224, 224))
    image = tf.expand_dims(image, axis=0)  # Add a batch dimension.
    preds = model.predict(image)
    pred_class = np.argmax(preds, axis=1)[0]

    # Map the prediction to a label.
    label = "fake" if pred_class == 0 else "real"

    # Extract the numeric index from the filename (e.g., "4.png" -> 4).
    index_val = int(os.path.splitext(file_name)[0])

    predictions_list.append({
        "index": index_val,
        "prediction": label
    })

# Sort the predictions by index (optional).
predictions_list.sort(key=lambda x: x["index"])

# Write the predictions to a JSON file.
with open(OUTPUT_JSON, "w") as f:
    json.dump(predictions_list, f, indent=4)

print(f"Predictions saved to {OUTPUT_JSON}")




Model loaded successfully.
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 144ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 109ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 127ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 77ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 76ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 75ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 67ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 71ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━