In [None]:
import os
import zipfile
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout, BatchNormalization
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

In [None]:
# Paths to the uploaded files
train_zip_path = '/content/train_imageDL.zip'
test_zip_path = '/content/test_imageDL.zip'
train_csv_path = '/content/train_image.csv'
test_csv_path = '/content/test_image.csv'
train_extracted_path = '/content/train_imageDL_extracted'
test_extracted_path = '/content/test_imageDL_extracted'

In [None]:
# Function to unzip files
def unzip_file(zip_path, extract_to):
    if os.path.exists(zip_path):
        try:
            os.makedirs(extract_to, exist_ok=True)
            with zipfile.ZipFile(zip_path, 'r') as zip_ref:
                zip_ref.extractall(extract_to)
            print(f"Extracted: {zip_path} to {extract_to}")
        except Exception as e:
            print(f"Error extracting {zip_path}: {e}")
            raise
    else:
        raise FileNotFoundError(f"{zip_path} does not exist.")



In [None]:
# Unzip train and test datasets
unzip_file(train_zip_path, train_extracted_path)
unzip_file(test_zip_path, test_extracted_path)

Extracted: /content/train_imageDL.zip to /content/train_imageDL_extracted
Extracted: /content/test_imageDL.zip to /content/test_imageDL_extracted


In [None]:
# Verify extracted files
print("Contents of train directory:", os.listdir(train_extracted_path))
print("Contents of test directory:", os.listdir(test_extracted_path))

Contents of train directory: ['train']
Contents of test directory: ['test']


In [None]:
# Load CSV files
def load_csv(csv_path):
    if os.path.exists(csv_path):
        return pd.read_csv(csv_path)
    else:
        raise FileNotFoundError(f"{csv_path} does not exist.")

train_csv = load_csv(train_csv_path)
test_csv = load_csv(test_csv_path)


In [None]:
# Update file paths based on labels
real_path = os.path.join(train_extracted_path, "train/training_real")
fake_path = os.path.join(train_extracted_path, "train/training_fake")

train_csv['file_path'] = train_csv.apply(
    lambda row: os.path.join(real_path, f"{row['file_id']}.jpg") if row['label'] == 1 else os.path.join(fake_path, f"{row['file_id']}.jpg"),
    axis=1
)

In [None]:
# Split data into training and validation sets
train_df, val_df = train_test_split(train_csv, test_size=0.2, stratify=train_csv['label'], random_state=42)


In [None]:
# Verify file existence
print("\nVerifying file paths in train_df:")
print(train_df['file_path'].apply(lambda x: os.path.exists(x)).value_counts())

print("\nVerifying file paths in val_df:")
print(val_df['file_path'].apply(lambda x: os.path.exists(x)).value_counts())


Verifying file paths in train_df:
file_path
True    1367
Name: count, dtype: int64

Verifying file paths in val_df:
file_path
True    342
Name: count, dtype: int64


In [None]:
# Constants
IMG_HEIGHT, IMG_WIDTH = 32, 32
BATCH_SIZE = 32

In [None]:
#data generator
# Ensure labels are strings
train_df['label'] = train_df['label'].astype(str)
val_df['label'] = val_df['label'].astype(str)

# Data Generators
def create_data_generator(dataframe, datagen, target_size, batch_size, mode):
    return datagen.flow_from_dataframe(
        dataframe,
        x_col='file_path',
        y_col='label' if mode != 'test' else None,
        target_size=target_size,
        batch_size=batch_size,
        class_mode='binary' if mode != 'test' else None,
        shuffle=(mode != 'test')
    )

train_datagen = ImageDataGenerator(rescale=1./255, horizontal_flip=True, rotation_range=10, zoom_range=0.2)
val_datagen = ImageDataGenerator(rescale=1./255)

train_generator = create_data_generator(train_df, train_datagen, (IMG_HEIGHT, IMG_WIDTH), BATCH_SIZE, mode='train')
val_generator = create_data_generator(val_df, val_datagen, (IMG_HEIGHT, IMG_WIDTH), BATCH_SIZE, mode='val')


Found 1367 validated image filenames belonging to 2 classes.
Found 342 validated image filenames belonging to 2 classes.


In [None]:
# Callbacks
checkpoint_path = '/content/best_model.keras'
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
model_checkpoint = ModelCheckpoint(checkpoint_path, monitor='val_loss', save_best_only=True)


In [None]:
# Model 1: Built from Scratch
def build_model_from_scratch():
    model = Sequential([
        Conv2D(32, (3, 3), activation='relu', input_shape=(IMG_HEIGHT, IMG_WIDTH, 3)),
        MaxPooling2D((2, 2)),
        BatchNormalization(),
        Conv2D(64, (3, 3), activation='relu'),
        MaxPooling2D((2, 2)),
        BatchNormalization(),
        Flatten(),
        Dense(128, activation='relu'),
        Dropout(0.5),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])
    return model

In [None]:
# Model 2: Pre-trained ResNet50
def build_pretrained_model():
    base_model = ResNet50(weights='imagenet', include_top=False, input_shape=(IMG_HEIGHT, IMG_WIDTH, 3))
    for layer in base_model.layers:
        layer.trainable = False
    x = Flatten()(base_model.output)
    x = Dense(128, activation='relu')(x)
    x = Dropout(0.5)(x)
    output = Dense(1, activation='sigmoid')(x)
    model = Model(inputs=base_model.input, outputs=output)
    model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])
    return model

In [None]:
# Train models
scratch_model = build_model_from_scratch()
pretrained_model = build_pretrained_model()

print("Training model from scratch...")
scratch_model.fit(
    train_generator,
    validation_data=val_generator,
    epochs=20,
    callbacks=[early_stopping, model_checkpoint]
)

print("Training pre-trained model...")
pretrained_model.fit(
    train_generator,
    validation_data=val_generator,
    epochs=20,
    callbacks=[early_stopping, model_checkpoint]
)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Training model from scratch...
Epoch 1/20


  self._warn_if_super_not_called()


[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 473ms/step - accuracy: 0.5352 - loss: 1.0678 - val_accuracy: 0.5497 - val_loss: 0.6833
Epoch 2/20
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 412ms/step - accuracy: 0.5686 - loss: 0.7709 - val_accuracy: 0.5556 - val_loss: 0.6856
Epoch 3/20
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 439ms/step - accuracy: 0.5851 - loss: 0.7015 - val_accuracy: 0.5556 - val_loss: 0.7261
Epoch 4/20
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 431ms/step - accuracy: 0.6116 - loss: 0.6660 - val_accuracy: 0.5585 - val_loss: 0.6848
Epoch 5/20
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 437ms/step - accuracy: 0.6191 - loss: 0.6614 - val_accuracy: 0.5292 - val_loss: 0.6887
Epoch 6/20
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 418ms/step - accuracy: 0.6381 - loss: 0.6293 - val_accuracy: 0.5585 - val_loss: 0.6825
Epoch 7/20
[1m43/43[0m [32m━━━

<keras.src.callbacks.history.History at 0x7bee3625d810>

In [174]:
def generate_test_predictions(model, output_path):
    # Ensure file_id is treated as a string
    test_csv['file_id'] = test_csv['file_id'].astype(str)

    # Rebuild file paths and validate
    #test_csv['file_path'] = test_csv['file_id'].apply(lambda x: os.path.join(test_extracted_path, f"{x}.jpg"))
    test_csv['file_path'] = test_csv['file_id'].apply(lambda x: os.path.join('/content/test_imageDL_extracted/test', f"{x}.jpg"))


    # Verify file existence
    missing_files = test_csv['file_path'].apply(lambda x: not os.path.exists(x)).sum()
    if missing_files > 0:
        print(f"Warning: {missing_files} files are missing. Fix the file paths.")
        print(test_csv[test_csv['file_path'].apply(lambda x: not os.path.exists(x))].head())  # Show some missing paths
        return

    # Create test data generator
    test_datagen = ImageDataGenerator(rescale=1./255)
    test_generator = create_data_generator(test_csv, test_datagen, (IMG_HEIGHT, IMG_WIDTH), 1, mode='test')

    # Generate predictions
    predictions = model.predict(test_generator)
    predictions = (predictions > 0.5).astype(int).flatten()

    # Save predictions
    submission = pd.DataFrame({
        'file_id': test_csv['file_id'],
        'label': predictions
    })
    submission.to_csv(output_path, index=False)
    print(f"Submission file created at {output_path}")


In [176]:
print("Verifying file paths in test_csv:")
print(test_csv['file_path'].head())  # Print a few file paths
print("File existence check:")
print(test_csv['file_path'].apply(lambda x: os.path.exists(x)).value_counts())


Verifying file paths in test_csv:
0    /content/test_imageDL_extracted/test/0.jpg
1    /content/test_imageDL_extracted/test/1.jpg
2    /content/test_imageDL_extracted/test/2.jpg
3    /content/test_imageDL_extracted/test/3.jpg
4    /content/test_imageDL_extracted/test/4.jpg
Name: file_path, dtype: object
File existence check:
file_path
True    332
Name: count, dtype: int64


In [175]:
generate_test_predictions(scratch_model, '/content/submission_scratch.csv')
generate_test_predictions(pretrained_model, '/content/submission_pretrained.csv')


Found 332 validated image filenames.
[1m  6/332[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m3s[0m 10ms/step  

  self._warn_if_super_not_called()


[1m332/332[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 17ms/step
Submission file created at /content/submission_scratch.csv
Found 332 validated image filenames.


  self._warn_if_super_not_called()


[1m332/332[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 33ms/step
Submission file created at /content/submission_pretrained.csv


In [177]:
from sklearn.metrics import accuracy_score

# Generate and calculate accuracy
def evaluate_model_accuracy(model, val_generator):
    # Predict on the validation data
    val_predictions = model.predict(val_generator)
    val_predictions = (val_predictions > 0.5).astype(int).flatten()  # Convert probabilities to binary predictions

    # Get true labels
    val_labels = val_generator.labels  # The actual labels for the validation data

    # Calculate accuracy
    accuracy = accuracy_score(val_labels, val_predictions)
    print(f"Validation Accuracy: {accuracy * 100:.2f}%")

# Now, evaluate both models
print("Evaluating scratch model on validation data...")
evaluate_model_accuracy(scratch_model, val_generator)

print("Evaluating pre-trained model on validation data...")
evaluate_model_accuracy(pretrained_model, val_generator)


Evaluating scratch model on validation data...
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 392ms/step
Validation Accuracy: 47.37%
Evaluating pre-trained model on validation data...
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 667ms/step
Validation Accuracy: 55.56%


In [187]:
import os
import pandas as pd

# Define the directory where test images are located
test_images_dir = '/content/test_imageDL_extracted/test'

# Get the list of image filenames
image_files = os.listdir(test_images_dir)

# Create a DataFrame with 'file_id' and 'file_path'
test_csv = pd.DataFrame({
    'file_id': [i for i in range(len(image_files))],
    'file_path': [os.path.join(test_images_dir, f) for f in image_files]
})

# Save this DataFrame to a CSV file
test_csv.to_csv('/content/test_csv.csv', index=False)

print("Test CSV file has been generated and saved as '/content/test_csv.csv'")


Test CSV file has been generated and saved as '/content/test_csv.csv'


In [192]:
# Assuming predictions are made using your model and stored in 'predictions'
# Here 'test_csv' contains the 'file_id' and 'file_path' for the test data

# Predict labels using your model (assuming the model output is probabilities or class indices)
predictions = model.predict(test_generator, verbose=1)

# For multi-class classification, you might need to take the argmax of the probabilities
predicted_labels = np.argmax(predictions, axis=-1)

# If the task is binary classification, predicted_labels can be a 0/1 prediction
# If it is multi-class, the predicted labels will be indices of the predicted class

# Add the predictions to the test_csv
submission_df = test_csv.copy()
submission_df['label'] = predicted_labels  # Add predicted labels to the dataframe

# Save the submission CSV in the required format (file_id, label)
submission_df[['file_id', 'label']].to_csv('/content/submission.csv', index=False)

print("Submission file saved as '/content/submission.csv'")


[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 322ms/step
Submission file saved as '/content/submission.csv'


In [197]:
# Define the path to your true labels file
file_path = '/path_to_true_labels.csv'  # Replace with your actual path

# Check if the file exists
if os.path.exists(file_path):
    print("True labels file found. Loading the data...")
    # Assuming the true labels are in the 'label' column of the CSV
    true_labels = pd.read_csv(file_path)['label'].values  # Adjust if column name is different
else:
    print(f"True labels file not found at {file_path}")

# Assuming predictions are made using your model
# Make sure that your model and test_generator are defined correctly
predictions = model.predict(test_generator, verbose=1)

# If your model outputs probabilities, convert to class labels
predicted_labels = np.argmax(predictions, axis=-1)  # This is for multi-class classification

# Calculate accuracy if true labels and predicted labels are available
if 'true_labels' in locals() and 'predicted_labels' in locals():
    accuracy = accuracy_score(true_labels, predicted_labels)
    print(f"Accuracy: {accuracy:.4f}")
else:
    print("Could not compute accuracy due to missing data.")


True labels file not found at /path_to_true_labels.csv
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 303ms/step
Could not compute accuracy due to missing data.


In [199]:
import os
import pandas as pd

# Define the correct path to your true labels file
true_labels_file_path = '/content/true_labels.csv'  # Replace with your actual file path

# Check if the file exists and load it
if os.path.exists(true_labels_file_path):
    print("True labels file found. Loading data...")
    true_labels_df = pd.read_csv(true_labels_file_path)
    print(true_labels_df.head())  # Print first few rows to verify the column names
    true_labels = true_labels_df['label'].values  # Make sure 'label' is the correct column name
else:
    print(f"True labels file not found at {true_labels_file_path}")



True labels file not found at /content/true_labels.csv


In [2]:
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing import image
from sklearn.metrics import accuracy_score  # For calculating accuracy

# Assuming you have the model loaded (scratch_model or pretrained_model)
# Define the test data folder path
test_folder_path = '/content/test_imageDL_extracted/test'  # Adjust path if necessary

# Load the test CSV if you already have it
test_csv = pd.read_csv('/content/test_csv.csv')  # Adjust path if necessary

# Prepare the data generator for test data
test_datagen = image.ImageDataGenerator(rescale=1./255)

# Create the generator for test data (no labels are needed)
test_generator = test_datagen.flow_from_dataframe(
    dataframe=test_csv,
    directory=None,  # Since the file paths are already complete
    x_col='file_path',
    y_col=None,  # No labels for test data
    class_mode=None,
    target_size=(IMG_HEIGHT, IMG_WIDTH),  # Adjust image size if necessary
    batch_size=BATCH_SIZE,  # Adjust batch size as needed
    shuffle=False  # Do not shuffle, to match filenames correctly
)

# Use your model to predict on the test set
predictions = model.predict(test_generator, verbose=1)

# If your model outputs probabilities (e.g., for a classification task), take the class with the highest probability
predicted_labels = np.argmax(predictions, axis=1)  # Change this depending on your task

# OPTIONAL: If ground truth is available (for evaluation purposes)
# You can compare predicted labels with the actual labels
# test_labels = test_csv['actual_labels']  # Replace this with your actual labels column (if available)
# accuracy = accuracy_score(test_labels, predicted_labels)

# Now create the submission DataFrame
submission_df = pd.DataFrame({
    'file_id': test_csv['file_id'],
    'label': predicted_labels
})

# If ground truth is available, you can add accuracy to the CSV
# submission_df['accuracy'] = accuracy  # Only if you want to include the accuracy in the CSV file

# Save the DataFrame to a CSV file
submission_df.to_csv('/content/submission_dl.csv', index=False)

# If you want to print accuracy and other metrics:
print("Accuracy of the model on test data:", accuracy)  # If accuracy is calculated

print("Submission CSV has been saved as '/content/submission.csv'")


FileNotFoundError: [Errno 2] No such file or directory: '/content/test_csv.csv'