In [11]:
import os
import cv2
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score
from tensorflow.keras import layers, models
import matplotlib.pyplot as plt
import pandas as pd

dataset_folders = ["Celeb-real", "Youtube-real", "Celeb-Youtube-fake", "test"]
output_directory = "preprocessed_data"
os.makedirs(output_directory, exist_ok=True)

def preprocess_videos(dataset_folder, label, output_directory):
    for video_file in os.listdir(dataset_folder):
        video_path = os.path.join(dataset_folder, video_file)
        
        # Create output directory for the current video
        video_output_directory = os.path.join(output_directory, f"{label}_{video_file.split('.')[0]}")
        os.makedirs(video_output_directory, exist_ok=True)
        
        # Open video file
        cap = cv2.VideoCapture(video_path)
        success, image = cap.read()
        count = 0
        
        # Loop through frames and save them as images
        while success:
            # Define output file path for the current frame
            frame_path = os.path.join(video_output_directory, f"frame_{count}.jpg")
            
            # Resize frame to desired dimensions (e.g., 224x224) and save as JPEG
            resized_image = cv2.resize(image, (224, 224))
            cv2.imwrite(frame_path, resized_image)
            
            success, image = cap.read()
            count += 1
        
        cap.release()

for i, folder in enumerate(dataset_folders):
    if i < 2:
        label = "real"
    elif i == 2:
        label = "fake"
    else:
        label = "test"
    
    dataset_folder_path = os.path.join(".", folder)
    preprocess_videos(dataset_folder_path, label, output_directory)

preprocessed_data_path = "preprocessed_data"
real_folders = [folder for folder in os.listdir(preprocessed_data_path) if folder.startswith('real')]
fake_folders = [folder for folder in os.listdir(preprocessed_data_path) if folder.startswith('fake')]
test_folders = [folder for folder in os.listdir(preprocessed_data_path) if folder.startswith('test')]


# Define function to load data batch by batch using a generator
def data_generator(data_folders, batch_size=32):
    while True:
        X = []
        y = []
        for folder in data_folders:
            folder_path = os.path.join(preprocessed_data_path, folder)
            for file in os.listdir(folder_path):
                image_path = os.path.join(folder_path, file)
                # Read image using OpenCV
                image = cv2.imread(image_path)
                # Resize image to target size
                image = cv2.resize(image, (224, 224))
                # Normalize pixel values to range [0, 1]
                image = image.astype('float32') / 255.0
                X.append(image)
                # Append label to y
                y.append(int(folder.startswith('real')))
                # Yield batch if it reaches batch size
                if len(X) == batch_size:
                    yield np.array(X), np.array(y)
                    X = []
                    y = []

# Split data folders into training, validation, and test sets

train_real_folders, val_real_folders = train_test_split(real_folders, test_size=0.2, random_state=42)
train_fake_folders, val_fake_folders = train_test_split(fake_folders, test_size=0.2, random_state=42)

# Create data generators for training, validation, and test sets
train_generator = data_generator(train_real_folders + train_fake_folders)
val_generator = data_generator(val_real_folders + val_fake_folders)
test_generator = data_generator(test_folders)

# Define the model architecture
model = models.Sequential([
    layers.Conv2D(32, (3, 3), activation='relu', input_shape=(224, 224, 3)),
    layers.MaxPooling2D((2, 2)),
    layers.Conv2D(64, (3, 3), activation='relu'),
    layers.MaxPooling2D((2, 2)),
    layers.Conv2D(128, (3, 3), activation='relu'),
    layers.MaxPooling2D((2, 2)),
    layers.Flatten(),
    layers.Dense(128, activation='relu'),
    layers.Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model using generators
history = model.fit(train_generator, steps_per_epoch=len(train_real_folders + train_fake_folders),
                    epochs=10, validation_data=val_generator, validation_steps=len(val_real_folders + val_fake_folders))

# Evaluate the model on the test set
# test_loss, test_accuracy = model.evaluate(test_generator, steps=len(test_data_folders))
# print(f'Test Loss: {test_loss}, Test Accuracy: {test_accuracy}')

# # Predict classes on the test set
# y_pred = model.predict(test_generator, steps=len(test_data_folders)) > 0.5  # Thresholding at 0.5 for binary classification

# # Flatten the predictions and true labels
# y_pred_flat = y_pred.flatten()
# y_true_flat = np.array([int(folder.startswith('real')) for folder in test_data_folders])

# # Calculate F1 score
# f1 = f1_score(y_true_flat, y_pred_flat, average='binary')
# precision = precision_score(y_true_flat, y_pred_flat, average='binary')
# recall = recall_score(y_true_flat, y_pred_flat, average='binary')

# # Print F1 score to console
# print(f'F1 Score: {f1}')
# print(f'Precision: {precision}')
# print(f'Recall: {recall}')

# # Save predictions to CSV file
# results_df = pd.DataFrame({'Filename': test_data_folders, 'True_Label': y_true_flat, 'Predicted_Label': y_pred_flat})
# results_df.to_csv('predictions.csv', index=False)

# # Plot training history
# plt.plot(history.history['accuracy'], label='Training Accuracy')
# plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
# plt.xlabel('Epoch')
# plt.ylabel('Accuracy')
# plt.legend()
# plt.savefig('accuracy_plot.pdf')
# plt.show()

# # Create a PDF file with the F1 score
# with open('f1_score.pdf', 'w') as f:
#     f.write(f'F1 Score: {f1}\n')
#     f.write(f'Precision: {precision}\n')
#     f.write(f'Recall: {recall}\n')


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Loss: 0.0, Test Accuracy: 1.0


ValueError: Found input variables with inconsistent numbers of samples: [26, 832]

In [15]:
import os
import pandas as pd
from tensorflow.keras.models import load_model
import cv2
import numpy as np

# Load the trained model
model = load_model('deepfake_detection_model.h5')

# Directory containing test data
test_data_dir = 'test'  # Assuming the "test" folder is in the same directory as the script

# Get list of video files in the test data directory
test_videos = os.listdir(test_data_dir)

# Initialize lists to store filenames and predictions
filenames = []
predictions = []

# Loop through each video file in the test data directory
for video_file in test_videos:
    # Read video file
    video_path = os.path.join(test_data_dir, video_file)
    cap = cv2.VideoCapture(video_path)
    
    # Preprocess video frames and make predictions
    frames = []
    while True:
        success, image = cap.read()
        if not success:
            break
        resized_image = cv2.resize(image, (224, 224))
        resized_image = resized_image.astype('float32') / 255.0
        frames.append(resized_image)
    
    # Make predictions on the video frames
    frames = np.array(frames)
    predictions.append(model.predict(frames).mean())  # Use mean prediction for video
    
    # Append filename to list
    filenames.append(video_file)

# Convert predictions to binary labels
predicted_labels = ['real' if pred >= 0.5 else 'fake' for pred in predictions]

# Create DataFrame
submission_df = pd.DataFrame({'Filename': filenames, 'Predicted_Label': predicted_labels})

# Save DataFrame to CSV
submission_df.to_csv('submission.csv', index=False)




In [None]:
# Evaluate the model on the test set
try:
    test_loss, test_accuracy = model.evaluate(test_generator, steps=None, verbose=0)
    print(f'Test Loss: {test_loss}, Test Accuracy: {test_accuracy}')
except StopIteration:
    print("Test generator reached the end of the dataset.")

# Predict classes on the test set
test_data_generator = data_generator(test_folders)
y_pred = []
try:
    while True:
        X_batch, _ = next(test_data_generator)
        y_pred_batch = model.predict(X_batch) > 0.5  # Thresholding at 0.5 for binary classification
        y_pred.extend(y_pred_batch)
except StopIteration:
    pass

# Flatten the predictions and true labels
y_pred_flat = np.array(y_pred).flatten()
y_true_flat = np.array([int(folder.startswith('real')) for folder in test_folders])

# Calculate F1 score
f1 = f1_score(y_true_flat, y_pred_flat, average='binary')
precision = precision_score(y_true_flat, y_pred_flat, average='binary')
recall = recall_score(y_true_flat, y_pred_flat, average='binary')

# Print F1 score to console
print(f'F1 Score: {f1}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')

# Save predictions to CSV file
results_df = pd.DataFrame({'Filename': test_folders, 'True_Label': y_true_flat, 'Predicted_Label': y_pred_flat})
results_df.to_csv('predictions.csv', index=False)

# Plot training history
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.savefig('accuracy_plot.pdf')
plt.show()

# Create a PDF file with the F1 score
from reportlab.lib import colors
from reportlab.platypus import SimpleDocTemplate, Table, TableStyle

pdf_filename = "f1_score.pdf"
doc = SimpleDocTemplate(pdf_filename)

data = [["F1 Score", "Precision", "Recall"],
        [f"{f1}", f"{precision}", f"{recall}"]]

table = Table(data, colWidths=100, rowHeights=30)
table.setStyle(TableStyle([('BACKGROUND', (0, 0), (-1, 0), colors.gray),
                           ('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke),
                           ('ALIGN', (0, 0), (-1, -1), 'CENTER'),
                           ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
                           ('BOTTOMPADDING', (0, 0), (-1, 0), 12),
                           ('BACKGROUND', (0, 1), (-1, -1), colors.beige),
                           ('GRID', (0, 0), (-1, -1), 1, colors.black)]))

doc.build([table])

print(f"F1 score saved to {pdf_filename}")


Test generator reached the end of the dataset.
