In [1]:
import cv2
import numpy as np
import torch
import os
import torchvision.models as models
from torchvision import transforms
import torch.nn as nn

# Define constants
BASE_DIR = "/Users/ananyapurkait/Study Files and Folders/Semester Study/Sem VII/CS435/Project/working/code"
MODEL_PATH = os.path.join(BASE_DIR, "convnext_scenario_1.h5")
VIDEO_FOLDER = os.path.join(BASE_DIR, "deepfake_vids")
VIDEO_NAME = "deepfake_vid2.mp4"
VIDEO_PATH = os.path.join(VIDEO_FOLDER, VIDEO_NAME)

# Pre-trained ConvNeXt Tiny model structure
try:
    model = models.convnext_tiny(pretrained=False)  # Load an empty ConvNeXt model
    model.load_state_dict(torch.load(MODEL_PATH, weights_only=True))  # Load trained weights
    model.classifier[2] = nn.Linear(model.classifier[2].in_features, 1)  # Modify output layer
    model.eval()  # Set the model to evaluation mode
except Exception as e:
    print(f"Error loading model: {e}")
    exit()

# Define preprocessing transformations
preprocess = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Function to preprocess a frame
def preprocess_frame(frame):
    try:
        frame_tensor = preprocess(frame)
        return frame_tensor.unsqueeze(0)  # Add batch dimension
    except Exception as e:
        print(f"Error during preprocessing: {e}")
        return None

# Function to generate a saliency map and draw bounding boxes
def generate_saliency_map_with_bboxes(frame, model):
    try:
        input_tensor = preprocess_frame(frame)
        input_tensor.requires_grad = True  # Enable gradient computation
        
        # Forward pass
        output = model(input_tensor)
        prediction = torch.sigmoid(output).squeeze()

        # Backward pass
        model.zero_grad()
        prediction.backward()

        # Compute saliency map
        saliency = input_tensor.grad.abs().squeeze().cpu().numpy()
        saliency = np.max(saliency, axis=0)  # Aggregate across channels
        saliency = cv2.normalize(saliency, None, 0, 255, cv2.NORM_MINMAX).astype(np.uint8)

        # Threshold saliency map to find regions of interest
        _, thresh = cv2.threshold(saliency, 100, 255, cv2.THRESH_BINARY)
        contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

        # Generate heatmap
        heatmap = cv2.applyColorMap(saliency, cv2.COLORMAP_JET)
        overlay = cv2.addWeighted(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB), 0.6, heatmap, 0.4, 0)

        # Draw bounding boxes around salient regions
        for contour in contours:
            x, y, w, h = cv2.boundingRect(contour)
            cv2.rectangle(overlay, (x, y), (x + w, y + h), (0, 255, 0), 2)

        return overlay, saliency
    except Exception as e:
        print(f"Error generating saliency map with bounding boxes: {e}")
        return frame, None

# Modified deepfake detection function
def detect_deepfake_with_bboxes(video_path):
    cap = cv2.VideoCapture(video_path)
    frame_count, fake_count, real_count = 0, 0, 0

    if not cap.isOpened():
        print("Error: Could not open video.")
        return None

    while True:
        ret, frame = cap.read()
        if not ret:
            break
        
        frame_count += 1
        if frame_count % 10 == 0:  # Process every 10th frame
            processed_frame = preprocess_frame(frame)
            if processed_frame is None:
                continue
            with torch.no_grad():
                outputs = model(processed_frame)
                prediction = torch.sigmoid(outputs).item()
            if prediction > 0.5:
                fake_count += 1
            else:
                real_count += 1
            
            # Display saliency map with bounding boxes
            overlay_frame, _ = generate_saliency_map_with_bboxes(frame, model)
            cv2.imshow("Deepfake Detection with Bounding Boxes", overlay_frame)

            if cv2.waitKey(1) & 0xFF == ord('q'):
                break

    cap.release()
    cv2.destroyAllWindows()

    total_frames = fake_count + real_count
    accuracy = (real_count / total_frames) * 100 if total_frames > 0 else 0
    print(f"Real frames: {real_count}, Fake frames: {fake_count}")
    print(f"Accuracy: {accuracy:.2f}%")
    return "Fake" if fake_count > real_count else "Real", accuracy

# Run the modified detection
result, accuracy = detect_deepfake_with_bboxes(VIDEO_PATH)
print(f"Video is classified as: {result} with {accuracy:.2f}% accuracy")





Error generating saliency map with bounding boxes: OpenCV(4.10.0) /Users/xperience/GHA-Actions-OpenCV/_work/opencv-python/opencv-python/opencv/modules/core/src/arithm.cpp:661: error: (-209:Sizes of input arguments do not match) The operation is neither 'array op array' (where arrays have the same size and the same number of channels), nor 'array op scalar', nor 'scalar op array' in function 'arithm_op'





Error generating saliency map with bounding boxes: OpenCV(4.10.0) /Users/xperience/GHA-Actions-OpenCV/_work/opencv-python/opencv-python/opencv/modules/core/src/arithm.cpp:661: error: (-209:Sizes of input arguments do not match) The operation is neither 'array op array' (where arrays have the same size and the same number of channels), nor 'array op scalar', nor 'scalar op array' in function 'arithm_op'

Error generating saliency map with bounding boxes: OpenCV(4.10.0) /Users/xperience/GHA-Actions-OpenCV/_work/opencv-python/opencv-python/opencv/modules/core/src/arithm.cpp:661: error: (-209:Sizes of input arguments do not match) The operation is neither 'array op array' (where arrays have the same size and the same number of channels), nor 'array op scalar', nor 'scalar op array' in function 'arithm_op'

Error generating saliency map with bounding boxes: OpenCV(4.10.0) /Users/xperience/GHA-Actions-OpenCV/_work/opencv-python/opencv-python/opencv/modules/core/src/arithm.cpp:661: error: (-2

In [2]:
import cv2
import numpy as np
import torch
import os
import torchvision.models as models
from torchvision import transforms
import torch.nn as nn

# Define constants
BASE_DIR = "/Users/ananyapurkait/Study Files and Folders/Semester Study/Sem VII/CS435/Project/working/code"
MODEL_PATH = os.path.join(BASE_DIR, "convnext_scenario_1.h5")
VIDEO_FOLDER = os.path.join(BASE_DIR, "deepfake_vids")
VIDEO_NAME = "deepfake_vid2.mp4"
VIDEO_PATH = os.path.join(VIDEO_FOLDER, VIDEO_NAME)

# Pre-trained ConvNeXt Tiny model structure
try:
    model = models.convnext_tiny(pretrained=False)  # Load an empty ConvNeXt model
    model.load_state_dict(torch.load(MODEL_PATH, weights_only=True))  # Load trained weights
    model.classifier[2] = nn.Linear(model.classifier[2].in_features, 1)  # Modify output layer
    model.eval()  # Set the model to evaluation mode
except Exception as e:
    print(f"Error loading model: {e}")
    exit()

# Define preprocessing transformations
preprocess = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Function to preprocess a frame
def preprocess_frame(frame):
    try:
        frame_tensor = preprocess(frame)
        return frame_tensor.unsqueeze(0)  # Add batch dimension
    except Exception as e:
        print(f"Error during preprocessing: {e}")
        return None

def generate_saliency_map_with_bboxes(frame, model):
    try:
        input_tensor = preprocess_frame(frame)
        input_tensor.requires_grad = True  # Enable gradient computation
        
        # Forward pass
        output = model(input_tensor)
        prediction = torch.sigmoid(output).squeeze()

        # Backward pass
        model.zero_grad()
        prediction.backward()

        # Compute saliency map
        saliency = input_tensor.grad.abs().squeeze().cpu().numpy()
        saliency = np.max(saliency, axis=0)  # Aggregate across channels
        saliency = cv2.normalize(saliency, None, 0, 255, cv2.NORM_MINMAX).astype(np.uint8)

        # Threshold saliency map to find regions of interest
        _, thresh = cv2.threshold(saliency, 100, 255, cv2.THRESH_BINARY)
        contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

        # Generate heatmap
        heatmap = cv2.applyColorMap(saliency, cv2.COLORMAP_JET)

        # Resize heatmap to match frame size
        heatmap_resized = cv2.resize(heatmap, (frame.shape[1], frame.shape[0]))

        # Convert the frame to RGB (if it is not already)
        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

        # Overlay heatmap on the frame
        overlay = cv2.addWeighted(frame_rgb, 0.6, heatmap_resized, 0.4, 0)

        # Draw bounding boxes around salient regions
        for contour in contours:
            x, y, w, h = cv2.boundingRect(contour)
            cv2.rectangle(overlay, (x, y), (x + w, y + h), (0, 255, 0), 2)

        return overlay, saliency
    except Exception as e:
        print(f"Error generating saliency map with bounding boxes: {e}")
        return frame, None


# Modified deepfake detection function
def detect_deepfake_with_bboxes(video_path):
    cap = cv2.VideoCapture(video_path)
    frame_count, fake_count, real_count = 0, 0, 0

    if not cap.isOpened():
        print("Error: Could not open video.")
        return None

    while True:
        ret, frame = cap.read()
        if not ret:
            break
        
        frame_count += 1
        if frame_count % 10 == 0:  # Process every 10th frame
            processed_frame = preprocess_frame(frame)
            if processed_frame is None:
                continue
            with torch.no_grad():
                outputs = model(processed_frame)
                prediction = torch.sigmoid(outputs).item()
            if prediction > 0.5:
                fake_count += 1
            else:
                real_count += 1
            
            # Display saliency map with bounding boxes
            overlay_frame, _ = generate_saliency_map_with_bboxes(frame, model)
            cv2.imshow("Deepfake Detection with Bounding Boxes", overlay_frame)

            if cv2.waitKey(1) & 0xFF == ord('q'):
                break

    cap.release()
    cv2.destroyAllWindows()

    total_frames = fake_count + real_count
    accuracy = (real_count / total_frames) * 100 if total_frames > 0 else 0
    print(f"Real frames: {real_count}, Fake frames: {fake_count}")
    print(f"Accuracy: {accuracy:.2f}%")
    return "Fake" if fake_count > real_count else "Real", accuracy

# Run the modified detection
result, accuracy = detect_deepfake_with_bboxes(VIDEO_PATH)
print(f"Video is classified as: {result} with {accuracy:.2f}% accuracy")


Real frames: 13, Fake frames: 3
Accuracy: 81.25%
Video is classified as: Real with 81.25% accuracy
