In [2]:
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models
import cv2
import librosa
from moviepy.editor import VideoFileClip

# Image preprocessing
def preprocess_image(image_path, target_size=(224, 224)):
    img = cv2.imread(image_path)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    img = cv2.resize(img, target_size)
    img = img / 255.0  # Normalize
    return img

# Video preprocessing
def preprocess_video(video_path, target_size=(224, 224), max_frames=30):
    cap = cv2.VideoCapture(video_path)
    frames = []
    while len(frames) < max_frames:
        ret, frame = cap.read()
        if not ret:
            break
        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        frame = cv2.resize(frame, target_size)
        frame = frame / 255.0  # Normalize
        frames.append(frame)
    cap.release()
    
    # Pad if fewer than max_frames
    while len(frames) < max_frames:
        frames.append(np.zeros_like(frames[0]))
    
    return np.array(frames)

# Audio preprocessing
def preprocess_audio(audio_path, max_length=5*22050):  # 5 seconds at 22050 Hz
    audio, _ = librosa.load(audio_path, sr=22050)
    if len(audio) > max_length:
        audio = audio[:max_length]
    else:
        audio = np.pad(audio, (0, max_length - len(audio)))
    
    # Extract mel spectrogram
    mel_spec = librosa.feature.melspectrogram(y=audio, sr=22050)
    mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
    return mel_spec_db

# CNN Model for Image and Video
def create_image_model(input_shape=(224, 224, 3)):
    model = models.Sequential([
        layers.Conv2D(32, (3, 3), activation='relu', input_shape=input_shape),
        layers.MaxPooling2D((2, 2)),
        layers.Conv2D(64, (3, 3), activation='relu'),
        layers.MaxPooling2D((2, 2)),
        layers.Conv2D(64, (3, 3), activation='relu'),
        layers.Flatten(),
        layers.Dense(64, activation='relu'),
        layers.Dense(1, activation='sigmoid')
    ])
    return model

# CNN Model for Audio
def create_audio_model(input_shape=(128, 87, 1)):  # Adjust based on your mel spectrogram shape
    model = models.Sequential([
        layers.Conv2D(32, (3, 3), activation='relu', input_shape=input_shape),
        layers.MaxPooling2D((2, 2)),
        layers.Conv2D(64, (3, 3), activation='relu'),
        layers.MaxPooling2D((2, 2)),
        layers.Conv2D(64, (3, 3), activation='relu'),
        layers.Flatten(),
        layers.Dense(64, activation='relu'),
        layers.Dense(1, activation='sigmoid')
    ])
    return model

# Combined model
def create_combined_model(image_shape=(224, 224, 3), video_shape=(30, 224, 224, 3), audio_shape=(128, 87, 1)):
    # Image input
    image_input = layers.Input(shape=image_shape)
    image_model = create_image_model(image_shape)(image_input)
    
    # Video input
    video_input = layers.Input(shape=video_shape)
    video_model = layers.TimeDistributed(create_image_model(image_shape[1:]))(video_input)
    video_model = layers.GlobalAveragePooling3D()(video_model)
    
    # Audio input
    audio_input = layers.Input(shape=audio_shape)
    audio_model = create_audio_model(audio_shape)(audio_input)
    
    # Combine all inputs
    combined = layers.concatenate([image_model, video_model, audio_model])
    output = layers.Dense(1, activation='sigmoid')(combined)
    
    model = models.Model(inputs=[image_input, video_input, audio_input], outputs=output)
    return model

# Create and compile the model
combined_model = create_combined_model()
combined_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Example usage
def analyze_media(image_path, video_path, audio_path):
    # Preprocess inputs
    image = preprocess_image(image_path)
    video = preprocess_video(video_path)
    audio = preprocess_audio(audio_path)
    
    # Reshape inputs for model
    image = np.expand_dims(image, axis=0)
    video = np.expand_dims(video, axis=0)
    audio = np.expand_dims(audio, axis=0)
    audio = np.expand_dims(audio, axis=-1)
    
    # Make prediction
    prediction = combined_model.predict([image, video, audio])
    
    return prediction[0][0]

# Example call
# result = analyze_media('path/to/image.jpg', 'path/to/video.mp4', 'path/to/audio.wav')
# print(f"Deepfake probability: {result:.2f}")

ValueError: Input 0 of layer "conv2d_3" is incompatible with the layer: expected min_ndim=4, found ndim=3. Full shape received: (None, 224, 3)