<a href="https://colab.research.google.com/github/Abhi-10000/Multimodal-Deepfake-Detection-HackVortex/blob/main/version1%262.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install moviepy face_recognition pydub librosa python_speech_features mediapipe scikit-learn




New section

In [None]:
# Install dependencies (run this in Colab before running the code block below)
# !pip install moviepy librosa opencv-python scikit-learn

import os
import cv2
import numpy as np
import librosa
from moviepy.editor import VideoFileClip
from scipy.stats import pearsonr
from sklearn.ensemble import RandomForestClassifier
import joblib
import warnings

warnings.filterwarnings("ignore")

def extract_audio_and_frames(video_path, audio_out='extracted_audio.wav', frame_dir='frames', fps=5):
    print("[Step 1] Extracting audio and frames from video...")
    clip = VideoFileClip(video_path)
    clip.audio.write_audiofile(audio_out, logger=None)

    if not os.path.exists(frame_dir):
        os.makedirs(frame_dir)

    duration = clip.duration
    for i, t in enumerate(np.arange(0, duration, 1.0 / fps)):
        frame = clip.get_frame(t)
        frame_path = os.path.join(frame_dir, f"frame_{i:03d}.jpg")
        cv2.imwrite(frame_path, cv2.cvtColor(frame, cv2.COLOR_RGB2BGR))

    print(f"Extracted {i+1} frames and saved audio as '{audio_out}'")
    return audio_out, frame_dir

def extract_lip_movement_features(frame_dir):
    print("[Step 2] Extracting lip movement features from frames...")
    mouth_movements = []
    face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + "haarcascade_frontalface_default.xml")

    for file in sorted(os.listdir(frame_dir)):
        if file.endswith(".jpg"):
            path = os.path.join(frame_dir, file)
            img = cv2.imread(path)
            gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
            faces = face_cascade.detectMultiScale(gray, 1.3, 5)

            mouth_area = 0
            for (x, y, w, h) in faces:
                mouth_y = y + int(0.6 * h)
                mouth_h = int(0.2 * h)
                mouth = gray[mouth_y:mouth_y + mouth_h, x:x + w]
                mouth_area = cv2.countNonZero(mouth)

            mouth_movements.append(mouth_area)

    print(f"Extracted mouth movement data from {len(mouth_movements)} frames")
    return mouth_movements

def extract_audio_features(audio_path, sr=16000):
    print("[Step 3] Extracting MFCCs and audio energy...")
    y, sr = librosa.load(audio_path, sr=sr)
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
    energy = np.sum(librosa.feature.rms(y=y), axis=1)
    print(f"Extracted MFCC shape: {mfccs.shape}, Energy length: {len(energy)}")
    return mfccs, energy

def calculate_lipsync_inconsistency(mouth_movements, audio_energy):
    print("[Step 4] Calculating lip-sync inconsistency...")
    length = min(len(mouth_movements), len(audio_energy))
    if length < 2:
        print("Insufficient data for correlation calculation")
        return 1.0  # High suspicion if insufficient data

    mouth_movements = np.array(mouth_movements[:length])
    audio_energy = np.array(audio_energy[:length])
    corr, _ = pearsonr(mouth_movements, audio_energy)
    print(f"Lip-sync Pearson correlation: {corr:.3f}")
    return 1 - corr  # 1 = no sync, 0 = perfect sync

def synthetic_voice_detection(mfccs):
    print("[Step 5] Running synthetic voice detection...")
    mean_mfcc = np.mean(mfccs, axis=1)
    std_mfcc = np.std(mfccs, axis=1)
    features = np.concatenate((mean_mfcc, std_mfcc)).reshape(1, -1)

    if os.path.exists("voice_model.pkl"):
        clf = joblib.load("voice_model.pkl")
    else:
        # Mock classifier (for MVP demonstration)
        X_dummy = np.random.rand(20, 26)
        y_dummy = [0]*10 + [1]*10
        clf = RandomForestClassifier()
        clf.fit(X_dummy, y_dummy)
        joblib.dump(clf, "voice_model.pkl")
        print("Trained mock voice detection model")

    prob = clf.predict_proba(features)[0][1]
    print(f"Voice synthetic probability score: {prob:.3f}")
    return prob  # 0 = likely real, 1 = likely synthetic

def calculate_overall_score(lipsync_score, voice_score, w1=0.6, w2=0.4):
    print("[Step 6] Calculating weighted suspicion score...")
    print(f"Lip-sync score: {lipsync_score:.3f}, Voice score: {voice_score:.3f}")
    return w1 * lipsync_score + w2 * voice_score

def run_inference(video_path):
    print("⏳ Starting inference pipeline...")
    audio, frames = extract_audio_and_frames(video_path)

    lips = extract_lip_movement_features(frames)
    mfccs, energy = extract_audio_features(audio)

    lip_score = calculate_lipsync_inconsistency(lips, energy)
    voice_score = synthetic_voice_detection(mfccs)

    final_score = calculate_overall_score(lip_score, voice_score)

    print(f"\n🔎 Deepfake Suspicion Score: {final_score:.2f} (0-1 scale)")
    if final_score > 0.7:
        print("⚠ Likely deepfake.")
    elif final_score > 0.4:
        print("⚠ Possibly suspicious.")
    else:
        print("✅ Likely authentic.")

  if event.key is 'enter':



In [None]:
run_inference("/veo3.mp4")

⏳ Starting inference pipeline...
[Step 1] Extracting audio and frames from video...
Extracted 36 frames and saved audio as 'extracted_audio.wav'
[Step 2] Extracting lip movement features from frames...
Extracted mouth movement data from 36 frames
[Step 3] Extracting MFCCs and audio energy...
Extracted MFCC shape: (13, 223), Energy length: 1
[Step 4] Calculating lip-sync inconsistency...
Insufficient data for correlation calculation
[Step 5] Running synthetic voice detection...
Trained mock voice detection model
Voice synthetic probability score: 0.580
[Step 6] Calculating weighted suspicion score...
Lip-sync score: 1.000, Voice score: 0.580

🔎 Deepfake Suspicion Score: 0.83 (0-1 scale)
⚠ Likely deepfake.


In [None]:
run_inference("/WIN_20250526_14_50_09_Pro.mp4")

⏳ Starting inference pipeline...
[Step 1] Extracting audio and frames from video...
Extracted 36 frames and saved audio as 'extracted_audio.wav'
[Step 2] Extracting lip movement features from frames...
Extracted mouth movement data from 36 frames
[Step 3] Extracting MFCCs and audio energy...
Extracted MFCC shape: (13, 222), Energy length: 1
[Step 4] Calculating lip-sync inconsistency...
Insufficient data for correlation calculation
[Step 5] Running synthetic voice detection...
Voice synthetic probability score: 0.600
[Step 6] Calculating weighted suspicion score...
Lip-sync score: 1.000, Voice score: 0.600

🔎 Deepfake Suspicion Score: 0.84 (0-1 scale)
⚠ Likely deepfake.


In [None]:
run_inference("//News wrap in 30 Seconds, Full video follows at 9 PM on YouTube. Stay Tuned !.mp4")

⏳ Starting inference pipeline...
[Step 1] Extracting audio and frames from video...
Extracted 175 frames and saved audio as 'extracted_audio.wav'
[Step 2] Extracting lip movement features from frames...
Extracted mouth movement data from 175 frames
[Step 3] Extracting MFCCs and audio energy...
Extracted MFCC shape: (13, 1088), Energy length: 1
[Step 4] Calculating lip-sync inconsistency...
Insufficient data for correlation calculation
[Step 5] Running synthetic voice detection...
Voice synthetic probability score: 0.560
[Step 6] Calculating weighted suspicion score...
Lip-sync score: 1.000, Voice score: 0.560

🔎 Deepfake Suspicion Score: 0.82 (0-1 scale)
⚠ Likely deepfake.


In [None]:
run_inference("/videoplayback.mp4")

⏳ Starting inference pipeline...
[Step 1] Extracting audio and frames from video...
Extracted 177 frames and saved audio as 'extracted_audio.wav'
[Step 2] Extracting lip movement features from frames...
Extracted mouth movement data from 177 frames
[Step 3] Extracting MFCCs and audio energy...
Extracted MFCC shape: (13, 1101), Energy length: 1
[Step 4] Calculating lip-sync inconsistency...
Insufficient data for correlation calculation
[Step 5] Running synthetic voice detection...
Voice synthetic probability score: 0.580
[Step 6] Calculating weighted suspicion score...
Lip-sync score: 1.000, Voice score: 0.580

🔎 Deepfake Suspicion Score: 0.83 (0-1 scale)
⚠ Likely deepfake.


In [None]:
# deepfake_detector_v2.py

import os
import cv2
import numpy as np
import librosa
from moviepy.editor import VideoFileClip
from scipy.stats import pearsonr
from sklearn.ensemble import RandomForestClassifier
import joblib
import warnings
import sys

warnings.filterwarnings("ignore")

def safe_print(*args):
    try:
        print(*[str(a).encode('utf-8', 'replace').decode('utf-8') for a in args])
    except UnicodeEncodeError:
        print("⚠️ Skipped unsafe output due to encoding issue.")

def extract_audio_and_frames(video_path, audio_out='extracted_audio.wav', frame_dir='frames', fps=5):
    safe_print("[Step 1] Extracting audio and frames from video...")
    clip = VideoFileClip(video_path)
    clip.audio.write_audiofile(audio_out, logger=None)

    if not os.path.exists(frame_dir):
        os.makedirs(frame_dir)

    duration = clip.duration
    frame_count = 0
    for i, t in enumerate(np.arange(0, duration, 1.0 / fps)):
        frame = clip.get_frame(t)
        frame_path = os.path.join(frame_dir, f"frame_{i:03d}.jpg")
        cv2.imwrite(frame_path, cv2.cvtColor(frame, cv2.COLOR_RGB2BGR))
        frame_count += 1

    safe_print(f"Extracted {frame_count} frames and saved audio as '{audio_out}'")
    return audio_out, frame_dir

def extract_lip_movement_features(frame_dir):
    safe_print("[Step 2] Extracting lip movement features from frames...")
    mouth_movements = []
    face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + "haarcascade_frontalface_default.xml")

    for file in sorted(os.listdir(frame_dir)):
        if file.endswith(".jpg"):
            path = os.path.join(frame_dir, file)
            img = cv2.imread(path)
            gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
            faces = face_cascade.detectMultiScale(gray, 1.3, 5)

            mouth_area = 0
            for (x, y, w, h) in faces:
                mouth_y = y + int(0.6 * h)
                mouth_h = int(0.2 * h)
                mouth = gray[mouth_y:mouth_y + mouth_h, x:x + w]
                mouth_area = cv2.countNonZero(mouth)

            mouth_movements.append(mouth_area)

    safe_print(f"Extracted mouth movement data from {len(mouth_movements)} frames")
    return mouth_movements

def extract_audio_features(audio_path, sr=16000, fps=5):
    safe_print("[Step 3] Extracting MFCCs and audio energy...")
    y, sr = librosa.load(audio_path, sr=sr)
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)

    hop_length = int(sr / fps)
    frame_energy = [
        np.sum(np.square(y[i:i + hop_length]))
        for i in range(0, len(y), hop_length)
    ]

    safe_print(f"Extracted MFCC shape: {mfccs.shape}, Energy length: {len(frame_energy)}")
    return mfccs, frame_energy

def calculate_lipsync_inconsistency(mouth_movements, audio_energy):
    safe_print("[Step 4] Calculating lip-sync inconsistency...")
    length = min(len(mouth_movements), len(audio_energy))
    if length < 2:
        safe_print("Insufficient data for correlation calculation")
        return 1.0

    mouth_movements = np.array(mouth_movements[:length])
    audio_energy = np.array(audio_energy[:length])
    corr, _ = pearsonr(mouth_movements, audio_energy)
    return 1 - corr

def synthetic_voice_detection(mfccs):
    safe_print("[Step 5] Running synthetic voice detection...")
    mean_mfcc = np.mean(mfccs, axis=1)
    std_mfcc = np.std(mfccs, axis=1)
    features = np.concatenate((mean_mfcc, std_mfcc)).reshape(1, -1)

    if os.path.exists("voice_model.pkl"):
        clf = joblib.load("voice_model.pkl")
    else:
        safe_print("Trained mock voice detection model")
        X_dummy = np.random.rand(20, 26)
        y_dummy = [0]*10 + [1]*10
        clf = RandomForestClassifier()
        clf.fit(X_dummy, y_dummy)
        joblib.dump(clf, "voice_model.pkl")

    prob = clf.predict_proba(features)[0][1]
    safe_print(f"Voice synthetic probability score: {prob:.3f}")
    return prob

def calculate_overall_score(lipsync_score, voice_score, w1=0.6, w2=0.4):
    safe_print("[Step 6] Calculating weighted suspicion score...")
    safe_print(f"Lip-sync score: {lipsync_score:.3f}, Voice score: {voice_score:.3f}")
    return w1 * lipsync_score + w2 * voice_score

def run_inference(video_path):
    safe_print("\n⏳ Starting inference pipeline...")
    audio, frames = extract_audio_and_frames(video_path)
    lips = extract_lip_movement_features(frames)
    mfccs, energy = extract_audio_features(audio)
    lip_score = calculate_lipsync_inconsistency(lips, energy)
    voice_score = synthetic_voice_detection(mfccs)
    final_score = calculate_overall_score(lip_score, voice_score)

    safe_print(f"\n🔎 Deepfake Suspicion Score: {final_score:.2f} (0-1 scale)")
    if final_score > 0.7:
        safe_print("⚠️ Likely deepfake.")
    elif final_score > 0.4:
        safe_print("⚠️ Possibly suspicious.")
    else:
        safe_print("✅ Likely authentic.")


In [None]:
run_inference("/videoplayback.mp4")


⏳ Starting inference pipeline...
[Step 1] Extracting audio and frames from video...
Extracted 177 frames and saved audio as 'extracted_audio.wav'
[Step 2] Extracting lip movement features from frames...
Extracted mouth movement data from 177 frames
[Step 3] Extracting MFCCs and audio energy...
Extracted MFCC shape: (13, 1101), Energy length: 177
[Step 4] Calculating lip-sync inconsistency...
[Step 5] Running synthetic voice detection...
Voice synthetic probability score: 0.580
[Step 6] Calculating weighted suspicion score...
Lip-sync score: 0.971, Voice score: 0.580

🔎 Deepfake Suspicion Score: 0.81 (0-1 scale)
⚠️ Likely deepfake.


In [None]:
run_inference("//News wrap in 30 Seconds, Full video follows at 9 PM on YouTube. Stay Tuned !.mp4")


⏳ Starting inference pipeline...
[Step 1] Extracting audio and frames from video...
Extracted 175 frames and saved audio as 'extracted_audio.wav'
[Step 2] Extracting lip movement features from frames...
Extracted mouth movement data from 177 frames
[Step 3] Extracting MFCCs and audio energy...
Extracted MFCC shape: (13, 1088), Energy length: 175
[Step 4] Calculating lip-sync inconsistency...
[Step 5] Running synthetic voice detection...
Voice synthetic probability score: 0.560
[Step 6] Calculating weighted suspicion score...
Lip-sync score: 0.918, Voice score: 0.560

🔎 Deepfake Suspicion Score: 0.77 (0-1 scale)
⚠️ Likely deepfake.


In [None]:
run_inference("/WIN_20250526_14_50_09_Pro.mp4")


⏳ Starting inference pipeline...
[Step 1] Extracting audio and frames from video...
Extracted 36 frames and saved audio as 'extracted_audio.wav'
[Step 2] Extracting lip movement features from frames...
Extracted mouth movement data from 177 frames
[Step 3] Extracting MFCCs and audio energy...
Extracted MFCC shape: (13, 222), Energy length: 222
[Step 4] Calculating lip-sync inconsistency...
Lip-sync correlation: -0.631, Inconsistency Score: 1.631
[Step 5] Running synthetic voice detection...
Voice synthetic probability score: 0.600
[Step 6] Calculating weighted suspicion score...
Lip-sync score: 1.631, Voice score: 0.600

🔎 Deepfake Suspicion Score: 1.22 (0-1 scale)
⚠ Likely deepfake.


In [None]:
run_inference("/veo3.mp4")


⏳ Starting inference pipeline...
[Step 1] Extracting audio and frames from video...
Extracted 36 frames and saved audio as 'extracted_audio.wav'
[Step 2] Extracting lip movement features from frames...
Extracted mouth movement data from 177 frames
[Step 3] Extracting MFCCs and audio energy...
Extracted MFCC shape: (13, 223), Energy length: 223
[Step 4] Calculating lip-sync inconsistency...
Lip-sync correlation: -0.179, Inconsistency Score: 1.179
[Step 5] Running synthetic voice detection...
Voice synthetic probability score: 0.580
[Step 6] Calculating weighted suspicion score...
Lip-sync score: 1.179, Voice score: 0.580

🔎 Deepfake Suspicion Score: 0.94 (0-1 scale)
⚠ Likely deepfake.
