In [None]:
import cv2
import numpy as np
import librosa
import soundfile as sf
import face_recognition
from pydub import AudioSegment
from pydub.silence import split_on_silence
import argparse
import os
from typing import List, Tuple
import tkinter as tk
from tkinter import filedialog


In [None]:

class AudioSegmentExtractor:
    def __init__(self):
        self.face_encodings = []
        self.person_segments = []

    def load_reference_image(self, image_path: str) -> bool:
        """Load reference image of the person to detect"""
        try:
            # Load reference image
            reference_image = face_recognition.load_image_file(image_path)
            face_encodings = face_recognition.face_encodings(reference_image)

            if len(face_encodings) == 0:
                print("No face found in reference image!")
                return False

            self.face_encodings = face_encodings
            print(f"Loaded reference face from {image_path}")
            return True
        except Exception as e:
            print(f"Error loading reference image: {e}")
            return False

    def detect_person_in_video(self, video_path: str, tolerance: float = 0.6) -> List[Tuple[float, float]]:
        """Detect segments where the target person appears in video"""
        cap = cv2.VideoCapture(video_path)
        fps = cap.get(cv2.CAP_PROP_FPS)
        frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

        print(f"Processing video: {frame_count} frames at {fps} FPS")

        person_present = []
        frame_number = 0

        # Process every nth frame to speed up detection
        skip_frames = max(1, int(fps // 2))  # Process 2 frames per second

        while True:
            ret, frame = cap.read()
            if not ret:
                break

            if frame_number % skip_frames == 0:
                # Convert BGR to RGB
                rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

                # Find faces in current frame
                face_locations = face_recognition.face_locations(rgb_frame)
                face_encodings = face_recognition.face_encodings(rgb_frame, face_locations)

                # Check if target person is in frame
                person_found = False
                for face_encoding in face_encodings:
                    matches = face_recognition.compare_faces(self.face_encodings, face_encoding, tolerance=tolerance)
                    if True in matches:
                        person_found = True
                        break

                timestamp = frame_number / fps
                person_present.append((timestamp, person_found))

                if frame_number % (skip_frames * 30) == 0:  # Progress update every ~15 seconds
                    progress = (frame_number / frame_count) * 100
                    print(f"Progress: {progress:.1f}%")

            frame_number += 1

        cap.release()

        # Convert detection points to segments
        segments = self._create_segments(person_present, fps)
        self.person_segments = segments

        print(f"Found {len(segments)} segments where person is present")
        return segments

    def _create_segments(self, detections: List[Tuple[float, bool]], fps: float) -> List[Tuple[float, float]]:
        """Convert detection points to continuous segments"""
        segments = []
        current_start = None

        for timestamp, person_present in detections:
            if person_present and current_start is None:
                current_start = timestamp
            elif not person_present and current_start is not None:
                segments.append((current_start, timestamp))
                current_start = None

        # Handle case where person is present until end
        if current_start is not None:
            segments.append((current_start, detections[-1][0]))

        # Merge nearby segments (within 1 second)
        merged_segments = []
        for start, end in segments:
            if merged_segments and start - merged_segments[-1][1] < 1.0:
                merged_segments[-1] = (merged_segments[-1][0], end)
            else:
                merged_segments.append((start, end))

        return merged_segments

    def extract_audio_segments(self, source_audio_path: str, output_path: str,
                             total_duration: float = None) -> str:
        """Extract audio segments and create final audio file"""
        try:
            # Load source audio
            print(f"Loading source audio: {source_audio_path}")
            source_audio = AudioSegment.from_file(source_audio_path)

            # Get total duration from video if not provided
            if total_duration is None:
                # Try to get duration from the first video file
                # This is a fallback - ideally pass duration from video analysis
                total_duration = len(source_audio) / 1000.0

            # Create silent audio of total duration
            silent_audio = AudioSegment.silent(duration=int(total_duration * 1000))

            print(f"Creating audio file of {total_duration:.2f} seconds")
            print(f"Extracting {len(self.person_segments)} segments")

            # Extract and overlay segments
            for i, (start_time, end_time) in enumerate(self.person_segments):
                print(f"Processing segment {i+1}: {start_time:.2f}s - {end_time:.2f}s")

                # Convert to milliseconds
                start_ms = int(start_time * 1000)
                end_ms = int(end_time * 1000)
                duration_ms = end_ms - start_ms

                # Extract segment from source audio
                if start_ms < len(source_audio):
                    if end_ms <= len(source_audio):
                        audio_segment = source_audio[start_ms:end_ms]
                    else:
                        # Handle case where segment extends beyond source audio
                        audio_segment = source_audio[start_ms:]
                        # Pad with silence if needed
                        needed_silence = duration_ms - len(audio_segment)
                        if needed_silence > 0:
                            audio_segment += AudioSegment.silent(duration=needed_silence)
                else:
                    # If start is beyond source audio, use silence
                    audio_segment = AudioSegment.silent(duration=duration_ms)

                # Overlay onto silent audio
                silent_audio = silent_audio.overlay(audio_segment, position=start_ms)

            # Export final audio
            print(f"Exporting final audio to: {output_path}")
            silent_audio.export(output_path, format="wav")

            return output_path

        except Exception as e:
            print(f"Error extracting audio segments: {e}")
            return None

    def get_video_duration(self, video_path: str) -> float:
        """Get video duration in seconds"""
        cap = cv2.VideoCapture(video_path)
        fps = cap.get(cv2.CAP_PROP_FPS)
        frame_count = cap.get(cv2.CAP_PROP_FRAME_COUNT)
        duration = frame_count / fps
        cap.release()
        return duration

def select_file(title: str, filetypes: List[Tuple[str, str]]) -> str:
    """Open file dialog to select a file"""
    root = tk.Tk()
    root.withdraw()  # Hide the main window

    file_path = filedialog.askopenfilename(
        title=title,
        filetypes=filetypes
    )

    root.destroy()
    return file_path

def main():
    parser = argparse.ArgumentParser(description="Extract audio segments based on person detection in video")
    parser.add_argument("--video", help="Path to input video file")
    parser.add_argument("--reference", help="Path to reference image of person")
    parser.add_argument("--audio", help="Path to source audio file")
    parser.add_argument("--output", help="Path to output audio file", default="extracted_audio.wav")
    parser.add_argument("--tolerance", type=float, default=0.6, help="Face recognition tolerance (0.0-1.0)")

    args = parser.parse_args()

    # Initialize extractor
    extractor = AudioSegmentExtractor()

    # Get video file
    video_path = args.video
    if not video_path:
        print("Select video file:")
        video_path = select_file("Select Video File", [
            ("Video files", "*.mp4 *.avi *.mov *.mkv *.wmv"),
            ("All files", "*.*")
        ])

    if not video_path or not os.path.exists(video_path):
        print("Invalid video file selected!")
        return

    # Get reference image
    reference_path = args.reference
    if not reference_path:
        print("Select reference image of the person:")
        reference_path = select_file("Select Reference Image", [
            ("Image files", "*.jpg *.jpeg *.png *.bmp"),
            ("All files", "*.*")
        ])

    if not reference_path or not os.path.exists(reference_path):
        print("Invalid reference image selected!")
        return

    # Load reference image
    if not extractor.load_reference_image(reference_path):
        return

    # Detect person in video
    print("Analyzing video for person detection...")
    segments = extractor.detect_person_in_video(video_path, tolerance=args.tolerance)

    if not segments:
        print("No segments found where the person is present!")
        return

    # Print detected segments
    print("\nDetected segments:")
    total_segment_time = 0
    for i, (start, end) in enumerate(segments):
        duration = end - start
        total_segment_time += duration
        print(f"  Segment {i+1}: {start:.2f}s - {end:.2f}s (duration: {duration:.2f}s)")

    print(f"Total time with person present: {total_segment_time:.2f}s")

    # Get source audio file
    audio_path = args.audio
    if not audio_path:
        print("\nSelect source audio file:")
        audio_path = select_file("Select Source Audio File", [
            ("Audio files", "*.wav *.mp3 *.m4a *.flac *.aac"),
            ("All files", "*.*")
        ])

    if not audio_path or not os.path.exists(audio_path):
        print("Invalid audio file selected!")
        return

    # Get video duration
    video_duration = extractor.get_video_duration(video_path)
    print(f"Video duration: {video_duration:.2f}s")

    # Extract audio segments
    print("\nExtracting audio segments...")
    output_path = extractor.extract_audio_segments(audio_path, args.output, video_duration)

    if output_path:
        print(f"\nSuccess! Extracted audio saved to: {output_path}")
        print(f"The output audio will have sound only during the {len(segments)} segments where the person is visible.")
    else:
        print("Failed to extract audio segments!")