In [2]:
import cv2
import numpy as np

def get_valid_time_intervals(video_path, threshold=10, min_duration=0.1):
    cap = cv2.VideoCapture(video_path)
    fps = cap.get(cv2.CAP_PROP_FPS)
    timestamp = 0
    valid_intervals = []
    start_time = None

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
        mean_intensity = np.mean(gray)

        if mean_intensity > threshold:  # Not black
            if start_time is None:
                start_time = timestamp
        else:  # Black frame
            if start_time is not None:
                duration = timestamp - start_time
                if duration >= min_duration:  # Ignore very short flashes
                    valid_intervals.append((start_time, timestamp))
                start_time = None
        timestamp += 1 / fps

    # Handle case where video ends on valid segment
    if start_time is not None:
        valid_intervals.append((start_time, timestamp))

    cap.release()
    return valid_intervals  # e.g., [(2.5, 8.3), (12.1, 15.7), ...]

In [None]:
from pydub import AudioSegment

# Load full audio (same timeline as video)
audio = AudioSegment.from_wav("full_audio.wav")

# Extract audio during non-black periods
valid_intervals = get_valid_time_intervals("your_video.mp4")
target_audio = AudioSegment.silent(duration=0)

for start_sec, end_sec in valid_intervals:
    start_ms = int(start_sec * 1000)
    end_ms = int(end_sec * 1000)
    target_audio += audio[start_ms:end_ms]

target_audio.export("target_person_audio.wav", format="wav")

In [None]:
import whisper

model = whisper.load_model("small")  # Runs fine on Mac CPU
result = model.transcribe("target_person_audio.wav")
print(result["text"])