## Import libraries

In [None]:
import os
import cv2
from pydub import AudioSegment
import numpy as np

### Function: `block_difference`
- Compares two frames by dividing them into blocks
- Calculates the mean absolute difference between corresponding blocks 
- Returns the average block difference across the whole frame

In [3]:
def block_difference(frame1, frame2, block_size=16):
    height, width = frame1.shape
    total_diff = 0
    num_blocks = 0

    for y in range(0, height, block_size):
        for x in range(0, width, block_size):
            block1 = frame1[y:y+block_size, x:x+block_size]
            block2 = frame2[y:y+block_size, x:x+block_size]
            if block1.shape == block2.shape:
                diff = np.abs(block1.astype(int) - block2.astype(int)).mean()
                total_diff += diff
                num_blocks += 1

    return total_diff / num_blocks if num_blocks > 0 else 0

### Function: `extract_keyframes_from_shot`
- Extracts keyframes from a list of frames using block difference
- Uses the first frame as reference, then adds new frames when difference exceeds a threshold
- Resizes keyframes to 224x224
- Ensures exactly 15 keyframes by padding with the last frame if needed

In [5]:
def extract_keyframes_from_shot(frames, fps, block_threshold=20):
    duration = len(frames) / fps
    target_num_keyframes = 15
    keyframes = []

    if len(frames) == 0:
        return []

    ref_gray = cv2.cvtColor(frames[0], cv2.COLOR_BGR2GRAY)
    resized_frame = cv2.resize(frames[0], (224, 224))
    keyframes.append(resized_frame)

    for i, frame in enumerate(frames[1:], start=1):
        gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
        diff = block_difference(ref_gray, gray)
        if diff > block_threshold:
            resized = cv2.resize(frame, (224, 224))
            keyframes.append(resized)

        if len(keyframes) >= target_num_keyframes:
            break

    while len(keyframes) < target_num_keyframes:
        keyframes.append(keyframes[-1].copy())

    return keyframes


### Function: `extract_audio_shot`
- Extracts a segment of audio from a given audio file
- Takes `start_time` and `end_time` in seconds
- Saves the extracted audio segment as a `.wav` file

In [1]:
def extract_audio_shot(audio_path, start_time, end_time, save_path):
    audio = AudioSegment.from_file(audio_path)
    start_ms = int(start_time * 1000)
    end_ms = int(end_time * 1000)
    segment = audio[start_ms:end_ms]
    segment.export(save_path, format="wav")

### Function: `extract_audio_temp`
- Extracts the entire audio track from a video
- Saves it temporarily as a `.wav` file
- Returns the path of the temporary audio file

In [1]:
def extract_audio_temp(video_path):
    video = VideoFileClip(video_path)
    with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio_file:
        temp_audio_path = temp_audio_file.name
    video.audio.write_audiofile(temp_audio_path)
    return temp_audio_path

### Function: `extract_shots_and_keyframes_with_audio`
- Splits a video into shots based on histogram difference
- Extracts **keyframes** from each shot using block difference
- Saves the **corresponding audio segment** of each shot as `.wav`
- Returns a list of samples, where each sample contains:  
  - keyframes  
  - audio file path  
  - start time and end time of the shot  

In [2]:
def extract_shots_and_keyframes_with_audio(video_path, hist_threshold=30, block_threshold=20, fps_cap=30, save_audio_dir="audio_shots"):
    os.makedirs(save_audio_dir, exist_ok=True)

    audio_path = extract_audio_temp(video_path)

    cap = cv2.VideoCapture(video_path)
    fps = cap.get(cv2.CAP_PROP_FPS)
    fps = min(fps, fps_cap)

    prev_hist = None
    frames = []
    all_samples = []
    shot_start_frame = 0

    while True:
        ret, frame = cap.read()
        if not ret:
            break

        gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
        hist = cv2.calcHist([gray], [0], None, [256], [0, 256])
        hist = cv2.normalize(hist, hist).flatten()

        if prev_hist is not None:
            diff = cv2.compareHist(prev_hist, hist, cv2.HISTCMP_BHATTACHARYYA)
            if diff > hist_threshold / 100:
                shot_end_frame = shot_start_frame + len(frames)
                start_time = shot_start_frame / fps
                end_time = shot_end_frame / fps

                keyframes = extract_keyframes_from_shot(frames, fps, block_threshold)

                if keyframes:
                    audio_filename = f"{os.path.basename(video_path)}_{int(start_time*1000)}_{int(end_time*1000)}.wav"
                    audio_save_path = os.path.join(save_audio_dir, audio_filename)
                    extract_audio_shot(audio_path, start_time, end_time, audio_save_path)

                    all_samples.append({
                        "keyframes": keyframes,
                        "audio_path": audio_save_path,
                        "start_time": start_time,
                        "end_time": end_time
                    })

                shot_start_frame = shot_end_frame
                frames = []

        frames.append(frame.copy())
        prev_hist = hist

    cap.release()

    os.remove(audio_path)

    return all_samples