In [1]:
# IMPORTS

import os, re, json, tempfile, traceback, textwrap, asyncio, nest_asyncio, shutil
from pathlib import Path
import torch
import whisper
import pysrt
import pandas as pd
import numpy as np
import cv2 
import clip
import edge_tts
from tqdm.auto import tqdm
from whisper.utils import get_writer
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from PIL import Image
from __future__ import annotations
import math, shlex, subprocess
from typing import List
from moviepy.editor import VideoFileClip, concatenate_videoclips, concatenate_audioclips, AudioFileClip, vfx



  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# PROJECT SETTINGS

PROJECT_ROOT = Path("/home/jovyan")
MOVIE_TITLE  = "Below the Deadline (1936)"

VIDEO_FILE = PROJECT_ROOT / "ProjectVideos" / f"{MOVIE_TITLE}.mp4"
TRANS_DIR  = PROJECT_ROOT / "transcripts"

VIDEO_STEM   = VIDEO_FILE.stem
TXT_PATH     = TRANS_DIR / f"{VIDEO_STEM}.txt"
SRT_PATH     = TRANS_DIR / f"{VIDEO_STEM}.srt"
JSON_PATH = TRANS_DIR / f"{VIDEO_STEM}.json"
MOVIE_SUM_TXT= PROJECT_ROOT / "movie_summary.txt"
AUDIO_DIR = PROJECT_ROOT / "narration" / VIDEO_STEM.lower().replace(" ", "_")
AUDIO_DIR.mkdir(parents=True, exist_ok=True)
OUT_VIDEO    = PROJECT_ROOT / f"{VIDEO_STEM.lower().replace(' ','_')}_reel.mp4"



In [3]:
# 1. TRANSCRIPTION

model_size = "large-v3" # Selecting the large model for maximum accuracy.
device     = "cuda" if torch.cuda.is_available() else "cpu" # Setting device as the GPU by default.
use_fp16   = device == "cuda"
print(f"[Whisper] model={model_size}  device={device}")

model = whisper.load_model(model_size, device=device) # We are loading the model here.
TRANS_DIR.mkdir(exist_ok=True, parents=True)

if TXT_PATH.exists() and SRT_PATH.exists(): # Here, we check if the transcript for the film has already been generated so that we can mitigate redundancy.
    print("Transcript already exists - skipping.")
else:
    try:
        res = model.transcribe(str(VIDEO_FILE), language="en", fp16=use_fp16) # If the transcript is not generated we are transcribe the film.
        TXT_PATH.write_text(res["text"], encoding="utf-8") # We are creating a text,SRT and JSON file of the transcript for future use.
        get_writer("srt", TRANS_DIR)(res, VIDEO_STEM)
        with JSON_PATH.open("w", encoding="utf-8") as f:
            json.dump(res, f, ensure_ascii=False, indent=2)
        print("Transcription complete:", SRT_PATH.name, JSON_PATH.name)
    except Exception as e:
        print("Transcription failed:", e)
        traceback.print_exc()

del model
torch.cuda.empty_cache() # We are emptying the cache on the GPU as our summarization model requires 70 GB of VRAM.



[Whisper] model=large-v3  device=cuda
Transcript already exists - skipping.


In [4]:
# 2. SUMMARY MODEL INITIALIZATION

model_id = "Qwen/Qwen2.5-72B-Instruct" # After checking Mistral, Mixtral, llama and Yi, we found that Qwen gives us the best accuracy and has a 32k context window for our full transcript.
bnb_cfg  = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
) # Configuring the model's settings in 4-bit precision to cut the memory usage while still keeping respectable precision.

tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) # We load the tokenizer, allowing execution of any custom code in the repo.
model = AutoModelForCausalLM.from_pretrained(
            model_id,
            device_map="auto",
            quantization_config=bnb_cfg
) # Loading the model.



E0000 00:00:1755626130.121164  128932 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1755626130.129871  128932 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1755626130.244512  128932 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1755626130.244535  128932 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1755626130.244537  128932 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1755626130.244539  128932 computation_placer.cc:177] computation placer already registered. Please check linka

In [5]:
# 3. SUMMARY GENERATION

transcript_text= TXT_PATH.read_text(encoding="utf-8") # Initializing the transcript file to feed into the model.

# The prompt created below gives a comprehensive and accurate summary of the transcript. 

prompt = textwrap.dedent(f"""
### ROLE
You are an professional film critic and narrative analyst.

### GOAL
Craft a *chronological* plot synopsis that:
1. **Covers the five classic beats**  
   • Setting & characters • Inciting incident • Rising action & turning points  
   • Climax • Resolution  
2. **Fits in exactly 20 sentences**, each on its own line.  
3. Uses *present-tense, active voice*, vivid but precise language.  
4. Contains **no dialogue quotes** and **no details not found in the transcript**.  
5. Mentions each major character by name the first time they appear, then uses pronouns sparingly.

### THOUGHT PROCESS  (*think step, hidden*)
First: Read the transcript and silently extract  
· main characters · time/place · inciting incident · major conflicts/turns · climax · resolution  
Second: Outline those items in bullets (in your own mind).  
**Do NOT reveal these notes.**
Third: Reanalyze to check if you are hallucinating.Correct yourself if you are wrong.

### OUTPUT
Write the final **engaging** synopsis only, numbered 1-20, one sentence per line.  
Do not write anything else—no title, no headings, no notes.

### SOURCE MATERIAL
The full transcript is below, delimited by triple angle brackets.  
Use *only* facts that appear inside those brackets.

<<<TRANSCRIPT>>>
{transcript_text}
<<<END OF TRANSCRIPT>>>
""")

inputs = tokenizer(
    prompt,
    return_tensors="pt",
    truncation=True,
    max_length=32768
).to(model.device) # Feeding the prompt into the model for interpretation with max context length as 32k tokens, anything longer will be truncated.

outputs = model.generate(
    **inputs,
    max_new_tokens=1024,
    temperature=0.1,
    top_p= 0.9,
    do_sample=False,
    repetition_penalty=1.15
) # Generating outputs that have up to 1k tokens. 

input_length = inputs["input_ids"].shape[-1] # We find out how many tokens were taken up by the prompt.
generated_ids = outputs[0][input_length:] # Trimming off the prompt portion and keeping only the generated summary portion.

decoded = tokenizer.decode(generated_ids, skip_special_tokens=True).strip() # Decoding the generated token IDs back into a string, skipping special tokens.

summary_lines = re.findall(r"^\d+\..*?$", decoded, flags=re.MULTILINE) # Now, we use a regex to extract only lines starting with a number followed by a dot.

if summary_lines:
    summary_lines[-1] = re.sub(r"\s*###.*$", "", summary_lines[-1]) # Here, we are removing all the extra headers and markers as we need only the sentences.

summary = "\n".join(summary_lines[:20]) # Now, we create the 20 numbered summary sentences.

MOVIE_SUM_TXT.write_text(summary, encoding="utf-8") # Writing the summary into into a file for future use.



The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


2568

In [7]:
# 4. TEXT TO SPEECH

async def _gen_tts(text: str, out_path: Path,
                   voice="en-US-GuyNeural",
                   rate="+0%", pitch="+0Hz"):

    communicator = edge_tts.Communicate(text, voice=voice, rate=rate, pitch=pitch) # Initializing a TTS communicator with the desired voice, speaking rate and pitch.
    await communicator.save(str(out_path)) # Saving the generated audio.

async def make_summary_tts(summary_path: Path = MOVIE_SUM_TXT,
                           out_dir: Path = AUDIO_DIR) -> Path | None:

    lines = [ln.strip() for ln in summary_path.read_text().splitlines() if ln.strip()] # Splitting the summary into lines, stripping whitespaces, and skipping empty lines if any.
    sentences = [re.sub(r"^\d+\.\s*", "", ln) for ln in lines] # Since our sentences start with numbering we remove the numbers to get only the sentences.

    pad = len(str(len(sentences)))
    tasks = []
    for i, sent in enumerate(sentences, 1): # We are creating a TTS task for each sentences here.
        fname = f"sent_{i:0{pad}d}.mp3" # Saving that file name as sent_01.mp3 and so on.
        tasks.append(_gen_tts(sent, out_dir / fname)) # Creating the voiceover and writing it to the file.
    await asyncio.gather(*tasks) # Running all the tasks concurrently.
    print("Sentence-level TTS complete:", out_dir)

nest_asyncio.apply() # We need nested loops here or else the kernel is crashing.
asyncio.run(make_summary_tts()) # Running the functions.



Sentence-level TTS complete: /home/jovyan/narration/below_the_deadline_(1936)


In [8]:
# CLEANING VIDEO FILE

# The video file needs a constant 24 FPS, H.264 video and AAC audio, so we re-encode the original file so it doesn't crash while building a reel using moviepy.

TARGET_FPS  = 24.0

def clean_video(src: Path, dst: Path, fps: float, crf: int = 17):
    if dst.exists():
        return dst
    cmd = (
        "ffmpeg -y -fflags +genpts -err_detect ignore_err "
        f"-i {shlex.quote(str(src))} "
        f"-vf \"fps={fps},setpts=PTS-STARTPTS\" "
        f"-map 0:v -map 0:a? -c:v libx264 -preset slow -crf {crf} "
        "-pix_fmt yuv420p -c:a aac -b:a 160k -movflags +faststart "
        f"{shlex.quote(str(dst))}"
    )
    subprocess.run(cmd, shell=True, check=True)
    return dst


CLEAN_FILE = VIDEO_FILE.with_name(VIDEO_FILE.stem + "_clean.mp4")
clean_video(VIDEO_FILE, CLEAN_FILE, TARGET_FPS)



PosixPath('/home/jovyan/ProjectVideos/Below the Deadline (1936)_clean.mp4')

In [9]:
# 5. REEL BUILDING MAIN

CROSS       = 0.5
FRAME_STRIDE= 1.0
CLIP_MODEL  = "ViT-L/14"
USE_REWRITE = True
REWRITE_TEMP= 0.3
DEVICE      = "cuda" if torch.cuda.is_available() else "cpu"


# We are loading the summary lines to convert it into visual cues for better clip matching.
sent_lines = [ln.strip() for ln in MOVIE_SUM_TXT.read_text().splitlines() if ln.strip()]
sent_text  = [re.sub(r"^\\d+\\.\\s*", "", ln) for ln in sent_lines]

# Rewriting the summary sentences as visual sentences.
if USE_REWRITE:
    def rewrite(s):
        prompt = f"Rewrite for CLIP image search: '{s}'. Make it short, concrete, visual."
        inp = tokenizer(prompt, return_tensors="pt").to(model.device)
        out = model.generate(**inp, max_new_tokens=32, temperature=REWRITE_TEMP, do_sample=True)
        return tokenizer.decode(out[0][inp.input_ids.shape[-1]:], skip_special_tokens=True).strip()
    sent_text = [rewrite(s) for s in sent_text]
    print("Sentences rewritten for CLIP")

# Now, we embed the sentences for visual matching.
clip_model, clip_prep = clip.load(CLIP_MODEL, device=DEVICE) # Loading the CLIP model and preprocessing function.
with torch.no_grad():
    sent_emb = clip_model.encode_text(clip.tokenize(sent_text).to(DEVICE)).float() # Tokenizing and encoding the input sentence text into an embedding.
    sent_emb /= sent_emb.norm(dim=-1, keepdim=True) # Normalizing the embedding to unit length.


base_vid = VideoFileClip(str(CLEAN_FILE)).without_audio().set_fps(TARGET_FPS) # Here, we prepare the video file for frame extraction by removing audio and enforcing a fixed fps.
MOV_END  = base_vid.duration # Finding the total duration of the film in seconds to prevent out of limit clip searching.
EPS      = 1.0 / TARGET_FPS # This acts as a tiny one frame safety margin to prevent out of bounds seeking of clips.

# Indexing video frames at regular intervals.
probe_ts, probe_imgs = [], []
cap = cv2.VideoCapture(str(CLEAN_FILE)) # Opening video file for frame capture.
while True:
    t = len(probe_ts) * FRAME_STRIDE # Next timestamp = index * stride.
    if t >= MOV_END - 0.5: # Stopping when close to video end.
        break
    cap.set(cv2.CAP_PROP_POS_MSEC, t * 1000) # Seeking to t seconds.
    ok, frame = cap.read() # Reading the frame.
    if not ok:
        break
    probe_ts.append(t) # Recording the timestamp.
    probe_imgs.append(
        clip_prep(Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))).to(DEVICE)
    ) # Pre processing and storing the frame.
cap.release()

# Batch-encoding all the probed video frames into CLIP embeddings for text‑to‑image similarity comparisons.
with torch.no_grad():
    frame_emb = []
    for i in range(0, len(probe_imgs), 128):
        batch = torch.stack(probe_imgs[i : i + 128]).half()
        e = clip_model.encode_image(batch).float()
        e /= e.norm(dim=-1, keepdim=True)
        frame_emb.append(e.cpu())
    frame_emb = torch.cat(frame_emb).to(DEVICE)

print(f"{len(probe_ts)} probe frames indexed")

# Now that we have a coarse timestamp, we scan window seconds in steps to find the frame whose embedding best matches the text embedding.
def refine_pick(text_emb, coarse_t, window=15.0, step=0.25):
    times, frames, ts = [], [], [] # Lists to collect candidate frames and their timestamps.
    for t in np.arange(max(0, coarse_t - window), min(MOV_END, coarse_t + window), step): # Start at coarse_t - window and end at coarse_t + window.
        frame_ok, fr = False, None
        vcap = cv2.VideoCapture(str(CLEAN_FILE)) # Opening the video at time t (in milliseconds).
        vcap.set(cv2.CAP_PROP_POS_MSEC, t * 1000) # Seeking to t seconds.
        ok, fr = vcap.read() # Reading the frame.
        vcap.release()
        if not ok:
            continue # Skipping the timestamp if frame couldn't be read.
        frames.append(
            clip_prep(Image.fromarray(cv2.cvtColor(fr, cv2.COLOR_BGR2RGB))).to(DEVICE)
        ) # Collecting frames after converting BGR to RGB and preprocessing for CLIP.
        ts.append(t) # Now we record this candidate timestamp.
    if not frames:
        return coarse_t # If we can't find any valid frames, we fall back to coarse timestamp.
    with torch.no_grad():
        f_emb = clip_model.encode_image(torch.stack(frames).half()).float()
        f_emb /= f_emb.norm(dim=-1, keepdim=True)
        best = torch.argmax(text_emb @ f_emb.T).item() # Computing the similarity between text embedding and frame embedding and returning the best match.
    return ts[best]

# Clip-selection loop.
pad_len     = len(str(len(sent_text))) # Zero-pad width for filenames.
narr_durs   = [
    AudioFileClip(str(AUDIO_DIR / f"sent_{i:0{pad_len}d}.mp3")).duration
    for i in range(1, len(sent_text) + 1)
] # We are collecting the durations of each TTS audio file so that we can cut clips that match the narration length.
BLOCK       = max(narr_durs) / 2 # Half-block to prevent overlap.

# Now we define a function to compute a start/end window of length equal to the narration duration.
def centred_slice(mid: float, dur: float) -> Tuple[float, float]:
    st, en = mid - dur / 2, mid + dur / 2 # Initial half‑window around midpoint.
    if st < 0:
        en -= st; st = 0 # If start lesser than 0, we shift window forward to avoid crashes.
    if en > MOV_END:
        shift = en - MOV_END; st = max(0, st - shift); en = MOV_END # If end is greater than video length, we shift window back.
    en = min(en, MOV_END - EPS) # Subtracting EPS to avoid slicing at exact end.
    return st, en # Returning valid (start, end) times.

used_windows: list[Tuple[float, float]] = [] # List of already selected time windows, this is used to prevent overlap.
chosen_mid:  list[float] = [] # List of midpoints of chosen segments.
last_t = -1.0 # Initializing last chosen timestamp.

# Now, we loop through each sentence embedding and its narration duration.
for idx, (emb, dur) in enumerate(zip(sent_emb, narr_durs), 1):
    sims  = emb @ frame_emb.T # We compute similarity scores of all frame embeddings.
    order = sims.argsort(descending=True).tolist() # Here, we rank frame indices by descending similarity.

# For maintaining unique clips we check if the current probed timestamp is far enough from used windows.
    def probe_ok(j):
        t = probe_ts[j]
        return all(not (st - BLOCK <= t <= en + BLOCK) for st, en in used_windows)

    eligible = [j for j in order if probe_ok(j)] or order

    for pick in eligible:
        mid   = refine_pick(emb, probe_ts[pick]) # Now, we refine the coarse timestamp to best match.
        st, en = centred_slice(mid, dur) # Computing the slice around refined timestamp.
        if any(os < en and st < oe for os, oe in used_windows):
            continue # Skipping if overlapping any used window.
        chosen_mid.append(mid) # Accepting selected midpoint.
        used_windows.append((st, en)) # Marking its window as used.
        last_t = mid # Updating last chosen time.
        break # Moving on to next sentence.

# Building the reel.
clips = [] # List for all the subclips of each sentence.
for i, mid in enumerate(chosen_mid, 1):
    narr = AudioFileClip(str(AUDIO_DIR / f"sent_{i:0{pad_len}d}.mp3")) # Loading the the TTS audio. 
    st, en = centred_slice(mid, narr.duration) # Computing the video slice.
    clip_v = base_vid.subclip(st, en).set_audio(narr) # Attaching the audio.
    if i > 1:
        clip_v = clip_v.crossfadein(CROSS) # Adding crossfade between clips.
    clips.append(clip_v) # Appending the clip to the list.

# Finally, we export the reel.
final = concatenate_videoclips(clips, method="compose", padding=-CROSS).set_fps(TARGET_FPS) # Concatenating the clips at the constant 24 FPS.
final.write_videofile(
    str(OUT_VIDEO),
    fps=TARGET_FPS,
    codec="libx264",
    audio_codec="aac",
    preset="medium",
    bitrate="4000k",
    ffmpeg_params=["-err_detect", "ignore_err"],
)
print("Reel exported →", OUT_VIDEO.resolve())



Sentences rewritten for CLIP
4118 probe frames indexed
Moviepy - Building video /home/jovyan/below_the_deadline_(1936)_reel.mp4.
MoviePy - Writing audio in below_the_deadline_(1936)_reelTEMP_MPY_wvf_snd.mp4


                                                                     

MoviePy - Done.
Moviepy - Writing video /home/jovyan/below_the_deadline_(1936)_reel.mp4



                                                                 

Moviepy - Done !
Moviepy - video ready /home/jovyan/below_the_deadline_(1936)_reel.mp4
Reel exported → /home/jovyan/below_the_deadline_(1936)_reel.mp4
