In [None]:
import sys
print(sys.version)

In [None]:
# Install dependencies

!pip install --upgrade pip

!pip install replicate diffusers transformers accelerate scipy

!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cpu

!pip install torch torchvision transformers

!pip install pytube opencv-python


In [None]:
# Download YouTube Video Function (+ Some Dependencies)

!pip install yt-dlp
!brew install ffmpeg

import yt_dlp

import os

# Add ffmpeg's location to PATH (update if your path differs)
os.environ["PATH"] += os.pathsep + "/opt/homebrew/bin"

def download_youtube_video_single_stream(url, output_dir="videos", filename="input_video.mp4"):
    """
    Downloads a single video stream without merging audio and video.
    :param url: YouTube video URL
    :param output_dir: Directory to save the video
    :param filename: Name of the downloaded video file
    :return: Path to the downloaded video
    """
    os.makedirs(output_dir, exist_ok=True)
    video_path = os.path.join(output_dir, filename)

    ydl_opts = {
        "format": "best",  # Download single format, no merging
        "outtmpl": video_path,
    }

    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        ydl.download([url])

    print(f"Video saved to: {video_path}")
    return video_path

In [None]:
# Download a Video

video_path = download_youtube_video_single_stream("https://www.youtube.com/watch?v=y8Kyi0WNg40")

In [None]:
# Split Video into Frames Function

import cv2

def split_video_into_frames(video_path, frames_dir="frames"):
    """
    Splits a video into individual frames and saves them as images.
    :param video_path: Path to the video file
    :param frames_dir: Directory to save the extracted frames
    :return: Number of frames extracted
    """
    os.makedirs(frames_dir, exist_ok=True)  # Create the output directory if it doesn't exist

    # Open the video file
    cap = cv2.VideoCapture(video_path)
    frame_count = 0

    while True:
        ret, frame = cap.read()
        if not ret:
            break  # Stop if the video has ended

        frame_path = os.path.join(frames_dir, f"frame_{frame_count:04d}.jpg")
        cv2.imwrite(frame_path, frame)
        frame_count += 1

    cap.release()
    print(f"Extracted {frame_count} frames to {frames_dir}")
    return frame_count

frame_count = split_video_into_frames(video_path)

In [None]:
# CLIP Interrogator Building and Folder Processing Cell
# Takes images from the 'frames' folder and provides prompts for each, outputting a CSV file

# 1. Install dependencies
import subprocess

def setup():
    install_cmds = [
        ['pip', 'install', 'gradio'],
        ['pip', 'install', 'open_clip_torch'],
        ['pip', 'install', 'clip-interrogator'],
    ]
    for cmd in install_cmds:
        print(f"Installing: {' '.join(cmd)}")
        subprocess.run(cmd, check=True)

setup()

# 2. Import required libraries
import os
import sys
import pandas as pd
from PIL import Image
!{sys.executable} -m pip install gradio clip-interrogator
import gradio as gr
from clip_interrogator import Config, Interrogator
print("Gradio and CLIP Interrogator installed successfully!")

# 3. Set configuration parameters
caption_model_name = 'blip-large'  # Options: 'blip-base', 'blip-large', 'git-large-coco'
clip_model_name = 'ViT-L-14/openai'  # Options: 'ViT-L-14/openai', 'ViT-H-14/laion2b_s32b_b79k'

# 4. Initialize Interrogator
config = Config()
config.clip_model_name = clip_model_name
config.caption_model_name = caption_model_name
ci = Interrogator(config)

# 5. Process all images in a folder and save prompts to CSV
def process_images_in_folder(input_folder, output_file, mode='best'):
    """
    Process all images in a folder, generate prompts, and save results to a CSV file.
    :param input_folder: Folder containing the images
    :param output_file: Output CSV file path
    :param mode: Mode for prompt generation ('best', 'classic', 'fast', 'negative')
    """
    prompts = []
    
    # Map the mode to the correct CLIP Interrogator method
    mode_mapping = {
        'best': ci.interrogate,
        'classic': ci.interrogate_classic,
        'fast': ci.interrogate_fast,
        'negative': ci.interrogate_negative
    }
    
    if mode not in mode_mapping:
        raise ValueError("Invalid mode. Choose from 'best', 'classic', 'fast', or 'negative'.")
    
    interrogate_method = mode_mapping[mode]
    
    # Iterate through all image files in the folder
    for filename in sorted(os.listdir(input_folder)):
        if filename.lower().endswith(('.png', '.jpg', '.jpeg')):  # Only process images
            image_path = os.path.join(input_folder, filename)
            print(f"Processing {filename}...")

            try:
                image = Image.open(image_path).convert("RGB")
                # Use the correct interrogate method
                prompt = interrogate_method(image)
                prompts.append({"filename": filename, "prompt": prompt})
            
            except Exception as e:
                print(f"Error processing {filename}: {e}")

    # Save results to a CSV file
    if prompts:
        df = pd.DataFrame(prompts)
        df.to_csv(output_file, index=False)
        print(f"Saved {len(prompts)} prompts to {output_file}")
    else:
        print("No prompts were generated. Please check for errors or empty input folder.")

# 6. Process Entire Folder
input_folder = "frames"  # Folder with extracted frames
output_file = "generated_prompts.csv"

process_images_in_folder(input_folder, output_file, mode="best")

# I strongly recommend pausing at this stage, and re-working the prompts. The model tends to produce NSFW prompts, so it is worth working through and sanitising them prior to generating the frames.

In [None]:
# NEED TO CREATE SOMETHING THAT GENERATES THE FRAMES FROM THE PROMPTS

Note: I have temporarily managed this by running the generated prompts into Stable Diffusion running through ComfyUI. The longer-term and lower effort way would be to integrate this and feed each prompt into the model to generate the frames. Currently, prompts need to be pasted out individually. This would be a future and significant improvement to the workflow.

In [None]:
# Stitch the Video Back Together

def frames_to_video(frames_dir="output_frames", output_video="output_video.mp4", fps=24):
    """
    Combines frames into a video.
    :param frames_dir: Directory containing the frames
    :param output_video: Path to save the output video
    :param fps: Frames per second for the video
    """
    frames = sorted(glob.glob(f"{frames_dir}/*.jpg"))  # Ensure correct frame order
    if not frames:
        raise ValueError("No frames found in the directory!")

    # Get frame dimensions
    first_frame = cv2.imread(frames[0])
    height, width, _ = first_frame.shape

    # Initialize video writer
    fourcc = cv2.VideoWriter_fourcc(*"mp4v")
    out = cv2.VideoWriter(output_video, fourcc, fps, (width, height))

    for frame in frames:
        img = cv2.imread(frame)
        out.write(img)

    out.release()
    print(f"Video saved to {output_video}")

In [None]:
# The below code may help in certain instances if you're having problems

In [None]:
# Generate Prompts from Frames 'Helper' Function

import torch
from torchvision import transforms
from PIL import Image
from transformers import CLIPProcessor, CLIPModel

# Load the CLIP model
device = "cuda" if torch.cuda.is_available() else "cpu"
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

def generate_clip_prompt(image_path):
    """
    Generates a textual description for the given image using OpenAI's CLIP model.
    :param image_path: Path to the image file
    :return: Generated description
    """
    image = Image.open(image_path).convert("RGB")
    inputs = processor(images=image, return_tensors="pt", padding=True).to(device)
    with torch.no_grad():
        outputs = model.get_image_features(**inputs)
    # Normalize the image features
    image_features = outputs / outputs.norm(dim=-1, keepdim=True)
    
    # Example text prompts (you can expand this list)
    text_prompts = [
        "a photo of a cat",
        "a photo of a dog",
        "a scenic landscape",
        "an image of a car",
        "a portrait of a person",
        "a surreal painting",
    ]
    text_inputs = processor(text=text_prompts, return_tensors="pt", padding=True).to(device)
    text_features = model.get_text_features(**text_inputs)
    text_features = text_features / text_features.norm(dim=-1, keepdim=True)

    # Compute similarities
    similarities = torch.matmul(image_features, text_features.T)
    best_match_idx = similarities.argmax().item()
    return text_prompts[best_match_idx]

In [None]:
# Frame Processing Pipeline Function

import glob
import os

def generate_prompts_from_frames_clip(frames_dir="frames", prompts_file="prompts_clip.txt"):
    """
    Generates prompts for each frame using CLIP.
    :param frames_dir: Directory containing input frames
    :param prompts_file: File to save the generated prompts
    """
    frames = sorted(glob.glob(f"{frames_dir}/*.jpg"))  # Ensure frames are processed in order

    # Ensure the output file's directory exists
    prompts_dir = os.path.dirname(prompts_file)
    if prompts_dir:
        os.makedirs(prompts_dir, exist_ok=True)

    with open(prompts_file, "w") as f:
        for frame_path in frames:
            print(f"Processing frame: {frame_path}")
            try:
                prompt = generate_clip_prompt(frame_path)
                f.write(f"{os.path.basename(frame_path)}: {prompt}\n")
            except Exception as e:
                print(f"Error processing {frame_path}: {e}")

    print(f"Prompts saved to {prompts_file}")

In [None]:
# Generate Prompts from Frames Main Function

# Install and Upgrade Required Libraries
!pip install --upgrade diffusers accelerate transformers torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cpu

# Import Libraries
import os
import glob
import torch
from PIL import Image

# Check for Device Support (MPS for Apple Silicon or CPU)
device = "cuda" if torch.cuda.is_available() else ("mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")

# Function to Generate Prompts from Frames Using CLIP
def generate_prompts_from_frames_clip(frames_dir="frames", prompts_file="prompts_clip.txt"):
    """
    Generates prompts for each frame and saves them to a text file.
    :param frames_dir: Directory containing input frames
    :param prompts_file: File to save the generated prompts
    """
    frames = sorted(glob.glob(f"{frames_dir}/*.jpg"))  # Ensure frames are processed in order

    # Ensure the directory for the prompts file exists
    if os.path.dirname(prompts_file):
        os.makedirs(os.path.dirname(prompts_file), exist_ok=True)

    with open(prompts_file, "w") as f:
        for frame_path in frames:
            print(f"Processing frame: {frame_path}")
            try:
                prompt = generate_clip_prompt(frame_path)  # Using the helper function
                f.write(f"{os.path.basename(frame_path)}: {prompt}\n")
            except Exception as e:
                print(f"Error processing {frame_path}: {e}")

    print(f"Prompts saved to {prompts_file}")

# Paths for Input Frames and Output Prompts
frames_dir = "frames"
prompts_file = "prompts_clip.txt"

# Step: Generate Prompts from Frames
generate_prompts_from_frames_clip(frames_dir=frames_dir, prompts_file=prompts_file)