# Pipeline for interpolation of Keyframes


## Imports

This code uses the `diffusers-0-27-0` environment
<br>
<span style="color:red">**NOTE:** Define all imports and install/shell commands here</span>


In [3]:
# %%capture
# add install or other terminal commands here
# %pip install numpy matplotlib opencv-python scikit-image scikit-video pillow
# %pip install tabulate
# %conda install -c conda-forge ffmpeg

In [None]:
import cv2
import subprocess
import sys
from KeyFrameDetector.key_frame_detector import smartKeyframeDetection
import numpy as np
from moviepy.editor import concatenate_videoclips, VideoFileClip
from tabulate import tabulate
from svd.attn_ctrl.attention_control import (
    AttentionStore,
    register_temporal_self_attention_control,
    register_temporal_self_attention_flip_control,
)
from svd.custom_diffusers.schedulers.scheduling_euler_discrete import EulerDiscreteScheduler
from svd.custom_diffusers.pipelines.pipeline_frame_interpolation_with_noise_injection import FrameInterpolationWithNoiseInjectionPipeline
from diffusers import UNetSpatioTemporalConditionModel
from diffusers.utils import load_image, export_to_video
import time
import torch
import shutil
import os
from utils.ffmpeg_evaluator import FFmpegEvaluator
import pandas as pd
import torch
import gc
os.chdir("/root/VideoReconstruction")

## Pipeline

There are two functuions in this pipeline:

1. `interpolate_keyframes` - This function takes in 2 keyframes and interpolates the keyframes to get the intermediate keyframes.
2. `generate_video` - This function uses a list of keyframes and interpolates between every two consecutive keyframes to get a list of intermediate video segments which are then concatenated to get the final video.


### Interpolation

The `run_interpolation` function interpolates between two keyframes and saves the output segment video to the specifed path.

The  `generate_all_interpolations` function generates all the intermediate keyframes between every two consecutive keyframes in an input directory with all the keyframes in it saving all the output videos to the output path.

The `stitch_video_segments` function stitches the video segments to get the final video.

In [5]:
def run_interpolation(checkpoint_dir, frame1_path, frame2_path, out_path, resize_specs, fps, pretrained_model_name_or_path, duration, num_frames, seed=42, num_inference_steps=50, weighted_average=False, noise_injection_steps=0, noise_injection_ratio=0.5, decode_chunk_size=8, device="cuda:0"):
    """
    Run key frame interpolation between two frames using a pretrained model and noise injection. It saves the interpolated video or gif to the specified output path.

    Args:
        checkpoint_dir (str): Directory containing the checkpoint for the fine-tuned UNet model.
        frame1_path (str): Path to the first frame image.
        frame2_path (str): Path to the second frame image.
        out_path (str): Path to save the output interpolated video or gif.
        resize_specs (tuple): Tuple specifying the resize dimensions (width, height) for the input frames.
        fps (int): Frames per second for the output video.
        pretrained_model_name_or_path (str): Path or name of the pretrained model.
        duration (int): Duration for which each frame will be displayed in the gif (in milliseconds).
        num_frames (int): Number of frames to interpolate between the two input frames.
        seed (int, optional): Random seed for reproducibility. Defaults to 42.
        num_inference_steps (int, optional): Number of inference steps for the interpolation. Defaults to 50.
        weighted_average (bool, optional): Whether to use weighted average during interpolation. Defaults to False. 
        True: Produces a video with a gradual shift from image1 to image2. This can give the effect of morphing or smooth interpolation between two images.
        False: Maintains an equal influence of both images across all frames. This results in a more consistent combination of features from both image1 and image2.

        noise_injection_steps (int, optional): Number of steps for noise injection. Defaults to 0.
        noise_injection_ratio (float, optional): Ratio of noise injection. Defaults to 0.5.
        decode_chunk_size (int, optional): Chunk size for decoding. Defaults to 8. controls how many frames are decoded at a time from the latent representations during the video generation process.


        device (str, optional): Device to run the interpolation on. Defaults to "cuda:0".

    Returns:
        None
    """
    # Load noise scheduler and pipeline
    noise_scheduler = EulerDiscreteScheduler.from_pretrained(
        pretrained_model_name_or_path, subfolder="scheduler")

    pipe = FrameInterpolationWithNoiseInjectionPipeline.from_pretrained(
        pretrained_model_name_or_path, scheduler=noise_scheduler, variant="fp16", torch_dtype=torch.float16)

    # Set up UNet model for fine-tuning and load state dicts

    ref_unet = pipe.ori_unet
    state_dict = pipe.unet.state_dict()

    # Compute delta weights
    finetuned_unet = UNetSpatioTemporalConditionModel.from_pretrained(
        checkpoint_dir, subfolder="unet", torch_dtype=torch.float16)

    ori_unet = UNetSpatioTemporalConditionModel.from_pretrained(
        pretrained_model_name_or_path, subfolder="unet", variant="fp16", torch_dtype=torch.float16)

    # Apply delta to state dict for specific layers
    finetuned_state_dict = finetuned_unet.state_dict()
    ori_state_dict = ori_unet.state_dict()
    for name, param in finetuned_state_dict.items():
        if "temporal_transformer_blocks.0.attn1.to_v" in name or "temporal_transformer_blocks.0.attn1.to_out.0" in name:
            delta_w = param - ori_state_dict[name]
            state_dict[name] = state_dict[name] + delta_w
    pipe.unet.load_state_dict(state_dict)

    # Setup attention controllers
    controller_ref = AttentionStore()
    register_temporal_self_attention_control(ref_unet, controller_ref)

    controller = AttentionStore()
    register_temporal_self_attention_flip_control(
        pipe.unet, controller, controller_ref)

    # Move pipeline to specified device
    pipe = pipe.to(device)

    # Set random seed
    generator = torch.Generator(device=device)
    if seed is not None:
        generator = generator.manual_seed(seed)

    # Load and resize frames
    frame1 = load_image(frame1_path).resize(resize_specs)
    frame2 = load_image(frame2_path).resize(resize_specs)

    timestamp_f1 = frame1_path.split("_")[1]
    timestamp_f2 = frame2_path.split("_")[1]
    timestamp_f1 = timestamp_f1.split(".jpg")[0]
    timestamp_f2 = timestamp_f2.split(".jpg")[0]
  

    num_frames = int(
        np.round((float(timestamp_f2) - float(timestamp_f1)) * fps))
    
    # fps = np.round(num_frames / np.round(float(timestamp_f2) - float(timestamp_f1)))
    
    print("Interpolating between frames at timestamps", frame1_path, frame2_path)
    
    print("The num frames is ", num_frames)
    print("The fps is ", fps)

    frames = pipe(image1=frame1, image2=frame2, num_inference_steps=num_inference_steps, generator=generator, weighted_average=False,
                  noise_injection_steps=noise_injection_steps, noise_injection_ratio=noise_injection_ratio, num_frames=num_frames, decode_chunk_size=decode_chunk_size, fps=fps).frames[0]
    # frames = pipe(image1=frame1, image2=frame2, num_inference_steps=num_inference_steps, generator=generator, weighted_average=True,
    #               noise_injection_steps=noise_injection_steps, noise_injection_ratio=noise_injection_ratio, num_frames=num_frames, decode_chunk_size=7, motion_bucket_id=200).frames[0]

    os.makedirs(os.path.dirname(out_path), exist_ok=True)
    # duration = the time  for whicch each frame will be displayed in the gif
    if out_path.endswith(".gif"):
        print(f"Saving {len(frames)} frames to {out_path} as gif")
        frames[0].save(out_path, save_all=True,
                       append_images=frames[1:], duration=duration, loop=0)
    else:
        print(f"Saving {len(frames)} frames to {out_path} as video")
        export_to_video(frames, out_path, fps=fps)

    print(f"Interpolated video saved to {out_path}")

    # Free GPU memory after inference
    del controller, controller_ref, ori_unet, finetuned_unet
    torch.cuda.empty_cache()

In [None]:
def generate_all_interpolations(checkpoint_dir, input_sub_dir, output_dir, model_name, num_frames, decode_chunk_size, extension=".gif", resize_specs=(1024, 576), fps=7, duration=142, seed=42, inference_steps=20, noise_injection_steps=2, noise_injection_ratio=0.5, device="cuda:0", video_name=None):
    """
    Generates interpolated videos from key frames in the input directory.

    Args:
        checkpoint_dir (str): Directory containing the model checkpoints.
        input_sub_dir (str): Directory containing the input frames.
        output_dir (str): Directory to save the output videos.
        model_name (str): Name of the pre-trained model to use for interpolation.
        num_frames (int): Number of frames to generate between each pair of key frames.
        decode_chunk_size (int): Size of the chunk to decode.
        extension (str, optional): Extension of the output video files. Defaults to ".gif".
        resize_specs (tuple, optional): Tuple specifying the width and height to resize the frames. Defaults to (1024, 576).
        fps (int, optional): Frames per second for the output video. Defaults to 7.
        duration (int, optional): Duration of the output video in seconds. Defaults to 142.
        seed (int, optional): Random seed for reproducibility. Defaults to 42.
        inference_steps (int, optional): Number of inference steps for the model. Defaults to 20.
        noise_injection_steps (int, optional): Number of steps to inject noise during inference. Defaults to 2.
        noise_injection_ratio (float, optional): Ratio of noise to inject during inference. Defaults to 0.5.
        device (str, optional): Device to run the model on. Defaults to "cuda:0".
        video_name (str, optional): Name of the video. If None, it will be derived from the input_sub_dir. Defaults to None.

    Returns:
        None
    """
    video_name = os.path.basename(input_sub_dir)

    intermediate_videos_dir = os.path.join(
        output_dir, f"interm_videos_{video_name}")
    os.makedirs(intermediate_videos_dir, exist_ok=True)

    frames = sorted([os.path.join(input_sub_dir, f) for f in os.listdir(
        input_sub_dir) if f.endswith((".png", ".jpeg", ".jpg"))])

    print(f"Found {len(frames)} frames in {input_sub_dir}")
    
    if len(frames) < 2:
        print(f"Skipping {video_name} as it has less than 2 frames")
        return

    frames = sorted(frames, key=lambda x: float(
        x.split("_")[-1].replace(".jpg", "")))
    
    for i in range(len(frames) - 1):
        frame1_path = frames[i]
        frame2_path = frames[i + 1]

        print(f"Interpolating between frames {frame1_path} and {frame2_path}")

        if "<s>" in frame1_path:
            print(f"Skipping {frame1_path} as its entrie bucket is already saved")
            continue
        
        segment_output_path = os.path.join(
            intermediate_videos_dir, f"segment_{i}{extension}")

        run_interpolation(
            checkpoint_dir=checkpoint_dir,
            frame1_path=frame1_path,
            frame2_path=frame2_path,
            out_path=segment_output_path,
            resize_specs=resize_specs,
            fps=fps,
            pretrained_model_name_or_path=model_name,
            duration=duration,
            seed=seed,
            num_inference_steps=inference_steps,
            noise_injection_ratio=noise_injection_ratio,
            noise_injection_steps=noise_injection_steps,
            num_frames=num_frames,
        )
        torch.cuda.empty_cache()
    torch.cuda.empty_cache()

In [7]:
def stitch_video_segments(intermediate_videos_dir, output_path, fps):
    """
    Stitches together video segments from a specified directory into a single video file.

    Args:
        intermediate_videos_dir (str): The directory containing the video segments to be stitched together.
        output_path (str): The file path where the final stitched video will be saved.
        fps (int): The frames per second (fps) rate for the final video.

    Returns:
        None

    Prints:
        The number of segments found and a confirmation message when the final video is saved.
    """
    segment_paths = sorted([os.path.join(intermediate_videos_dir, f)
                           for f in os.listdir(intermediate_videos_dir)])

    print(f"Found {len(segment_paths)} segments to stitch.")
    clips = [VideoFileClip(segment).set_fps(fps) for segment in segment_paths]
    final_video = concatenate_videoclips(clips, method="compose")
    final_video.write_videofile(output_path, fps=fps)
    print(f"Final video saved to {output_path}")

    for clip in clips:
        clip.close()

In [None]:
def trim_videos(input_dir, trim_length, output_dir):
    """
    Trims all videos in the specified directory to the specified length.

    Args:
        input_dir (str): The directory containing the videos to be trimmed.
        trim_length (int): The duration to trim the videos to (in seconds).

    Returns:
        None

    Prints:
        A confirmation message when the trimmed videos are saved.
    """
    video_paths = [os.path.join(input_dir, f)
                   for f in os.listdir(input_dir) if f.endswith((".mp4"))]
    already_trimmed = [os.path.basename(i) for i in os.listdir(output_dir)]
    

    for video_path in video_paths:
        video_name = os.path.basename(video_path)
        
        if video_name in already_trimmed:
            print(f"Skipping {video_name} as it is already trimmed")
            continue
        
        video = VideoFileClip(video_path)
        trimmed_video = video.subclip(0, trim_length)
        trimmed_video.write_videofile(os.path.join(output_dir, video_name), codec="libx264", fps=video.fps, audio_codec="aac"), 
        trimmed_video.close()
        video.close()

    print(f"Trimmed videos saved to {input_dir}")

In [9]:
# untrimmed_dir = "/root/VideoReconstruction/original_vids/"
# output_dir_trimmed = "/root/VideoReconstruction/input_videos/"
# trim_videos(untrimmed_dir, 10, output_dir_trimmed )

### Keyframe Generation and Savings Analysis

This part of the code processes all the videos in the input folder, generating keyframes for each video and storing them in the output folder. It also benchmarks the file sizes of the original videos and the videos reconstructed from the keyframes.

To ensure a fair comparison, the original videos—initially in 4K resolution—are resized to match the resolution of the videos that can be generated by our system, which is currently limited to 1024x576 due to hardware and model constraints. By resizing the original videos to 1024x576 before conducting the bandwidth savings analysis, we maintain consistency and avoid skewing results caused by differences in resolution.

To further standardize the comparison, the same codec (`libx264`) and bitrate are applied to both the resized original videos and the videos reconstructed from keyframes. This ensures that differences in compression settings do not affect the analysis, making the results reflect genuine bandwidth savings attributable to keyframe interpolation.

> **Note:** While the output videos are limited to 1024x576 resolution, the keyframes are still generated from the original 4K frames. This approach leverages the higher quality of 4K frames during interpolation, resulting in better-quality reconstructed videos even though the final resolution is reduced to 1024x576. This ensures that the interpolation process operates on the highest possible quality inputs, maximizing the fidelity of the generated videos.

In [10]:
def get_directory_size(directory):
    total_size = 0
    for dirpath, dirnames, filenames in os.walk(directory):
        for filename in filenames:
            filepath = os.path.join(dirpath, filename)
            total_size += os.path.getsize(filepath)
    return total_size

In [None]:
bucket_size_in_frames = 60  # Number of frames in each bucket/segment of the video. We will chunk the video into these many frames per chunk and then interpolate between the keyframes in each bucket.
threshold = 0.3  # Threshold for the keyframe detection algorithm. The higher the threshold, the fewer the keyframes.
minimum_frames_in_between = 20  # Minimum number of frames between two keyframes.
maximum_frames_in_between = 25  # Maximum number of frames between two keyframes.
top_k_no_interpolation = 0  # Number of buckets for which we will not do any interpolation. We will just copy the frames as it is. This is to ensure that high motion segments are not interpolated and the motion is preserved.
segment_fps = 15  # FPS of the video segments that we will NOT interpolate as defined by top_k_no_interpolation.

In [None]:
# DO NOT PUT _ in the input directory name. We parse at the _ to get the timestamp of the frame. This can mess up the paths. 
input_dir = "/root/VideoReconstruction/alloriginalvids" # this is the directory where the input videos are stored. 
output_dir_base = "/root/VideoReconstruction/outputs/bucketedkeyframes"
smaller_video_output_dir = "/root/VideoReconstruction/outputs/bucketed_keyframes_video"

# !rm -rf $output_dir_base
# !rm -rf $smaller_video_output_dir

os.makedirs(output_dir_base, exist_ok=True)
os.makedirs(smaller_video_output_dir, exist_ok=True)

videos = os.listdir(input_dir)

summary_data = []

evaluator = FFmpegEvaluator()

interims = "./interims"
os.makedirs("./interims", exist_ok=True)

for video in videos:
    video_path = os.path.join(input_dir, video)
    video_name = os.path.splitext(video)[0]
    output_dir = os.path.join(output_dir_base, video_name)
    interim_output_path = os.path.join(interims, f"{video_name}_resized.mp4")

    cap = cv2.VideoCapture(video_path)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    fps = cap.get(cv2.CAP_PROP_FPS)
    duration = total_frames / fps
    cap.release()
    command = [
        'ffmpeg',
        '-i', video_path,  # Input video
        '-r', str(fps),  # Set frame rate
        '-vf', 'scale=1024:576',  # Resize to 1024x576
        '-c:v', 'libx264',  # Use H.264 codec
        '-pix_fmt', 'yuv420p',  # Set pixel format
        '-t', str(duration),  
        '-y',  # Overwrite existing file
        interim_output_path
    ]
    
    process = subprocess.run(command, check=True, capture_output=True)

    video_size_in_bytes = os.path.getsize(interim_output_path)
    video_size_in_megabytes = video_size_in_bytes / (1024 * 1024)
    if video_name not in os.listdir(output_dir_base):
        smartKeyframeDetection(
            video_path,
            bucket_size_in_frames,
            threshold,
            output_dir,
            minimum_frames_in_between,
            maximum_frames_in_between,
            segment_fps,
            top_k_no_interpolation
        )
    else:
        print(f"Skipping {video_name} as it is already processed")
        continue
        
    # get directory size
    keyframes = os.listdir(output_dir)
    keyframe_count = len(keyframes)

    keyframes_dir_size_in_bytes = get_directory_size(output_dir)
    keyframes_dir_size_in_megabytes = keyframes_dir_size_in_bytes / (1024 * 1024)

    compressed_size, savings_from_original_video, savings_from_keyframes_video = evaluator.evaluate_smaller_video(
        interim_output_path, smaller_video_output_dir, output_dir)

    summary_data.append(
        [
            video,
            f"{video_size_in_megabytes:.2f} MB",
            total_frames,
            f"{fps:.2f} FPS",
            keyframe_count,
            f"{keyframes_dir_size_in_megabytes:.2f} MB",
            f"{compressed_size/(1024*1024):.2f} MB",
            f"{savings_from_original_video:.2f} %",
            f"{savings_from_keyframes_video:.2f} %"
        ]
    )

headers = [
    "Video Name",
    "Original Size",
    "Total Frames",
    "FPS",
    "Total Keyframes",
    "Keyframes Size",
    "Compressed Video Size",
    "Savings Original Video",
    "Savings Keyframes Video"
]

df = pd.DataFrame(summary_data, columns=headers)
output_csv_path = "/root/VideoReconstruction/logs/bandwidth_savings_data.csv"
df.to_csv(output_csv_path, index=False)
shutil.rmtree(interims)

df.style.set_table_styles(
    [{'selector': 'th', 'props': [('font-weight', 'bold'), ('background-color', '#000000')]}]
).set_properties(**{'text-align': 'center'})
    

Skipping multiplewaterfalls as it is already processed
For /root/VideoReconstruction/alloriginalvids/multiplewaterfalls.mp4, selected 2 frames, and inserted 10 frames.
Skipping pancakechocolate as it is already processed
For /root/VideoReconstruction/alloriginalvids/pancakechocolate.mp4, selected 2 frames, and inserted 9 frames.
Skipping inkmixing as it is already processed
For /root/VideoReconstruction/alloriginalvids/inkmixing.mp4, selected 2 frames, and inserted 9 frames.
Skipping girleatingcakespoon as it is already processed
For /root/VideoReconstruction/alloriginalvids/girleatingcakespoon.mp4, selected 2 frames, and inserted 10 frames.
Skipping coffeeart as it is already processed
For /root/VideoReconstruction/alloriginalvids/coffeeart.mp4, selected 2 frames, and inserted 8 frames.
Skipping tallwaterfall as it is already processed
For /root/VideoReconstruction/alloriginalvids/tallwaterfall.mp4, selected 2 frames, and inserted 10 frames.
Skipping countrysideaerial as it is already

Unnamed: 0,Video Name,Original Size,Total Frames,FPS,Total Keyframes,Keyframes Size,Compressed Video Size,Savings Original Video,Savings Keyframes Video
0,forest.mp4,3.79 MB,300,29.97 FPS,13,36.82 MB,0.08 MB,97.83 %,968.72 %
1,fog.mp4,0.78 MB,179,24.00 FPS,8,9.64 MB,0.03 MB,95.65 %,1229.76 %
2,multiplewaterfalls.mp4,7.23 MB,299,29.97 FPS,12,44.55 MB,0.11 MB,98.43 %,614.22 %
3,pancakechocolate.mp4,0.59 MB,250,25.00 FPS,11,7.34 MB,0.03 MB,95.36 %,1230.35 %
4,motorway.mp4,1.61 MB,300,29.97 FPS,12,23.00 MB,0.05 MB,96.77 %,1422.16 %
5,cliffs.mp4,2.58 MB,300,29.97 FPS,13,27.10 MB,0.05 MB,97.92 %,1048.71 %
6,inkmixing.mp4,1.14 MB,250,25.00 FPS,11,9.93 MB,0.03 MB,97.48 %,867.77 %
7,coast.mp4,1.55 MB,300,29.97 FPS,12,22.41 MB,0.03 MB,97.83 %,1443.79 %
8,fisher.mp4,0.91 MB,250,25.00 FPS,9,4.15 MB,0.02 MB,97.27 %,454.45 %
9,girleatingcakespoon.mp4,0.87 MB,250,25.00 FPS,12,12.38 MB,0.03 MB,96.02 %,1413.08 %


### Run the pipeline


#### Constants


In [8]:

# INPUT_DIR is the directory from which the keyframes will be read
# DO NOT PUT _ in the input directory name. We parse at the _ to get the timestamp of the frame. This can mess up the paths. 

INPUT_DIR = "/root/VideoReconstruction/outputs/bucketedkeyframes"
OUT_DIR = "/root/VideoReconstruction/our_results"
os.makedirs(OUT_DIR, exist_ok=True)

CHECKPOINT_DIR = "/root/VideoReconstruction/svd/checkpoints/svd_reverse_motion_with_attnflip/svd_reverse_motion_with_attnflip/unet"  # path for docker image

threshold = 0.3

# Model options and selection. maybe we cam add more models later for this
models_to_try = ["stabilityai/stable-video-diffusion-img2vid-xt",
                 "stabilityai/stable-video-diffusion-img2vid-xt-1-1"]
MODEL_NAME = models_to_try[0]

# Noise injection parameters
noise_injection_steps = 2
noise_injection_ratio = 0.5

# if this is .gif, the output will be a gif otherwise it will be a video
interpolation_extension = ".mp4"
final_extension = ".mp4"

# Resize specifications, frame rate, and duration etc
resize_specs = (1024, 576)
fps = 25 # the frame rate of the output video
duration = 142  # the duration for which each frame is displayed in the vid
inference_steps = 15 # The number of denoising steps. More denoising steps usually lead to a higher quality image at the expense of slower inference.
seed = 42
decode_chunk_size = 6 # controls how many frames are decoded at a time from the latent representations during the video generation process. A smaller value will use less memory but may be slower. The higher the chunk size, the higher the temporal consistency between frames, but also the higher the memory consumption. By default, the decoder will decode all frames at once for maximal quality. Reduce `decode_chunk_size` to reduce memory usage.
num_frames = 24 #  The number of frames to decode at a time. We decide these dynamically based on the timestamps of the frames.

# ===========================
# Display Configuration
# ===========================

settings = [
    ["Model", MODEL_NAME],
    ["Output Directory", OUT_DIR],
    ["Resize Specifications", f"{resize_specs[0]} x {resize_specs[1]}"],
    ["FPS", fps],
    ["Frame Duration (ms)", duration],
    ["Inference Steps", inference_steps],
    ["Random Seed", seed],
    ["Saving interpolated video as", "gif file" if interpolation_extension ==
        ".gif" else "mp4 video file"],
    ["Number of Frames", num_frames],
]

print("\nSettings for Image-to-Video Interpolation:")
print(tabulate(settings, headers=["Parameter", "Value"], tablefmt="grid"))


Settings for Image-to-Video Interpolation:
+------------------------------+-----------------------------------------------+
| Parameter                    | Value                                         |
| Model                        | stabilityai/stable-video-diffusion-img2vid-xt |
+------------------------------+-----------------------------------------------+
| Output Directory             | /root/VideoReconstruction/our_results         |
+------------------------------+-----------------------------------------------+
| Resize Specifications        | 1024 x 576                                    |
+------------------------------+-----------------------------------------------+
| FPS                          | 25                                            |
+------------------------------+-----------------------------------------------+
| Frame Duration (ms)          | 142                                           |
+------------------------------+---------------------------------

#### Looped interpolation

Here, we loop through all the videos in the input folder, generate keyframes for them and then interpolate between the keyframes to get the final video. This is done for all the videos in the input directory.

In [None]:
# Loop over each file in the input directory

for filename in os.listdir(INPUT_DIR):
    
    vids_done = os.listdir("./our_results/")
    start_time = time.time()
    video_name = os.path.splitext(filename)[0] 
    video_path = os.path.join(INPUT_DIR, filename)
    
    if f"{video_name}_interpolated{final_extension}" in vids_done:
        print(f"Skipping {video_name} as it is already done")
        continue
    try:
        generate_all_interpolations(
            checkpoint_dir=CHECKPOINT_DIR, 
            input_sub_dir=os.path.join(INPUT_DIR, video_name), 
            output_dir=OUT_DIR, 
            model_name=MODEL_NAME, 
            resize_specs=resize_specs, 
            fps=fps, 
            duration=duration, 
            extension=interpolation_extension,
            seed=seed, 
            inference_steps=inference_steps, 
            noise_injection_steps=noise_injection_steps, 
            noise_injection_ratio=noise_injection_ratio, 
            decode_chunk_size=decode_chunk_size, 
            num_frames=num_frames, 
            video_name=video_name
        )
        stitch_video_segments(
            intermediate_videos_dir=os.path.join(OUT_DIR, f"interm_videos_{video_name}"), 
            output_path=os.path.join(OUT_DIR, f"{video_name}_interpolated{final_extension}"), 
            fps=fps
        )
        
    except Exception as e:
        print("Error while generate_all_interpolations")
        print(f"Error: {e}")
        # Kill the kernel processes on GPU errors
        print("Killing the kernel processes")
        env_name = os.path.basename(sys.prefix)
        subprocess.call(
            f"nvidia-smi | grep {env_name} | awk '{{print $5}}' | xargs -I {{}} kill -9 {{}}",
            shell=True
        )
        torch.cuda.empty_cache()
        gc.collect()
        raise e

    finally:
        torch.cuda.empty_cache()
        gc.collect()
        pass


    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f"Elapsed time for {video_name}: {elapsed_time:.2f} seconds")

#### Inidvidual Video Processing

You can uncomment the below cell to run the interpolation for a single video.

In [None]:
input_dir = "/root/VideoReconstruction/input_videos/cliffwater.mp4"
output_keyframes_dir_base = "/root/VideoReconstruction/outputs/bucketedkeyframes"
output_video_dir = "/root/VideoReconstruction/our_results/"
video_name = os.path.splitext(os.path.basename(input_dir))[0]
os.makedirs(output_keyframes_dir_base, exist_ok=True)
output_keyframes_dir = os.path.join(output_keyframes_dir_base, video_name)

# if the keyframes have not been generated then uncomment the below line as well. otherwise comment it out to prevent time wastage. It will not cause an error if you generate the keyframes again but it will be a waste of time

smartKeyframeDetection(input_dir, bucket_size_in_frames, threshold, output_keyframes_dir_base, minimum_frames_in_between, maximum_frames_in_between, segment_fps, top_k_no_interpolation)

try:
    generate_all_interpolations(checkpoint_dir=CHECKPOINT_DIR, input_sub_dir=output_keyframes_dir, output_dir=output_video_dir, model_name=MODEL_NAME, resize_specs=resize_specs, fps=fps, duration=duration, extension=interpolation_extension,
                    seed=seed, inference_steps=inference_steps, noise_injection_steps=noise_injection_steps, noise_injection_ratio=noise_injection_ratio, decode_chunk_size=decode_chunk_size, num_frames=num_frames)
    stitch_video_segments(intermediate_videos_dir=os.path.join(OUT_DIR, f"interm_videos_{video_name}"), output_path=os.path.join(
            OUT_DIR, f"{video_name}_interpolated{final_extension}"), fps=fps)
except Exception as e:
    print(f"Error: {e}")
    print("Killing the kernel processes")
    env_name = os.path.basename(sys.prefix)
    subprocess.call(
    f"nvidia-smi | grep {env_name} | awk '{{print $5}}' | xargs -I {{}} kill -9 {{}}",
    shell=True
    )


In [None]:
# we can kill the kernel processes after we are done with the interpolation to clear up GPU memory

!nvidia-smi | grep $(conda env list | grep '*' | awk '{print $1}') | awk '{print $5}' | xargs -I {} kill -9 {}

In [None]:
!nvidia-smi

Tue Dec 17 13:24:07 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.183.01             Driver Version: 535.183.01   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 4090        Off | 00000000:01:00.0  On |                  Off |
|  0%   41C    P5              22W / 450W |  23257MiB / 24564MiB |     28%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    