<h1>Video Generator By Scene</h1>

In [1]:
from __future__ import print_function
import sys
import os
import pickle
import PIL
import skimage.io as io
import torch

from scenedetect.video_manager import VideoManager
from scenedetect.scene_manager import SceneManager
from scenedetect.stats_manager import StatsManager

from scenedetect.detectors.content_detector import ContentDetector
from scenedetect.detectors.threshold_detector import ThresholdDetector
from typing import List, Tuple

import cv2

BASE_DIR=os.environ['PROJECT_DIRECTORY']
sys.path.append(BASE_DIR+'software_utils/')

# To account for path errors
try:
    from models.image_captioner import ImageCaptioner
    from models.video_captioner import VideoCaptioner
    from models.encoderCNN import EncoderCNN
except ImportError:
    pass

try:
    from software_utils.create_transformer import create_transformer
    from software_utils.vocabulary import Vocabulary
except ImportError:
    pass

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
root_path=BASE_DIR
coco_vocab_path=BASE_DIR+'Data/processed/coco_vocab.pkl'
msrvtt_vocab_path='Data/processed/msrvtt_vocab.pkl'
base_model='resnet152'
ic_model_path='models/image_caption-model11-20-0.1309-5.0.pkl'
vc_model_path='models/video_models/video_caption-model11-110-0.3354-5.0.pkl'
im_embedding_size=2048
vid_embedding_size=2048
embed_size=256
hidden_size=512
num_frames=40
max_caption_length=35
ic_rnn_type='lstm'
vc_rnn_type='gru'
im_res=224

In [3]:
with open(msrvtt_vocab_path, 'rb') as f:
    msrvtt_vocab = pickle.load(f)
with open(coco_vocab_path, 'rb') as f:
    coco_vocab = pickle.load(f)

In [4]:
transformer = create_transformer()
encoder = EncoderCNN(base_model)



In [5]:
video_captioner = VideoCaptioner(
            vid_embedding_size,
            embed_size,
            hidden_size,
            len(msrvtt_vocab),
            rnn_type='lstm',
            start_id=msrvtt_vocab.word2idx[msrvtt_vocab.start_word],
            end_id=msrvtt_vocab.word2idx[msrvtt_vocab.end_word]
        )

Selected RNN Type is lstm


In [6]:
if torch.cuda.is_available():
    vc_checkpoint = torch.load(root_path + vc_model_path)
else:
    vc_checkpoint = torch.load(root_path + vc_model_path, map_location='cpu')
video_captioner.load_state_dict(vc_checkpoint['params'])

  vc_checkpoint = torch.load(root_path + vc_model_path)


<All keys matched successfully>

In [7]:
if torch.cuda.is_available():
    encoder.cuda()
    video_captioner.cuda()

encoder.eval()
video_captioner.eval()

VideoCaptioner(
  (inp): Linear(in_features=2048, out_features=256, bias=True)
  (inp_dropout): Dropout(p=0.2, inplace=False)
  (inp_bn): BatchNorm1d(256, eps=1e-05, momentum=0.01, affine=True, track_running_stats=True)
  (embed): Embedding(14748, 256)
  (rnn): LSTM(256, 512, batch_first=True)
  (out): Linear(in_features=512, out_features=14748, bias=True)
)

In [8]:
def find_scene_changes(video_path: str, method: str = 'threshold', new_stat_file: bool = True) -> List[Tuple[int, int]]:
    """
    Detect scene changes in a given video.

    Args:
        video_path: Path to the video to analyze.
        method: Method for detecting scene changes ('content' or 'threshold').
        new_stat_file: Whether to create a new stats file.

    Returns:
        List of scene changes with their corresponding frame ranges as tuples.
    """
    # Initialize the VideoManager and StatsManager.
    video_manager = VideoManager([video_path])
    stats_manager = StatsManager()

    # Create a SceneManager and add the appropriate detector.
    scene_manager = SceneManager(stats_manager)
    if method == 'content':
        scene_manager.add_detector(ContentDetector(threshold=30, min_scene_len=40))
    else:
        scene_manager.add_detector(ThresholdDetector(threshold=125, min_scene_len=40))

    # Set the path for the stats file.
    stats_file_path = f'{video_path}.{method}.stats.csv'
    scene_list = []

    try:
        # Load stats file if it exists and new_stat_file is False.
        if not new_stat_file and os.path.exists(stats_file_path):
            with open(stats_file_path, 'r') as stats_file:
                stats_manager.load_from_csv(stats_file)

        # Set the downscale factor for faster processing.
        video_manager.set_downscale_factor(2)

        # Start the video manager.
        video_manager.start()

        # Perform scene detection.
        scene_manager.detect_scenes(video_manager)

        # Obtain the list of scenes.
        scene_list = scene_manager.get_scene_list()
        # Each scene is a tuple of (start_frame, end_frame).

        # Save stats if required.
        if stats_manager.is_save_required():
            with open(stats_file_path, 'w') as stats_file:
                stats_manager.save_to_csv(stats_file)

    finally:
        # Release the video manager resources.
        video_manager.release()
    # print(scene_list)
    return scene_list

In [39]:
video_path=os.environ['PROJECT_DIRECTORY'] + 'Dataset/MSR-VTT/TrainVal/video66.mp4'

In [None]:
scenes = find_scene_changes(video_path, method='content', new_stat_file=True)
print(f'Scenes: {scenes}')
scene_change_timecodes = [(scene[0].get_timecode(), scene[1].get_timecode())for scene in scenes]
print(f"Scene change timecode: {scene_change_timecodes}")
scene_change_idxs = [scene[0].get_frames() for scene in scenes]

print(f"Scene change idxs:{scene_change_idxs}")

VideoManager is deprecated and will be removed.


Scenes: [(00:00:00.000 [frame=0, fps=25.000], 00:00:02.080 [frame=52, fps=25.000]), (00:00:02.080 [frame=52, fps=25.000], 00:00:05.520 [frame=138, fps=25.000]), (00:00:05.520 [frame=138, fps=25.000], 00:00:07.200 [frame=180, fps=25.000]), (00:00:07.200 [frame=180, fps=25.000], 00:00:10.280 [frame=257, fps=25.000]), (00:00:10.280 [frame=257, fps=25.000], 00:00:11.000 [frame=275, fps=25.000])]
Scene change timecode: [('00:00:00.000', '00:00:02.080'), ('00:00:02.080', '00:00:05.520'), ('00:00:05.520', '00:00:07.200'), ('00:00:07.200', '00:00:10.280'), ('00:00:10.280', '00:00:11.000')]
Scene change idxs:[0, 52, 138, 180, 257]


In [41]:
if len(scene_change_idxs) == 0:
    print("No Scene Change!")
    scene_change_timecodes = ['00:00:00']
    scene_change_idxs = [0]
else:
    print("Scene Change detected!")

Scene Change detected!


In [42]:
# Empty torch tensor's to store values
vid_embeddings = torch.zeros(
    len(scene_change_idxs), num_frames, vid_embedding_size)
if torch.cuda.is_available():
    vid_embeddings = vid_embeddings.cuda()

In [43]:
# Determine last frame to analyze
last_frame = scene_change_idxs[-1] + num_frames + 1

frame_idx = 0
cap_start_idx = 0

In [44]:
cap = cv2.VideoCapture(video_path)


In [45]:
# Loop through and store relevant frames
while True:
    ret, frame = cap.read()

    if not ret or frame_idx == last_frame:
        break

    # Start storing frames
    if frame_idx in scene_change_idxs:
        cap_start_idx = frame_idx
        vid_array = torch.zeros(num_frames, 3, 224, 224)

    # Transform, and store
    if frame_idx - cap_start_idx < num_frames:
        try:
            frame = PIL.Image.fromarray(frame).convert('RGB')

            if torch.cuda.is_available():
                frame = transformer(frame).cuda().unsqueeze(0)
            else:
                frame = transformer(frame).unsqueeze(0)

            vid_array[frame_idx - cap_start_idx] = frame

        except OSError as e:
            print(e + " could not process frame in " + f)

    # If at scene ending frame, encode the collected scene
    if frame_idx - cap_start_idx == num_frames:
        if torch.cuda.is_available():
            vid_array = vid_array.cuda()
        vid_embeddings[scene_change_idxs.index(
            cap_start_idx)] =encoder(vid_array)

    frame_idx += 1

cap.release()

In [46]:
# Predict captions using the video embeddings
encoded_captions = video_captioner.predict(
    vid_embeddings, beam_size=5).cpu().numpy().astype(int)

In [47]:
# Convert word ids to word tags
captions = []
for caption in encoded_captions:
    captions.append(msrvtt_vocab.decode(
        caption, clean=True, join=True))

In [48]:
for cap, t in zip(captions, scene_change_timecodes):
    print (t, cap)

('00:00:00.000', '00:00:02.080') a girl is singing on stage
('00:00:02.080', '00:00:05.520') a girl is singing
('00:00:05.520', '00:00:07.200') a girl is singing on stage
('00:00:07.200', '00:00:10.280') a girl is singing on stage
('00:00:10.280', '00:00:11.000') displaying on screen
