# Multimodal Video Trimming - Experiments

In [1]:
# set to true to run all experiments
run_all_experiments_mode = True

# Hyperparameters

In [2]:
# Auto Summary
AUTO_SUMMARY_SUMMARY_LENGTH_PERCENTAGE = 0.3 # Min: 0.2, Max: 0.5
AUTO_SUMMARY_MIN_SUMMARY_LENGTH = 30         # Min: 30, Max: 60
AUTO_SUMMARY_MAX_SUMMARY_LENGTH = 600        # Min: 100, Max: 1000

# Deletion Metric
DELETION_METRIC_THRESHOLD = 0.2              # Min: 0.05, Max: 0.5

# Metric 1
METRIC_1_MODEL_SIZE = "base"                 # "base", "large"
METRIC_1_WEIGHT = 1.0                        # Min: 0.0, Max: 1.0

# Metric 2
METRIC_2_WEIGHT = 0.3                        # Min: 0.0, Max: 1.0
METRIC_2_MIN_SCENE_LEN = 15                  # Min: 15, Max: 9000
METRIC_2_THRESHOLD = 25                      # Min: 10, Max: 50


## --- DEFAULTS ---
# DELETION_METRIC_THRESHOLD = 0.2

# METRIC_1_WEIGHT = 1.0
# METRIC_1_MODEL_SIZE = "base"

# METRIC_2_WEIGHT = 0.3
# METRIC_2_MIN_SCENE_LEN = 15
# METRIC_2_THRESHOLD = 25

# AUTO_SUMMARY_SUMMARY_LENGTH_PERCENTAGE = 0.3
# AUTO_SUMMARY_MIN_SUMMARY_LENGTH = 30
# AUTO_SUMMARY_MAX_SUMMARY_LENGTH = 600

# Environment Setup

In [13]:
import os
import sys
import shutil
from IPython.display import clear_output
from IPython.display import FileLink

In [4]:
# Update installers
!pip install --upgrade pip
!sudo apt-get update

Collecting pip
  Downloading pip-25.0.1-py3-none-any.whl.metadata (3.7 kB)
Downloading pip-25.0.1-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m25.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.1.2
    Uninstalling pip-24.1.2:
      Successfully uninstalled pip-24.1.2
Successfully installed pip-25.0.1
Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
Get:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Hit:3 http://archive.ubuntu.com/ubuntu jammy InRelease                         
Get:4 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ Packages [66.7 kB]
Get:5 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]      
Get:6 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]           
Get:7 https:

In [5]:
# ffmpeg
!sudo apt-get install -y ffmpeg

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
ffmpeg is already the newest version (7:4.4.2-0ubuntu0.22.04.1).
0 upgraded, 0 newly installed, 0 to remove and 135 not upgraded.


# Clone Project

In [6]:
project_dir = '/kaggle/working/multimodal-video-trimming'

# Check if the directory exists
if os.path.exists(project_dir):
    # Change to the project directory
    os.chdir(project_dir)
    # Pull the latest changes from the repository
    !git pull origin main  # Replace "main" with your branch name if different
else:
    # Clone the project if it doesn't exist
    !git clone https://github.com/Dada-Tech/multimodal-video-trimming.git
    # Change to the project directory
    os.chdir(project_dir)

Cloning into 'multimodal-video-trimming'...
remote: Enumerating objects: 595, done.[K
remote: Counting objects: 100% (103/103), done.[K4/103)[K
remote: Compressing objects: 100% (87/87), done.[K
remote: Total 595 (delta 62), reused 38 (delta 16), pack-reused 492 (from 1)[K
Receiving objects: 100% (595/595), 827.30 KiB | 16.55 MiB/s, done.
Resolving deltas: 100% (302/302), done.


In [7]:
# Install requirements
!pip install -r requirements.txt

Looking in indexes: https://pypi.org/simple, https://download.pytorch.org/whl/cpu
Collecting faster-whisper==1.1.0 (from -r requirements.txt (line 7))
  Downloading faster_whisper-1.1.0-py3-none-any.whl.metadata (16 kB)
Collecting ctranslate2<4.5.0 (from -r requirements.txt (line 8))
  Downloading ctranslate2-4.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (10 kB)
Collecting datasets==3.3.1 (from -r requirements.txt (line 14))
  Downloading datasets-3.3.1-py3-none-any.whl.metadata (19 kB)
Collecting deepmultilingualpunctuation==1.0.1 (from -r requirements.txt (line 15))
  Downloading deepmultilingualpunctuation-1.0.1-py3-none-any.whl.metadata (4.0 kB)
Collecting pytextrank==3.3.0 (from -r requirements.txt (line 19))
  Downloading pytextrank-3.3.0-py3-none-any.whl.metadata (12 kB)
Collecting pymediainfo==7.0.1 (from -r requirements.txt (line 21))
  Downloading pymediainfo-7.0.1-py3-none-manylinux_2_27_x86_64.whl.metadata (9.0 kB)
Collecting silero-vad==5.1.2 (f

## MMVTrim NLP Dependencies Download

In [8]:
import nltk
import spacy

In [9]:
# NLP Libraries
nltk.download('punkt')
nltk.download('punkt_tab')

# Load the spaCy model
spacy.cli.download("en_core_web_sm")

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /usr/share/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


# Experiements Setup

In [10]:
# Directory containing the video files
video_dir = '/kaggle/input/tvsum50/tvsum50/video'

# List all files in the directory
all_files = os.listdir(video_dir)

# Filter to get only video files (assuming .mp4, .avi, and .mov as examples)
experiment_videos = [f for f in all_files if f.endswith(('.mp4', '.avi', '.mov'))]
experiment_videos.sort()

n_experiments = len(experiment_videos)

print(f"experiments loaded: {n_experiments}")

videos loaded: 50


In [11]:
# copy to working dir 'dataset'
dataset_dir = '/kaggle/working/multimodal-video-trimming/dataset'

shutil.copytree(video_dir, dataset_dir)

experiments = [os.path.join(dataset_dir, exp) for exp in experiment_videos]

# Run Experiments (All)

In [None]:
starting_experiment_index = 0

if run_all_experiments_mode:
    for i, experiment in enumerate(experiments[starting_experiment_index:], start=starting_experiment_index):
        clear_output(wait=True) # clear output
        print(f"=== Experiment: {i} of {n_experiments}\n\n")
        sys.stdout.flush()

        !python multimodal_video_summarization.py \
        --experiment_mode \
        --skip_nlp_downloads \
        --video_input {experiment} \
        --auto_summary_summary_length_percentage {AUTO_SUMMARY_SUMMARY_LENGTH_PERCENTAGE} \
        --auto_summary_min_summary_length {AUTO_SUMMARY_MIN_SUMMARY_LENGTH} \
        --auto_summary_max_summary_length {AUTO_SUMMARY_MAX_SUMMARY_LENGTH} \
        --deletion_metric_threshold {DELETION_METRIC_THRESHOLD} \
        --metric_1_model_size {METRIC_1_MODEL_SIZE} \
        --metric_1_weight {METRIC_1_WEIGHT} \
        --metric_2_weight {METRIC_2_WEIGHT} \
        --metric_2_min_scene_len {METRIC_2_MIN_SCENE_LEN} \
        --metric_2_threshold {METRIC_2_THRESHOLD}

=====Experiment: 10


Notebook Mode: False

=== skipping installation



=== importing...

2025-02-21 21:47:36.226780: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-02-21 21:47:36.248152: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-02-21 21:47:36.254720: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
/usr/local/lib/python3.10/dist-packages

=== importing done



=== extracting audio from video

ffmpeg version 4.4.2-0ubuntu0.22.04.1 Copyright (c) 2000-2021 the FFmpeg developers
  built with gcc 11 (Ubuntu 11.2.0-19ubuntu1)
  configuration: --prefix=/usr --extra-version=0

# Experiment 1 (Test)

In [None]:
if not run_all_experiments_mode:
    experiment = experiments[2]
    
    !python multimodal_video_summarization.py \
    --experiment_mode \
    --skip_nlp_downloads \
    --video_input {experiment} \
    --auto_summary_summary_length_percentage {AUTO_SUMMARY_SUMMARY_LENGTH_PERCENTAGE} \
    --auto_summary_min_summary_length {AUTO_SUMMARY_MIN_SUMMARY_LENGTH} \
    --auto_summary_max_summary_length {AUTO_SUMMARY_MAX_SUMMARY_LENGTH} \
    --deletion_metric_threshold {DELETION_METRIC_THRESHOLD} \
    --metric_1_model_size {METRIC_1_MODEL_SIZE} \
    --metric_1_weight {METRIC_1_WEIGHT} \
    --metric_2_weight {METRIC_2_WEIGHT} \
    --metric_2_min_scene_len {METRIC_2_MIN_SCENE_LEN} \
    --metric_2_threshold {METRIC_2_THRESHOLD}
    
    # !python multimodal_video_summarization.py --experiment_mode --skip_nlp_downloads --video_input {experiment}

# Export

In [None]:
# leave blank to use default experiment name
custom_experiment_name = ""

In [None]:
# Use default experiment name if custom name is blank
if len(custom_experiment_name) > 0:
    export_dir = os.path.join(project_dir, custom_experiment_name)
else:
    experiment_name = f"experiment_m1w-{METRIC_1_WEIGHT}_m2w-{METRIC_2_WEIGHT}_mdt-{DELETION_METRIC_THRESHOLD}"
    export_dir = os.path.join(project_dir, experiment_name)

# Export experiment to unique filename
if not os.path.exists(export_dir):
    os.makedirs(export_dir, exist_ok=False)

    # Copy only .csv and .json files
    for file in os.listdir(dataset_dir):
        if file.endswith(".csv") or file.endswith(".json"):
            shutil.copy(os.path.join(dataset_dir, file), os.path.join(export_dir, file))
    
    shutil.make_archive(export_dir, 'zip', export_dir)
    print(f"zip ready: {export_dir}.zip")
else:
    print(f"The directory {export_dir} already exists.\n\nPlease rename the experiment.")