<a href="https://colab.research.google.com/github/Dada-Tech/multimodal-video-trimming/blob/main/MultiModal_Video_Summarization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Configuration

In [1]:
import os
import subprocess

notebook_mode = "auto"
dev_mode = False

# Notebook Mode: Auto-detect Google Colab instance
notebook_mode = (True if 'COLAB_GPU' in os.environ else False) if notebook_mode == "auto" else notebook_mode
print(f"""Notebook Mode: {notebook_mode}""")

Notebook Mode: True


In [2]:
def print_section(message):
  """Prints a section separator with a custom message embedded.

  Args:
    message: The message to embed within the separator.
  """
  separator_length = 40
  separator_char = "="

  # Calculate padding for the message
  message_length = len(message)
  padding = (separator_length - message_length - 2) // 2
  padding = max(padding, 0)

  # Separator line
  top_line = separator_char * separator_length

  # Message line
  message_line = separator_char * padding + " " + message + " " + separator_char * padding
  if len(message_line) == 39:
    message_line += separator_char
  message_line = message_line[:separator_length]

  print(top_line)
  print(message_line)
  print(top_line,'\n')


def print_info(message, preview = None, max_length=80):
  print(f"""\n=== {message}\n""")
  if preview:
    print("--- Preview:")
    print(preview[:max_length] + "..." if len(preview) > max_length else preview)


def notebook_mode_print(message_or_df):
  if notebook_mode:
    display(message_or_df) if isinstance(message_or_df, pd.DataFrame) else print(message_or_df)

def dev_mode_print(message):
  if dev_mode:
    print(message)

# Installation & Setup

In [3]:
if notebook_mode:
    print_section("installing deps")

    # download requirements.txt from repository
    subprocess.run(["curl", "-O", "https://raw.githubusercontent.com/Dada-Tech/multimodal-video-trimming/main/requirements.txt"], check=True)

    subprocess.check_call(['python', '-m', 'pip', 'install', '--no-cache-dir', '-r', 'requirements.txt'])

    print_info("installation done.")
else:
    print_info("skipping installation")



=== installation done.



In [4]:
from pydantic import BaseModel, validator, conint, confloat, ValidationError
from enum import Enum
import argparse

class AutoSummary(BaseModel):
    summary_length_percentage: confloat(ge=0.2, le=0.5)
    min_summary_length: conint(ge=30, le=60)
    max_summary_length: conint(ge=100, le=1000)

class DeletionMetric(BaseModel):
    threshold: confloat(ge=0.2, le=0.5)

class Metric1(BaseModel):
    model_size: str

class Hyperparameters(BaseModel):
    auto_summary: AutoSummary
    deletion_metric: DeletionMetric
    metric_1: Metric1

# Inputs & Hyperparameters

### Auto Summary

*   **`summary_length_percentage`**: 0.3
    *   Determines the target length of the summary as a percentage of the original text length.
*   **`min_summary_length`**: 30
    *   Sets the minimum number of words (or tokens) allowed in the summary.
*   **`max_summary_length`**: 600
    *   Sets the maximum number of words (or tokens) allowed in the summary.

### Deletion Metric

*   **`threshold`**: 0.3
    *   The minimum relevance score a sentence needs to have to be included in the final output. Sentences below this threshold are considered for removal.

In [5]:
if notebook_mode:
  video_input = "dataset/teamwork in the classroom.mov"
  # video_output = "dataset/teamwork in the classroom_skimmed.mov"
  video_export_max_length_seconds = 0 # set develop video max length to export a shortened version of the multimedia

  # original was max_length=150, min_length=30
  hyperparameters = {
      "auto_summary": {
        "summary_length_percentage": 0.3,
        "min_summary_length": 30,
        "max_summary_length": 600
      },
      "deletion_metric": {
          "threshold": 0.3
      },
      "metric_1": {
          "model_size": "base"
      }
  }

else:
  # Define the argparse parser
  parser = argparse.ArgumentParser(description="Process video and hyperparameters.")

  # Define the arguments for the inputs
  parser.add_argument("--video_export_max_length_seconds", type=int, default=0, help="Maximum length of the video to export (in seconds)")
  parser.add_argument("--video_input", "-i", type=str, required=True, help="Path to the video input file")
  # parser.add_argument("--video_output", "-o", type=str, default=None, help="Optional path to save the output video")

  # Hyperparameters as individual arguments
  parser.add_argument("--auto_summary_summary_length_percentage", type=float, default=0.3, help="Summary length as a percentage")
  parser.add_argument("--auto_summary_min_summary_length", type=int, default=30, help="Minimum summary length")
  parser.add_argument("--auto_summary_max_summary_length", type=int, default=600, help="Maximum summary length")
  parser.add_argument("--deletion_metric_threshold", type=float, default=0.3, help="Threshold for deletion metric")
  parser.add_argument("--metric_1_model_size", type=str, choices=["base", "large"], default="base", help="Model size for metric 1")

  # Parse arguments
  args = parser.parse_args()

  # Now you can use the parsed arguments
  video_export_max_length_seconds = args.video_export_max_length_seconds
  video_input = args.video_input

  hyperparameters = {
      "auto_summary": {
          "summary_length_percentage": args.auto_summary_summary_length_percentage,
          "min_summary_length": args.auto_summary_min_summary_length,
          "max_summary_length": args.auto_summary_max_summary_length
      },
      "deletion_metric": {
          "threshold": args.deletion_metric_threshold
      },
      "metric_1": {
          "model_size": args.metric_1_model_size
      }
  }



# Validate Hyperparameters
try:
    validated_hyperparameters = Hyperparameters(**hyperparameters)
except ValidationError as e:
    print(f"Hyperparameter validation error: {e}")

    print_info("exiting...")
    os._exit(1)

# Imports



In [6]:
print_info("importing...")

import os
import numpy as np
import pandas as pd
import tarfile
import gdown
import re
from functools import reduce
import subprocess

# ML General
from datasets import load_dataset
import torch
import torchaudio
import torch.nn.functional as F
from transformers import \
LongformerTokenizer, LongformerModel, LongformerForSequenceClassification, LongformerConfig, \
RobertaTokenizer, RobertaForTokenClassification, TrainingArguments, \
LEDTokenizer, LEDForConditionalGeneration

# Text
import pytextrank
import nltk
from nltk.tokenize import sent_tokenize
import spacy
import srt

# Audio
import whisperx
import silero_vad
from silero_vad import load_silero_vad, read_audio, get_speech_timestamps
from pydub import AudioSegment

# Video
import ffmpeg

print_info("importing done")


=== importing...

/usr/local/lib/python3.10/dist-packages

=== importing done



In [7]:
print_info("downloading NLTK libraries...")

nltk.download('punkt')
nltk.download('punkt_tab')

print_info("downloading done")


=== downloading NLTK libraries...



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.



=== downloading done



[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


# Variables

In [8]:
full_base = os.path.dirname(video_input)
path_dataset = full_base
filename = os.path.basename(video_input)
filename_without_extension = os.path.splitext(filename)[0]
filename_video_extension = video = os.path.splitext(video_input)[1]


filename_video_input = filename
filename_subtitles_output = filename_without_extension + ".srt"
filename_audio_output = filename_without_extension + ".wav"
filename_audio_output_skimmed = filename_without_extension + "_skimmed.wav"
filename_video_output_skimmed = filename_without_extension + "_skimmed" + filename_video_extension

subtitles_output = os.path.join(full_base, filename_subtitles_output)
audio_output = os.path.join(full_base, filename_audio_output)
audio_output_skimmed = os.path.join(full_base, filename_audio_output_skimmed)
video_output_skimmed = os.path.join(full_base, filename_video_output_skimmed)

video = ''
audio = ''
subtitles = ''
sentences = ''

# Functions

In [9]:
def drop_if_exists(df, col_name):
  """Drops a column from a DataFrame if it exists
  Args:
    df: The pandas DataFrame to modify.
    col_name: The name of the column to drop and insert.
  """
  if col_name in df.columns:
    df.drop(col_name, axis=1, inplace=True)

# Datasets

- teamwork in the classroom.mov - `190MB`
- flipped learning basics.mov - `380MB`
- assessing students without exams.mov - `830MB`

In [10]:
if notebook_mode:
  from google.colab import files

  # Google Drive Dataset Location
  folder_id = '1k7DLJPl1xz9lpU4l3dZYtPe1XawhrXeC' # taken from drive.google.com/drive/u/1/folders/1k7D...(this part)
  gdown.download_folder(id=folder_id, quiet=False, use_cookies=False)

Retrieving folder contents


Processing file 1wslcvTNd88FQMXJgvGbKR3sjwXORR6xt teamwork in the classroom.mov
Processing file 1ZsA-X-HsSk0WJiZaGcHIC3t17DchdXIX teamwork in the classroom.srt
Processing file 1lb2rCvjouElVhqEHAyojUOvb_ECgUqjF teamwork in the classroom.wav


Retrieving folder contents completed
Building directory structure
Building directory structure completed
Downloading...
From (original): https://drive.google.com/uc?id=1wslcvTNd88FQMXJgvGbKR3sjwXORR6xt
From (redirected): https://drive.google.com/uc?id=1wslcvTNd88FQMXJgvGbKR3sjwXORR6xt&confirm=t&uuid=de84784c-1f5d-40a0-8f6a-09743aeebab5
To: /content/dataset/teamwork in the classroom.mov
100%|██████████| 202M/202M [00:05<00:00, 37.2MB/s]
Downloading...
From: https://drive.google.com/uc?id=1ZsA-X-HsSk0WJiZaGcHIC3t17DchdXIX
To: /content/dataset/teamwork in the classroom.srt
100%|██████████| 7.73k/7.73k [00:00<00:00, 10.0MB/s]
Downloading...
From: https://drive.google.com/uc?id=1lb2rCvjouElVhqEHAyojUOvb_ECgUqjF
To: /content/dataset/teamwork in the classroom.wav
100%|██████████| 62.4M/62.4M [00:01<00:00, 60.7MB/s]
Download completed


# Preprocessing

In [None]:
print_section("Preprocessing")




## Audio - Extract

In [None]:
# Extract audio (wav) from video
# !ffmpeg -y -i "$video_input" -vn -acodec pcm_s16le -ar 44100 -ac 2 "$audio_output"
print_info("extracting audio from video")

subprocess.run(['ffmpeg', '-y', '-i', video_input, '-vn', '-acodec', 'pcm_s16le', '-ar', '44100', '-ac', '2', audio_output], check=True)
# subprocess.run(["ffmpeg", '-y', '-i', video_input, '-vn', '-acodec', 'pcm_s16le', '-ar', '44100', '-ac', '2', audio_output], check=True, capture_output=True)


=== extracting audio from video



CompletedProcess(args=['ffmpeg', '-y', '-i', 'dataset/teamwork in the classroom.mov', '-vn', '-acodec', 'pcm_s16le', '-ar', '44100', '-ac', '2', 'dataset/teamwork in the classroom.wav'], returncode=0)

## Audio - SRT File Generation

##### Time Taken: ~4min

SRT  
each **`subtitle`** in the subtitles array has the following properties:

1. **`index`**
   - The sequential number of the subtitle within the SRT file.
   - `1`, `2`, `3`, etc. (Integer)
2. **`start`**
   - The time (in milliseconds) when the subtitle should appear on the screen.
   - `00:00:05,000` (String representing HH:MM:SS,SSS)
3. **`end`**
   - The time (in milliseconds) when the subtitle should disappear from the screen.
   - `00:00:10,000` (String representing HH:MM:SS,SSS)
4. **`content`**
   - The actual text of the subtitle that will be displayed.
   - "Hello, world!" (String)
5. **`proprietary`**
   - This field holds any additional data or formatting specific to the SRT file or software used to create it. Often empty and can usually be ignored.
   - `''` (Empty string, or sometimes contains specific formatting codes)

In [None]:
def seconds_to_srt_timestamp(seconds):
    """
    Extract hours, minutes, seconds, and milliseconds
    from a given number of seconds.
    """

    hours = int(seconds // 3600)
    minutes = int((seconds % 3600) // 60)
    seconds = seconds % 60
    milliseconds = int((seconds - int(seconds)) * 1000)

    # Format as HH:MM:SS,MS
    return f"{hours:02}:{minutes:02}:{int(seconds):02},{milliseconds:03}"

In [None]:
# Select device (GPU if available, otherwise CPU)
language="en"

from multiprocessing import Queue

# GPU
if torch.cuda.is_available():
  device = "cuda"
  compute_type = "float16"
  batch_size = 16
  model_whisperx = "base"

  print_info(f"""Generating SRT File with {device}...""")
else:
  device = "cpu"
  compute_type = "int8"
  batch_size = 1
  model_whisperx = "tiny"

  queue = Queue(maxsize=200)

  print_info(f"""WARNING: Generating SRT File with {device}...""")


# Model WhisperX
model = whisperx.load_model(model_whisperx, device=device, language=language, compute_type=compute_type) # Choose "base" or "large" model

# Transcribe audio
aligned_segments = model.transcribe(audio_output, batch_size=batch_size)

# Align with forced alignment
alignment_model, metadata = whisperx.load_align_model(language_code=aligned_segments["language"], device=device)
aligned_segments = whisperx.align(aligned_segments["segments"], alignment_model, metadata, audio_output, device)

# Generate SRT file with aligned sentences
with open(subtitles_output, "w") as f:
    for i, segment in enumerate(aligned_segments["segments"], 1):
        # Get start and end times in SRT format
        start_time = seconds_to_srt_timestamp(segment["start"])
        end_time = seconds_to_srt_timestamp(segment["end"])

        # Write SRT entry
        f.write(f"{i}\n{start_time} --> {end_time}\n{segment['text']}\n\n")

print_info("SRT file generated", subtitles_output)


=== Generating SRT File with cuda...



INFO:pytorch_lightning.utilities.migration.utils:Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.5.0.post0. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../usr/local/lib/python3.10/dist-packages/whisperx/assets/pytorch_model.bin`


Model was trained with pyannote.audio 0.0.1, yours is 3.1.1. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.5.1+cu121. Bad things might happen unless you revert torch to 1.x.


## Text - Load SRT File

In [11]:
# Subtitles:
with open(subtitles_output, "r", encoding="utf-8") as f:
    subtitles = list(srt.parse(f.read()))

## Text - Sentence Segmentation

In [12]:
def format_timedelta(timedelta_obj):
    """Formats a datetime.timedelta object into HH:MM:SS.mmm timestamp.

    Args:
        timedelta_obj: The datetime.timedelta object.

    Returns:
        A string representing the timestamp in HH:MM:SS.mmm format.
    """
    total_seconds = timedelta_obj.total_seconds()
    hours = int(total_seconds // 3600)  # Get hours
    minutes = int((total_seconds % 3600) // 60)  # Get minutes
    seconds = int(total_seconds % 60)  # Get seconds
    milliseconds = int((total_seconds % 1) * 1000)  # Get milliseconds

    return f"{hours:02d}:{minutes:02d}:{seconds:02d}.{milliseconds:03d}"

In [13]:
sentences = []
for i, segment in enumerate(subtitles):
    sentences.append({
        'base_idx': i,
        'start_time': format_timedelta(segment.start),
        'end_time': format_timedelta(segment.end),
        'sentence': segment.content
    })

df_sentences = pd.DataFrame(sentences)
sentences = df_sentences['sentence'].tolist()

notebook_mode_print(df_sentences)

Unnamed: 0,base_idx,start_time,end_time,sentence
0,0,00:00:01.274,00:00:08.685,"Hello, this is Lino Cordia and today I want t..."
1,1,00:00:11.057,00:00:13.319,So why is teamwork in the classroom so import...
2,2,00:00:13.359,00:00:17.480,"Well, for our students, it allows them to deve..."
3,3,00:00:17.699,00:00:20.059,"Communication skills, leadership skills, et ce..."
4,4,00:00:20.620,00:00:23.861,"Also, when you're working with a team, you get..."
...,...,...,...,...
69,69,00:05:46.624,00:05:48.245,"And yeah, please reach out."
70,70,00:05:48.384,00:05:49.406,I would love to hear from you.
71,71,00:05:49.526,00:05:50.266,Thank you so much.
72,72,00:05:50.547,00:05:51.487,I am Lino Cordia.


## Text - Paragraph
combination of all subtitle parts.  

WhisperAI enhances transcription with basic punctuation.

In [14]:
paragraph = reduce(lambda acc, seg: acc + seg.strip() + ' ', sentences, '')

# Print the paragraph
notebook_mode_print(paragraph)
print_info("paragraph sample", paragraph)

Hello, this is Lino Cordia and today I want to talk to you about a very important topic, challenging topic, teamwork in the classroom. So why is teamwork in the classroom so important? Well, for our students, it allows them to develop a bunch of new skills, right? Communication skills, leadership skills, et cetera. Also, when you're working with a team, you get different perspectives. Ideally, you are part of a team that has people with different genders, people with different age groups, people with different academic backgrounds, right? So when you're talking to them, you get all these fresh perspectives that inform your the task that you're trying to solve. Also teams will motivate you, they will support you, you will feel empowered by them ideally, right? This is like the things that should happen. And also this is how the world works, right? Pretty much everything we do, we need to do it as a team. Now here's the thing about teamwork in the classroom. Students hate it. But why do 

## Text - Paragraph Summarized

##### Time Taken: ~1min

In [15]:
print_info("Summarizing Paragraph")


=== Summarizing Paragraph



In [16]:
# Model: Longformer Encoder-Decoder
model_name = "allenai/led-base-16384"
tokenizer = LEDTokenizer.from_pretrained(model_name)
model = LEDForConditionalGeneration.from_pretrained(model_name)
text = paragraph

# Tokenization
inputs = tokenizer(text, return_tensors="pt", max_length=4096, truncation=True)

# Calculate dynamic summary length
summary_length_percentage = hyperparameters["auto_summary"]["summary_length_percentage"]
min_summary_length = hyperparameters["auto_summary"]["min_summary_length"]
max_summary_length = hyperparameters["auto_summary"]["max_summary_length"]


input_length = len(inputs["input_ids"][0])
summary_length = int(input_length * summary_length_percentage)
summary_length = max(min_summary_length, min(summary_length, max_summary_length))

# Summary Generation
summary_ids = model.generate(
    inputs["input_ids"],
    max_length=summary_length,
    min_length=min_summary_length,
    length_penalty=1.2,
    num_beams=4,
    early_stopping=True
)

paragraph_summarized = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
print_info("paragraph summarized", paragraph_summarized)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/27.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.09k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/648M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/168 [00:00<?, ?B/s]

Input ids are automatically padded from 1141 to 2048 to be a multiple of `config.attention_window`: 1024



=== paragraph summarized

--- Preview:
Hello, this is Lino Cordia and today I want to talk to you about a very importan...


In [17]:
# Simple Metrics
print_info("Simple Metrics")

original_length = len(paragraph)
summary_length = len(paragraph_summarized)

print(f"original length: {original_length}")
print(f"summary length: {summary_length}")

summarization_ratio = (original_length - summary_length) / original_length
print(f"Summarized/Original Length Ratio: {summarization_ratio:.2f}")


=== Simple Metrics

original length: 5209
summary length: 1555
Summarized/Original Length Ratio: 0.70


# Text

## Metric 1: Sentence- Summarized Paragraph Relevancy (Cosine Similarity)

##### Time Taken: ~2min

In [18]:
print_section("Metric 1: Sentence-Summarized Relevancy")

 Metric 1: Sentence-Summarized Relevancy



In [19]:
# config
attention_window = 256
model_size = hyperparameters["metric_1"]["model_size"]
model_name_lf = f'allenai/longformer-{model_size}-4096'
config = LongformerConfig.from_pretrained(model_name_lf, attention_window=attention_window)

# model: Longformer
model_lf = LongformerModel.from_pretrained(model_name_lf, config=config)
tokenizer_lf = LongformerTokenizer.from_pretrained(model_name_lf, model_max_length=attention_window)

config.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/597M [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [20]:
# 2: Tokenization
paragraph_tokens = tokenizer_lf(paragraph_summarized, return_tensors='pt')
# sentence_tokens = [tokenizer_lf(sentence, return_tensors='pt') for sentence in sentences]

sentence_tokens = tokenizer_lf(sentences, padding=True, truncation=True, return_tensors='pt')

Token indices sequence length is longer than the specified maximum sequence length for this model (342 > 256). Running this sequence through the model will result in indexing errors


In [21]:
# 3: Embedding
with torch.no_grad():  # Disable gradient computation for efficiency
    paragraph_embedding = model_lf(**paragraph_tokens).last_hidden_state[:, 0, :]  # Get the [CLS] token embedding

    # Process batched sentence tokens
    sentence_embeddings = model_lf(**sentence_tokens).last_hidden_state[:, 0, :]

Input ids are automatically padded from 342 to 512 to be a multiple of `config.attention_window`: 256
Input ids are automatically padded from 40 to 256 to be a multiple of `config.attention_window`: 256


Embedding Explanation  
The [CLS] (classification) token is often used in transformer models to represent the overall meaning or summary of the input sequence. By extracting its embedding, you're essentially obtaining a representation that captures the main point or essence of the paragraph.

In [22]:
# 4: Relevance scores
relevance_scores = [torch.cosine_similarity(paragraph_embedding, sentence_embedding).item() for sentence_embedding in sentence_embeddings]

# Normalization: min-max normalization
min_score = min(relevance_scores)
max_score = max(relevance_scores)
normalized_scores = [(score - min_score) / (max_score - min_score) for score in relevance_scores]

# round
normalized_scores = [np.format_float_positional(score, precision=2, unique=False, fractional=False, trim='k') for score in normalized_scores]

In [23]:
# 5: Display Results
drop_if_exists(df_sentences, "metric_1_score")
df_sentences.insert(0, "metric_1_score", normalized_scores)

notebook_mode_print(df_sentences)

Unnamed: 0,metric_1_score,base_idx,start_time,end_time,sentence
0,1.0,0,00:00:01.274,00:00:08.685,"Hello, this is Lino Cordia and today I want t..."
1,0.46,1,00:00:11.057,00:00:13.319,So why is teamwork in the classroom so import...
2,0.58,2,00:00:13.359,00:00:17.480,"Well, for our students, it allows them to deve..."
3,0.48,3,00:00:17.699,00:00:20.059,"Communication skills, leadership skills, et ce..."
4,0.53,4,00:00:20.620,00:00:23.861,"Also, when you're working with a team, you get..."
...,...,...,...,...,...
69,0.38,69,00:05:46.624,00:05:48.245,"And yeah, please reach out."
70,0.32,70,00:05:48.384,00:05:49.406,I would love to hear from you.
71,0.50,71,00:05:49.526,00:05:50.266,Thank you so much.
72,0.23,72,00:05:50.547,00:05:51.487,I am Lino Cordia.


In [24]:
# Interactive Sheet for easy exporting
# from google.colab import sheets
# sheet = sheets.InteractiveSheet(df=df_sentences)

## Metric 2: Intra-sentence relevancy
Score by if current sentence is needded by adjacent sentences.

In [None]:
# from transformers import BertForSequenceClassification, BertTokenizer

# # Load pre-trained model and tokenizer
# model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# # Store predictions for each sentence
# predictions = []

# # Iterate through sentence pairs
# for i in range(len(sentences) - 1):
#     sentence1 = sentences[i]
#     sentence2 = sentences[i + 1]

#     # Tokenize and prepare input
#     inputs = tokenizer(sentence1, sentence2, return_tensors='pt', truncation=True, padding=True, add_special_tokens=True)

#     # Get model prediction
#     outputs = model(**inputs)
#     prediction = torch.argmax(outputs.logits).item()

#     # Store prediction
#     predictions.append(prediction)

# # Handle last sentence (no next sentence)
# predictions.append(0)  # Assume last sentence doesn't need a next sentence

In [None]:
# # Add predictions to DataFrame
# df_sentences = df_sentences.assign(**{"Previous Sentence Needed": predictions})

# display(df_sentences)

## Metric 3: Intelligent Sentence-Paragraph Relevancy

##### Time Taken: 13min - 26min

In [None]:
# tokenizer = LongformerTokenizer.from_pretrained("allenai/longformer-base-4096")
# model = LongformerForSequenceClassification.from_pretrained("allenai/longformer-base-4096")

# # Ensure the model is in evaluation mode
# model.eval()

# # Example usage
# body_paragraph = paragraph

# relevance_scores = []

# for sentence in sentences:
#     # Prepare the input for Longformer
#     inputs = tokenizer(
#         body_paragraph,
#         sentence,
#         return_tensors='pt',
#         max_length=4096,
#         truncation=True,
#         padding='max_length'  # Pad to max length to avoid issues with model input size
#     )

#     # Get model predictions
#     with torch.no_grad():
#         outputs = model(**inputs)

#     # Assuming binary classification (relevant/not relevant)
#     relevance_score = torch.softmax(outputs.logits, dim=1)[0][1].item()  # Probability of being relevant
#     relevance_scores.append((sentence, relevance_score))

# # Sort sentences based on relevance scores
# sorted_sentences = sorted(relevance_scores, key=lambda x: x[1], reverse=True)
# ranked_sentences = [sentence for sentence, score in sorted_sentences]

In [None]:
# relevance_scores[0]

In [None]:
# sentence_indices = list(range(len(relevance_scores)))
# scores = [score for sentence, score in relevance_scores]
# sentences_text = [sentence for sentence, score in relevance_scores]

# df_relevance = pd.DataFrame({'Sentence Index': sentence_indices, 'Score': scores, 'Sentence': sentences_text})
# df_relevance

## Metric 4: Keyword extraction and Ranking
using TextRank

In [None]:
# # Load a spaCy model
# nlp = spacy.load("en_core_web_sm")

# # Add the pytextrank pipeline component to spaCy
# nlp.add_pipe("textrank")

# phrase_data = []

# # Process the text
# doc = nlp(paragraph)

# for phrase in doc._.phrases:
#   phrase_data.append([phrase.text, phrase.rank, phrase.count])

# df_phrases = pd.DataFrame(phrase_data, columns=['Phrase', 'Rank', 'Count'])
# df_phrases.sort_values(by=['Rank'], ascending=False, inplace=True)

# display(df_phrases)

## Metric 5: Question-Answering Evaluation

In [30]:
# from transformers import pipeline
# from sentence_transformers import SentenceTransformer, util


# # Load the question-answering pipeline
# # qa_pipeline = pipeline(
# #     "question-answering",
# #     model="distilbert-base-cased-distilled-squad"
# #     # model="valhalla/longformer-base-4096-finetuned-squadv1"
# #   )

# question_generation_pipeline = pipeline("text2text-generation", model="t5-small")
# # question_generation_pipeline = pipeline("text2text-generation", model="facebook/bart-large")
# # question_generation_pipeline = pipeline("text2text-generation", model="valhalla/t5-base-qa-qg-hl")

Device set to use cuda:0


In [34]:
# unsummarized_text = """
# Albert Einstein was a theoretical physicist born in Germany. He developed the theory of relativity,
# one of the two pillars of modern physics. He won the Nobel Prize in Physics in 1921.
# """

In [43]:
# # Prompt BART to generate questions
# prompt = f"translate english to french: {unsummarized_text}"

# # Generate questions
# # generated_questions = question_generation_pipeline(prompt)
# generated_questions = question_generation_pipeline(
#     prompt,
#     max_length=50,           # Maximum length of generated text (in tokens)
#     num_beams=5,             # Enable beam search with 5 beams
#     num_return_sequences=1)  # Generate 3 different sequences

# generated_questions
# # Print the generated questions
# print(generated_questions)

[{'generated_text': "Albert Einstein, physicien théorique né en Allemagne, a développé la théorie de la relativité, l'un des deux piliers de la physique moderne, et "}]


In [None]:
# answers = {}

# for question in questions:
#     result = qa_pipeline(question=question, context=text)
#     if result['score'] >= threshold:
#         answers[question] = result  # Store the entire answer object
#     else:
#         answers[question] = {"answer": "I don't know", "score": result['score']}

In [None]:
# # Display generated questions and answers
# print("Generated Questions and Answers from Unsummarized Text:")
# for q, a in answers_unsummarized.items():
#     print(f"Q: {q}\nA: {a}\n")

# Audio

## Metric 6: Silence Detection
* From the Paragraph boundaries, get the time in aduio that we care about
* For each time in audio we care about, analyze if they are low volume

OR
* analyze all potential sentence boundaries first
* match with end of sentences

In [None]:
# # 0: Load audio, extract timestamps

# SAMPLING_RATE = 16000 # 16 kHz

# model = load_silero_vad()
# wav = read_audio(audio_output)
# speech_timestamps = get_speech_timestamps(wav, model)

# # Check the shape of the wav tensor
# print(f"Audio shape: {wav.shape}")
# print(f"Audio length (seconds): {len(wav) / SAMPLING_RATE:.2f}")

Audio shape: torch.Size([5657259])
Audio length (seconds): 353.58


In [None]:
# # Speech Intervals
# speech_intervals = []
# for i in range(0, len(speech_timestamps)-1):
#     speech_intervals.append((speech_timestamps[i]['start'] / SAMPLING_RATE, speech_timestamps[i]['end'] / SAMPLING_RATE))

# # Silence Intervals
# silence_intervals = []
# for i in range(1, len(speech_timestamps)):
#     silence_start = speech_timestamps[i-1]['end']  # End of previous speech segment
#     silence_end = speech_timestamps[i]['start']     # Start of current speech segment
#     silence_intervals.append((silence_start / SAMPLING_RATE, silence_end / SAMPLING_RATE))

In [None]:
# notebook_mode_print(speech_timestamps[0:3])
# notebook_mode_print(speech_intervals[0:3])
# notebook_mode_print(silence_intervals[0:3])

[{'start': 20000, 'end': 113120}, {'start': 116768, 'end': 140256}, {'start': 176672, 'end': 219104}]
[(1.25, 7.07), (7.298, 8.766), (11.042, 13.694)]
[(7.07, 7.298), (8.766, 11.042), (13.694, 14.05)]


# Video

# Final Score - Metric Weighting

In [None]:
# Add Final Metric Column
drop_if_exists(df_sentences, "metric_final")
df_sentences.insert(0, "metric_final", 1)

# Metric 1 Apply
df_sentences['metric_final'] = 1 * df_sentences['metric_final'] * df_sentences['metric_1_score'].astype(float)


notebook_mode_print(df_sentences)

Unnamed: 0,metric_final,metric_1_score,base_idx,start_time,end_time,sentence
0,1.00,1.0,0,00:00:01.274,00:00:08.685,"Hello, this is Lino Cordia and today I want t..."
1,0.46,0.46,1,00:00:11.057,00:00:13.319,So why is teamwork in the classroom so import...
2,0.58,0.58,2,00:00:13.359,00:00:17.480,"Well, for our students, it allows them to deve..."
3,0.48,0.48,3,00:00:17.699,00:00:20.059,"Communication skills, leadership skills, et ce..."
4,0.53,0.53,4,00:00:20.620,00:00:23.861,"Also, when you're working with a team, you get..."
...,...,...,...,...,...,...
69,0.38,0.38,69,00:05:46.624,00:05:48.245,"And yeah, please reach out."
70,0.32,0.32,70,00:05:48.384,00:05:49.406,I would love to hear from you.
71,0.50,0.50,71,00:05:49.526,00:05:50.266,Thank you so much.
72,0.23,0.23,72,00:05:50.547,00:05:51.487,I am Lino Cordia.


### Deletion Metric

In [None]:
# Threshold
threshold = hyperparameters['deletion_metric']['threshold']

filtered_df_to_keep = df_sentences[df_sentences['metric_final'] >= threshold]
filtered_df_to_delete = df_sentences[df_sentences['metric_final'] < threshold]
filtered_df = filtered_df_to_keep

# Percentage
# percentage_cutoff = 0.2
# percentile_20 = df_sentences['metric_final'].quantile(0.2)
# filtered_df = df_sentences[df_sentences['metric_final'] <= percentile_20]

# Timestamps
# sample_timestamps = [('00:00:00.00','00:00:01.25'), ('00:00:08.766', '00:00:11.042')]
sentence_timestamps = list(zip(filtered_df['start_time'], filtered_df['end_time']))


notebook_mode_print(sentence_timestamps)

[('00:00:01.274', '00:00:08.685'), ('00:00:11.057', '00:00:13.319'), ('00:00:13.359', '00:00:17.480'), ('00:00:17.699', '00:00:20.059'), ('00:00:20.620', '00:00:23.861'), ('00:00:24.042', '00:00:34.804'), ('00:00:34.825', '00:00:39.506'), ('00:00:40.426', '00:00:42.225'), ('00:00:43.408', '00:00:49.691'), ('00:00:49.710', '00:00:52.091'), ('00:00:52.551', '00:00:54.213'), ('00:00:54.253', '00:00:57.094'), ('00:00:59.396', '00:01:01.155'), ('00:01:06.628', '00:01:09.087'), ('00:01:17.152', '00:01:24.495'), ('00:01:24.775', '00:01:26.215'), ('00:01:27.956', '00:01:29.918'), ('00:01:30.138', '00:01:36.120'), ('00:01:36.322', '00:01:39.305'), ('00:01:43.046', '00:01:44.629'), ('00:01:44.709', '00:01:48.331'), ('00:01:48.652', '00:01:50.953'), ('00:01:52.013', '00:01:53.795'), ('00:01:53.834', '00:01:57.018'), ('00:01:58.367', '00:02:04.552'), ('00:02:04.611', '00:02:13.900'), ('00:02:14.479', '00:02:17.703'), ('00:02:17.861', '00:02:31.376'), ('00:02:32.137', '00:02:39.818'), ('00:02:40.03

### Text to Keep

In [None]:
notebook_mode_print(filtered_df_to_keep[['metric_final', 'start_time', 'end_time', 'sentence']])

Unnamed: 0,metric_final,start_time,end_time,sentence
0,1.0,00:00:01.274,00:00:08.685,"Hello, this is Lino Cordia and today I want t..."
1,0.46,00:00:11.057,00:00:13.319,So why is teamwork in the classroom so import...
2,0.58,00:00:13.359,00:00:17.480,"Well, for our students, it allows them to deve..."
3,0.48,00:00:17.699,00:00:20.059,"Communication skills, leadership skills, et ce..."
4,0.53,00:00:20.620,00:00:23.861,"Also, when you're working with a team, you get..."
5,0.64,00:00:24.042,00:00:34.804,"Ideally, you are part of a team that has peopl..."
6,0.61,00:00:34.825,00:00:39.506,"So when you're talking to them, you get all th..."
7,0.43,00:00:40.426,00:00:42.225,the task that you're trying to solve.
8,0.46,00:00:43.408,00:00:49.691,"Also teams will motivate you, they will suppor..."
9,0.38,00:00:49.710,00:00:52.091,This is like the things that should happen.


In [None]:
text_to_keep = " ".join(filtered_df_to_keep['sentence'].tolist())

notebook_mode_print(text_to_keep)

 Hello, this is Lino Cordia and today I want to talk to you about a very important topic, challenging topic, teamwork in the classroom.  So why is teamwork in the classroom so important? Well, for our students, it allows them to develop a bunch of new skills, right? Communication skills, leadership skills, et cetera. Also, when you're working with a team, you get different perspectives. Ideally, you are part of a team that has people with different genders, people with different age groups, people with different academic backgrounds, right? So when you're talking to them, you get all these fresh perspectives that inform your  the task that you're trying to solve. Also teams will motivate you, they will support you, you will feel empowered by them ideally, right? This is like the things that should happen. And also this is how the world works, right? Pretty much everything we do, we need to do it as a team. Now here's the thing about teamwork in the classroom. Well, these are the things

### Text to Delete

In [None]:
notebook_mode_print(filtered_df_to_delete[['metric_final', 'start_time', 'end_time', 'sentence']])

Unnamed: 0,metric_final,start_time,end_time,sentence
13,0.23,00:01:02.356,00:01:03.438,Students hate it.
14,0.066,00:01:05.367,00:01:06.507,But why do they hate it?
16,0.15,00:01:09.769,00:01:13.950,They don't know their teammates or they do kno...
17,0.22,00:01:14.391,00:01:17.111,"They're also concerned about an even workload,..."
20,0.0,00:01:26.254,00:01:27.075,They're concerned because
24,0.29,00:01:40.926,00:01:42.766,So what can we do as professors?
41,0.11,00:03:11.132,00:03:16.193,"And so I approached them and say, why do you g..."
42,0.26,00:03:16.774,00:03:18.734,And their answer was we had dinner together.
46,0.12,00:03:38.043,00:03:46.367,Now when I create teams if it's a short collab...
50,0.22,00:04:08.907,00:04:15.229,I have a final project and I tell them you get...


In [None]:
text_to_delete = " ".join(filtered_df_to_delete['sentence'].tolist())

notebook_mode_print(text_to_delete)

Students hate it.  But why do they hate it? They don't know their teammates or they do know their teammates, but they don't like them. They're also concerned about an even workload, right? They're concerned because So what can we do as professors? And so I approached them and say, why do you get along so well as a team? And their answer was we had dinner together. Now when I create teams if it's a short collaboration and by this I mean they're going to be working together about 10 minutes every every class I have a final project and I tell them you get to pick whoever you can work with anybody that you want. You need to determine when is a good idea for self-selection or when is a good idea to think of. There's tables with four chairs, right? So make sense that we don't need to disrupt the classroom destroy things, move around furniture too much. I find that it's easy to identify what each member has done.  But what are your strategies? I am Lino Cordia.


# PostProcessing

#### Time Taken: ~1.5min
6min video: ~2min to process, ~30sec to download

In [None]:
print_section("Postprocessing")




In [None]:
def ts_to_s(timestamp):
    """Converts a timestamp string in HH:MM:SS.mmm format to seconds.

    Args:
        timestamp: The timestamp string in HH:MM:SS.mmm format.

    Returns:
        The timestamp in seconds as a float.
    """
    hours, minutes, seconds_milliseconds = re.split(r':', timestamp)
    seconds, milliseconds = seconds_milliseconds.split('.')

    # Convert to seconds
    total_seconds = int(hours) * 3600 + int(minutes) * 60 + int(seconds) + int(milliseconds) / 1000.0

    return total_seconds

In [None]:
def skim_video(input_video, output_video, segments_to_retain):
    """
    Skims a video by keeping only the specified segments and removes others.

    Args:
        input_video (str): Path to the input video file.
        output_video (str): Path to the output video file.
        segments_to_retain (list of tuples): List of tuples where each tuple contains
                                             (start_time, end_time) in seconds to retain.
    """
    print_info("Processed video...")

    # Prepare the select filter for video (only select the specified ranges)
    video_select_filter = '+'.join([
        f"between(t,{start},{end})"
        for start, end in segments_to_retain
    ])

    # Prepare the select filter for audio (only select the specified ranges)
    audio_select_filter = '+'.join([
        f"between(t,{start},{end})"
        for start, end in segments_to_retain
    ])

    # Construct the ffmpeg command with the specified filters
    ffmpeg_command = [
        "ffmpeg",
        "-y",
        "-i", input_video,
        "-vf", f"select='{video_select_filter}',setpts=N/FRAME_RATE/TB",
        "-af", f"aselect='{audio_select_filter}',asetpts=N/SR/TB",
        "-threads", str(os.cpu_count()),
         "-preset", "ultrafast",
        output_video
    ]

    # Run the FFmpeg command and capture the output
    result = subprocess.run(ffmpeg_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)

    # Check if FFmpeg finished successfully or if there were errors
    if result.returncode != 0:
        print("FFmpeg Error:")
        print(result.stderr.decode())  # Print the error output
    else:
        print_info("Video processed successfully.")

In [None]:
def get_video_length(input_video):
    """Get the duration (length) of a video file using ffmpeg-python."""
    probe = ffmpeg.probe(input_video, v='error', select_streams='v:0', show_entries='format=duration')
    return float(probe['format']['duration'])

# def get_video_length(video_input):

#     # Run ffmpeg to get video information
#     result = subprocess.run([ffmpeg, '-i', video_input], stderr=subprocess.PIPE, stdout=subprocess.PIPE, text=True)

#     # Extract duration from stderr output
#     stderr_output = result.stderr
#     for line in stderr_output.split('\n'):
#         if 'Duration' in line:
#             # Extract the duration: 'Duration: hh:mm:ss.xx'
#             duration_str = line.split(',')[0].split('Duration: ')[1].strip()
#             h, m, s = map(float, duration_str.split(':'))
#             return h * 3600 + m * 60 + s  # Convert to seconds
#     return 0  # Return 0 if duration is not found

In [None]:
def generate_keep_timestamps(timestamps_to_remove, video_length=None):
    """
    Given a list of timestamps to remove from a video, generates the list of timestamps to keep.

    Args:
        timestamps_to_remove (list of tuples): List of segments to remove (start_time, end_time) in seconds.
        video_length (float, optional): Total length of the video in seconds. If not provided, the last segment's end is used.

    Returns:
        list of tuples: Segments to keep.
    """
    # Sort the timestamps to remove by their start times (just in case they're out of order)
    timestamps_to_remove.sort()

    # Initialize the list of segments to keep
    timestamps = []

    # If the first removal starts after 0, keep from the start of the video to the first removal
    if timestamps_to_remove[0][0] > 0:
        timestamps.append((0.0, timestamps_to_remove[0][0]))

    # Now, for each consecutive pair of timestamps to remove, keep the time between them
    for i in range(len(timestamps_to_remove) - 1):
        end_of_previous_removal = timestamps_to_remove[i][1]
        start_of_next_removal = timestamps_to_remove[i + 1][0]

        # If there's a gap, keep that gap
        if end_of_previous_removal < start_of_next_removal:
            timestamps.append((end_of_previous_removal, start_of_next_removal))

    # If there is time left after the last removal, keep it
    if video_length is not None:
        last_end_time = timestamps_to_remove[-1][1]
        if last_end_time < video_length:
            timestamps.append((last_end_time, video_length))

    return timestamps

In [None]:
# Timestamp pre-processing
original_video_length = get_video_length(video_input)

# Dev mode: export shorter video
video_length = min(original_video_length, video_export_max_length_seconds) if video_export_max_length_seconds > 0 else original_video_length

# Trim Method 1: Video with sentences to remove, removed
# timestamps_to_remove = list(map(lambda x: (ts_to_s(x[0]), ts_to_s(x[1])), sentence_timestamps))
# timestamps_to_keep = generate_keep_timestamps(timestamps_to_remove, video_length)

# Trim method 2: Video of only sentences to keep
timestamps_to_keep = list(map(lambda x: (ts_to_s(x[0]), ts_to_s(x[1])), sentence_timestamps))

# print(f"Timestamps to remove: {timestamps_to_remove}")
notebook_mode_print(f"Timestamps to keep: {timestamps_to_keep}")

Timestamps to keep: [(1.274, 8.685), (11.057, 13.319), (13.359, 17.48), (17.699, 20.059), (20.62, 23.861), (24.042, 34.804), (34.825, 39.506), (40.426, 42.225), (43.408, 49.691), (49.71, 52.091), (52.551, 54.213), (54.253, 57.094), (59.396, 61.155), (66.628, 69.087), (77.152, 84.495), (84.775, 86.215), (87.956, 89.918), (90.138, 96.12), (96.322, 99.305), (103.046, 104.629), (104.709, 108.331), (108.652, 110.953), (112.013, 113.795), (113.834, 117.018), (118.367, 124.552), (124.611, 133.9), (134.479, 137.703), (137.861, 151.376), (152.137, 159.818), (160.038, 164.878), (165.9, 170.02), (171.181, 177.52), (180.508, 182.55), (183.25, 186.27), (186.729, 190.491), (199.014, 201.633), (201.854, 208.395), (209.497, 216.02), (227.497, 233.401), (234.199, 244.985), (245.846, 248.705), (255.307, 255.807), (256.168, 265.632), (273.814, 281.067), (282.471, 285.752), (285.793, 287.353), (287.733, 292.156), (292.536, 295.896), (306.141, 314.67), (320.497, 326.403), (330.271, 330.992), (331.413, 333.

In [None]:
# Skim Video
skim_video(video_input, video_output_skimmed, timestamps_to_keep)


=== Video processed successfully.



In [None]:
# Download
if notebook_mode:
  print_info("Downloading video...")
  files.download(video_output_skimmed)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# Simple Metrics
print_section("Simple Metrics")

original_video_length = get_video_length(video_input)
print(f"Original Video Length: {original_video_length:.2f}s\n")

skimmed_video_length = get_video_length(video_output_skimmed)
print(f"Skimmed Video Length: {skimmed_video_length:.2f}s\n")

summarization_ratio = (original_video_length - skimmed_video_length) / original_video_length
print(f"Summarized/Original Video Length Ratio: {summarization_ratio:.2f}")


Original Video Length: 353.63s

Skimmed Video Length: 248.35s

Summarized/Original Video Length Ratio: 0.30
