# Installation & Setup

In [None]:
%%capture
!pip install nltk
!pip install transformers
!pip install datasets
!pip install srt
!pip install gdown
!apt install ffmpeg
!pip install deepmultilingualpunctuation
!pip install silero-vad
!pip install spacy
!pip install pytextrank
!pip install pydub
!pip install ffmpeg-python pymediainfo

!pip install git+https://github.com/m-bain/whisperX.git

In [None]:
import os
import numpy as np
import pandas as pd
from google.colab import files
import tarfile
import gdown
import re
from functools import reduce

# ML General
from datasets import load_dataset
import torch
import torchaudio
import torch.nn.functional as F
from transformers import \
LongformerTokenizer, LongformerModel, LongformerForSequenceClassification, LongformerConfig, \
RobertaTokenizer, RobertaForTokenClassification, Trainer, TrainingArguments, \
LEDTokenizer, LEDForConditionalGeneration

# Text
import pytextrank
import nltk
from nltk.tokenize import sent_tokenize
import spacy
import srt

# Audio
import whisperx
import silero_vad
from silero_vad import load_silero_vad, read_audio, get_speech_timestamps
from pydub import AudioSegment

# Video
from moviepy.editor import VideoFileClip, concatenate_videoclips

/usr/local/lib/python3.10/dist-packages


  torchaudio.set_audio_backend("soundfile")
  backend = torchaudio.get_audio_backend()
  from speechbrain.pretrained import (
  torchaudio.set_audio_backend(backend)
  from torchaudio.backend.common import AudioMetaData
  if event.key is 'enter':



In [None]:
%%capture
# NLTK
nltk.download('punkt')
nltk.download('punkt_tab')

# Notebook config
os.environ["WANDB_DISABLED"] = "true"

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


# Variables

In [None]:
video_input = "dataset/teamwork in the classroom.mov"

full_base = os.path.dirname(video_input)
path_dataset = full_base
filename = os.path.basename(video_input)
filename_without_extension = os.path.splitext(filename)[0]
filename_video_extension = video = os.path.splitext(video_input)[1]


filename_video_input = filename
filename_subtitles_output = filename_without_extension + ".srt"
filename_audio_output = filename_without_extension + ".wav"
filename_audio_output_skimmed = filename_without_extension + "_skimmed.wav"
filename_video_output_skimmed = filename_without_extension + "_skimmed" + filename_video_extension

subtitles_output = os.path.join(full_base, filename_subtitles_output)
audio_output = os.path.join(full_base, filename_audio_output)
audio_output_skimmed = os.path.join(full_base, filename_audio_output_skimmed)
video_output_skimmed = os.path.join(full_base, filename_video_output_skimmed)

video = ''
audio = ''
subtitles = ''
sentences = ''

# Datasets

In [None]:
# Google Drive Dataset Location
folder_id = '1k7DLJPl1xz9lpU4l3dZYtPe1XawhrXeC' # taken from drive.google.com/drive/u/1/folders/1k7D...(this part)
gdown.download_folder(id=folder_id, quiet=False, use_cookies=False)

Retrieving folder contents


Processing file 1vuIW3CVm2p_Ig-_srJ5sIwUxzTGqdGHp assessing students without exams.mov
Processing file 1OP3zzSmpKJ0RDPasl9AGQD2yNeCEXtoR flipped learning basics.mov
Processing file 1wslcvTNd88FQMXJgvGbKR3sjwXORR6xt teamwork in the classroom.mov
Processing file 1ZsA-X-HsSk0WJiZaGcHIC3t17DchdXIX teamwork in the classroom.srt
Processing file 1lb2rCvjouElVhqEHAyojUOvb_ECgUqjF teamwork in the classroom.wav


Retrieving folder contents completed
Building directory structure
Building directory structure completed
Downloading...
From (original): https://drive.google.com/uc?id=1vuIW3CVm2p_Ig-_srJ5sIwUxzTGqdGHp
From (redirected): https://drive.google.com/uc?id=1vuIW3CVm2p_Ig-_srJ5sIwUxzTGqdGHp&confirm=t&uuid=dfdbd51a-9ed4-4d57-b867-bf8246c15f24
To: /content/dataset/assessing students without exams.mov
100%|██████████| 875M/875M [00:15<00:00, 57.4MB/s]
Downloading...
From (original): https://drive.google.com/uc?id=1OP3zzSmpKJ0RDPasl9AGQD2yNeCEXtoR
From (redirected): https://drive.google.com/uc?id=1OP3zzSmpKJ0RDPasl9AGQD2yNeCEXtoR&confirm=t&uuid=77512c8b-f397-4628-a5e6-9df01095b9ff
To: /content/dataset/flipped learning basics.mov
100%|██████████| 399M/399M [00:02<00:00, 171MB/s]
Downloading...
From (original): https://drive.google.com/uc?id=1wslcvTNd88FQMXJgvGbKR3sjwXORR6xt
From (redirected): https://drive.google.com/uc?id=1wslcvTNd88FQMXJgvGbKR3sjwXORR6xt&confirm=t&uuid=acee06b0-9546-405c-bb74-b

['/content/dataset/assessing students without exams.mov',
 '/content/dataset/flipped learning basics.mov',
 '/content/dataset/teamwork in the classroom.mov',
 '/content/dataset/teamwork in the classroom.srt',
 '/content/dataset/teamwork in the classroom.wav']

In [None]:
# Simple Test Dataset:
paragrah_simple = "Renewable energy is crucial for reducing carbon emissions. Solar power, in particular, is sustainable and abundant. Interestingly, solar panels were first invented in 1954. With continued advancements, solar energy is becoming more accessible in everyday life."
paragraph_simple_unpunct = "Renewable energy is crucial for reducing carbon emissions  Solar power, in particular, is sustainable and abundant Interestingly, solar panels were first invented in 1954 With continued advancements, solar energy is becoming more accessible in everyday life"

# Other: CNN/Daily Mail
# dataset_news = load_dataset("cnn_dailymail", "3.0.0")
# paragraph_news = dataset_news['train']['article'][0]
# summary_news = dataset_news['train']['highlights'][0]

SRT  
each **`subtitle`** in the subtitles array has the following properties:

1. **`index`**
   - The sequential number of the subtitle within the SRT file.
   - `1`, `2`, `3`, etc. (Integer)
2. **`start`**
   - The time (in milliseconds) when the subtitle should appear on the screen.
   - `00:00:05,000` (String representing HH:MM:SS,SSS)
3. **`end`**
   - The time (in milliseconds) when the subtitle should disappear from the screen.
   - `00:00:10,000` (String representing HH:MM:SS,SSS)
4. **`content`**
   - The actual text of the subtitle that will be displayed.
   - "Hello, world!" (String)
5. **`proprietary`**
   - This field holds any additional data or formatting specific to the SRT file or software used to create it. Often empty and can usually be ignored.
   - `''` (Empty string, or sometimes contains specific formatting codes)

# Preprocessing

## Audio - Extract

In [None]:
%%capture
# Extract audio (wav) from video
!ffmpeg -y -i "$video_input" -vn -acodec pcm_s16le -ar 44100 -ac 2 "$audio_output"

## Audio - SRT File Generation

##### Time Taken: ~4min

In [None]:
def seconds_to_srt_timestamp(seconds):
    """
    Extract hours, minutes, seconds, and milliseconds
    from a given number of seconds.
    """

    hours = int(seconds // 3600)
    minutes = int((seconds % 3600) // 60)
    seconds = seconds % 60
    milliseconds = int((seconds - int(seconds)) * 1000)

    # Format as HH:MM:SS,MS
    return f"{hours:02}:{minutes:02}:{int(seconds):02},{milliseconds:03}"

In [None]:
# Select device (GPU if available, otherwise CPU)
device = "cuda" if torch.cuda.is_available() else "cpu"
language="en"
compute_type="int8"

# Model WhisperX
model = whisperx.load_model("base", device=device, language=language, compute_type=compute_type) # Choose "base" or "large" model

# Transcribe audio
aligned_segments = model.transcribe(audio_output)

# Align with forced alignment
alignment_model, metadata = whisperx.load_align_model(language_code=aligned_segments["language"], device=device)
aligned_segments = whisperx.align(aligned_segments["segments"], alignment_model, metadata, audio_output, device)

# Generate SRT file with aligned sentences
with open(subtitles_output, "w") as f:
    for i, segment in enumerate(aligned_segments["segments"], 1):
        # Get start and end times in SRT format
        start_time = seconds_to_srt_timestamp(segment["start"])
        end_time = seconds_to_srt_timestamp(segment["end"])

        # Write SRT entry
        f.write(f"{i}\n{start_time} --> {end_time}\n{segment['text']}\n\n")

print(f"SRT file generated: {subtitles_output}")

INFO:pytorch_lightning.utilities.migration.utils:Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.4.0. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../root/.cache/torch/whisperx-vad-segmentation.bin`


Model was trained with pyannote.audio 0.0.1, yours is 3.1.1. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.5.0+cu121. Bad things might happen unless you revert torch to 1.x.
SRT file generated: dataset/teamwork in the classroom.srt


## Text - Load SRT File

In [None]:
# Subtitles:
with open(subtitles_output, "r", encoding="utf-8") as f:
    subtitles = list(srt.parse(f.read()))

## Text - Sentence Segmentation

In [None]:
def format_timedelta(timedelta_obj):
    """Formats a datetime.timedelta object into HH:MM:SS.mmm timestamp.

    Args:
        timedelta_obj: The datetime.timedelta object.

    Returns:
        A string representing the timestamp in HH:MM:SS.mmm format.
    """
    total_seconds = timedelta_obj.total_seconds()
    hours = int(total_seconds // 3600)  # Get hours
    minutes = int((total_seconds % 3600) // 60)  # Get minutes
    seconds = int(total_seconds % 60)  # Get seconds
    milliseconds = int((total_seconds % 1) * 1000)  # Get milliseconds

    return f"{hours:02d}:{minutes:02d}:{seconds:02d}.{milliseconds:03d}"

In [None]:
sentences = []
for i, segment in enumerate(subtitles):
    sentences.append({
        'base_idx': i,
        'start_time': format_timedelta(segment.start),
        'end_time': format_timedelta(segment.end),
        'sentence': segment.content
    })

df_sentences = pd.DataFrame(sentences)
sentences = df_sentences['sentence'].tolist()

display(df_sentences)

Unnamed: 0,base_idx,start_time,end_time,sentence
0,0,00:00:01.274,00:00:08.685,"Hello, this is Lino Cordia and today I want t..."
1,1,00:00:11.057,00:00:13.319,So why is teamwork in the classroom so import...
2,2,00:00:13.359,00:00:17.480,"Well, for our students, it allows them to deve..."
3,3,00:00:17.699,00:00:20.059,"Communication skills, leadership skills, et ce..."
4,4,00:00:20.620,00:00:23.861,"Also, when you're working with a team, you get..."
...,...,...,...,...
69,69,00:05:46.624,00:05:48.245,"And yeah, please reach out."
70,70,00:05:48.384,00:05:49.406,I would love to hear from you.
71,71,00:05:49.526,00:05:50.266,Thank you so much.
72,72,00:05:50.547,00:05:51.487,I am Lino Cordia.


## Text - Paragraph
combination of all subtitle parts.  

WhisperAI enhances transcription with basic punctuation.

In [None]:
paragraph = reduce(lambda acc, seg: acc + seg.strip() + ' ', sentences, '')

# Print the paragraph
print(paragraph)

Hello, this is Lino Cordia and today I want to talk to you about a very important topic, challenging topic, teamwork in the classroom. So why is teamwork in the classroom so important? Well, for our students, it allows them to develop a bunch of new skills, right? Communication skills, leadership skills, et cetera. Also, when you're working with a team, you get different perspectives. Ideally, you are part of a team that has people with different genders, people with different age groups, people with different academic backgrounds, right? So when you're talking to them, you get all these fresh perspectives that inform your the task that you're trying to solve. Also teams will motivate you, they will support you, you will feel empowered by them ideally, right? This is like the things that should happen. And also this is how the world works, right? Pretty much everything we do, we need to do it as a team. Now here's the thing about teamwork in the classroom. Students hate it. But why do 

## Text - Paragraph Summarized

##### Time Taken: ~1min

In [None]:
# Model: Longformer Encoder-Decoder
model_name = "allenai/led-base-16384"
tokenizer = LEDTokenizer.from_pretrained(model_name)
model = LEDForConditionalGeneration.from_pretrained(model_name)
text = paragraph

# Tokenization
inputs = tokenizer(text, return_tensors="pt", max_length=4096, truncation=True)

# Summary Generation
summary_ids = model.generate(inputs["input_ids"], max_length=150, min_length=30, length_penalty=2.0, num_beams=4, early_stopping=True)
paragraph_summarized = tokenizer.decode(summary_ids[0], skip_special_tokens=True)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.



tokenizer_config.json:   0%|          | 0.00/27.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.09k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/648M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/168 [00:00<?, ?B/s]

Input ids are automatically padded from 1141 to 2048 to be a multiple of `config.attention_window`: 1024


# Text

In [None]:
# Simple Metrics
original_length = len(paragraph)
summary_length = len(paragraph_summarized)

print(f"original length: {original_length}")
print(f"summary length: {summary_length}")

compression_ratio = (original_length - summary_length) / original_length
print(f"auto-summary compression ratio: {compression_ratio:.2f}")

original length: 5209
summary length: 708
auto-summary compression ratio: 0.86


## Metric 1: Sentence- Summarized Paragraph Relevancy (Cosine Similarity)

##### Time Taken: ~2min

In [None]:
%%capture
# config
attention_window = 256
config = LongformerConfig.from_pretrained('allenai/longformer-base-4096', attention_window=attention_window)

# model: Longformer
model_lf = LongformerModel.from_pretrained('allenai/longformer-base-4096', config=config)
tokenizer_lf = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096', model_max_length=attention_window)

In [None]:
# 2: Tokenization
paragraph_tokens = tokenizer_lf(paragraph_summarized, return_tensors='pt')
# sentence_tokens = [tokenizer_lf(sentence, return_tensors='pt') for sentence in sentences]

sentence_tokens = tokenizer_lf(sentences, padding=True, truncation=True, return_tensors='pt')

In [None]:
# 3: Embedding
with torch.no_grad():  # Disable gradient computation for efficiency
    paragraph_embedding = model_lf(**paragraph_tokens).last_hidden_state[:, 0, :]  # Get the [CLS] token embedding

    # Process batched sentence tokens
    sentence_embeddings = model_lf(**sentence_tokens).last_hidden_state[:, 0, :]

Input ids are automatically padded from 150 to 256 to be a multiple of `config.attention_window`: 256
Input ids are automatically padded from 40 to 256 to be a multiple of `config.attention_window`: 256


Embedding Explanation  
The [CLS] (classification) token is often used in transformer models to represent the overall meaning or summary of the input sequence. By extracting its embedding, you're essentially obtaining a representation that captures the main point or essence of the paragraph.

In [None]:
# 4: Relevance scores
relevance_scores = [torch.cosine_similarity(paragraph_embedding, sentence_embedding).item() for sentence_embedding in sentence_embeddings]

# Normalization: min-max normalization
min_score = min(relevance_scores)
max_score = max(relevance_scores)
normalized_scores = [(score - min_score) / (max_score - min_score) for score in relevance_scores]

# round
normalized_scores = [np.format_float_positional(score, precision=2, unique=False, fractional=False, trim='k') for score in normalized_scores]

In [None]:
# 5: Display Results
df_sentences.insert(0, "metric_1_score", normalized_scores)

display(df_sentences)

Unnamed: 0,metric_1_score,base_idx,start_time,end_time,sentence
0,1.0,0,00:00:01.274,00:00:08.685,"Hello, this is Lino Cordia and today I want t..."
1,0.45,1,00:00:11.057,00:00:13.319,So why is teamwork in the classroom so import...
2,0.57,2,00:00:13.359,00:00:17.480,"Well, for our students, it allows them to deve..."
3,0.47,3,00:00:17.699,00:00:20.059,"Communication skills, leadership skills, et ce..."
4,0.53,4,00:00:20.620,00:00:23.861,"Also, when you're working with a team, you get..."
...,...,...,...,...,...
69,0.39,69,00:05:46.624,00:05:48.245,"And yeah, please reach out."
70,0.33,70,00:05:48.384,00:05:49.406,I would love to hear from you.
71,0.51,71,00:05:49.526,00:05:50.266,Thank you so much.
72,0.25,72,00:05:50.547,00:05:51.487,I am Lino Cordia.


## Metric 2: Intra-sentence relevancy
Score by if current sentence is needded by adjacent sentences.

In [None]:
from transformers import BertForSequenceClassification, BertTokenizer

# Load pre-trained model and tokenizer
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Store predictions for each sentence
predictions = []

# Iterate through sentence pairs
for i in range(len(sentences) - 1):
    sentence1 = sentences[i]
    sentence2 = sentences[i + 1]

    # Tokenize and prepare input
    inputs = tokenizer(sentence1, sentence2, return_tensors='pt', truncation=True, padding=True, add_special_tokens=True)

    # Get model prediction
    outputs = model(**inputs)
    prediction = torch.argmax(outputs.logits).item()

    # Store prediction
    predictions.append(prediction)

# Handle last sentence (no next sentence)
predictions.append(0)  # Assume last sentence doesn't need a next sentence

In [None]:
# Add predictions to DataFrame
df_sentences = df_sentences.assign(**{"Previous Sentence Needed": predictions})

display(df_sentences)

## Metric 3: Intelligent Sentence-Paragraph Relevancy

##### Time Taken: 13min - 26min

In [None]:
tokenizer = LongformerTokenizer.from_pretrained("allenai/longformer-base-4096")
model = LongformerForSequenceClassification.from_pretrained("allenai/longformer-base-4096")

# Ensure the model is in evaluation mode
model.eval()

# Example usage
body_paragraph = paragraph

relevance_scores = []

for sentence in sentences:
    # Prepare the input for Longformer
    inputs = tokenizer(
        body_paragraph,
        sentence,
        return_tensors='pt',
        max_length=4096,
        truncation=True,
        padding='max_length'  # Pad to max length to avoid issues with model input size
    )

    # Get model predictions
    with torch.no_grad():
        outputs = model(**inputs)

    # Assuming binary classification (relevant/not relevant)
    relevance_score = torch.softmax(outputs.logits, dim=1)[0][1].item()  # Probability of being relevant
    relevance_scores.append((sentence, relevance_score))

# Sort sentences based on relevance scores
sorted_sentences = sorted(relevance_scores, key=lambda x: x[1], reverse=True)
ranked_sentences = [sentence for sentence, score in sorted_sentences]

In [None]:
relevance_scores[0]

In [None]:
sentence_indices = list(range(len(relevance_scores)))
scores = [score for sentence, score in relevance_scores]
sentences_text = [sentence for sentence, score in relevance_scores]

df_relevance = pd.DataFrame({'Sentence Index': sentence_indices, 'Score': scores, 'Sentence': sentences_text})
df_relevance

## Metric 4: Keyword extraction and Ranking
using TextRank

In [None]:
# Load a spaCy model
nlp = spacy.load("en_core_web_sm")

# Add the pytextrank pipeline component to spaCy
nlp.add_pipe("textrank")

phrase_data = []

# Process the text
doc = nlp(paragraph)

for phrase in doc._.phrases:
  phrase_data.append([phrase.text, phrase.rank, phrase.count])

df_phrases = pd.DataFrame(phrase_data, columns=['Phrase', 'Rank', 'Count'])
df_phrases.sort_values(by=['Rank'], ascending=False, inplace=True)

display(df_phrases)

# Audio

## Metric 5: Silence Detection
* From the Paragraph boundaries, get the time in aduio that we care about
* For each time in audio we care about, analyze if they are low volume

OR
* analyze all potential sentence boundaries first
* match with end of sentences

In [None]:
# 0: Load audio, extract timestamps

SAMPLING_RATE = 16000 # 16 kHz

model = load_silero_vad()
wav = read_audio(audio_output)
speech_timestamps = get_speech_timestamps(wav, model)

# Check the shape of the wav tensor
print(f"Audio shape: {wav.shape}")
print(f"Audio length (seconds): {len(wav) / SAMPLING_RATE:.2f}")

Audio shape: torch.Size([5657259])
Audio length (seconds): 353.58


In [None]:
# Speech Intervals
speech_intervals = []
for i in range(0, len(speech_timestamps)-1):
    speech_intervals.append((speech_timestamps[i]['start'] / SAMPLING_RATE, speech_timestamps[i]['end'] / SAMPLING_RATE))

# Silence Intervals
silence_intervals = []
for i in range(1, len(speech_timestamps)):
    silence_start = speech_timestamps[i-1]['end']  # End of previous speech segment
    silence_end = speech_timestamps[i]['start']     # Start of current speech segment
    silence_intervals.append((silence_start / SAMPLING_RATE, silence_end / SAMPLING_RATE))

In [None]:
print(speech_timestamps[0:3])
print(speech_intervals[0:3])
print(silence_intervals[0:3])

[{'start': 20000, 'end': 113120}, {'start': 116768, 'end': 140256}, {'start': 176672, 'end': 219104}]
[(1.25, 7.07), (7.298, 8.766), (11.042, 13.694)]
[(7.07, 7.298), (8.766, 11.042), (13.694, 14.05)]


# Video

# Final Score - Metric Weighting

In [None]:
if 'metric_final' in df_sentences.columns:
    df_sentences.drop('metric_final', axis=1, inplace=True)

df_sentences.insert(0, 'metric_final', 1)

# Metric 1 Apply
df_sentences['metric_final'] = 1 * df_sentences['metric_final'] * df_sentences['metric_1_score'].astype(float)

display(df_sentences)

Unnamed: 0,metric_final,metric_1_score,base_idx,start_time,end_time,sentence
0,1.00,1.0,0,00:00:01.274,00:00:08.685,"Hello, this is Lino Cordia and today I want t..."
1,0.45,0.45,1,00:00:11.057,00:00:13.319,So why is teamwork in the classroom so import...
2,0.57,0.57,2,00:00:13.359,00:00:17.480,"Well, for our students, it allows them to deve..."
3,0.47,0.47,3,00:00:17.699,00:00:20.059,"Communication skills, leadership skills, et ce..."
4,0.53,0.53,4,00:00:20.620,00:00:23.861,"Also, when you're working with a team, you get..."
...,...,...,...,...,...,...
69,0.39,0.39,69,00:05:46.624,00:05:48.245,"And yeah, please reach out."
70,0.33,0.33,70,00:05:48.384,00:05:49.406,I would love to hear from you.
71,0.51,0.51,71,00:05:49.526,00:05:50.266,Thank you so much.
72,0.25,0.25,72,00:05:50.547,00:05:51.487,I am Lino Cordia.


# PostProcessing

In [51]:
# set develop mode to true to export a shortened version of the multimedia
dev_mode = True
dev_media_output_start_s = 0
dev_media_output_end_s = 15

In [None]:
def ts_to_ms(timestamp):
    """Converts a timestamp string in HH:MM:SS.mmm format to milliseconds.

    Args:
        timestamp: The timestamp string in HH:MM:SS.mmm format.

    Returns:
        The timestamp in milliseconds as an integer.
    """
    hours, minutes, seconds_milliseconds = re.split(r':', timestamp)
    seconds, milliseconds = seconds_milliseconds.split('.')

    # Convert to milliseconds
    total_milliseconds = (int(hours) * 3600 + int(minutes) * 60 + int(seconds)) * 1000 + int(milliseconds)

    return total_milliseconds

In [44]:
def ts_to_s(timestamp):
    """Converts a timestamp string in HH:MM:SS.mmm format to seconds.

    Args:
        timestamp: The timestamp string in HH:MM:SS.mmm format.

    Returns:
        The timestamp in seconds as a float.
    """
    hours, minutes, seconds_milliseconds = re.split(r':', timestamp)
    seconds, milliseconds = seconds_milliseconds.split('.')

    # Convert to seconds
    total_seconds = int(hours) * 3600 + int(minutes) * 60 + int(seconds) + int(milliseconds) / 1000.0

    return total_seconds

In [None]:
def trim_audio_multiple(audio_path, timestamps):
    """Trims audio at multiple locations and combines the remaining segments.

    Args:
        audio: raw audio.
        timestamps: A list of tuples, where each tuple contains
                    the start and end times (in milliseconds)
                    of the segment to remove.

    Returns:
        An AudioSegment object containing the trimmed audio.
    """
    audio = AudioSegment.from_file(audio_path)
    segments = []
    last_end = 0

    for start_trim, end_trim in timestamps:
        segments.append(audio[last_end:start_trim])
        last_end = end_trim

    segments.append(audio[last_end:])  # Add the remaining part

    trimmed_audio = sum(segments)  # Combine all segments
    return trimmed_audio

In [57]:
def trim_video_multiple(video_path, timestamps):
    """Trims video at multiple locations and combines the remaining segments.

    Args:
        video_path: Path to the video file.
        timestamps: A list of tuples, where each tuple contains
                    the start and end times (in seconds)
                    of the segment to remove.

    Returns:
        A VideoFileClip object containing the trimmed video.
    """
    video = VideoFileClip(video_path)
    print(f"Original video length: {video.duration}s")

    segments = []
    last_end = 0

    for start_trim, end_trim in timestamps:
        segments.append(video.subclip(last_end, start_trim))
        last_end = end_trim

    segments.append(video.subclip(last_end))  # Add the remaining part

    trimmed_video = concatenate_videoclips(segments)  # Combine all segments
    return trimmed_video

In [58]:
def get_video_codec(video_path):
    """Returns the codec of the video file using ffmpeg probe."""
    try:
        probe = ffmpeg.probe(video_path, v='error', select_streams='v:0', show_entries='stream=codec_name')
        codec = probe['streams'][0]['codec_name']
        return codec
    except ffmpeg.Error as e:
        print(f"Error detecting codec: {e}")
        return None

In [59]:
timestamps_to_trim = [('00:00:00.00','00:00:01.25'), ('00:00:08.766', '00:00:11.042')]
timestamps = list(map(lambda x: (ts_to_s(x[0]), ts_to_s(x[1])), timestamps_to_trim))
# timestamps_ms = list(map(lambda x: (x[0] / 1000, x[1] / 1000), timestamps_ms))

In [60]:
# Skim Video
skimmed_video = trim_video_multiple(video_input, timestamps)
print(f"Skimmed video length: {skimmed_video.duration}s")

if dev_mode:
  skimmed_video = skimmed_video.subclip(dev_media_output_start_s, dev_media_output_end_s)
  print(f"Output Video Length (Dev Mode): {skimmed_video.duration}s")

# Export Video
# skimmed_video.write_videofile(
#     video_output_skimmed,
#     codec="copy",
#     threads=2,
#     progress_bar=True
# )

Original video length: 353.63s
Skimmed video length: 350.329s
Video Length (Dev Mode): 15s


In [37]:
# # Skim Audio
# skimmed_audio = trim_audio_multiple(audio_output, timestamps)

# if develop_mode:
#   skimmed_audio = skimmed_audio[develop_mode_sample_start_ms:develop_mode_sample_end_ms]

# # Export Audio
# skimmed_audio.export(audio_output_skimmed, format="wav")

In [None]:
# Download
files.download(audio_output_skimmed)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>