# Installation & Setup

In [157]:
%%capture
!pip install nltk==3.9.1
!pip install transformers==4.45.2
!pip install datasets==3.0.2
!pip install srt==3.5.3
!pip install gdown==5.2.0
!apt install ffmpeg==1.4
!pip install deepmultilingualpunctuation==1.0.1
!pip install silero-vad==5.1.2

In [193]:
import os
import numpy as np
import pandas as pd
import tarfile
import nltk
from nltk.tokenize import sent_tokenize
import torch
import torchaudio
from transformers import LongformerTokenizer, LongformerModel, LongformerForSequenceClassification, RobertaTokenizer, RobertaForTokenClassification, Trainer, TrainingArguments
from datasets import load_dataset
from datasets import load_dataset
import torch.nn.functional as F
import gdown
import srt
from deepmultilingualpunctuation import PunctuationModel

import silero_vad
from silero_vad import load_silero_vad, read_audio, get_speech_timestamps

In [110]:
# Notebook config
os.environ["WANDB_DISABLED"] = "true"

In [111]:
%%capture
# NLTK
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


# Variables

In [128]:
path_dataset = "dataset/"

filename = "teamwork in the classroom"
filename_video_input = filename + ".mov"
filename_subtitles_input = filename + ".srt"
filename_audio_output = filename + ".wav"

video_input = os.path.join(path_dataset, filename_video_input)
subtitles_input = os.path.join(path_dataset, filename_subtitles_input)
audio_output = os.path.join(path_dataset, filename_audio_output)

video = ''
audio = ''
subtitles = ''
sentences = ''

# Datasets

In [113]:
# Google Drive Dataset Location
folder_id = '1k7DLJPl1xz9lpU4l3dZYtPe1XawhrXeC'
gdown.download_folder(id=folder_id, quiet=False, use_cookies=False)

Retrieving folder contents


Processing file 1vuIW3CVm2p_Ig-_srJ5sIwUxzTGqdGHp assessing students without exams.mov
Processing file 1i8XfB6AOly9l0vsdrGXz2FBs0OHxhFeP assessing students without exams.srt
Processing file 1OP3zzSmpKJ0RDPasl9AGQD2yNeCEXtoR flipped learning basics.mov
Processing file 1O5Q1jZLdbmzpZdkytc3byg-xMSkGAs7u flipped learning basics.srt
Processing file 1wslcvTNd88FQMXJgvGbKR3sjwXORR6xt teamwork in the classroom.mov
Processing file 10-kRFMxQqrI7j0K7dR6dBRBLSgpbjgUO teamwork in the classroom.srt


Retrieving folder contents completed
Building directory structure
Building directory structure completed
Downloading...
From (original): https://drive.google.com/uc?id=1vuIW3CVm2p_Ig-_srJ5sIwUxzTGqdGHp
From (redirected): https://drive.google.com/uc?id=1vuIW3CVm2p_Ig-_srJ5sIwUxzTGqdGHp&confirm=t&uuid=eacd6000-0319-4286-a64a-8ed059eb2514
To: /content/dataset/assessing students without exams.mov
100%|██████████| 875M/875M [00:07<00:00, 119MB/s]
Downloading...
From: https://drive.google.com/uc?id=1i8XfB6AOly9l0vsdrGXz2FBs0OHxhFeP
To: /content/dataset/assessing students without exams.srt
100%|██████████| 17.1k/17.1k [00:00<00:00, 49.2MB/s]
Downloading...
From (original): https://drive.google.com/uc?id=1OP3zzSmpKJ0RDPasl9AGQD2yNeCEXtoR
From (redirected): https://drive.google.com/uc?id=1OP3zzSmpKJ0RDPasl9AGQD2yNeCEXtoR&confirm=t&uuid=bc01f23c-c859-426a-8337-7743b71e8bca
To: /content/dataset/flipped learning basics.mov
100%|██████████| 399M/399M [00:05<00:00, 74.6MB/s]
Downloading...
From: htt

['/content/dataset/assessing students without exams.mov',
 '/content/dataset/assessing students without exams.srt',
 '/content/dataset/flipped learning basics.mov',
 '/content/dataset/flipped learning basics.srt',
 '/content/dataset/teamwork in the classroom.mov',
 '/content/dataset/teamwork in the classroom.srt']

In [129]:
# Subtitles:
with open(subtitles_input, "r", encoding="utf-8") as f:
    subtitles = list(srt.parse(f.read()))

# Simple Test Dataset:
paragrah_simple = "Renewable energy is crucial for reducing carbon emissions. Solar power, in particular, is sustainable and abundant. Interestingly, solar panels were first invented in 1954. With continued advancements, solar energy is becoming more accessible in everyday life."
paragraph_simple_unpunct = "Renewable energy is crucial for reducing carbon emissions  Solar power, in particular, is sustainable and abundant Interestingly, solar panels were first invented in 1954 With continued advancements, solar energy is becoming more accessible in everyday life"

# Other: CNN/Daily Mail
# dataset_news = load_dataset("cnn_dailymail", "3.0.0")
# paragraph_news = dataset_news['train']['article'][0]
# summary_news = dataset_news['train']['highlights'][0]

### SRT
each **`subtitle`** in the subtitles array has the following properties:

1. **`index`**
   - The sequential number of the subtitle within the SRT file.
   - `1`, `2`, `3`, etc. (Integer)
2. **`start`**
   - The time (in milliseconds) when the subtitle should appear on the screen.
   - `00:00:05,000` (String representing HH:MM:SS,SSS)
3. **`end`**
   - The time (in milliseconds) when the subtitle should disappear from the screen.
   - `00:00:10,000` (String representing HH:MM:SS,SSS)
4. **`content`**
   - The actual text of the subtitle that will be displayed.
   - "Hello, world!" (String)
5. **`proprietary`**
   - This field holds any additional data or formatting specific to the SRT file or software used to create it. Often empty and can usually be ignored.
   - `''` (Empty string, or sometimes contains specific formatting codes)

# Text:

#### Challenges:
* Imposing/Detecting punctuation
* Incorrect words parsed to transcript

 ## Preprocessing

In [115]:
# Intermediate exploration

# 'proprietary' field can be safely ignored"
proprietary_values = sum([len(subtitle.proprietary) for subtitle in subtitles])
print(proprietary_values) # returns 0

0


### Paragraph (Unpunctuated)
combination of all subtitle parts.  
As text is parsed from audio automatically there is no inherent punctuation (periods are missing)

In [130]:
# Paragraph
paragraph_unpunct = " ".join([subtitle.content for subtitle in subtitles])

In [131]:
paragraph_unpunct

"hello this is Leno Kia and today I want to talk to you about a very important topic challenging topic teamwork in the classroom so why is teamwork in the classroom so important well for our students it allows them to develop a bunch of new skills right uh communication skills leadership skills Etc also when you're working with a team you get different perspectives ideally you are part of a team that has uh people with different genders people with uh different a in different age groups people with different academic backgrounds right so when you're talking to them you get all this fresh perspectives that inform your uh the task that you're trying to solve uh also teams will motivate you they will support you you will feel empowered by them ideally right this is like kind the thing that should happen and also this is how the world works right pretty much everything we do we need to do it as a team now here's the thing about teamwork in the classroom students hate it but why do they hat

### Paragraph (Punctuated)
Using DeepPunct model

In [132]:
# DeepPunct model to restore punctuation
model = PunctuationModel()
text = paragraph_unpunct
text_tokens = model.preprocess(text)
# labled_words = model.predict(clean_text)
paragraph_punct = model.restore_punctuation(text)



In [133]:
paragraph_punct

"hello, this is Leno Kia, and today I want to talk to you about a very important topic, challenging topic: teamwork in the classroom. so why is teamwork in the classroom so important? well, for our students. it allows them to develop a bunch of new skills, right? uh, communication skills, leadership skills, Etc. also, when you're working with a team, you get different perspectives. ideally, you are part of a team that has, uh people with different genders, people with uh different a in different age groups, people with different academic backgrounds- right, so when you're talking to them, you get all this fresh perspectives that inform your uh the task that you're trying to solve. uh, also, teams will motivate you. they will support you. you will feel empowered by them. ideally, right, this is, like kind, the thing that should happen, and also, this is how the world works. right, pretty much everything we do, we need to do it as a team. now here's the thing about teamwork in the classr


Individual sentences from the paragraph.

### Sentence Segmentation

In [228]:
# Segmentation
sentences = sent_tokenize(paragraph_punct)
print(len(sentences))
print(sentences)

54
['hello, this is Leno Kia, and today I want to talk to you about a very important topic, challenging topic: teamwork in the classroom.', 'so why is teamwork in the classroom so important?', 'well, for our students.', 'it allows them to develop a bunch of new skills, right?', 'uh, communication skills, leadership skills, Etc.', "also, when you're working with a team, you get different perspectives.", "ideally, you are part of a team that has, uh people with different genders, people with uh different a in different age groups, people with different academic backgrounds- right, so when you're talking to them, you get all this fresh perspectives that inform your uh the task that you're trying to solve.", 'uh, also, teams will motivate you.', 'they will support you.', 'you will feel empowered by them.', 'ideally, right, this is, like kind, the thing that should happen, and also, this is how the world works.', 'right, pretty much everything we do, we need to do it as a team.', "now here'

## Metric 1: Simple Sentence-Paragraph Relevancy (Cosine Similarity)

In [135]:
%%capture
# 1: Longformer Model
tokenizer_lf = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')
model_lf = LongformerModel.from_pretrained('allenai/longformer-base-4096')

In [136]:
# 2: Tokenization
paragraph_tokens = tokenizer_lf(paragraph, return_tensors='pt')
sentence_tokens = [tokenizer_lf(sentence, return_tensors='pt') for sentence in sentences]

### Embedding Explanation
The [CLS] (classification) token is often used in transformer models to represent the overall meaning or summary of the input sequence. By extracting its embedding, you're essentially obtaining a representation that captures the main point or essence of the paragraph.

In [137]:
# 3: Embedding - ~2.5min for 850MB video
with torch.no_grad(): # Disable gradient computation for efficiency
    paragraph_embedding = model_lf(**paragraph_tokens).last_hidden_state[:, 0, :]  # Get the [CLS] token embedding
    sentence_embeddings = [model_lf(**tokens).last_hidden_state[:, 0, :] for tokens in sentence_tokens]

In [138]:
# 4: Relevance scores
relevance_scores = [torch.cosine_similarity(paragraph_embedding, sentence_embedding).item() for sentence_embedding in sentence_embeddings]

# Normalization: min-max normalization
min_score = min(relevance_scores)
max_score = max(relevance_scores)
normalized_scores = [(score - min_score) / (max_score - min_score) for score in relevance_scores]

# round
normalized_scores = [np.format_float_positional(score, precision=2, unique=False, fractional=False, trim='k') for score in normalized_scores]

In [145]:
# 5: Display Results
df = pd.DataFrame({"Sentence Index": range(len(sentences)), "Score": normalized_scores, "Sentence": sentences })
df.sort_values(by=['Score'], ascending=False, inplace=True)

display(df)

Unnamed: 0,Sentence Index,Score,Sentence
52,52,1.0,I am Leno Kia.
8,8,1.0,they will support you.
18,18,0.97,they're also concerned about their grades.
2,2,0.97,"well, for our students."
45,45,0.95,I find that it's easy to identify what each me...
23,23,0.94,I tell them about my own life experiences.
34,34,0.94,something like that is going to help you connect.
11,11,0.93,"right, pretty much everything we do, we need t..."
9,9,0.93,you will feel empowered by them.
20,20,0.92,so what can we do as professors?


## Metric 2: Intra-sentence relevancy
Score by if current sentence is needded by adjacent sentences.

In [146]:
from transformers import BertForSequenceClassification, BertTokenizer

# Load pre-trained model and tokenizer
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Store predictions for each sentence
predictions = []

# Iterate through sentence pairs
for i in range(len(sentences) - 1):
    sentence1 = sentences[i]
    sentence2 = sentences[i + 1]

    # Tokenize and prepare input
    inputs = tokenizer(sentence1, sentence2, return_tensors='pt', truncation=True, padding=True, add_special_tokens=True)

    # Get model prediction
    outputs = model(**inputs)
    prediction = torch.argmax(outputs.logits).item()

    # Store prediction
    predictions.append(prediction)

# Handle last sentence (no next sentence)
predictions.append(0)  # Assume last sentence doesn't need a next sentence

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [147]:
df.sort_values(by=['Sentence Index'], ascending=True, inplace=True)

# Add predictions to DataFrame
df = df.assign(**{"Previous Sentence Needed": predictions})

display(df)

Unnamed: 0,Sentence Index,Score,Sentence,Previous Sentence Needed
0,0,0.46,"hello, this is Leno Kia, and today I want to t...",0
1,1,0.87,so why is teamwork in the classroom so important?,0
2,2,0.97,"well, for our students.",0
3,3,0.86,it allows them to develop a bunch of new skill...,0
4,4,0.66,"uh, communication skills, leadership skills, Etc.",0
5,5,0.86,"also, when you're working with a team, you get...",0
6,6,0.18,"ideally, you are part of a team that has, uh p...",0
7,7,0.82,"uh, also, teams will motivate you.",0
8,8,1.0,they will support you.,0
9,9,0.93,you will feel empowered by them.,0


## Metric 3: Intelligent Sentence-Paragraph Relevancy

In [None]:
# runtime: ~4min

tokenizer = LongformerTokenizer.from_pretrained("allenai/longformer-base-4096")
model = LongformerForSequenceClassification.from_pretrained("allenai/longformer-base-4096")

# Ensure the model is in evaluation mode
model.eval()

# Example usage
body_paragraph = paragraph_punct

relevance_scores = []

for sentence in sentences:
    # Prepare the input for Longformer
    inputs = tokenizer(
        body_paragraph,
        sentence,
        return_tensors='pt',
        max_length=4096,
        truncation=True,
        padding='max_length'  # Pad to max length to avoid issues with model input size
    )

    # Get model predictions
    with torch.no_grad():
        outputs = model(**inputs)

    # Assuming binary classification (relevant/not relevant)
    relevance_score = torch.softmax(outputs.logits, dim=1)[0][1].item()  # Probability of being relevant
    relevance_scores.append((sentence, relevance_score))

# Sort sentences based on relevance scores
sorted_sentences = sorted(relevance_scores, key=lambda x: x[1], reverse=True)
ranked_sentences = [sentence for sentence, score in sorted_sentences]

Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at allenai/longformer-base-4096 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Audio

## Loading

In [None]:
%%capture
# Extract audio (wav) from video
!ffmpeg -y -i "$video_input" -vn -acodec pcm_s16le -ar 44100 -ac 2 "$audio_output"

In [None]:
# Download Audio file
# from google.colab import files
# files.download(os.path.join('/content', audio_output))

## Preprocessing
* From the Paragraph boundaries, get the time in aduio that we care about
* For each time in audio we care about, analyze if they are low volume

OR
* analyze all potential sentence boundaries first
* match with end of sentences

In [208]:
# 0: Load audio, extract timestamps

SAMPLING_RATE = 16000 # 16 kHz

model = load_silero_vad()
wav = read_audio(audio_output)
speech_timestamps = get_speech_timestamps(wav, model)

# Check the shape of the wav tensor
print(f"Audio shape: {wav.shape}")
print(f"Audio length (seconds): {len(wav) / SAMPLING_RATE:.2f}")

Audio shape: torch.Size([5657259])
Audio length (seconds): 353.58


In [212]:
# Speech Intervals
speech_intervals = []
for i in range(0, len(speech_timestamps)-1):
    speech_intervals.append((speech_timestamps[i]['start'] / SAMPLING_RATE, speech_timestamps[i+1]['end'] / SAMPLING_RATE))

# Silence Intervals
silence_intervals = []
for i in range(1, len(speech_timestamps)):
    silence_start = speech_timestamps[i-1]['end']  # End of previous speech segment
    silence_end = speech_timestamps[i]['start']     # Start of current speech segment
    silence_intervals.append((silence_start / SAMPLING_RATE, silence_end / SAMPLING_RATE))

In [213]:
print(speech_timestamps[0:3])
print(speech_intervals[0:3])
print(silence_intervals[0:3])

[{'start': 20000, 'end': 113120}, {'start': 116768, 'end': 140256}, {'start': 176672, 'end': 219104}]
[(1.25, 8.766), (7.298, 13.694), (11.042, 20.382)]
[(7.07, 7.298), (8.766, 11.042), (13.694, 14.05)]


In [149]:
# Enhancing SRV
# import re

# # Split the enhanced text by sentence boundaries
# sentences = re.split(r'(?<=[.!?]) +', enhanced_text)

# # Update subtitles with enhanced sentences
# for i, subtitle in enumerate(subtitles):
#     subtitle.content = sentences[i] if i < len(sentences) else ""

# # Reconstruct the SRT
# enhanced_srt_content = srt.compose(subtitles)

# # Save the improved SRT
# with open("enhanced_subtitle_file.srt", "w") as f:
#     f.write(enhanced_srt_content)


# Video