# Installation & Setup

In [21]:
%%capture
!pip install nltk==3.9.1
!pip install transformers==4.45.2
!pip install datasets==3.0.2
!pip install srt==3.5.3
!pip install gdown==5.2.0
!apt install ffmpeg==1.4
!pip install deepmultilingualpunctuation==1.0.1

In [24]:
import os
import numpy as np
import pandas as pd
import tarfile
import nltk
from nltk.tokenize import sent_tokenize
import torch
from transformers import LongformerTokenizer, LongformerModel, LongformerForSequenceClassification, RobertaTokenizer, RobertaForTokenClassification, Trainer, TrainingArguments
from datasets import load_dataset
from datasets import load_dataset
import torch.nn.functional as F
import gdown
import srt
from deepmultilingualpunctuation import PunctuationModel

In [3]:
# Notebook config
os.environ["WANDB_DISABLED"] = "true"

In [4]:
%%capture
# NLTK
nltk.download('punkt')
nltk.download('punkt_tab')
# nltk.download('treebank')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


# Datasets

In [None]:
# Google Drive Dataset Location
folder_id = '1k7DLJPl1xz9lpU4l3dZYtPe1XawhrXeC'
gdown.download_folder(id=folder_id, quiet=False, use_cookies=False)

In [22]:
# Local Dataset
path_dataset = "dataset/"
filename_video = "assessing students without exams.mov"
filename_subtitles = "assessing students without exams.srt"

# Subtitles:
with open(path_dataset + filename_subtitles, "r", encoding="utf-8") as f:
    subtitles = list(srt.parse(f.read()))

# Simple Test Dataset:
paragrah_simple = "Renewable energy is crucial for reducing carbon emissions. Solar power, in particular, is sustainable and abundant. Interestingly, solar panels were first invented in 1954. With continued advancements, solar energy is becoming more accessible in everyday life."
paragraph_simple_unpunct = "Renewable energy is crucial for reducing carbon emissions  Solar power, in particular, is sustainable and abundant Interestingly, solar panels were first invented in 1954 With continued advancements, solar energy is becoming more accessible in everyday life"


# Other: CNN/Daily Mail
# dataset_news = load_dataset("cnn_dailymail", "3.0.0")
# paragraph_news = dataset_news['train']['article'][0]
# summary_news = dataset_news['train']['highlights'][0]

### SRT
each **`subtitle`** in the subtitles array has the following properties:

1. **`index`**
   - The sequential number of the subtitle within the SRT file.
   - `1`, `2`, `3`, etc. (Integer)
2. **`start`**
   - The time (in milliseconds) when the subtitle should appear on the screen.
   - `00:00:05,000` (String representing HH:MM:SS,SSS)
3. **`end`**
   - The time (in milliseconds) when the subtitle should disappear from the screen.
   - `00:00:10,000` (String representing HH:MM:SS,SSS)
4. **`content`**
   - The actual text of the subtitle that will be displayed.
   - "Hello, world!" (String)
5. **`proprietary`**
   - This field holds any additional data or formatting specific to the SRT file or software used to create it. Often empty and can usually be ignored.
   - `''` (Empty string, or sometimes contains specific formatting codes)

# Text: Preprocessing

#### Challenges:
* Imposing/Detecting punctuation
* Incorrect words parsed to transcript

In [37]:
# Intermediate exploration

# 'proprietary' field can be safely ignored"
proprietary_values = sum([len(subtitle.proprietary) for subtitle in subtitles])
print(proprietary_values) # returns 0

0


## Parameters

### Paragraph (Unpunctuated)
combination of all subtitle parts.  
As text is parsed from audio automatically there is no inherent punctuation (periods are missing)

In [38]:
# Paragraph
paragraph = " ".join([subtitle.content for subtitle in subtitles])

In [39]:
paragraph

"hello everybody this is Loria and today I will be talking about a very controversial topic for us Educators assessing students without exams but before I tell you about this I'm going to share a little bit about my life when I was an undergrad student um I had to do a lot of exams and let's say for every course usually the only assessment was two exams one in the middle of the semester and then a final exam so this was a very stressful situation because you couldn't make mistakes one mistake would cost you would would cost you dearly right like let's say I only had three problems to solve in this midterm exam if I got one problem wrong I was already getting a 66 out of 100 on that exam so lots of stress I I I won't go into details but I had every every morning when I had an exam I would wake up and I had to go to the bathroom at least twice it was super stressful really really really bad but I became really good at doing exams I I'm going to brag a little bit I was the highest uh grad

### Paragraph (Punctuated)
Using DeepPunct model

In [43]:
# DeepPunct model to restore punctuation
model = PunctuationModel()
text = paragraph
text_tokens = model.preprocess(text)
# labled_words = model.predict(clean_text)
paragraph_punct_restored = model.restore_punctuation(text)



In [44]:
paragraph_punct_restored

"hello everybody, this is Loria, and today I will be talking about a very controversial topic for us Educators: assessing students without exams. but before I tell you about this, I'm going to share a little bit about my life when I was an undergrad student. um, I had to do a lot of exams and, let's say, for every course, usually the only assessment was two exams: one in the middle of the semester and then a final exam. so this was a very stressful situation because you couldn't make mistakes. one mistake would cost you, would would cost you dearly, right like, let's say, I only had three problems to solve in this midterm exam. if I got one problem wrong, I was already getting a 66 out of 100 on that exam. so lots of stress. I, I- I won't go into details, but I had every, every morning when I had an exam, I would wake up and I had to go to the bathroom at least twice. it was super stressful, really, really, really bad. but I became really good at doing exams. I- I'm going to brag a lit


Individual sentences from the paragraph.

### Sentence Segmentation

In [50]:
# Segmentation
sentences_segmented = sent_tokenize(paragraph_punct_restored)
print(len(sentences_segmented))
print(sentences_segmented)

92
['hello everybody, this is Loria, and today I will be talking about a very controversial topic for us Educators: assessing students without exams.', "but before I tell you about this, I'm going to share a little bit about my life when I was an undergrad student.", "um, I had to do a lot of exams and, let's say, for every course, usually the only assessment was two exams: one in the middle of the semester and then a final exam.", "so this was a very stressful situation because you couldn't make mistakes.", "one mistake would cost you, would would cost you dearly, right like, let's say, I only had three problems to solve in this midterm exam.", 'if I got one problem wrong, I was already getting a 66 out of 100 on that exam.', 'so lots of stress.', "I, I- I won't go into details, but I had every, every morning when I had an exam, I would wake up and I had to go to the bathroom at least twice.", 'it was super stressful, really, really, really bad.', 'but I became really good at doing ex

## Metric 1: Simple Sentence-Paragraph Relevancy (Cosine Similarity)

In [51]:
%%capture
# 1: Longformer Model
tokenizer_lf = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')
model_lf = LongformerModel.from_pretrained('allenai/longformer-base-4096')

In [52]:
# 2: Tokenization
paragraph_tokens = tokenizer_lf(paragraph, return_tensors='pt')
sentence_tokens = [tokenizer_lf(sentence, return_tensors='pt') for sentence in sentences_segmented]

### Embedding Explanation
The [CLS] (classification) token is often used in transformer models to represent the overall meaning or summary of the input sequence. By extracting its embedding, you're essentially obtaining a representation that captures the main point or essence of the paragraph.

In [53]:
# 3: Embedding - ~2.5min
with torch.no_grad(): # Disable gradient computation for efficiency
    paragraph_embedding = model_lf(**paragraph_tokens).last_hidden_state[:, 0, :]  # Get the [CLS] token embedding
    sentence_embeddings = [model_lf(**tokens).last_hidden_state[:, 0, :] for tokens in sentence_tokens]

Input ids are automatically padded to be a multiple of `config.attention_window`: 512


In [55]:
# 4: Relevance scores
relevance_scores = [torch.cosine_similarity(paragraph_embedding, sentence_embedding).item() for sentence_embedding in sentence_embeddings]


temperature = 0.0002
relevance_scores_softmax = F.softmax(torch.tensor(relevance_scores)/temperature, dim=0)

In [56]:
# print(relevance_scores_softmax * 100)
relevance_scores_softmax = relevance_scores_softmax * 100

np.set_printoptions(formatter={'float': lambda x: f"{x:.2g}"})

In [57]:
# 5: Display Results

df = pd.DataFrame({"Index": range(len(sentences_segmented)), "Score": relevance_scores, "Sentence": sentences_segmented })

df.sort_values(by=['Score'], ascending=False, inplace=True)

display(df)

Unnamed: 0,Index,Score,Sentence
4,4,0.995219,"one mistake would cost you, would would cost y..."
2,2,0.994973,"um, I had to do a lot of exams and, let's say,..."
29,29,0.994465,"also, my professors would get really limited f..."
39,39,0.994364,"uh, sometimes my students do the quizzes on th..."
25,25,0.994350,"I'm sure there's better ways of writing exams,..."
...,...,...,...
34,34,0.992047,"so then what do you suggest, right?"
83,83,0.992023,what if I tweak this parameter?
51,51,0.991934,"it's not just this midterm and this final, the..."
52,52,0.991934,"right, they, they, they, they have to cover mo..."


## Metric 2: Intra-sentence relevancy
Score by if current sentence is needded by adjacent sentences.

In [None]:

from transformers import BertForSequenceClassification, BertTokenizer

# Load pre-trained model and tokenizer
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')


sentences = sentences_segmented

# Store predictions for each sentence
predictions = []

# Iterate through sentence pairs
for i in range(len(sentences) - 1):
    sentence1 = sentences[i]
    sentence2 = sentences[i + 1]

    # Tokenize and prepare input
    inputs = tokenizer(sentence1, sentence2, return_tensors='pt', truncation=True, padding=True, add_special_tokens=True)

    # Get model prediction
    outputs = model(**inputs)
    prediction = torch.argmax(outputs.logits).item()

    # Store prediction
    predictions.append(prediction)

# Handle last sentence (no next sentence)
predictions.append(0)  # Assume last sentence doesn't need a next sentence

In [None]:
df.sort_values(by=['Index'], ascending=True, inplace=True)

# Add predictions to DataFrame
df = df.assign(**{"Previous Sentence Needed": predictions})

display(df)

# Audio

## Loading

In [None]:
# Extract audio (wav) from video
filename_video = "teamwork in the classroom.mov"

filename_base = os.path.splitext(filename_video)[0]
audio_output = filename_base + ".wav"

filename_input = os.path.join(path_dataset, filename_video)
audio_output = os.path.join(path_dataset, audio_output)

!ffmpeg -y -i "$filename_input" -vn -acodec copy "$audio_output"

# Download if necessary
# from google.colab import files
# files.download(os.path.join('/content', audio_output))