# Installation & Setup

In [1]:
%%capture
!pip install nltk==3.9.1
!pip install transformers==4.45.2
!pip install datasets==3.0.2
!pip install srt==3.5.3
!pip install gdown
!apt install ffmpeg

In [2]:
import os
import numpy as np
import pandas as pd
import tarfile
import nltk
from nltk.tokenize import sent_tokenize
import torch
from transformers import LongformerTokenizer, LongformerModel, LongformerForSequenceClassification
from datasets import load_dataset
import torch.nn.functional as F
import gdown
import srt

In [3]:
%%capture
# NLTK
nltk.download('punkt')
nltk.download('punkt_tab')

# Other Dataset: CNN/DailyMail
dataset_news = load_dataset("cnn_dailymail", "3.0.0")

# Dataset
folder_id = '1k7DLJPl1xz9lpU4l3dZYtPe1XawhrXeC'
gdown.download_folder(id=folder_id, quiet=False, use_cookies=False)

path_dataset = "dataset/"

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


# Datasets

In [5]:
path_dataset = "dataset/"
filename_video = "teamwork in the classroom.mov"
filename_subtitles = "teamwork in the classroom.srt"

# Subtitles:
with open(path_dataset + filename_subtitles, "r", encoding="utf-8") as f:
    subtitles = list(srt.parse(f.read()))

# Other: Simple
paragrah_simple = "Renewable energy is crucial for reducing carbon emissions.  \
Solar power, in particular, is sustainable and abundant. Interestingly, \
solar panels were first invented in 1954. With continued advancements, \
solar energy is becoming more accessible in everyday life."

# Other: CNN/Daily News
paragraph_news = dataset_news['train']['article'][0]
summary_news = dataset_news['train']['highlights'][0]

### SRT
each **`subtitle`** in the subtitles array has the following properties:

1. **`index`**
   - The sequential number of the subtitle within the SRT file.
   - `1`, `2`, `3`, etc. (Integer)
2. **`start`**
   - The time (in milliseconds) when the subtitle should appear on the screen.
   - `00:00:05,000` (String representing HH:MM:SS,SSS)
3. **`end`**
   - The time (in milliseconds) when the subtitle should disappear from the screen.
   - `00:00:10,000` (String representing HH:MM:SS,SSS)
4. **`content`**
   - The actual text of the subtitle that will be displayed.
   - "Hello, world!" (String)
5. **`proprietary`**
   - This field holds any additional data or formatting specific to the SRT file or software used to create it. Often empty and can usually be ignored.
   - `''` (Empty string, or sometimes contains specific formatting codes)

# Text: Preprocessing

In [24]:
# Intermediate exploration

# 'proprietary' field can be safely ignored"
proprietary_values = sum([len(subtitle.proprietary) for subtitle in subtitles])
print(proprietary_values) # returns 0

dict_keys(['index', 'start', 'end', 'content', 'proprietary'])
0


## Parameters

### Paragraph
combination of all subtitle parts.

In [7]:
# Paragraph
paragraph = " ".join([subtitle.content for subtitle in subtitles])

In [8]:
paragraph

"hello this is Leno Kia and today I want to talk to you about a very important topic challenging topic teamwork in the classroom so why is teamwork in the classroom so important well for our students it allows them to develop a bunch of new skills right uh communication skills leadership skills Etc also when you're working with a team you get different perspectives ideally you are part of a team that has uh people with different genders people with uh different a in different age groups people with different academic backgrounds right so when you're talking to them you get all this fresh perspectives that inform your uh the task that you're trying to solve uh also teams will motivate you they will support you you will feel empowered by them ideally right this is like kind the thing that should happen and also this is how the world works right pretty much everything we do we need to do it as a team now here's the thing about teamwork in the classroom students hate it but why do they hat

### Sentences
Individual sentences from the paragraph.

#### Challenges:
* Imposing/Detecting punctuation
* Incorrect word parsed to transcript

In [None]:
# Legacy segmentation when sentences were already perfect

# Segmentation
# sentences_segmented = sent_tokenize(sentences)
# print(len(sentences_segmented), '\n', sentences_segmented)

In [None]:
dataset = load_dataset("universal_dependencies", "en_ewt")

#3 Prepare dataset for token classification
def prepare_labels(examples):
    labels = []
    for sentence in examples["tokens"]:
        label = [0] * len(sentence)
        label[-1] = 1  # Mark the last token in each sentence as the end of the sentence
        labels.append(label)
    return {"tokens": examples["tokens"], "labels": labels}

# Apply the function to each split
train_dataset = dataset["train"].map(prepare_labels)
validation_dataset = dataset["validation"].map(prepare_labels)
test_dataset = dataset["test"].map(prepare_labels)




#4 Tokenize and align labels
from transformers import RobertaTokenizer

tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True, padding="max_length")
    labels = []
    for i, label in enumerate(examples["labels"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []
        previous_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)  # For special tokens
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)  # Only label the first subtoken
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

train_tokenized = train_dataset.map(tokenize_and_align_labels, batched=True)
validation_tokenized = validation_dataset.map(tokenize_and_align_labels, batched=True)
test_tokenized = test_dataset.map(tokenize_and_align_labels, batched=True)


#5 Initialize model
from transformers import RobertaForTokenClassification

model = RobertaForTokenClassification.from_pretrained("roberta-base", num_labels=2)

#6 set up trainer
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=validation_tokenized,
)

# 7 train
trainer.train()

# 8 Predict
def predict_sentence_boundaries(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True)
    with torch.no_grad():
        outputs = model(**inputs).logits
    predictions = torch.argmax(outputs, dim=2)
    tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
    result = []
    for token, pred in zip(tokens, predictions[0].numpy()):
        result.append((token, pred))
    return result

# Example
text = "This is a sample paragraph Without punctuation It should be split into sentences"
print(predict_sentence_boundaries(text))



## Metric 1: Simple Sentence-Paragraph Relevancy (Cosine Similarity)

In [10]:
%%capture
# 2: Longformer Model
tokenizer_lf = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')
model_lf = LongformerModel.from_pretrained('allenai/longformer-base-4096')

In [11]:
# 3: Tokenization
paragraph_tokens = tokenizer_lf(paragraph, return_tensors='pt')
sentence_tokens = [tokenizer_lf(sentence, return_tensors='pt') for sentence in sentences_segmented]

### Embedding Explanation
The [CLS] (classification) token is often used in transformer models to represent the overall meaning or summary of the input sequence. By extracting its embedding, you're essentially obtaining a representation that captures the main point or essence of the paragraph.

In [12]:
# 4: Embedding
with torch.no_grad(): # Disable gradient computation for efficiency
    paragraph_embedding = model_lf(**paragraph_tokens).last_hidden_state[:, 0, :]  # Get the [CLS] token embedding
    sentence_embeddings = [model_lf(**tokens).last_hidden_state[:, 0, :] for tokens in sentence_tokens]

Input ids are automatically padded to be a multiple of `config.attention_window`: 512


In [13]:
# 5: Relevance scores
relevance_scores = [torch.cosine_similarity(paragraph_embedding, sentence_embedding).item() for sentence_embedding in sentence_embeddings]


temperature = 0.0002
relevance_scores_softmax = F.softmax(torch.tensor(relevance_scores)/temperature, dim=0)

In [14]:
# print(relevance_scores_softmax * 100)
relevance_scores_softmax = relevance_scores_softmax * 100

np.set_printoptions(formatter={'float': lambda x: f"{x:.2g}"})

In [15]:
# 6: Display Results

df = pd.DataFrame({"Index": range(len(sentences_segmented)), "Score": relevance_scores, "Sentence": sentences_segmented })

df.sort_values(by=['Score'], ascending=False, inplace=True)

display(df)

Unnamed: 0,Index,Score,Sentence
13,13,0.990917,"""But I try very hard not to go that way becaus..."
9,9,0.990096,"""Hopefully none of you will be reading about it."""
15,15,0.989744,Watch I-Reporter give her review of Potter's l...
19,19,0.989673,"Earlier this year, he made his stage debut pla..."
2,2,0.989543,"""I don't plan to be one of those people who, a..."
12,12,0.989525,"""People are always looking to say 'kid star go..."
5,5,0.989461,"At 18, Radcliffe will be able to gamble in a c..."
8,8,0.989144,"""I'll definitely have some sort of party,"" he ..."
21,21,0.989084,E-mail to a friend .
4,4,0.989071,"""The things I like buying are things that cost..."


## Metric 2: Intra-sentence relevancy
Score by if current sentence is needded by adjacent sentences.

In [16]:

from transformers import BertForSequenceClassification, BertTokenizer

# Load pre-trained model and tokenizer
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')


sentences = sentences_segmented

# Store predictions for each sentence
predictions = []

# Iterate through sentence pairs
for i in range(len(sentences) - 1):
    sentence1 = sentences[i]
    sentence2 = sentences[i + 1]

    # Tokenize and prepare input
    inputs = tokenizer(sentence1, sentence2, return_tensors='pt', truncation=True, padding=True, add_special_tokens=True)

    # Get model prediction
    outputs = model(**inputs)
    prediction = torch.argmax(outputs.logits).item()

    # Store prediction
    predictions.append(prediction)

# Handle last sentence (no next sentence)
predictions.append(0)  # Assume last sentence doesn't need a next sentence

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [17]:
df.sort_values(by=['Index'], ascending=True, inplace=True)

# Add predictions to DataFrame
df = df.assign(**{"Previous Sentence Needed": predictions})

display(df)

Unnamed: 0,Index,Score,Sentence,Previous Sentence Needed
0,0,0.986152,"LONDON, England (Reuters) -- Harry Potter star...",0
1,1,0.988646,"Daniel Radcliffe as Harry Potter in ""Harry Pot...",0
2,2,0.989543,"""I don't plan to be one of those people who, a...",0
3,3,0.987571,"""I don't think I'll be particularly extravagant.",0
4,4,0.989071,"""The things I like buying are things that cost...",0
5,5,0.989461,"At 18, Radcliffe will be able to gamble in a c...",0
6,6,0.988843,Details of how he'll mark his landmark birthda...,0
7,7,0.988864,His agent and publicist had no comment on his ...,0
8,8,0.989144,"""I'll definitely have some sort of party,"" he ...",0
9,9,0.990096,"""Hopefully none of you will be reading about it.""",0


# Audio

## Loading

In [None]:
# Extract audio (wav) from video
filename_video = "teamwork in the classroom.mov"

filename_base = os.path.splitext(filename_video)[0]
audio_output = filename_base + ".wav"

filename_input = os.path.join(path_dataset, filename_video)
audio_output = os.path.join(path_dataset, audio_output)

!ffmpeg -y -i "$filename_input" -vn -acodec copy "$audio_output"

# Download if necessary
# from google.colab import files
# files.download(os.path.join('/content', audio_output))