# Installation & Setup

In [1]:
%%capture
!pip install nltk==3.9.1
!pip install transformers==4.45.2
# !pip install transformers==4.27.4
!pip install datasets==3.0.2
!pip install srt==3.5.3
!pip install gdown
!apt install ffmpeg

In [15]:
import os
import numpy as np
import pandas as pd
import tarfile
import nltk
from nltk.tokenize import sent_tokenize
import torch
from transformers import LongformerTokenizer, LongformerModel, LongformerForSequenceClassification, RobertaTokenizer, RobertaForTokenClassification, Trainer, TrainingArguments
from datasets import load_dataset
from datasets import load_dataset
import torch.nn.functional as F
import gdown
import srt

In [None]:
# Notebook config
os.environ["WANDB_DISABLED"] = "true"

In [3]:
%%capture
# NLTK
nltk.download('punkt')
nltk.download('punkt_tab')
# nltk.download('treebank')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Unzipping corpora/treebank.zip.


# Datasets

In [17]:
# Local Dataset
path_dataset = "dataset/"
filename_video = "teamwork in the classroom.mov"
filename_subtitles = "teamwork in the classroom.srt"

folder_id = '1k7DLJPl1xz9lpU4l3dZYtPe1XawhrXeC'
gdown.download_folder(id=folder_id, quiet=False, use_cookies=False)

# Subtitles:
with open(path_dataset + filename_subtitles, "r", encoding="utf-8") as f:
    subtitles = list(srt.parse(f.read()))

# Other: Simple
paragrah_simple = "Renewable energy is crucial for reducing carbon emissions.  \
Solar power, in particular, is sustainable and abundant. Interestingly, \
solar panels were first invented in 1954. With continued advancements, \
solar energy is becoming more accessible in everyday life."

# Other: CNN/Daily Mail
# dataset_news = load_dataset("cnn_dailymail", "3.0.0")
# paragraph_news = dataset_news['train']['article'][0]
# summary_news = dataset_news['train']['highlights'][0]

Retrieving folder contents


Processing file 1vuIW3CVm2p_Ig-_srJ5sIwUxzTGqdGHp assessing students without exams.mov
Processing file 1i8XfB6AOly9l0vsdrGXz2FBs0OHxhFeP assessing students without exams.srt
Processing file 1OP3zzSmpKJ0RDPasl9AGQD2yNeCEXtoR flipped learning basics.mov
Processing file 1O5Q1jZLdbmzpZdkytc3byg-xMSkGAs7u flipped learning basics.srt
Processing file 1wslcvTNd88FQMXJgvGbKR3sjwXORR6xt teamwork in the classroom.mov
Processing file 10-kRFMxQqrI7j0K7dR6dBRBLSgpbjgUO teamwork in the classroom.srt


Retrieving folder contents completed
Building directory structure
Building directory structure completed
Downloading...
From (original): https://drive.google.com/uc?id=1vuIW3CVm2p_Ig-_srJ5sIwUxzTGqdGHp
From (redirected): https://drive.google.com/uc?id=1vuIW3CVm2p_Ig-_srJ5sIwUxzTGqdGHp&confirm=t&uuid=6d2b37b8-9f98-40ce-b9c6-313b61b3333b
To: /content/dataset/assessing students without exams.mov
100%|██████████| 875M/875M [00:17<00:00, 51.1MB/s]
Downloading...
From: https://drive.google.com/uc?id=1i8XfB6AOly9l0vsdrGXz2FBs0OHxhFeP
To: /content/dataset/assessing students without exams.srt
100%|██████████| 17.1k/17.1k [00:00<00:00, 27.6MB/s]
Downloading...
From (original): https://drive.google.com/uc?id=1OP3zzSmpKJ0RDPasl9AGQD2yNeCEXtoR
From (redirected): https://drive.google.com/uc?id=1OP3zzSmpKJ0RDPasl9AGQD2yNeCEXtoR&confirm=t&uuid=f0fa3518-4c94-4771-93ac-fa4ed4558833
To: /content/dataset/flipped learning basics.mov
100%|██████████| 399M/399M [00:03<00:00, 123MB/s]
Downloading...
From: htt

### SRT
each **`subtitle`** in the subtitles array has the following properties:

1. **`index`**
   - The sequential number of the subtitle within the SRT file.
   - `1`, `2`, `3`, etc. (Integer)
2. **`start`**
   - The time (in milliseconds) when the subtitle should appear on the screen.
   - `00:00:05,000` (String representing HH:MM:SS,SSS)
3. **`end`**
   - The time (in milliseconds) when the subtitle should disappear from the screen.
   - `00:00:10,000` (String representing HH:MM:SS,SSS)
4. **`content`**
   - The actual text of the subtitle that will be displayed.
   - "Hello, world!" (String)
5. **`proprietary`**
   - This field holds any additional data or formatting specific to the SRT file or software used to create it. Often empty and can usually be ignored.
   - `''` (Empty string, or sometimes contains specific formatting codes)

# Text: Preprocessing

In [6]:
# Intermediate exploration

# 'proprietary' field can be safely ignored"
proprietary_values = sum([len(subtitle.proprietary) for subtitle in subtitles])
print(proprietary_values) # returns 0

0


## Parameters

### Paragraph
combination of all subtitle parts.

In [7]:
# Paragraph
paragraph = " ".join([subtitle.content for subtitle in subtitles])

In [8]:
paragraph

"hello this is Leno Kia and today I want to talk to you about a very important topic challenging topic teamwork in the classroom so why is teamwork in the classroom so important well for our students it allows them to develop a bunch of new skills right uh communication skills leadership skills Etc also when you're working with a team you get different perspectives ideally you are part of a team that has uh people with different genders people with uh different a in different age groups people with different academic backgrounds right so when you're talking to them you get all this fresh perspectives that inform your uh the task that you're trying to solve uh also teams will motivate you they will support you you will feel empowered by them ideally right this is like kind the thing that should happen and also this is how the world works right pretty much everything we do we need to do it as a team now here's the thing about teamwork in the classroom students hate it but why do they hat

### Sentences
Individual sentences from the paragraph.

#### Challenges:
* Imposing/Detecting punctuation
* Incorrect words parsed to transcript

In [None]:
# Load pre-trained tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaForTokenClassification.from_pretrained("roberta-base", num_labels=2)  # 2 labels: boundary (1), no boundary (0)

In [21]:
# Load OntoNotes dataset
dataset_ontonotes = load_dataset("conll2003", trust_remote_code=True)
dataset = dataset_ontonotes

In [36]:
dataset_ontonotes['train'][0]
# dataset_ontonotes

{'id': '0',
 'tokens': ['EU',
  'rejects',
  'German',
  'call',
  'to',
  'boycott',
  'British',
  'lamb',
  '.'],
 'pos_tags': [22, 42, 16, 21, 35, 37, 16, 21, 7],
 'chunk_tags': [11, 21, 11, 12, 21, 22, 11, 12, 0],
 'ner_tags': [3, 0, 7, 0, 0, 0, 7, 0, 0]}

In [23]:
# Preprocess function to create sentence boundary labels
def preprocess_sentence_boundary(examples):
    inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    for tokens in examples["tokens"]:
        sentence_labels = []
        for i, token in enumerate(tokens):
            # Assume a sentence boundary if it's the last token in a sentence
            label = 1 if (i == len(tokens) - 1) else 0
            sentence_labels.append(label)
        labels.append(sentence_labels)

    inputs["labels"] = labels
    return inputs

In [24]:
# Apply preprocessing to the train and validation sets
train_dataset = dataset["train"].map(preprocess_sentence_boundary, batched=True)
valid_dataset = dataset["validation"].map(preprocess_sentence_boundary, batched=True)

In [None]:
train_dataset[5]

In [38]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="./roberta-ontonotes-sbd",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    tokenizer=tokenizer,
)

In [49]:
# Fine-tune the model
trainer.train()

ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`labels` in this case) have excessive nesting (inputs type `list` where type `int` is expected).

In [None]:
from transformers import RobertaTokenizer, RobertaForTokenClassification
import torch

# Load the fine-tuned model and tokenizer
tokenizer = RobertaTokenizer.from_pretrained("path/to/your/fine-tuned-model")
model = RobertaForTokenClassification.from_pretrained("path/to/your/fine-tuned-model")

# Input paragraph
text = "This is the first sentence This is the second sentence And this is the third one"

# Tokenize the input
inputs = tokenizer(text, return_tensors="pt", truncation=True)
outputs = model(**inputs).logits
predictions = torch.argmax(outputs, dim=2).squeeze()

# Map predictions back to tokens
tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"].squeeze().tolist())
sentences = []
sentence = []

for token, prediction in zip(tokens, predictions):
    if prediction.item() == 1:  # If it's a boundary
        sentence.append(token)
        sentences.append(" ".join(sentence))
        sentence = []
    else:
        sentence.append(token)

# Append the last sentence if exists
if sentence:
    sentences.append(" ".join(sentence))

# Print sentences
for i, sent in enumerate(sentences):
    print(f"Sentence {i + 1}: {sent.strip()}")

In [50]:
# Global approach

import torch
from transformers import LongformerTokenizer, LongformerForSequenceClassification


# Load Longformer tokenizer and model
tokenizer = LongformerTokenizer.from_pretrained("allenai/longformer-base-4096")
model = LongformerForSequenceClassification.from_pretrained("allenai/longformer-base-4096")

# Ensure model is in evaluation mode
model.eval()

# Define the input text
text = (
    "hello this is Leno Kia and today I want to talk to you about a very important topic challenging topic teamwork in the classroom "
    "so why is teamwork in the classroom so important well for our students it allows them to develop a bunch of new skills right "
    "uh communication skills leadership skills Etc also when you're working with a team you get different perspectives ideally you are "
    "part of a team that has uh people with different genders people with uh different a in different age groups people with different "
    "academic backgrounds right so when you're talking to them you get all this fresh perspectives that inform your uh the task that "
    "you're trying to solve uh also teams will motivate you they will support you you will feel empowered by them ideally right this is "
    "like kind the thing that should happen and also this is how the world works right pretty much everything we do we need to do it as a "
    "team now here's the thing about teamwork in the classroom students hate it but why do they hate it well these are the things that I can "
    "identify they don't know their teammates or or they do know their their teammates but they don't like them uh they're also concerned about "
    "an even workload right I'm going to be working all night while this guy is like doing something else uh that is not teamwork and things "
    "like that they're also concerned about their grades they're concerned because a lot of it is out of their control right it doesn't matter "
    "how hard I work if my teammates are not doing enough work it's going to impact my my own grade and so yeah all of these things are legitimate "
    "concerns so what can we do as professors these are the things that I that I do I explain to my students why teamwork is so important I tell "
    "them about my own life experiences I am positive but realistic when I talk about teamwork I acknowledge that it's challenging and then once "
    "I have created teams it's important to assign class time for team building exercises now if this team is going to work together for 10 minutes "
    "every week then maybe you just need a a short uh ice breaker right so that they know each other and they can start working together if you are "
    "going to have this team work together for the entire semester it doesn't hurt to use one of your 2-hour lectures uh for stronger longer more "
    "comprehensive ensive team building exercise and the first time you give your students a task as a team make it a low stake uh task right "
    "something that doesn't really impact their grade or if it does it's tiny it's it's minimal and ideally the first time they work together as a "
    "team they do this in person working together in person is always uh creates a stronger connection than if you do this virtually now this is "
    "the best thing that I have learned about teamwork I had this class and my students were working in teams and after a few weeks I saw that this "
    "team was working really well together and so I I approached them and say uh why do you get along so well as a team and their answer was we had "
    "dinner together and so this is my advice to every team go and grab a coffee go and you know get lunch together something like that is going to "
    "help you connect uh as a team you're going to see everybody as a person and you're going to know each other and yeah it's going to be better "
    "trust me now when I create teams uh if it's a short collaboration and by this I mean they're going to be working together about 10 minutes every "
    "every class then you can randomly assign them and especially if it's a large class it's going to be a good strategy sometimes especially after "
    "they've been they know each other for a month or so you can also let them um you know choose their own teammates and and yeah just pick your "
    "team right if it's a long collaboration sometimes self- selection works I have a final project and I tell them you get to pick whoever you you "
    "can work with anybody that you want right and so that works but sometimes it's important that you choose a team based on the skills that the team "
    "has or based on their interest right so you need to determine when is a good idea for self uh selection or when is a good idea to think of um "
    "interest or skills when when created a team depends on the goals of of that uh exercise of that project things like that now the size of the team "
    "again this is not what you should do this is what I do right for short collaborations uh usually I create uh four member teams and this is also "
    "because of the structure of the classrooms where I work there's the tables with four chairs right uh so makes sense that uh we don't need to disrupt "
    "the classroom destroy things uh Move Around furniture too much but if it is a one month or a one semester project I usually create teams of two or "
    "three members I find that it's easy to identify what each member has uh has done uh it's easy also for them to for communication purposes and this "
    "is what has worked for me but what are your strategies I you know I would love to learn I'm not an expert in team building right I'm just sharing "
    "what I've done what has worked for me and what I have learned from my students but I would love to hear your experiences your strategies your uh "
    "solutions to this uh teamwork issue and uh yeah please reach out I would love to hear from you thank you so much I am Leno Kia see you next time"
)

# Tokenize the input text
inputs = tokenizer(text, return_tensors="pt", max_length=4096, truncation=True)

# Move tensors to the same device as the model (GPU if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
inputs = {key: value.to(device) for key, value in inputs.items()}

# Predict sentence boundaries (this is a mock prediction)
with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits

# Process the logits (assuming binary classification for sentence boundary)
# Here, you'll have to apply your sentence boundary logic based on the logits
# For simplicity, we consider logits > 0 as boundary indications
predicted_labels = torch.argmax(logits, dim=-1).cpu().numpy()

# Split text into sentences based on predicted labels
# Note: This is a mock processing; you'll need a proper method to determine boundaries
sentences = []
start_idx = 0

for i, label in enumerate(predicted_labels[0]):
    if label == 1:  # Assuming 1 indicates a boundary
        sentences.append(text[start_idx:inputs['input_ids'][0][i].item()])
        start_idx = inputs['input_ids'][0][i].item()

# Add the last segment if any text remains
if start_idx < len(text):
    sentences.append(text[start_idx:])

# Print the segmented sentences
for i, sentence in enumerate(sentences):
    print(f"Sentence {i + 1}: {sentence.strip()}")


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/597M [00:00<?, ?B/s]

Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at allenai/longformer-base-4096 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Initializing global attention on CLS token...
Input ids are automatically padded to be a multiple of `config.attention_window`: 512


TypeError: 'numpy.int64' object is not iterable

In [9]:
# Legacy segmentation when sentences were already perfect

# Segmentation
# sentences_segmented = sent_tokenize(sentences)
# print(len(sentences_segmented), '\n', sentences_segmented)

## Metric 1: Simple Sentence-Paragraph Relevancy (Cosine Similarity)

In [None]:
%%capture
# 2: Longformer Model
tokenizer_lf = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')
model_lf = LongformerModel.from_pretrained('allenai/longformer-base-4096')

In [None]:
# 3: Tokenization
paragraph_tokens = tokenizer_lf(paragraph, return_tensors='pt')
sentence_tokens = [tokenizer_lf(sentence, return_tensors='pt') for sentence in sentences_segmented]

### Embedding Explanation
The [CLS] (classification) token is often used in transformer models to represent the overall meaning or summary of the input sequence. By extracting its embedding, you're essentially obtaining a representation that captures the main point or essence of the paragraph.

In [None]:
# 4: Embedding
with torch.no_grad(): # Disable gradient computation for efficiency
    paragraph_embedding = model_lf(**paragraph_tokens).last_hidden_state[:, 0, :]  # Get the [CLS] token embedding
    sentence_embeddings = [model_lf(**tokens).last_hidden_state[:, 0, :] for tokens in sentence_tokens]

In [None]:
# 5: Relevance scores
relevance_scores = [torch.cosine_similarity(paragraph_embedding, sentence_embedding).item() for sentence_embedding in sentence_embeddings]


temperature = 0.0002
relevance_scores_softmax = F.softmax(torch.tensor(relevance_scores)/temperature, dim=0)

In [None]:
# print(relevance_scores_softmax * 100)
relevance_scores_softmax = relevance_scores_softmax * 100

np.set_printoptions(formatter={'float': lambda x: f"{x:.2g}"})

In [None]:
# 6: Display Results

df = pd.DataFrame({"Index": range(len(sentences_segmented)), "Score": relevance_scores, "Sentence": sentences_segmented })

df.sort_values(by=['Score'], ascending=False, inplace=True)

display(df)

## Metric 2: Intra-sentence relevancy
Score by if current sentence is needded by adjacent sentences.

In [None]:

from transformers import BertForSequenceClassification, BertTokenizer

# Load pre-trained model and tokenizer
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')


sentences = sentences_segmented

# Store predictions for each sentence
predictions = []

# Iterate through sentence pairs
for i in range(len(sentences) - 1):
    sentence1 = sentences[i]
    sentence2 = sentences[i + 1]

    # Tokenize and prepare input
    inputs = tokenizer(sentence1, sentence2, return_tensors='pt', truncation=True, padding=True, add_special_tokens=True)

    # Get model prediction
    outputs = model(**inputs)
    prediction = torch.argmax(outputs.logits).item()

    # Store prediction
    predictions.append(prediction)

# Handle last sentence (no next sentence)
predictions.append(0)  # Assume last sentence doesn't need a next sentence

In [None]:
df.sort_values(by=['Index'], ascending=True, inplace=True)

# Add predictions to DataFrame
df = df.assign(**{"Previous Sentence Needed": predictions})

display(df)

# Audio

## Loading

In [None]:
# Extract audio (wav) from video
filename_video = "teamwork in the classroom.mov"

filename_base = os.path.splitext(filename_video)[0]
audio_output = filename_base + ".wav"

filename_input = os.path.join(path_dataset, filename_video)
audio_output = os.path.join(path_dataset, audio_output)

!ffmpeg -y -i "$filename_input" -vn -acodec copy "$audio_output"

# Download if necessary
# from google.colab import files
# files.download(os.path.join('/content', audio_output))