In [None]:
!pip install nltk==3.9.1
!pip install transformers==4.45.2

In [3]:
import os
import tarfile
import nltk
from nltk.tokenize import sent_tokenize
import torch
from transformers import LongformerTokenizer, LongformerModel

In [12]:
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
# # TVSum Dataset

# # Dataset Download: TVSum
# !wget -O TVSum.tgz https://people.csail.mit.edu/yalesong/tvsum/tvsum50_ver_1_1.tgz

# # Extract the dataset
# with tarfile.open("TVSum.tgz", 'r:gz') as tar_ref:
#     tar_ref.extractall("./")


# !ls ydata-tvsum50-v1_1

# # Path to the extracted dataset (adjust if needed)
# dataset_path = "ydata-tvsum50-v1_1"

# # Example: List the video files in the dataset
# video_files = [f for f in os.listdir(dataset_path) ]
# # video_files = [f for f in os.listdir(dataset_path) if f.endswith(".mp4")]
# print("Video files found:", video_files)

# Text

In [24]:
# 1: Text Processing - Segmentation
paragraph = "Renewable energy is crucial for reducing carbon emissions. Solar power, in particular, is sustainable and abundant. Interestingly, solar panels were first invented in 1954. With continued advancements, solar energy is becoming more accessible in everyday life."
sentences = sent_tokenize(paragraph)
print(len(sentences), '\n', sentences)

4 
 ['Renewable energy is crucial for reducing carbon emissions.', 'Solar power, in particular, is sustainable and abundant.', 'Interestingly, solar panels were first invented in 1954.', 'With continued advancements, solar energy is becoming more accessible in everyday life.']


In [6]:
# 2: Longformer Model
tokenizer_lf = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')
model_lf = LongformerModel.from_pretrained('allenai/longformer-base-4096')

In [14]:
# 3: Tokenization
paragraph_tokens = tokenizer_lf(paragraph, return_tensors='pt')
sentence_tokens = [tokenizer_lf(sentence, return_tensors='pt') for sentence in sentences]

In [18]:
# 4: Embedding
with torch.no_grad(): # Disable gradient computation for efficiency
    paragraph_embedding = model_lf(**paragraph_tokens).last_hidden_state[:, 0, :]  # Get the [CLS] token embedding
    sentence_embeddings = [model_lf(**tokens).last_hidden_state[:, 0, :] for tokens in sentence_tokens]

Input ids are automatically padded to be a multiple of `config.attention_window`: 512


The [CLS] (classification) token is often used in transformer models to represent the overall meaning or summary of the input sequence. By extracting its embedding, you're essentially obtaining a representation that captures the main point or essence of the paragraph.

In [25]:
# 5: Relevance scores
relevance_scores = [torch.cosine_similarity(paragraph_embedding, sentence_embedding).item() for sentence_embedding in sentence_embeddings]

In [26]:
print(relevance_scores)

[0.9989673495292664, 0.9986081719398499, 0.9978553056716919, 0.9988285303115845]
