In [15]:
%%capture
!pip install nltk==3.9.1
!pip install transformers==4.45.2
!pip install datasets==3.0.2

In [46]:
import os
import pandas as pd
import tarfile
import nltk
from nltk.tokenize import sent_tokenize
import torch
from transformers import LongformerTokenizer, LongformerModel
from datasets import load_dataset

In [16]:
%%capture
# Datasets
dataset_news = load_dataset("cnn_dailymail", "3.0.0")

# NLTK
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [4]:
# # TVSum Dataset

# # Dataset Download: TVSum
# !wget -O TVSum.tgz https://people.csail.mit.edu/yalesong/tvsum/tvsum50_ver_1_1.tgz

# # Extract the dataset
# with tarfile.open("TVSum.tgz", 'r:gz') as tar_ref:
#     tar_ref.extractall("./")


# !ls ydata-tvsum50-v1_1

# # Path to the extracted dataset (adjust if needed)
# dataset_path = "ydata-tvsum50-v1_1"

# # Example: List the video files in the dataset
# video_files = [f for f in os.listdir(dataset_path) ]
# # video_files = [f for f in os.listdir(dataset_path) if f.endswith(".mp4")]
# print("Video files found:", video_files)

# Text

In [33]:
# Datasets

# Simple
paragrah_simple = "Renewable energy is crucial for reducing carbon emissions.  \
Solar power, in particular, is sustainable and abundant. Interestingly, \
solar panels were first invented in 1954. With continued advancements, \
solar energy is becoming more accessible in everyday life."

# CNN/Daily News
paragraph_news = dataset_news['train']['article'][0]
summary_news = dataset_news['train']['highlights'][0]

In [39]:
# 0: Parameters
paragraph = summary_news # unsumarized
summary = summary_news # sumarized
sentences = paragraph_news # sentences to test

In [40]:
# 1: Text Processing - Segmentation
sentences_segmented = sent_tokenize(sentences)
print(len(sentences_segmented), '\n', sentences_segmented)

24 
 ["LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won't cast a spell on him.", 'Daniel Radcliffe as Harry Potter in "Harry Potter and the Order of the Phoenix" To the disappointment of gossip columnists around the world, the young actor says he has no plans to fritter his cash away on fast cars, drink and celebrity parties.', '"I don\'t plan to be one of those people who, as soon as they turn 18, suddenly buy themselves a massive sports car collection or something similar," he told an Australian interviewer earlier this month.', '"I don\'t think I\'ll be particularly extravagant.', '"The things I like buying are things that cost about 10 pounds -- books and CDs and DVDs."', 'At 18, Radcliffe will be able to gamble in a casino, buy a drink in a pub or see the horror film "Hostel: Part II," currently six places below his number one movie on the UK box off

In [41]:
%%capture
# 2: Longformer Model
tokenizer_lf = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')
model_lf = LongformerModel.from_pretrained('allenai/longformer-base-4096')

In [42]:
# 3: Tokenization
paragraph_tokens = tokenizer_lf(paragraph, return_tensors='pt')
sentence_tokens = [tokenizer_lf(sentence, return_tensors='pt') for sentence in sentences_segmented]

### Embedding Explanation
The [CLS] (classification) token is often used in transformer models to represent the overall meaning or summary of the input sequence. By extracting its embedding, you're essentially obtaining a representation that captures the main point or essence of the paragraph.

In [43]:
# 4: Embedding
with torch.no_grad(): # Disable gradient computation for efficiency
    paragraph_embedding = model_lf(**paragraph_tokens).last_hidden_state[:, 0, :]  # Get the [CLS] token embedding
    sentence_embeddings = [model_lf(**tokens).last_hidden_state[:, 0, :] for tokens in sentence_tokens]

In [44]:
# 5: Relevance scores
relevance_scores = [torch.cosine_similarity(paragraph_embedding, sentence_embedding).item() for sentence_embedding in sentence_embeddings]

In [45]:
print(relevance_scores)

[0.9945123791694641, 0.9955788850784302, 0.9938606023788452, 0.9924793839454651, 0.9923685789108276, 0.9952806234359741, 0.9940185546875, 0.9938303828239441, 0.9937145113945007, 0.9941849708557129, 0.9953868985176086, 0.9941660165786743, 0.9941638112068176, 0.9943624138832092, 0.9950680732727051, 0.9957815408706665, 0.9938970804214478, 0.9941508173942566, 0.9942107200622559, 0.994347095489502, 0.9928677678108215, 0.9937551021575928, 0.9920448064804077, 0.9908921718597412]


In [48]:
# 6: Display Results

df = pd.DataFrame({"Score": relevance_scores, "Sentence": sentences_segmented, "Sentence #": range(len(sentences_segmented)) })

df.sort_values(by=['Score'], ascending=False, inplace=True)

display(df)

Unnamed: 0,Score,Sentence,Sentence #
15,0.995782,Watch I-Reporter give her review of Potter's l...,15
1,0.995579,"Daniel Radcliffe as Harry Potter in ""Harry Pot...",1
10,0.995387,Radcliffe's earnings from the first five Potte...,10
5,0.995281,"At 18, Radcliffe will be able to gamble in a c...",5
14,0.995068,"His latest outing as the boy wizard in ""Harry ...",14
0,0.994512,"LONDON, England (Reuters) -- Harry Potter star...",0
13,0.994362,"""But I try very hard not to go that way becaus...",13
19,0.994347,"Earlier this year, he made his stage debut pla...",19
18,0.994211,"He will also appear in ""December Boys,"" an Aus...",18
9,0.994185,"""Hopefully none of you will be reading about it.""",9
