# Libraries & Packages

In [None]:
!pip install rouge

In [None]:
!pip install sentence_transformers

In [4]:
import nltk
import string
import string
nltk.download('punkt')
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from gensim.test.utils import common_texts
import spacy
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import numpy as np
from rouge import Rouge
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')


In [None]:
from transformers import AutoTokenizer, AutoModelWithLMHead
tokenizer=AutoTokenizer.from_pretrained('T5-base')
model=AutoModelWithLMHead.from_pretrained('T5-base', return_dict=True)


# Data and Methodology

In [5]:
# Clean the srt and get a clean text
def clean_text(subs):
    clean_subtitles = []
    for sub in subs:
        sub = sub.lower()
        sub = sub.translate(str.maketrans('', '', string.punctuation.replace('.', '')))
        sub = ''.join([i for i in sub if not i.isdigit()])
        clean_subtitles.append(sub)
    cleaned_text = ' '.join(clean_subtitles)
    return cleaned_text.replace("  ", " ")

In [6]:
# Split into sentences
def split_into_sentences(text):
    sentences = nltk.sent_tokenize(text)
    return sentences


In [7]:
# Tokenize sentences
def tokenize_sentences(sentences):
    tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
    return tokenized_sentences


In [8]:
# Convert sentences into vectors and Use TF-IDF to find keyword scores
def vectorize_sentences(tokenized_sentences):
    tfidf_vectorizer = TfidfVectorizer(tokenizer=lambda x: x, preprocessor=lambda x: x)
    tfidf_matrix = tfidf_vectorizer.fit_transform(tokenized_sentences)
    feature_names = tfidf_vectorizer.get_feature_names_out()
    return tfidf_matrix, feature_names


In [9]:
def vectorize_sentences2(tokenized_sentences):
    # Create TaggedDocument objects for Doc2Vec
    tagged_data = [TaggedDocument(words=sentence, tags=[str(i)]) for i, sentence in enumerate(tokenized_sentences)]

    # Train a Doc2Vec model
    model = Doc2Vec(vector_size=100, window=2, min_count=1, workers=4, epochs=20)
    model.build_vocab(tagged_data)
    model.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs)

    # Create vectors for each sentence
    doc2vec_vectors = [model.infer_vector(sentence) for sentence in tokenized_sentences]

    return doc2vec_vectors


In [10]:
def vectorize_sentences3(text):
  # Try the T5 embbeding
  inputs=tokenizer.encode("sumarize: " + text, return_tensors='pt', max_length=512, truncation=True)
  output = model.generate(inputs, min_length=80, max_length=100)
  summary=tokenizer.decode(output[0])
  return summary

In [11]:
# Create a similarity matrix
def create_similarity_matrix(tfidf_matrix):

    similarity_matrix = cosine_similarity(tfidf_matrix)
    return similarity_matrix


In [12]:
# Create a graph and Use PageRank to rank nodes (sentences)
def rank_sentences(similarity_matrix):
    nx_graph = nx.from_numpy_array(similarity_matrix)
    scores = nx.pagerank(nx_graph)
    return scores, nx_graph


In [13]:
# Create a graph and Use HITS to rank nodes (sentences)
def rank_sentences_hits(similarity_matrix):
    # Create a graph from the similarity matrix
    nx_graph = nx.from_numpy_array(similarity_matrix)

    # Apply the HITS algorithm
    hub_scores, authority_scores = nx.hits(nx_graph)

    # You can choose to use either hub or authority scores for ranking
    scores = hub_scores

    return scores


In [14]:
# Output the top k sentences
def summarize_text(scores, sentences, k):
    top_sentence_indices = sorted(((scores[i], i) for i in range(len(scores))), reverse=True)[:k]
    top_sentence_indices.sort(key=lambda x: x[1])
    summary = ' '.join([sentences[i] for score, i in top_sentence_indices])
    return summary


In [15]:
# With tfidf and rank scentences

def tfidf_summary(cleaned_text, summary_size):
    sentences = split_into_sentences(cleaned_text)
    tokenized_sentences = tokenize_sentences(sentences)

    tfidf_matrix, names = vectorize_sentences(tokenized_sentences)

    similarity_matrix = create_similarity_matrix(tfidf_matrix)
    scores, graph = rank_sentences(similarity_matrix)

    # Generate a summary of the specified size
    summary = summarize_text(scores, sentences, summary_size)
    return summary

In [16]:
# With doc2vec and HITS

def doc2vec_summary(cleaned_text, summary_size):
    sentences = split_into_sentences(cleaned_text)
    tokenized_sentences = tokenize_sentences(sentences)

    doc2vec_matrix = vectorize_sentences2(tokenized_sentences)

    similarity_matrix = create_similarity_matrix(doc2vec_matrix)

    scores = rank_sentences_hits(similarity_matrix)

    # Generate a summary of the specified size
    summary3 = summarize_text(scores, sentences, summary_size)

    return summary3


In [102]:
# Encode the sentences usinf SentenceTransformer model
def vectorize_sentences_4(sentences):
  emb_list = []
  for i in sentences:
    emb_list.append(model.encode(i))
  return emb_list


In [138]:
# Summarize with the transformer approach
def transformer_summrizer(cleaned_text):
  sentences = split_into_sentences(cleaned_text)

  transformer_matrix = vectorize_sentences_4(sentences)

  similarity_matrix = create_similarity_matrix(transformer_matrix)
  scores = rank_sentences_hits(similarity_matrix)

  # Generate a summary of the specified size
  summary = summarize_text(scores, sentences, 5)
  # Some summaries are corrupted so we need to make sure
  if len(summary) < 20:
    return "NaN"
  return summary

In [156]:
def fast_text_vectorization(scentences):
  ft_l = []
  model = FastText(scentences, vector_size=128, window=5, min_count=3, epochs=10, seed=42, sg=1)
  for i in scentences:
    ftext = model.wv
    ft_l.append(ftext[i])

  return ft_l


In [165]:

def fast_text_summary(clean_text):
  sentences = split_into_sentences(clean_text)

  ft_matrix = fast_text_vectorization(sentences)

  similarity_matrix = create_similarity_matrix(ft_matrix)
  scores = rank_sentences_hits(similarity_matrix)
  summary = summarize_text(scores, sentences, 5)

  return summary


### Generate Summaries after cleaning...

In [19]:
# Read the data
data = pd.read_csv("/content/mr_beast.csv")
print(data.shape)
data.head()

(523, 2)


Unnamed: 0,text,summary
0,- I hit six million subscribers. When I hit fi...,"When I hit five million subscribers, I gave fi..."
1,Nice day. So I live with my brown zone. So I s...,It took him like 30 seconds to pick his move a...
2,"Ah, sorry, dude, I started tilting. Redo it, r...","""I've actually never seen these people before ..."
3,"What's up, guys? I just wanted to do a quick c...",I just wanted to do a quick commentary to talk...
4,"Off a talking about Black Ops three, you see, ...",The only games I'm gonna be buying is Black Op...


In [20]:
# Clean the subtitle
data["text"] = data["text"].apply(lambda x: clean_text(x.split()))
data["summary"] = data["summary"].apply(lambda x: clean_text(x.split()))

data.head()

Unnamed: 0,text,summary
0,i hit six million subscribers. when i hit fiv...,when i hit five million subscribers i gave fiv...
1,nice day. so i live with my brown zone. so i s...,it took him like seconds to pick his move and ...
2,ah sorry dude i started tilting. redo it redo ...,ive actually never seen these people before in...
3,whats up guys i just wanted to do a quick comm...,i just wanted to do a quick commentary to talk...
4,off a talking about black ops three you see th...,the only games im gonna be buying is black ops...


# Getting the summary for each method of vectorization as a column in the data

#### Doc2Vec...

In [None]:
# Iterate through the data and generate summaries while handling exceptions
for index, row in data.iterrows():
    try:
        summary = doc2vec_summary(row['text'], 5)
        data.at[index, 'summary_doc2vec'] = summary
    except ValueError as e:
        # Handle the exception, e.g., print an error message
        print(f"Error for subtitle at index {index}: {e}")
        # Drop the row with the problematic subtitle
        data = data.drop(index)


#### TF-IDF...

In [None]:
# Apply the function to create summaries with variable sizes
data['summary_tfidf'] = data.apply(lambda row: tfidf_summary(row['text'], 5), axis=1)

#### T5...

In [37]:
# Iterate through the data and generate summaries while handling exceptions
for index, row in data[505:].iterrows():
    try:
        summary = vectorize_sentences3(row['text'])
        data.at[index, 't5_summary'] = summary
    except ValueError as e:
        # Handle the exception, e.g., print an error message
        print(f"Error for subtitle at index {index}: {e}")
        # Drop the row with the problematic subtitle
        data = data.drop(index)


In [44]:
# Drop the NaN values
data.dropna(inplace=True)
data.isna().sum()

text               0
summary            0
summary_doc2vec    0
summary_tfidf      0
t5_summary         0
dtype: int64

In [40]:
print(data.shape)
data.head()

(522, 5)


Unnamed: 0,text,summary,summary_doc2vec,summary_tfidf,t5_summary
0,i hit six million subscribers. when i hit fiv...,when i hit five million subscribers i gave fiv...,you know i showed up i dumped a couple million...,. . . . .,<pad> i didnt make my millionth subscriber ear...
1,nice day. so i live with my brown zone. so i s...,it took him like seconds to pick his move and ...,he predicted me to try to give magic bounces s...,i did not set up the reflect and i was going t...,<pad><extra_id_0> this is really stall ii.<ext...
2,ah sorry dude i started tilting. redo it redo ...,ive actually never seen these people before in...,were pranking people that ive never met before...,im sorry i just do it over xored. i dont you k...,<pad> ah sorry dude i started tilting. redo it...
3,whats up guys i just wanted to do a quick comm...,i just wanted to do a quick commentary to talk...,yesterday i uploaded a trolling montage where ...,and yeah so pretty much. i thought it was pret...,<pad> i just wanted to do a quick commentary t...
4,off a talking about black ops three you see th...,the only games im gonna be buying is black ops...,seriously here not alia youre not t. martins j...,ive been at basketball camp to pass like three...,<pad> black ops three is one of my favorite ga...


In [45]:
# Clean the T5 summary
def clean_summary(summary):
    # Remove <pad>, <extra_id_1>, and other unwanted tags

    cleaned_summary = re.sub(r'<pad>|<extra_id_\d+>', '', summary)
    # Remove extra whitespaces
    cleaned_summary = ' '.join(cleaned_summary.split())

    return cleaned_summary

# Apply the cleaning function to the 't5_summary' column in your DataFrame
data['t5_summary'] = data['t5_summary'].apply(clean_summary)


In [52]:
# Filter the data from the corruption
filtered_data = data[data['t5_summary'].apply(lambda x: len(x) > 100)]
filtered_data.shape

(517, 5)

#### Transformer...

In [141]:
# Apply the function to create summaries with variable sizes
data['transformer_summary'] = data.apply(lambda row: transformer_summrizer(row['text']), axis=1)

In [None]:
# Drop the NaN
filtered_indices = data[data['transformer_summary'].apply(lambda x: x == 'NaN')].index
data.drop(filtered_indices, inplace=True)

#### FastText...

In [None]:
# Apply the function to create summaries with variable sizes
data['fast_text_summary'] = data.apply(lambda row: fast_text_summary(row['text']), axis=1)

In [167]:
data.head()

Unnamed: 0,text,summary,summary_doc2vec,summary_tfidf,t5_summary,transformer_summary,fast_text_summary
1,nice day. so i live with my brown zone. so i s...,it took him like seconds to pick his move and ...,he predicted me to try to give magic bounces s...,i did not set up the reflect and i was going t...,this is really stall ii. i was like nice day. ...,he predicted me to try to give magic bounces s...,he predicted me to try to give magic bounces s...
2,ah sorry dude i started tilting. redo it redo ...,ive actually never seen these people before in...,were pranking people that ive never met before...,im sorry i just do it over xored. i dont you k...,ah sorry dude i started tilting. redo it redo ...,i dont you know that is whos people um yeah. w...,im sorry i just do it over xored. ok yep so im...
3,whats up guys i just wanted to do a quick comm...,i just wanted to do a quick commentary to talk...,yesterday i uploaded a trolling montage where ...,and yeah so pretty much. i thought it was pret...,i just wanted to do a quick commentary to talk...,ive been playing all day and i just didnt get ...,so im going to keep playing tonight and tomorr...
4,off a talking about black ops three you see th...,the only games im gonna be buying is black ops...,seriously here not alia youre not t. martins j...,ive been at basketball camp to pass like three...,black ops three is one of my favorite games of...,off a talking about black ops three you see th...,off a talking about black ops three you see th...
5,you whats up guys before we get into the topic...,today were going to be youtube a.k.a. heres fi...,but i just randomly thought about that like wh...,this is just something that i see occasionally...,a video about a car wreck is being aired on yo...,this is just something that i see occasionally...,so it made the guy in front of me think that i...


# Rouge Comaprison

## Calculate the Rouge For TFIDF

In [53]:
rouge_tfidf = Rouge()
scores = []
for index, row in filtered_data.iterrows():
    hypothesis = row['summary']
    reference = row['summary_tfidf']
    try:

      # Calculate ROUGE scores
      score = rouge_tfidf.get_scores(hypothesis, reference)
      scores.append(score)
    except RecursionError as e:
      print("RecursionError: Maximum recursion depth exceeded. ROUGE scores calculation ignored.")

In [54]:
avg_rouge_tfidf = {
    "rouge-1": {
        "f": sum(score[0]['rouge-1']['f'] for score in scores) / len(scores),
        "p": sum(score[0]['rouge-1']['p'] for score in scores) / len(scores),
        "r": sum(score[0]['rouge-1']['r'] for score in scores) / len(scores),
    },
    "rouge-2": {
        "f": sum(score[0]['rouge-2']['f'] for score in scores) / len(scores),
        "p": sum(score[0]['rouge-2']['p'] for score in scores) / len(scores),
        "r": sum(score[0]['rouge-2']['r'] for score in scores) / len(scores),
    }
}
tfidf_avg_rouge = pd.DataFrame(avg_rouge_tfidf)

In [55]:
tfidf_avg_rouge

Unnamed: 0,rouge-1,rouge-2
f,0.278163,0.137758
p,0.325172,0.177857
r,0.275488,0.119796


## Calculate the Rouge For Doc2Vec

In [56]:
rouge_doc2vec = Rouge()
scores = []
for index, row in filtered_data.iterrows():
    hypothesis = row['summary']
    reference = row['summary_doc2vec']
    try:
      # Calculate ROUGE scores
      score = rouge_doc2vec.get_scores(hypothesis, reference)
      scores.append(score)
    except RecursionError as e:
      print("RecursionError: Maximum recursion depth exceeded. ROUGE scores calculation ignored.")

In [57]:
avg_rouge_doc2vec = {
    "rouge-1": {
        "f": sum(score[0]['rouge-1']['f'] for score in scores) / len(scores),
        "p": sum(score[0]['rouge-1']['p'] for score in scores) / len(scores),
        "r": sum(score[0]['rouge-1']['r'] for score in scores) / len(scores),
    },
    "rouge-2": {
        "f": sum(score[0]['rouge-2']['f'] for score in scores) / len(scores),
        "p": sum(score[0]['rouge-2']['p'] for score in scores) / len(scores),
        "r": sum(score[0]['rouge-2']['r'] for score in scores) / len(scores),
    }
}
doc2vec_avg_rouge = pd.DataFrame(avg_rouge_doc2vec)

In [58]:
doc2vec_avg_rouge

Unnamed: 0,rouge-1,rouge-2
f,0.324874,0.142302
p,0.420695,0.194106
r,0.276855,0.118902


## Calculate the Rouge For T5

In [62]:
rouge_t5 = Rouge()
scores = []
for index, row in filtered_data.iterrows():
    hypothesis = row['summary']
    reference = row['t5_summary']
    try:
      # Calculate ROUGE scores
      score = rouge_t5.get_scores(hypothesis, reference)
      scores.append(score)
    except RecursionError as e:
      print("RecursionError: Maximum recursion depth exceeded. ROUGE scores calculation ignored.")

In [63]:
avg_rouge_t5 = {
    "rouge-1": {
        "f": sum(score[0]['rouge-1']['f'] for score in scores) / len(scores),
        "p": sum(score[0]['rouge-1']['p'] for score in scores) / len(scores),
        "r": sum(score[0]['rouge-1']['r'] for score in scores) / len(scores),
    },
    "rouge-2": {
        "f": sum(score[0]['rouge-2']['f'] for score in scores) / len(scores),
        "p": sum(score[0]['rouge-2']['p'] for score in scores) / len(scores),
        "r": sum(score[0]['rouge-2']['r'] for score in scores) / len(scores),
    }
}
t5_avg_rouge = pd.DataFrame(avg_rouge_t5)

In [64]:
t5_avg_rouge

Unnamed: 0,rouge-1,rouge-2
f,0.371217,0.196883
p,0.350029,0.191089
r,0.423908,0.217118


## Calculate the Rouge For SenteneTransformer

In [151]:
rouge_transformer = Rouge()
scores = []
for index, row in data.iterrows():
    hypothesis = row['summary']
    reference = row['transformer_summary']
    try:
      # Calculate ROUGE scores
      score = rouge_transformer.get_scores(hypothesis, reference)
      scores.append(score)
    except RecursionError as e:
      print("RecursionError: Maximum recursion depth exceeded. ROUGE scores calculation ignored.")

In [152]:
avg_rouge_transformer = {
    "rouge-1": {
        "f": sum(score[0]['rouge-1']['f'] for score in scores) / len(scores),
        "p": sum(score[0]['rouge-1']['p'] for score in scores) / len(scores),
        "r": sum(score[0]['rouge-1']['r'] for score in scores) / len(scores),
    },
    "rouge-2": {
        "f": sum(score[0]['rouge-2']['f'] for score in scores) / len(scores),
        "p": sum(score[0]['rouge-2']['p'] for score in scores) / len(scores),
        "r": sum(score[0]['rouge-2']['r'] for score in scores) / len(scores),
    }
}
transformer_avg_rouge = pd.DataFrame(avg_rouge_transformer)

In [153]:
transformer_avg_rouge

Unnamed: 0,rouge-1,rouge-2
f,0.39087,0.206566
p,0.43734,0.253593
r,0.380516,0.185136


## Calculate the Rouge For FasText

In [168]:
rouge_fast_text = Rouge()
scores = []
for index, row in data.iterrows():
    hypothesis = row['summary']
    reference = row['fast_text_summary']
    try:
      # Calculate ROUGE scores
      score = rouge_fast_text.get_scores(hypothesis, reference)
      scores.append(score)
    except RecursionError as e:
      print("RecursionError: Maximum recursion depth exceeded. ROUGE scores calculation ignored.")

In [169]:
avg_rouge_fast_text = {
    "rouge-1": {
        "f": sum(score[0]['rouge-1']['f'] for score in scores) / len(scores),
        "p": sum(score[0]['rouge-1']['p'] for score in scores) / len(scores),
        "r": sum(score[0]['rouge-1']['r'] for score in scores) / len(scores),
    },
    "rouge-2": {
        "f": sum(score[0]['rouge-2']['f'] for score in scores) / len(scores),
        "p": sum(score[0]['rouge-2']['p'] for score in scores) / len(scores),
        "r": sum(score[0]['rouge-2']['r'] for score in scores) / len(scores),
    }
}
fast_text_avg_rouge = pd.DataFrame(avg_rouge_fast_text)

In [170]:
fast_text_avg_rouge

Unnamed: 0,rouge-1,rouge-2
f,0.388317,0.194147
p,0.463742,0.250577
r,0.355428,0.169194
