# **Importing Libraries**

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [None]:
import re
import string
import spacy
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
try:
    import contractions
except ImportError:
    !pip install contractions
    import contractions

from contractions import fix  # Ensure contractions library is installed
from contractions import fix  # Assuming you have a function to expand contractions
# Load English language model for lemmatization
nlp = spacy.load("en_core_web_sm")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Collecting contractions
  Downloading contractions-0.1.73-py2.py3-none-any.whl (8.7 kB)
Collecting textsearch>=0.0.21 (from contractions)
  Downloading textsearch-0.0.24-py2.py3-none-any.whl (7.6 kB)
Collecting anyascii (from textsearch>=0.0.21->contractions)
  Downloading anyascii-0.3.2-py3-none-any.whl (289 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m289.9/289.9 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pyahocorasick (from textsearch>=0.0.21->contractions)
  Downloading pyahocorasick-2.1.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (110 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.7/110.7 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyahocorasick, anyascii, textsearch, contractions
Successfully installed anyascii-0.3.2 contractions-0.1.73 pyahocorasick-2.1.0 textsearch-0.0.24


# **Loading Dataset**

In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
dataset_path = '/content/drive/My Drive/NLP DATASET/YT_Summary_100_ROW.csv'


data contains Youtube video transcript chunks and its summary

In [None]:
data = pd.read_csv(dataset_path)  # data contains Validate Dataset
data.shape

(102, 3)

In [None]:
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)

In [None]:
data.head(5)

Unnamed: 0,video_key,chunks,summary
0,pjoQdz0nxf4,A ruthless murder was committed. Someone kille...,"During the last days of the Cretaceous period,..."
1,pjoQdz0nxf5,The ancient continents almost resembled the wo...,"Before the asteroid impact, the Deccan Traps, ..."
2,pjoQdz0nxf6,"so, after the initial warming, a period of coo...","After the initial warming, a period of cooling..."
3,pjoQdz0nxf7,"If you want to see what this was like, we made...",Deccan Traps went on expelling tens of trillio...
4,pjoQdz0nxf8,"Until recently, many scientists thought that t...",Project crates from KiwiCo are like a little a...


In [None]:
data.shape

(102, 3)

# **Preprocessing**

### **Checking For Null Values**

In [None]:
df=data # df frame contains Test Dataset
df.shape

(102, 3)

In [None]:
df.isnull().sum()  # count the number of missing values (NaNs) in each column of a DataFrame df.

video_key    2
chunks       2
summary      2
dtype: int64

In [None]:
if df.isnull().values.any():   # removes rows containing missing values (NaNs) from the DataFrame (As missing value present in df, we are removing here the respective rows)
    df.dropna(inplace=True)

In [None]:
df.nunique()  #used to count the number of unique values in each column of a DataFrame df.

video_key    100
chunks       100
summary      100
dtype: int64

In [None]:
df.info()  #used to get a concise summary of a DataFrame

<class 'pandas.core.frame.DataFrame'>
Index: 100 entries, 0 to 99
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   video_key  100 non-null    object
 1   chunks     100 non-null    object
 2   summary    100 non-null    object
dtypes: object(3)
memory usage: 3.1+ KB


In [None]:
df.shape

(100, 3)

In [None]:
# Assuming df is your original DataFrame containing the "chunks" column
# Create a new DataFrame df2 with only the "chunks" column
df2 = pd.DataFrame()
df2['chunks'] = df['chunks']
df2.shape

(100, 1)

In [None]:
df2.head(2)

Unnamed: 0,chunks
0,A ruthless murder was committed. Someone kille...
1,The ancient continents almost resembled the wo...


In [None]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
Index: 100 entries, 0 to 99
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   chunks  100 non-null    object
dtypes: object(1)
memory usage: 1.6+ KB


#**Model Selection For Transcript Summarization**

## **A. Extractive Summarization**

### **1. TextRank Algorithm**

In [None]:
!pip install rouge
!pip install bert_score

from collections import defaultdict
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import sent_tokenize
from rouge import Rouge




In [None]:
# Function to implement TextRank for extractive summarization
def textrank_summarize(text, num_sentences=3):
    # Tokenize the text into sentences
    sentences = sent_tokenize(text)

    # Vectorize the sentences using CountVectorizer
    vectorizer = CountVectorizer(stop_words='english')
    X = vectorizer.fit_transform(sentences)

    # Calculate similarity matrix
    similarity_matrix = cosine_similarity(X, X)

    # Convert similarity matrix to graph
    graph = defaultdict(list)
    for i in range(len(sentences)):
        for j in range(len(sentences)):
            if i != j and similarity_matrix[i][j] > 0.0:
                graph[i].append(j)

    # Calculate PageRank scores
    d = 0.50  # damping factor
    scores = np.ones(len(sentences)) / len(sentences)
    for _ in range(100):  # number of iterations
        new_scores = np.ones(len(sentences)) * (1 - d) / len(sentences)
        for i in range(len(sentences)):
            for j in graph[i]:
                new_scores[j] += d * scores[i] / len(graph[i])
        scores = new_scores

    # Select top-ranked sentences
    ranked_sentences_indices = np.argsort(scores)[::-1][:num_sentences]
    summary = [sentences[i] for i in sorted(ranked_sentences_indices)]
    return ' '.join(summary)


In [None]:
# Apply TextRank summarization to each row in df2['chunks'] and store in df3
df3 = pd.DataFrame()
df3['textrank_summary'] = df2['chunks'].apply(textrank_summarize)

In [None]:
df3.head(2)

Unnamed: 0,textrank_summary
0,Witnesses say that an Everest-sized asteroid h...
1,"About 800,000 years before the impact, the Dec..."


In [None]:
# Compute ROUGE scores
rouge = Rouge()
rouge_scores = rouge.get_scores(df3['textrank_summary'], df['summary'], avg=True)


print("ROUGE Scores:")
for metric, score in rouge_scores.items():
    print(f"{metric}: {score}")

ROUGE Scores:
rouge-1: {'r': 0.34488464619166914, 'p': 0.2923146126101963, 'f': 0.303760081127223}
rouge-2: {'r': 0.18220415164880438, 'p': 0.13862657634036116, 'f': 0.15034229602571758}
rouge-l: {'r': 0.3157460072024653, 'p': 0.2679065462114868, 'f': 0.27837182020478246}


In [None]:
# Compute BERTScore scores
from bert_score import score

# Convert summaries to strings
reference_summaries = [' '.join(summary.split()) for summary in df['summary']]
generated_summaries = [' '.join(summary.split()) for summary in df3['textrank_summary']]

P, R, F1 = score(generated_summaries, reference_summaries, lang='en', verbose=True)
print("BERTScore Precision:", P.mean().item())
print("BERTScore Recall:", R.mean().item())
print("BERTScore F1:", F1.mean().item())



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/4 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/2 [00:00<?, ?it/s]

done in 3.28 seconds, 30.52 sentences/sec
BERTScore Precision: 0.855155348777771
BERTScore Recall: 0.8689554333686829
BERTScore F1: 0.8618507385253906


In [None]:
from nltk.translate.bleu_score import corpus_bleu

# Convert summaries to lists of tokens
reference_summaries = [[summary.split()] for summary in df['summary']]
generated_summaries = [summary.split() for summary in df3['textrank_summary']]

# Compute BLEU score
bleu_score = corpus_bleu(reference_summaries, generated_summaries)

print("BLEU Score:", round(bleu_score, 2))




BLEU Score: 0.13


### **2. LexRank Algorithm**

In [None]:
# Function to implement LexRank for extractive summarization
def lexrank_summarize(text, num_sentences=3):
    # Tokenize the text into sentences
    sentences = sent_tokenize(text)

    # Vectorize the sentences using CountVectorizer
    vectorizer = CountVectorizer(stop_words='english')
    X = vectorizer.fit_transform(sentences)

    # Calculate similarity matrix
    similarity_matrix = cosine_similarity(X, X)

    # Convert similarity matrix to graph
    graph = defaultdict(list)
    for i in range(len(sentences)):
        for j in range(len(sentences)):
            if i != j and similarity_matrix[i][j] > 0.0:
                graph[i].append(j)

    # Calculate LexRank scores
    d = 0.85  # damping factor
    scores = np.ones(len(sentences)) / len(sentences)
    for _ in range(100):  # number of iterations
        new_scores = np.ones(len(sentences)) * (1 - d) / len(sentences)
        for i in range(len(sentences)):
            for j in graph[i]:
                new_scores[j] += d * scores[i] / len(graph[i])
        scores = new_scores

    # Select top-ranked sentences
    ranked_sentences_indices = np.argsort(scores)[::-1][:num_sentences]
    summary = [sentences[i] for i in sorted(ranked_sentences_indices)]
    return ' '.join(summary)

In [None]:
df4 = pd.DataFrame()
df4['lexrank_summary'] = df2['chunks'].apply(lexrank_summarize)

In [None]:
df4.head(2)

Unnamed: 0,lexrank_summary
0,Witnesses say that an Everest-sized asteroid h...
1,"About 800,000 years before the impact, the Dec..."


In [None]:
# Compute ROUGE scores
rouge = Rouge()
rouge_scores = rouge.get_scores(df4['lexrank_summary'], df['summary'], avg=True)


print("ROUGE Scores:")
for metric, score in rouge_scores.items():
    print(f"{metric}: {score}")

ROUGE Scores:
rouge-1: {'r': 0.3510699470282868, 'p': 0.29492346113752516, 'f': 0.307583975617605}
rouge-2: {'r': 0.19320141383253372, 'p': 0.14388958437573007, 'f': 0.158013246706234}
rouge-l: {'r': 0.32198074234920815, 'p': 0.2715829247291931, 'f': 0.2828436084718318}


In [None]:
# Compute BERTScore scores

from bert_score import score

# Convert summaries to strings
reference_summaries = [' '.join(summary.split()) for summary in df['summary']]
generated_summaries = [' '.join(summary.split()) for summary in df4['lexrank_summary']]

P, R, F1 = score(generated_summaries, reference_summaries, lang='en', verbose=True)
print("BERTScore Precision:", P.mean().item())
print("BERTScore Recall:", R.mean().item())
print("BERTScore F1:", F1.mean().item())

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/4 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/2 [00:00<?, ?it/s]

done in 3.55 seconds, 28.20 sentences/sec
BERTScore Precision: 0.8558818101882935
BERTScore Recall: 0.8699812889099121
BERTScore F1: 0.8627180457115173


In [None]:
from nltk.translate.bleu_score import corpus_bleu

# Convert summaries to lists of tokens
reference_summaries = [[summary.split()] for summary in df['summary']]
generated_summaries = [summary.split() for summary in df4['lexrank_summary']]

# Compute BLEU score
bleu_score = corpus_bleu(reference_summaries, generated_summaries)

print("BLEU Score:", round(bleu_score, 2))



BLEU Score: 0.13


### **3. BERTSUM**

In [None]:
!pip install --upgrade transformers torch

import torch
from transformers import BertTokenizer, BertForPreTraining, BertModel
from transformers import BertTokenizer, BertForPreTraining, BertConfig
from transformers import BertModel, BertConfig
from transformers import BertForSequenceClassification, BertTokenizer
from transformers import BertConfig, BertForPreTraining



In [None]:
# Check if GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [None]:
import torch
from transformers import BertTokenizer, BertModel
import pandas as pd

# Check if GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

def bertsum_extractive_summarize_batch(texts):
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertModel.from_pretrained('bert-base-uncased')
    model.to(device)  # Move model to GPU
    model.eval()

    summaries = []
    for text in texts:
        # Tokenize input text
        inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True)
        inputs.to(device)  # Move input tensors to GPU

        # Generate summary using BERT
        with torch.no_grad():
            outputs = model(**inputs)
            hidden_states = outputs.last_hidden_state.to(device)  # Move hidden states to GPU
            # Perform pooling strategy (e.g., mean pooling)
            summary_vector = torch.mean(hidden_states, dim=1)  # Average pooling over the hidden states

            # You might need additional processing here to select important sentences or phrases

            # Decode the summary vector into text
            summary_text = tokenizer.decode(inputs.input_ids[0], skip_special_tokens=True)
            # print(summary_text)
            summaries.append(summary_text)
    return summaries

cuda


In [None]:
# Apply Bertsum summarization to batches of rows in df2['chunks'] and store in df5
batch_size = 32
df5 = pd.DataFrame()
for i in range(0, len(df2), batch_size):
    batch_texts = df2['chunks'].iloc[i:i+batch_size].tolist()
    batch_summaries = bertsum_extractive_summarize_batch(batch_texts)
    df5 = pd.concat([df5, pd.DataFrame({'bertsum_summary': batch_summaries})], ignore_index=True)
df5.head(2)

Unnamed: 0,bertsum_summary
0,a ruthless murder was committed. someone kille...
1,the ancient continents almost resembled the wo...


In [None]:
# Compute ROUGE scores
rouge = Rouge()
rouge_scores = rouge.get_scores(df5['bertsum_summary'], df['summary'], avg=True)

print("ROUGE Scores:")
for metric, score in rouge_scores.items():
    print(f"{metric}: {score}")

ROUGE Scores:
rouge-1: {'r': 0.685866842126585, 'p': 0.19378583376289207, 'f': 0.2947711593830912}
rouge-2: {'r': 0.5379355401446253, 'p': 0.10563649210762208, 'f': 0.17236003730428526}
rouge-l: {'r': 0.6715574066143962, 'p': 0.18834644511437582, 'f': 0.28700616245575167}


In [None]:
from bert_score import score

# Convert summaries to strings
reference_summaries = [' '.join(summary.split()) for summary in df['summary']]
generated_summaries = [' '.join(summary.split()) for summary in df5['bertsum_summary']]

P, R, F1 = score(generated_summaries, reference_summaries, lang='en', verbose=True)
print("BERTScore Precision:", P.mean().item())
print("BERTScore Recall:", R.mean().item())
print("BERTScore F1:", F1.mean().item())

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/4 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/2 [00:00<?, ?it/s]

done in 8.72 seconds, 11.47 sentences/sec
BERTScore Precision: 0.8155910968780518
BERTScore Recall: 0.9069635272026062
BERTScore F1: 0.8585578799247742


In [None]:
from nltk.translate.bleu_score import corpus_bleu

# Convert summaries to lists of tokens
reference_summaries = [[summary.split()] for summary in df['summary']]
generated_summaries = [summary.split() for summary in df5['bertsum_summary']]

# Compute BLEU score
bleu_score = corpus_bleu(reference_summaries, generated_summaries)

print("BLEU Score:", round(bleu_score, 2))

BLEU Score: 0.09


## **B. Abstractive Summarization**

### **1. Pegasus**

In [None]:
from transformers import PegasusForConditionalGeneration, PegasusTokenizer

def generate_pegasus_summary_batch(texts):
    # Load tokenizer
    tokenizer = PegasusTokenizer.from_pretrained("google/pegasus-large")

    # Load model
    model = PegasusForConditionalGeneration.from_pretrained("google/pegasus-large")
    model.to(device)  # Move model to GPU

    summaries = []
    for text in texts:
        # Tokenize the text
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding="longest")
        inputs.to(device)  # Move input tensors to GPU

        # Generate summary
        summary_ids = model.generate(inputs["input_ids"].to(device), max_length=50, num_beams=5, early_stopping=True)

        # Decode and append the summary
        summaries.append(tokenizer.decode(summary_ids[0], skip_special_tokens=True))

    return summaries




In [None]:
# Apply Pegasus summarization to batches of rows in df2['chunks'] and store in df6
batch_size = 32
df6 = pd.DataFrame()
for i in range(0, len(df2), batch_size):
    batch_texts = df2['chunks'].iloc[i:i+batch_size].tolist()
    batch_summaries = generate_pegasus_summary_batch(batch_texts)
    df6 = pd.concat([df6, pd.DataFrame({'pegasus_summary': batch_summaries})], ignore_index=True)

df6.head(2)

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-large and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-large and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-large and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions a

Unnamed: 0,pegasus_summary
0,"In the last few years, new evidence has reinvi..."
1,Clouds of ash darkened the sky as rivers of ma...


In [None]:
# Compute ROUGE scores
rouge = Rouge()
rouge_scores = rouge.get_scores(df6['pegasus_summary'], df['summary'], avg=True)


print("ROUGE Scores:")
for metric, score in rouge_scores.items():
    print(f"{metric}: {score}")



ROUGE Scores:
rouge-1: {'r': 0.24432027378751653, 'p': 0.3461084979753702, 'f': 0.2798119507621808}
rouge-2: {'r': 0.12193589408586442, 'p': 0.1692472960675295, 'f': 0.13948052705571926}
rouge-l: {'r': 0.21992256799504387, 'p': 0.3126112028742016, 'f': 0.2522877424065126}


In [None]:
from bert_score import score

# Convert summaries to strings
reference_summaries = [' '.join(summary.split()) for summary in df['summary']]
generated_summaries = [' '.join(summary.split()) for summary in df6['pegasus_summary']]

P, R, F1 = score(generated_summaries, reference_summaries, lang='en', verbose=True)
print("BERTScore Precision:", P.mean().item())
print("BERTScore Recall:", R.mean().item())
print("BERTScore F1:", F1.mean().item())

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/4 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/2 [00:00<?, ?it/s]

done in 2.93 seconds, 34.18 sentences/sec
BERTScore Precision: 0.8612805008888245
BERTScore Recall: 0.8537490963935852
BERTScore F1: 0.8574093580245972


In [None]:
from nltk.translate.bleu_score import corpus_bleu

# Convert summaries to lists of tokens
reference_summaries = [[summary.split()] for summary in df['summary']]
generated_summaries = [summary.split() for summary in df6['pegasus_summary']]

# Compute BLEU score
bleu_score = corpus_bleu(reference_summaries, generated_summaries)

print("BLEU Score:", round(bleu_score, 2))

BLEU Score: 0.09


### **2. BART (large-sized model) Model**

In [None]:
from transformers import BartForConditionalGeneration, BartTokenizer

def generate_bart_summary_batch(texts):
    # Load tokenizer
    tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")

    # Load model
    model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")
    model.to(device)  # Move model to GPU
    summaries = []
    for text in texts:
        # Tokenize the text
        inputs = tokenizer(text, return_tensors="pt", max_length=1024, truncation=True, padding="longest")
        inputs.to(device)  # Move input tensors to GPU

        # Generate summary
        summary_ids = model.generate(inputs.input_ids.to(device), max_length=100, num_beams=5, early_stopping=True, min_length=30)

        # Decode and append the summary
        summaries.append(tokenizer.decode(summary_ids[0], skip_special_tokens=True))

    return summaries



In [None]:
# Apply BART summarization to batches of rows in df2['chunks'] and store in df7
batch_size = 32
df7 = pd.DataFrame()
for i in range(0, len(df2), batch_size):
    batch_texts = df2['chunks'].iloc[i:i+batch_size].tolist()
    batch_summaries = generate_bart_summary_batch(batch_texts)
    df7 = pd.concat([df7, pd.DataFrame({'bart_summary': batch_summaries})], ignore_index=True)

df7.head(2)

Unnamed: 0,bart_summary
0,"66 million years ago, Earth was barely recogni..."
1,"About 800,000 years before the asteroid impact..."


In [None]:
# Compute ROUGE scores
rouge = Rouge()
rouge_scores = rouge.get_scores(df7['bart_summary'], df['summary'], avg=True)


print("ROUGE Scores:")
for metric, score in rouge_scores.items():
    print(f"{metric}: {score}")



ROUGE Scores:
rouge-1: {'r': 0.517812613961106, 'p': 0.5708036783387799, 'f': 0.5321938192762887}
rouge-2: {'r': 0.42670624325818596, 'p': 0.4645482355537207, 'f': 0.4381618813090851}
rouge-l: {'r': 0.5059744648884523, 'p': 0.5562406853182956, 'f': 0.5198807425152947}


In [None]:
from bert_score import score

# Convert summaries to strings
reference_summaries = [' '.join(summary.split()) for summary in df['summary']]
generated_summaries = [' '.join(summary.split()) for summary in df7['bart_summary']]

P, R, F1 = score(generated_summaries, reference_summaries, lang='en', verbose=True)
print("BERTScore Precision:", P.mean().item())
print("BERTScore Recall:", R.mean().item())
print("BERTScore F1:", F1.mean().item())

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/3 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/2 [00:00<?, ?it/s]

done in 2.91 seconds, 34.37 sentences/sec
BERTScore Precision: 0.9141034483909607
BERTScore Recall: 0.9070139527320862
BERTScore F1: 0.9103907942771912


In [None]:
from nltk.translate.bleu_score import corpus_bleu

# Convert summaries to lists of tokens
reference_summaries = [[summary.split()] for summary in df['summary']]
generated_summaries = [summary.split() for summary in df7['bart_summary']]

# Compute BLEU score
bleu_score = corpus_bleu(reference_summaries, generated_summaries)

print("BLEU Score:", round(bleu_score, 2))


BLEU Score: 0.35


### **3. DistilBART Model**

In [None]:
from transformers import BartForConditionalGeneration, BartTokenizer
import pandas as pd

def generate_distilbart_summary_batch(texts):
    # Load tokenizer
    tokenizer = BartTokenizer.from_pretrained("sshleifer/distilbart-cnn-12-6")

    # Load model
    model = BartForConditionalGeneration.from_pretrained("sshleifer/distilbart-cnn-12-6")
    model.to(device)  # Move model to GPU

    summaries = []
    for text in texts:
        # Tokenize the text
        inputs = tokenizer(text, return_tensors="pt", max_length=1024, truncation=True, padding="longest")
        inputs.to(device)  # Move input tensors to GPU

        # Generate summary
        summary_ids = model.generate(inputs.input_ids.to(device), max_length=70, num_beams=5, early_stopping=True, min_length=10)

        # Decode and append the summary
        summaries.append(tokenizer.decode(summary_ids[0], skip_special_tokens=True))

    return summaries


In [None]:
# Apply DistilBART summarization to batches of rows in df2['chunks'] and store in df8
batch_size = 32
df8 = pd.DataFrame()
for i in range(0, len(df2), batch_size):
    batch_texts = df2['chunks'].iloc[i:i+batch_size].tolist()
    batch_summaries = generate_distilbart_summary_batch(batch_texts)
    df8 = pd.concat([df8, pd.DataFrame({'distil_bart_summary': batch_summaries})], ignore_index=True)

df8.head(2)

Unnamed: 0,distil_bart_summary
0,Witnesses say an Everest-sized asteroid hit E...
1,"About 800,000 years before the asteroid impac..."


In [None]:
# Compute ROUGE scores
rouge = Rouge()
rouge_scores = rouge.get_scores(df8['distil_bart_summary'], df['summary'], avg=True)

print("ROUGE Scores:")
for metric, score in rouge_scores.items():
    print(f"{metric}: {score}")

ROUGE Scores:
rouge-1: {'r': 0.4032919896101341, 'p': 0.45176151762725314, 'f': 0.4163443080322462}
rouge-2: {'r': 0.2891039831586818, 'p': 0.3072261136632284, 'f': 0.29276898754460356}
rouge-l: {'r': 0.38590641101498635, 'p': 0.4306788966760689, 'f': 0.3978987727293309}


In [None]:
from bert_score import score

# Convert summaries to strings
reference_summaries = [' '.join(summary.split()) for summary in df['summary']]
generated_summaries = [' '.join(summary.split()) for summary in df8['distil_bart_summary']]

P, R, F1 = score(generated_summaries, reference_summaries, lang='en', verbose=True)
print("BERTScore Precision:", P.mean().item())
print("BERTScore Recall:", R.mean().item())
print("BERTScore F1:", F1.mean().item())

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/4 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/2 [00:00<?, ?it/s]

done in 2.82 seconds, 35.43 sentences/sec
BERTScore Precision: 0.8903689384460449
BERTScore Recall: 0.8837271332740784
BERTScore F1: 0.8869249224662781


In [None]:
from nltk.translate.bleu_score import corpus_bleu

# Convert summaries to lists of tokens
reference_summaries = [[summary.split()] for summary in df['summary']]
generated_summaries = [summary.split() for summary in df8['distil_bart_summary']]

# Compute BLEU score
bleu_score = corpus_bleu(reference_summaries, generated_summaries)

print("BLEU Score:", round(bleu_score, 2))

BLEU Score: 0.23


### **4.Fine-Tuned T5 Small for Text Summarization**

In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import pandas as pd

def generate_falconsai_summary_batch(texts):
    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained("Falconsai/text_summarization")

    # Load model
    model = AutoModelForSeq2SeqLM.from_pretrained("Falconsai/text_summarization")
    model.to(device)  # Move model to GPU

    summaries = []
    for text in texts:
        # Tokenize the text
        inputs = tokenizer(text, return_tensors="pt", max_length=1024, truncation=True, padding="longest")
        inputs.to(device)  # Move input tensors to GPU

        # Generate summary
        summary_ids = model.generate(inputs.input_ids.to(device), max_length=60, num_beams=5, early_stopping=True, min_length=20)

        # Decode and append the summary
        summaries.append(tokenizer.decode(summary_ids[0], skip_special_tokens=True))

    return summaries


In [None]:
# Apply FalconsAI summarization to batches of rows in df2['chunks'] and store in df9
batch_size = 16
df9 = pd.DataFrame()
for i in range(0, len(df2), batch_size):
    batch_texts = df2['chunks'].iloc[i:i+batch_size].tolist()
    batch_summaries = generate_falconsai_summary_batch(batch_texts)
    df9 = pd.concat([df9, pd.DataFrame({'falconsai_summary': batch_summaries})], ignore_index=True)

df9.head(2)

Unnamed: 0,falconsai_summary
0,"a big asteroid hit Earth, devastated the plane..."
1,apocalypse began quietly and silently. About 8...


In [None]:
# Compute ROUGE scores
rouge = Rouge()
rouge_scores = rouge.get_scores(df9['falconsai_summary'], df['summary'], avg=True)

print("ROUGE Scores:")
for metric, score in rouge_scores.items():
    print(f"{metric}: {score}")



ROUGE Scores:
rouge-1: {'r': 0.2470866974215159, 'p': 0.3661546436124765, 'f': 0.2857619973562931}
rouge-2: {'r': 0.13443671199652837, 'p': 0.18549651705582573, 'f': 0.15158581147954048}
rouge-l: {'r': 0.2361443575262066, 'p': 0.34906017749260093, 'f': 0.2728266158614223}


In [None]:
from bert_score import score

# Convert summaries to strings
reference_summaries = [' '.join(summary.split()) for summary in df['summary']]
generated_summaries = [' '.join(summary.split()) for summary in df9['falconsai_summary']]

P, R, F1 = score(generated_summaries, reference_summaries, lang='en', verbose=True)
print("BERTScore Precision:", P.mean().item())
print("BERTScore Recall:", R.mean().item())
print("BERTScore F1:", F1.mean().item())

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/4 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/2 [00:00<?, ?it/s]

done in 2.94 seconds, 34.00 sentences/sec
BERTScore Precision: 0.8596431016921997
BERTScore Recall: 0.8511035442352295
BERTScore F1: 0.8552549481391907


In [None]:
from nltk.translate.bleu_score import corpus_bleu

# Convert summaries to lists of tokens
reference_summaries = [[summary.split()] for summary in df['summary']]
generated_summaries = [summary.split() for summary in df9['falconsai_summary']]

# Compute BLEU score
bleu_score = corpus_bleu(reference_summaries, generated_summaries)

print("BLEU Score:", round(bleu_score, 2))

BLEU Score: 0.1
