In [1]:
import nltk
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import cosine_similarity
import heapq
import numpy as np

#nltk.download('punkt')
#nltk.download('stopwords')

In [2]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [3]:
# Pseudocode functions
def remove_stopwords(tokens):
    stop_words = set(stopwords.words('english'))
    return [word for word in tokens if word.lower() not in stop_words]

def F_first_K_Sents(text, f):
    return text[:f]

def SelectImpSentences(scores, N, sentences):
    selected_indices = heapq.nlargest(N, scores, key=scores.get)
    selected_sentences = [sentences[i] for i in selected_indices]
    return [sentence for sublist in selected_sentences for sentence in sublist]


def extractiveApproach(dataset, f, N):
    output_dataset = []
    for input_text in dataset:
        important_sentences = []
        tokenized_sents = nltk.sent_tokenize(input_text)
        tokenized_sents = [nltk.word_tokenize(sent) for sent in tokenized_sents]
        tokenized_sents = [remove_stopwords(tokens) for tokens in tokenized_sents]
        first_k_sents = F_first_K_Sents(tokenized_sents, f)
        
        sentence_embeddings = [np.mean(np.array([token.vector for token in nlp(' '.join(tokens))]), axis=0) for tokens in first_k_sents]
        similarity_matrix = cosine_similarity(sentence_embeddings, sentence_embeddings)
        
        for i, sent in enumerate(first_k_sents):
            scores = {j: similarity_matrix[i][j] for j in range(len(first_k_sents))}
            important_sentences.append(sent)
        
        selected_sentences = SelectImpSentences(scores, N, first_k_sents)
        summary = ' '.join(selected_sentences)


        
        output_dataset.append(summary)
    
    return output_dataset

In [4]:
# Example usage
dataset = [
    "Hi,my name is Annarhysa Albert",
    "Mumbai is my favourite place",
    "Listen to me carefully"
]

f = 2  # Number of sentences to consider
N = 5  # Number of words per sentence

summarized_dataset = extractiveApproach(dataset, f, N)
for summary in summarized_dataset:
    print(summary)


Hi , name Annarhysa Albert
Mumbai favourite place
Listen carefully


In [5]:
import pandas as pd

df = pd.read_csv("./Data/test.csv")
df.head()

Unnamed: 0,id,article,highlights
0,92c514c913c0bdfe25341af9fd72b29db544099b,Ever noticed how plane seats appear to be gett...,Experts question if packed out planes are put...
1,2003841c7dc0e7c5b1a248f9cd536d727f27a45a,A drunk teenage boy had to be rescued by secur...,Drunk teenage boy climbed into lion enclosure ...
2,91b7d2311527f5c2b63a65ca98d21d9c92485149,Dougie Freedman is on the verge of agreeing a ...,Nottingham Forest are close to extending Dougi...
3,caabf9cbdf96eb1410295a673e953d304391bfbb,Liverpool target Neto is also wanted by PSG an...,Fiorentina goalkeeper Neto has been linked wit...
4,3da746a7d9afcaa659088c8366ef6347fe6b53ea,Bruce Jenner will break his silence in a two-h...,"Tell-all interview with the reality TV star, 6..."


In [6]:
df = df[:1000]

In [7]:
df.head()

Unnamed: 0,id,article,highlights
0,92c514c913c0bdfe25341af9fd72b29db544099b,Ever noticed how plane seats appear to be gett...,Experts question if packed out planes are put...
1,2003841c7dc0e7c5b1a248f9cd536d727f27a45a,A drunk teenage boy had to be rescued by secur...,Drunk teenage boy climbed into lion enclosure ...
2,91b7d2311527f5c2b63a65ca98d21d9c92485149,Dougie Freedman is on the verge of agreeing a ...,Nottingham Forest are close to extending Dougi...
3,caabf9cbdf96eb1410295a673e953d304391bfbb,Liverpool target Neto is also wanted by PSG an...,Fiorentina goalkeeper Neto has been linked wit...
4,3da746a7d9afcaa659088c8366ef6347fe6b53ea,Bruce Jenner will break his silence in a two-h...,"Tell-all interview with the reality TV star, 6..."


In [8]:
summarized_dataset = extractiveApproach(df['highlights'], f, N)

df["Summary"] = summarized_dataset

df.head()

Unnamed: 0,id,article,highlights,Summary
0,92c514c913c0bdfe25341af9fd72b29db544099b,Ever noticed how plane seats appear to be gett...,Experts question if packed out planes are put...,U.S consumer advisory group says minimum space...
1,2003841c7dc0e7c5b1a248f9cd536d727f27a45a,A drunk teenage boy had to be rescued by secur...,Drunk teenage boy climbed into lion enclosure ...,"Rahul Kumar , 17 , ran towards animals shoutin..."
2,91b7d2311527f5c2b63a65ca98d21d9c92485149,Dougie Freedman is on the verge of agreeing a ...,Nottingham Forest are close to extending Dougi...,Forest boss took former manager Stuart Pearce ...
3,caabf9cbdf96eb1410295a673e953d304391bfbb,Liverpool target Neto is also wanted by PSG an...,Fiorentina goalkeeper Neto has been linked wit...,Neto joined Firoentina Brazilian outfit Atleti...
4,3da746a7d9afcaa659088c8366ef6347fe6b53ea,Bruce Jenner will break his silence in a two-h...,"Tell-all interview with the reality TV star, 6...",comes amid continuing speculation transition w...


In [9]:
df.drop(columns = ["id", "article"])

Unnamed: 0,highlights,Summary
0,Experts question if packed out planes are put...,U.S consumer advisory group says minimum space...
1,Drunk teenage boy climbed into lion enclosure ...,"Rahul Kumar , 17 , ran towards animals shoutin..."
2,Nottingham Forest are close to extending Dougi...,Forest boss took former manager Stuart Pearce ...
3,Fiorentina goalkeeper Neto has been linked wit...,Neto joined Firoentina Brazilian outfit Atleti...
4,"Tell-all interview with the reality TV star, 6...",comes amid continuing speculation transition w...
...,...,...
995,Transport for London used actors in the uncomf...,Encourages women report sexual harassment publ...
996,WARNING: GRAPHIC CONTENT .\nThe week-long fest...,"week-long festival marks trial , crucifixion r..."
997,Floyd Mayweather and Manny Pacquiao fight in L...,men nearing end respective training camps . Fl...
998,ComRes survey for ITV shows Ukip falling behin...,Latest series polls showing losing ground Tori...


In [10]:
df.to_csv('summarized-data.csv', index = False)

In [11]:
#calculating the rouge score
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.tokenize import word_tokenize

def calculate_rouge_scores(generated_summaries, reference_summaries):
    rouge_1_scores = []
    rouge_2_scores = []
    rouge_l_scores = []

    for generated, reference in zip(generated_summaries, reference_summaries):
        # Tokenize the generated and reference summaries
        gen_tokens = word_tokenize(generated)
        ref_tokens = word_tokenize(reference)
        
        # Calculate ROUGE-N scores
        rouge_1_scores.append(sentence_bleu([ref_tokens], gen_tokens, weights=(1, 0, 0), smoothing_function=SmoothingFunction().method1))
        rouge_2_scores.append(sentence_bleu([ref_tokens], gen_tokens, weights=(0.5, 0.5, 0), smoothing_function=SmoothingFunction().method1))
        
        # Calculate ROUGE-L score
        rouge_l_scores.append(sentence_bleu([ref_tokens], gen_tokens, weights=(0, 1, 0), smoothing_function=SmoothingFunction().method1))

    avg_rouge_1 = sum(rouge_1_scores) / len(rouge_1_scores)
    avg_rouge_2 = sum(rouge_2_scores) / len(rouge_2_scores)
    avg_rouge_l = sum(rouge_l_scores) / len(rouge_l_scores)

    return avg_rouge_1, avg_rouge_2, avg_rouge_l

# Example usage
generated_summaries = df['Summary']
reference_summaries = df['highlights']

rouge_1, rouge_2, rouge_l = calculate_rouge_scores(generated_summaries, reference_summaries)
print("ROUGE-1 Score:", rouge_1)
print("ROUGE-2 Score:", rouge_2)
print("ROUGE-L Score:", rouge_l)

ROUGE-1 Score: 0.2442227234029551
ROUGE-2 Score: 0.19392719940715814
ROUGE-L Score: 0.1549063430371625
