In [30]:
import nltk
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import cosine_similarity
import heapq
import numpy as np

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\annar\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\annar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [31]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [36]:
# Pseudocode functions
def remove_stopwords(tokens):
    stop_words = set(stopwords.words('english'))
    return [word for word in tokens if word.lower() not in stop_words]

def F_first_K_Sents(text, f):
    return text[:f]

def SelectImpSentences(scores, N, sentences):
    selected_indices = heapq.nlargest(N, scores, key=scores.get)
    selected_sentences = [sentences[i] for i in selected_indices]
    return [sentence for sublist in selected_sentences for sentence in sublist]


def extractiveApproach(dataset, f, N):
    output_dataset = []
    for input_text in dataset:
        important_sentences = []
        tokenized_sents = nltk.sent_tokenize(input_text)
        tokenized_sents = [nltk.word_tokenize(sent) for sent in tokenized_sents]
        tokenized_sents = [remove_stopwords(tokens) for tokens in tokenized_sents]
        first_k_sents = F_first_K_Sents(tokenized_sents, f)
        
        sentence_embeddings = [np.mean(np.array([token.vector for token in nlp(' '.join(tokens))]), axis=0) for tokens in first_k_sents]
        similarity_matrix = cosine_similarity(sentence_embeddings, sentence_embeddings)
        
        for i, sent in enumerate(first_k_sents):
            scores = {j: similarity_matrix[i][j] for j in range(len(first_k_sents))}
            important_sentences.append(sent)
        
        selected_sentences = SelectImpSentences(scores, N, first_k_sents)
        summary = ' '.join(selected_sentences)


        
        output_dataset.append(summary)
    
    return output_dataset

In [37]:
# Example usage
dataset = [
    "Hi,my name is Annarhysa Albert",
    "Mumbai is my favourite place",
    "Listen to me carefully"
]

f = 2  # Number of sentences to consider
N = 5  # Number of words per sentence

summarized_dataset = extractiveApproach(dataset, f, N)
for summary in summarized_dataset:
    print(summary)


Hi , name Annarhysa Albert
Mumbai favourite place
Listen carefully


In [50]:
import pandas as pd

df = pd.read_csv("./Data/test.csv")
df.head()

Unnamed: 0,id,article,highlights
0,92c514c913c0bdfe25341af9fd72b29db544099b,Ever noticed how plane seats appear to be gett...,Experts question if packed out planes are put...
1,2003841c7dc0e7c5b1a248f9cd536d727f27a45a,A drunk teenage boy had to be rescued by secur...,Drunk teenage boy climbed into lion enclosure ...
2,91b7d2311527f5c2b63a65ca98d21d9c92485149,Dougie Freedman is on the verge of agreeing a ...,Nottingham Forest are close to extending Dougi...
3,caabf9cbdf96eb1410295a673e953d304391bfbb,Liverpool target Neto is also wanted by PSG an...,Fiorentina goalkeeper Neto has been linked wit...
4,3da746a7d9afcaa659088c8366ef6347fe6b53ea,Bruce Jenner will break his silence in a two-h...,"Tell-all interview with the reality TV star, 6..."


In [51]:
df = df[:10]

In [52]:
df.head()

Unnamed: 0,id,article,highlights
0,92c514c913c0bdfe25341af9fd72b29db544099b,Ever noticed how plane seats appear to be gett...,Experts question if packed out planes are put...
1,2003841c7dc0e7c5b1a248f9cd536d727f27a45a,A drunk teenage boy had to be rescued by secur...,Drunk teenage boy climbed into lion enclosure ...
2,91b7d2311527f5c2b63a65ca98d21d9c92485149,Dougie Freedman is on the verge of agreeing a ...,Nottingham Forest are close to extending Dougi...
3,caabf9cbdf96eb1410295a673e953d304391bfbb,Liverpool target Neto is also wanted by PSG an...,Fiorentina goalkeeper Neto has been linked wit...
4,3da746a7d9afcaa659088c8366ef6347fe6b53ea,Bruce Jenner will break his silence in a two-h...,"Tell-all interview with the reality TV star, 6..."


In [64]:
summarized_dataset = extractiveApproach(df['highlights'], f, N)

df["Summary"] = summarized_dataset

df.head()

Unnamed: 0,id,article,highlights,Summary
0,92c514c913c0bdfe25341af9fd72b29db544099b,Ever noticed how plane seats appear to be gett...,Experts question if packed out planes are put...,U.S consumer advisory group says minimum space...
1,2003841c7dc0e7c5b1a248f9cd536d727f27a45a,A drunk teenage boy had to be rescued by secur...,Drunk teenage boy climbed into lion enclosure ...,"Rahul Kumar , 17 , ran towards animals shoutin..."
2,91b7d2311527f5c2b63a65ca98d21d9c92485149,Dougie Freedman is on the verge of agreeing a ...,Nottingham Forest are close to extending Dougi...,Forest boss took former manager Stuart Pearce ...
3,caabf9cbdf96eb1410295a673e953d304391bfbb,Liverpool target Neto is also wanted by PSG an...,Fiorentina goalkeeper Neto has been linked wit...,Neto joined Firoentina Brazilian outfit Atleti...
4,3da746a7d9afcaa659088c8366ef6347fe6b53ea,Bruce Jenner will break his silence in a two-h...,"Tell-all interview with the reality TV star, 6...",comes amid continuing speculation transition w...


In [76]:
df.drop(columns = ["id", "article"])

Unnamed: 0,highlights,Summary
0,Experts question if packed out planes are put...,U.S consumer advisory group says minimum space...
1,Drunk teenage boy climbed into lion enclosure ...,"Rahul Kumar , 17 , ran towards animals shoutin..."
2,Nottingham Forest are close to extending Dougi...,Forest boss took former manager Stuart Pearce ...
3,Fiorentina goalkeeper Neto has been linked wit...,Neto joined Firoentina Brazilian outfit Atleti...
4,"Tell-all interview with the reality TV star, 6...",comes amid continuing speculation transition w...
5,Giant pig fell into the swimming pool at his h...,took efforts team firefighters winch water . G...
6,Figures show that while millions still tune in...,Average listener spent ten hours week tuning l...
7,"Show will return with a one-hour special, foll...",announced show Monday night `` Jimmy Kimmel Li...
8,Reanne Evans faced Ken Doherty in World Champi...,Doherty world championship 1997 . Reanne Evans...
9,Gang have been jailed for a total of 31 years ...,"Offences happened cars , woods defendants ' ho..."
