# Importing and downloading necessary libraries

In [1]:
import numpy as np
import pandas as pd
import re
import os
import random
import string

In [None]:
import nltk
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from scipy.spatial.distance import cosine

In [None]:
nltk.download('punkt')

# Importing a dataset containing gold summaries (for reference)

In [None]:
df = pd.read_csv("../input/bbc-articles/BBCarticles_csv.csv", encoding = "unicode_escape")
df.head()

Preprocessing the dataset

In [None]:
df = df.dropna().reset_index()
df['Text'] = df['Text'].apply(lambda x: x.replace('\n',' '))
df['Summary'] = df['Summary'].apply(lambda x: x.replace('\n',' '))
df.head()

Setting up inputs

In [None]:
rand = random.randint(0,df.shape[0])
print(rand)
sample_text = df.iloc[rand,2]
gold_summary = df.iloc[rand,1]
print("\nText: ", sample_text)
print("\nGold Summary: ", gold_summary)

# sample_text= "Tanjiro Kamado is a kind-hearted and intelligent boy who lives with his family in the mountains. He became his family's breadwinner after his father's death, making trips to the nearby village to sell charcoal. Everything changed when he came home one day to discover that his family was attacked and slaughtered by a demon. Tanjiro and his sister Nezuko were the sole survivors of the incident, with Nezuko being transformed into a demon, but still surprisingly showing signs of human emotion and thought. After an encounter with Giyū Tomioka, a demon slayer, Tanjiro is recruited by Giyū and sent to his retired master Sakonji Urokodaki for training to also become a demon slayer, beginning his quest to help his sister turn into human again and avenge the death of his family. After two years of strenuous training, Tanjiro takes part in a formidable exam and is one of the few survivors to pass, officially making him a member of the Demon Slayer Corps. He begins his work of hunting down and slaying demons alongside Nezuko, who has been hypnotized to bring no harm to humans and who occasionally helps him in battle. One of Tanjiro's assignments brings him to Asakusa where he encounters Muzan Kibutsuji, the progenitor of all demons and the one who murdered his family. He also meets Tamayo, a demon who is free of Muzan's control. Tamayo allies with Tanjiro and begins to develop a cure for Nezuko, though it will require Tanjiro to supply her with blood from the Twelve Kizuki, the most powerful demons under Muzan's command."

#  Preprocessing Input Text

In [None]:
def preprocess_input_text(text):
    stop_words = set(stopwords.words('english'))
    sentences = sent_tokenize(text)
    preprocessed_sentences = []
    for sent in sentences:
        words = word_tokenize(sent)
        words = [w for w in words if w not in string.punctuation]
        words = [w for w in words if not w.lower() in stop_words]
        words = [w.lower() for w in words]
        #words = [w.replace('"', "'") for w in words]
        preprocessed_sentences.append(" ".join(words))
    return preprocessed_sentences

In [None]:
preprocessed_sentences = preprocess_input_text(sample_text)
tokenized_words = []
for sent in preprocessed_sentences:
    tokenized_words.append(word_tokenize(sent))

In [None]:
def sentence_tokenize(text):
    sents = sent_tokenize(text)
    sents_filtered = []
    for s in sents:
        sents_filtered.append(s)
    return sents_filtered

# Define Embedding Model

In [None]:
embedding_model = Word2Vec(tokenized_words, min_count=1, sg = 1, epochs = 1000)

# Calculate TF-IDF Scores

In [None]:
def calculate_tf_idf(sentences):
    vectorizer = CountVectorizer()
    sent_word_matrix = vectorizer.fit_transform(sentences)
    transformer = TfidfTransformer(norm=None, sublinear_tf=False, smooth_idf=False)
    tfidf = transformer.fit_transform(sent_word_matrix)
    tfidf = tfidf.toarray()
    centroid_vector = tfidf.sum(0)
    centroid_vector = np.divide(centroid_vector, centroid_vector.max())
    feature_names = vectorizer.get_feature_names_out()
    centroid_limit = 0.3
    relevant_vector_indices = np.where(centroid_vector > centroid_limit)[0]

    word_list = list(np.array(feature_names)[relevant_vector_indices])
    return word_list

# Populating word vectors (with word embeddings)

In [None]:
def word_vectors_cache(sentences, embedding_model):
    word_vectors = dict()
    for sent in sentences:
        words = word_tokenize(sent)
        for w in words:
            word_vectors.update({w: embedding_model.wv[w]})
    return word_vectors

# Sentence embedding representation with sum of word vectors

In [None]:
def build_embedding_representation(words, word_vectors, embedding_model):
    embedding_representation = np.zeros(embedding_model.vector_size, dtype="float32")
    word_vectors_keys = set(word_vectors.keys())
    count = 0
    for w in words:
        if w in word_vectors_keys:
            embedding_representation = embedding_representation + word_vectors[w]
            count += 1
    if count != 0:
        embedding_representation = np.divide(embedding_representation, count)
    return embedding_representation

# Cosine Similarity

In [None]:
def calculate_cosine_similarity(vector1, vector2):
    score = 0.0
    if np.count_nonzero(vector1) != 0 and np.count_nonzero(vector2) != 0:
        score = ((1 - cosine(vector1, vector2)) + 1) / 2
    return score

# Generating Extractive Summary

In [None]:
def generate_summary(text, embedding_model):
    raw_sentences = sentence_tokenize(text)
    clean_sentences = preprocess_input_text(text)
    centroid_words = calculate_tf_idf(clean_sentences)
    word_vectors = word_vectors_cache(clean_sentences, embedding_model)
    centroid_vector = build_embedding_representation(centroid_words, word_vectors, embedding_model)
    sentences_scores = []
    for i in range(len(clean_sentences)):
        scores = []
        words = clean_sentences[i].split()
        sentence_vector = build_embedding_representation(words, word_vectors, embedding_model)
        score = calculate_cosine_similarity(sentence_vector, centroid_vector)
        sentences_scores.append((i, raw_sentences[i], score, sentence_vector))
    sentence_scores_sort = sorted(sentences_scores, key=lambda el: el[2], reverse=True)
    return sentence_scores_sort

# Removing redundancy

In [None]:
def remove_redundancy(sentence_scores, limit,limit_type ):
    count = 0
    sentences_summary = []
    for s in sentence_scores:
        if count>limit:
            break
        include_flag = True
        for ps in sentences_summary:
            sim = calculate_cosine_similarity(s[3], ps[3])
            if sim > 0.95:
                include_flag = False
        if include_flag:
            sentences_summary.append(s)
            if limit_type == "word":
                count += len(s[1].split())
            elif limit_type == "sentence":
                count += 1

        sentences_summary = sorted(sentences_summary, key=lambda el: el[0], reverse=False)

    summary = "".join([s[1] for s in sentences_summary])
    return summary

# Get summary

In [None]:
print("Number of sentences: ", len(preprocessed_sentences))
words = sample_text.split()
word_count = len(words)
print("Word count: ", word_count)
word_limit = 100
sentence_limit = 5

In [None]:
sentence_scores = generate_summary(sample_text, embedding_model)
extractive_summary = remove_redundancy(sentence_scores, sentence_limit, "sentence")

# Print Summaries

In [None]:
print("\nText:  ", sample_text)
print("\nGold Summary: ", gold_summary)
print("\nExtractive Summary: ", extractive_summary)