In [2]:
import nltk
nltk.download("punkt")
nltk.download("stopwords")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [3]:
import pandas
import math
from nltk import sent_tokenize, word_tokenize, PorterStemmer
from nltk.corpus import stopwords   

In [4]:
df = pandas.read_csv("news.csv")

In [5]:
df.head()

Unnamed: 0,title,content,published_at,source,topic
0,BTS: RM is reminded of Bon Voyage as he travel...,"After reaching his hotel in the city, RM revea...",2022-07-30T07:00:00Z,2,13
1,RM recalls wondering if he 'made right decisio...,RM aka Kim Namjoon was the first member to joi...,2022-12-22T15:57:55Z,2,13
2,BTS: J-Hope and RM go bonkers at Billie Eilish...,"Billie Eilish's concert was held in Seoul, Sou...",2022-08-16T07:00:00Z,1,7
3,"BTS: J-Hope proudly states he raised Jungkook,...",BTS ARMY y'all would be missing the members a ...,2022-12-18T13:08:40Z,1,7
4,BTS: Jin aka Kim Seokjin takes us through the ...,BTS member Kim Seokjin aka Jin has the capacit...,2022-11-21T08:00:00Z,1,8


In [6]:
text = df["content"][0]

'After reaching his hotel in the city, RM revealed that his stay would be for four days and added that he would step out for dinner. As he sat at a roadside open-air restaurant, RM feasted on beer, burgers and fries. He said, "I\'m starving right now. I\'m out to grab some food. It\'s much quieter than I expected and feels like a rural town. I like the familiar atmosphere." RM attended Art Basel and explained on camera the details of the art fair. He also gave a glimpse as he had noodles and beer which was followed by soup noodles and wrap. Showing the pattern of a ping pong table, RM said, "The table looks like our (BTS) symbol." He also spoke about the art pieces as he viewed them. After that, RM took a tram to visit the Foundation Beyeler, a museum. He later took a walk through the city. On his third day, RM visited the Kunstmuseum Basel, the Vitra Design Museum and the gallery. As he walked around, RM showed a chair to his fans and said, "I have breaking news for you guys. Coldplay

In [9]:
# 1 Tokenizing the sentences
sents = sent_tokenize(text)
total_docs = len(sents)
print(sents)

['After reaching his hotel in the city, RM revealed that his stay would be for four days and added that he would step out for dinner.', 'As he sat at a roadside open-air restaurant, RM feasted on beer, burgers and fries.', 'He said, "I\'m starving right now.', "I'm out to grab some food.", "It's much quieter than I expected and feels like a rural town.", 'I like the familiar atmosphere."', 'RM attended Art Basel and explained on camera the details of the art fair.', 'He also gave a glimpse as he had noodles and beer which was followed by soup noodles and wrap.', 'Showing the pattern of a ping pong table, RM said, "The table looks like our (BTS) symbol."', 'He also spoke about the art pieces as he viewed them.', 'After that, RM took a tram to visit the Foundation Beyeler, a museum.', 'He later took a walk through the city.', 'On his third day, RM visited the Kunstmuseum Basel, the Vitra Design Museum and the gallery.', 'As he walked around, RM showed a chair to his fans and said, "I hav

In [10]:
def Create_frequency_matrix(sentences):
    frequency_matrix = {}
    stopWords = set(stopwords.words("english"))
    ps = PorterStemmer()

    for sent in sentences:
        freq_table = {}
        words = word_tokenize(sent)
        for word in words:
            word = word.lower()
            word = ps.stem(word)
            if word in stopWords:
                continue

            if word in freq_table:
                freq_table[word] += 1
            else:
                freq_table[word] = 1

        frequency_matrix[sent[:15]] = freq_table

    return frequency_matrix

In [11]:
# 2 Create the Frequency matrix of the words in each sentence.
freq_matrix = Create_frequency_matrix(sents)
print(freq_matrix)

{'After reaching ': {'reach': 1, 'hi': 2, 'hotel': 1, 'citi': 1, ',': 1, 'rm': 1, 'reveal': 1, 'stay': 1, 'would': 2, 'four': 1, 'day': 1, 'ad': 1, 'step': 1, 'dinner': 1, '.': 1}, 'As he sat at a ': {'sat': 1, 'roadsid': 1, 'open-air': 1, 'restaur': 1, ',': 2, 'rm': 1, 'feast': 1, 'beer': 1, 'burger': 1, 'fri': 1, '.': 1}, 'He said, "I\'m s': {'said': 1, ',': 1, '``': 1, "'m": 1, 'starv': 1, 'right': 1, '.': 1}, "I'm out to grab": {"'m": 1, 'grab': 1, 'food': 1, '.': 1}, "It's much quiet": {"'s": 1, 'much': 1, 'quieter': 1, 'expect': 1, 'feel': 1, 'like': 1, 'rural': 1, 'town': 1, '.': 1}, 'I like the fami': {'like': 1, 'familiar': 1, 'atmospher': 1, '.': 1, "''": 1}, 'RM attended Art': {'rm': 1, 'attend': 1, 'art': 2, 'basel': 1, 'explain': 1, 'camera': 1, 'detail': 1, 'fair': 1, '.': 1}, 'He also gave a ': {'also': 1, 'gave': 1, 'glimps': 1, 'noodl': 2, 'beer': 1, 'wa': 1, 'follow': 1, 'soup': 1, 'wrap': 1, '.': 1}, 'Showing the pat': {'show': 1, 'pattern': 1, 'ping': 1, 'pong': 1, 

In [12]:
def Create_tf_matrix(freq_matrix):
    tf_matrix = {}

    for sent, f_table in freq_matrix.items():
        tf_table = {}

        count_words_in_sentence = len(f_table)
        for word, count in f_table.items():
            tf_table[word] = count / count_words_in_sentence

        tf_matrix[sent] = tf_table

    return tf_matrix

In [13]:
# 3 Calculate TermFrequency and generate a matrix
tf_matrix = Create_tf_matrix(freq_matrix)
print(tf_matrix)

{'After reaching ': {'reach': 0.06666666666666667, 'hi': 0.13333333333333333, 'hotel': 0.06666666666666667, 'citi': 0.06666666666666667, ',': 0.06666666666666667, 'rm': 0.06666666666666667, 'reveal': 0.06666666666666667, 'stay': 0.06666666666666667, 'would': 0.13333333333333333, 'four': 0.06666666666666667, 'day': 0.06666666666666667, 'ad': 0.06666666666666667, 'step': 0.06666666666666667, 'dinner': 0.06666666666666667, '.': 0.06666666666666667}, 'As he sat at a ': {'sat': 0.09090909090909091, 'roadsid': 0.09090909090909091, 'open-air': 0.09090909090909091, 'restaur': 0.09090909090909091, ',': 0.18181818181818182, 'rm': 0.09090909090909091, 'feast': 0.09090909090909091, 'beer': 0.09090909090909091, 'burger': 0.09090909090909091, 'fri': 0.09090909090909091, '.': 0.09090909090909091}, 'He said, "I\'m s': {'said': 0.14285714285714285, ',': 0.14285714285714285, '``': 0.14285714285714285, "'m": 0.14285714285714285, 'starv': 0.14285714285714285, 'right': 0.14285714285714285, '.': 0.142857142

In [14]:
def Create_documents_per_words(freq_matrix):
    word_per_doc_table = {}

    for sent, f_table in freq_matrix.items():
        for word, count in f_table.items():
            if word in word_per_doc_table:
                word_per_doc_table[word] += 1
            else:
                word_per_doc_table[word] = 1

    return word_per_doc_table

In [15]:
# 4 creating table for documents per words
count_doc_per_words = Create_documents_per_words(freq_matrix)
print(count_doc_per_words)

{'reach': 1, 'hi': 4, 'hotel': 1, 'citi': 2, ',': 12, 'rm': 14, 'reveal': 1, 'stay': 1, 'would': 1, 'four': 1, 'day': 3, 'ad': 2, 'step': 1, 'dinner': 1, '.': 25, 'sat': 1, 'roadsid': 1, 'open-air': 1, 'restaur': 1, 'feast': 1, 'beer': 2, 'burger': 1, 'fri': 1, 'said': 4, '``': 5, "'m": 3, 'starv': 1, 'right': 1, 'grab': 1, 'food': 1, "'s": 4, 'much': 1, 'quieter': 1, 'expect': 1, 'feel': 1, 'like': 3, 'rural': 1, 'town': 1, 'familiar': 1, 'atmospher': 1, "''": 5, 'attend': 2, 'art': 2, 'basel': 2, 'explain': 1, 'camera': 2, 'detail': 1, 'fair': 1, 'also': 3, 'gave': 1, 'glimps': 1, 'noodl': 1, 'wa': 2, 'follow': 1, 'soup': 1, 'wrap': 1, 'show': 3, 'pattern': 1, 'ping': 1, 'pong': 1, 'tabl': 1, 'look': 1, '(': 1, 'bt': 2, ')': 1, 'symbol': 1, 'spoke': 1, 'piec': 1, 'view': 1, 'took': 2, 'tram': 1, 'visit': 6, 'foundat': 1, 'beyel': 1, 'museum': 5, 'later': 1, 'walk': 3, 'third': 1, 'kunstmuseum': 1, 'vitra': 2, 'design': 2, 'galleri': 1, 'around': 1, 'chair': 2, 'fan': 1, 'break': 1, '

In [16]:
def Create_idf_matrix(freq_matrix, count_doc_per_words, total_documents):
    idf_matrix = {}

    for sent, f_table in freq_matrix.items():
        idf_table = {}

        for word in f_table.keys():
            idf_table[word] = math.log10(total_documents / float(count_doc_per_words[word]))

        idf_matrix[sent] = idf_table

    return idf_matrix

In [17]:
# 5 Calculate IDF and generate a matrix
idf_matrix = Create_idf_matrix(freq_matrix, count_doc_per_words, total_docs)
print(idf_matrix)

{'After reaching ': {'reach': 1.3979400086720377, 'hi': 0.7958800173440752, 'hotel': 1.3979400086720377, 'citi': 1.0969100130080565, ',': 0.31875876262441283, 'rm': 0.2518119729937996, 'reveal': 1.3979400086720377, 'stay': 1.3979400086720377, 'would': 1.3979400086720377, 'four': 1.3979400086720377, 'day': 0.9208187539523752, 'ad': 1.0969100130080565, 'step': 1.3979400086720377, 'dinner': 1.3979400086720377, '.': 0.0}, 'As he sat at a ': {'sat': 1.3979400086720377, 'roadsid': 1.3979400086720377, 'open-air': 1.3979400086720377, 'restaur': 1.3979400086720377, ',': 0.31875876262441283, 'rm': 0.2518119729937996, 'feast': 1.3979400086720377, 'beer': 1.0969100130080565, 'burger': 1.3979400086720377, 'fri': 1.3979400086720377, '.': 0.0}, 'He said, "I\'m s': {'said': 0.7958800173440752, ',': 0.31875876262441283, '``': 0.6989700043360189, "'m": 0.9208187539523752, 'starv': 1.3979400086720377, 'right': 1.3979400086720377, '.': 0.0}, "I'm out to grab": {"'m": 0.9208187539523752, 'grab': 1.39794000

In [18]:
def Create_tf_idf_matrix(tf_matrix, idf_matrix):
    tf_idf_matrix = {}

    for (sent1, f_table1), (sent2, f_table2) in zip(tf_matrix.items(), idf_matrix.items()):

        tf_idf_table = {}

        for (word1, value1), (word2, value2) in zip(f_table1.items(),
                                                    f_table2.items()):  # here, keys are the same in both the table
            tf_idf_table[word1] = float(value1 * value2)

        tf_idf_matrix[sent1] = tf_idf_table

    return tf_idf_matrix

In [19]:
# 6 Calculate TF-IDF and generate a matrix
tf_idf_matrix = Create_tf_idf_matrix(tf_matrix, idf_matrix)
print(tf_idf_matrix)

{'After reaching ': {'reach': 0.09319600057813585, 'hi': 0.10611733564587669, 'hotel': 0.09319600057813585, 'citi': 0.0731273342005371, ',': 0.021250584174960854, 'rm': 0.016787464866253307, 'reveal': 0.09319600057813585, 'stay': 0.09319600057813585, 'would': 0.1863920011562717, 'four': 0.09319600057813585, 'day': 0.06138791693015835, 'ad': 0.0731273342005371, 'step': 0.09319600057813585, 'dinner': 0.09319600057813585, '.': 0.0}, 'As he sat at a ': {'sat': 0.12708545533382162, 'roadsid': 0.12708545533382162, 'open-air': 0.12708545533382162, 'restaur': 0.12708545533382162, ',': 0.05795613865898415, 'rm': 0.022891997544890873, 'feast': 0.12708545533382162, 'beer': 0.0997190920916415, 'burger': 0.12708545533382162, 'fri': 0.12708545533382162, '.': 0.0}, 'He said, "I\'m s': {'said': 0.11369714533486788, ',': 0.04553696608920183, '``': 0.0998528577622884, "'m": 0.13154553627891075, 'starv': 0.1997057155245768, 'right': 0.1997057155245768, '.': 0.0}, "I'm out to grab": {"'m": 0.2302046884880

In [20]:
def Score_sentences(tf_idf_matrix) -> dict:
    """
    score a sentence by its word's TF
    Basic algorithm: adding the TF frequency of every non-stop word in a sentence divided by total no of words in a sentence.
    :rtype: dict
    """

    sentenceValue = {}

    for sent, f_table in tf_idf_matrix.items():
        total_score_per_sentence = 0

        count_words_in_sentence = len(f_table)
        for word, score in f_table.items():
            total_score_per_sentence += score

        sentenceValue[sent] = total_score_per_sentence / count_words_in_sentence

    return sentenceValue

In [21]:
# 7 Important Algorithm: score the sentences
sentence_scores = Score_sentences(tf_idf_matrix)
print(sentence_scores)

{'After reaching ': 0.07937079834810307, 'As he sat at a ': 0.09728776505747891, 'He said, "I\'m s': 0.11286341950206036, "I'm out to grab": 0.23229367320602817, "It's much quiet": 0.12474492374479847, 'I like the fami': 0.1766267510252988, 'RM attended Art': 0.12259484029691597, 'He also gave a ': 0.12900218840672756, 'Showing the pat': 0.06437078589349442, 'He also spoke a': 0.17254302202712624, 'After that, RM ': 0.09257800370236059, 'He later took a': 0.18050315154562108, 'On his third da': 0.06751087039765545, 'As he walked ar': 0.06687911786499891, "Coldplay's Chri": 0.10140603409706238, 'If you see this': 0.14300875123103304, "You're amazing.": 0.21842812635500589, 'RM next visited': 0.11067421131654279, 'Recalling his p': 0.05684105835350934, 'He was also rem': 0.06364294234550026, 'Speaking to the': 0.06634625666732458, "RM's travel in ": 0.09453309619736366, 'Next, RM flew t': 0.0817487470274649, 'He then went to': 0.1747425010840047, "RM's vlog ended": 0.09859654598388592}


In [22]:
def Find_average_score(sentenceValue) -> int:
    """
    Find the average score from the sentence value dictionary
    :rtype: int
    """
    sumValues = 0
    for entry in sentenceValue:
        sumValues += sentenceValue[entry]

    # Average value of a sentence from original summary_text
    average = (sumValues / len(sentenceValue))

    return average

In [23]:
# 8 Find the threshold
threshold = Find_average_score(sentence_scores)
print(threshold)

0.11716550326709461


In [32]:
def Generate_summary(sentences, sentenceValue, threshold):
    sentence_count = 0
    summary = ''
    removed_lines = ''
    for sentence in sentences:
        if sentence[:15] in sentenceValue and sentenceValue[sentence[:15]] >= (threshold):
            summary += " " + sentence
            sentence_count += 1
        else:
          removed_lines+=" "+sentence

    return [summary,removed_lines]

In [34]:
# 9 Important Algorithm: Generate the summary
[summary,removed_lines]= Generate_summary(sents, sentence_scores, 0.6 * threshold)
print(summary)
print()
print(removed_lines)

 After reaching his hotel in the city, RM revealed that his stay would be for four days and added that he would step out for dinner. As he sat at a roadside open-air restaurant, RM feasted on beer, burgers and fries. He said, "I'm starving right now. I'm out to grab some food. It's much quieter than I expected and feels like a rural town. I like the familiar atmosphere." RM attended Art Basel and explained on camera the details of the art fair. He also gave a glimpse as he had noodles and beer which was followed by soup noodles and wrap. He also spoke about the art pieces as he viewed them. After that, RM took a tram to visit the Foundation Beyeler, a museum. He later took a walk through the city. Coldplay's Chris Martin made a chair and it's displayed in the Vitra Design Museum. If you see this Chris, give me a call. You're amazing." RM next visited Lucerne and hiked to Mount Rigi. RM's travel in Switzerland ended with a visit to the Museum Tinguely. Next, RM flew to Paris to attend t

In [35]:
def Summary(text):
  # 1 Tokenizing the sentences
  sents = sent_tokenize(text)
  total_docs = len(sents)

  # 2 Create the Frequency matrix of the words in each sentence.
  freq_matrix = Create_frequency_matrix(sents)
  
  # 3 Calculate TermFrequency and generate a matrix
  tf_matrix = Create_tf_matrix(freq_matrix)
  
  # 4 creating table for documents per words
  count_doc_per_words = Create_documents_per_words(freq_matrix)
  
  # 5 Calculate IDF and generate a matrix
  idf_matrix = Create_idf_matrix(freq_matrix, count_doc_per_words, total_docs)

  # 6 Calculate TF-IDF and generate a matrix
  tf_idf_matrix = Create_tf_idf_matrix(tf_matrix, idf_matrix)

  # 7 Important Algorithm: score the sentences
  sentence_scores = Score_sentences(tf_idf_matrix)

  # 8 Find the threshold
  threshold = Find_average_score(sentence_scores)

  # 9 Important Algorithm: Generate the summary
  [summary,removed_lines]= Generate_summary(sents, sentence_scores, 0.6 * threshold)

  return [summary,removed_lines]

In [36]:
Summary(text)

[' After reaching his hotel in the city, RM revealed that his stay would be for four days and added that he would step out for dinner. As he sat at a roadside open-air restaurant, RM feasted on beer, burgers and fries. He said, "I\'m starving right now. I\'m out to grab some food. It\'s much quieter than I expected and feels like a rural town. I like the familiar atmosphere." RM attended Art Basel and explained on camera the details of the art fair. He also gave a glimpse as he had noodles and beer which was followed by soup noodles and wrap. He also spoke about the art pieces as he viewed them. After that, RM took a tram to visit the Foundation Beyeler, a museum. He later took a walk through the city. Coldplay\'s Chris Martin made a chair and it\'s displayed in the Vitra Design Museum. If you see this Chris, give me a call. You\'re amazing." RM next visited Lucerne and hiked to Mount Rigi. RM\'s travel in Switzerland ended with a visit to the Museum Tinguely. Next, RM flew to Paris to

In [28]:
def Get_tfidf_score(text):
  # 1 Tokenizing the sentences
  sents = sent_tokenize(text)
  total_docs = len(sents)

  # 2 Create the Frequency matrix of the words in each sentence.
  freq_matrix = Create_frequency_matrix(sents)
  
  # 3 Calculate TermFrequency and generate a matrix
  tf_matrix = Create_tf_matrix(freq_matrix)
  
  # 4 creating table for documents per words
  count_doc_per_words = Create_documents_per_words(freq_matrix)
  
  # 5 Calculate IDF and generate a matrix
  idf_matrix = Create_idf_matrix(freq_matrix, count_doc_per_words, total_docs)

  # 6 Calculate TF-IDF and generate a matrix
  tf_idf_matrix = Create_tf_idf_matrix(tf_matrix, idf_matrix)

  # 7 Important Algorithm: score the sentences
  sentence_scores = Score_sentences(tf_idf_matrix)

  # 8 Find the threshold
  average_score = Find_average_score(sentence_scores)

  return average_score

In [30]:
Get_tfidf_score(Summary(text))

0.13091129058816642

In [37]:
def Get_summary_with_scores(text):
  [summary_text,removed_lines] = Summary(text)
  text_score = Get_tfidf_score(text)
  summary_text_score = Get_tfidf_score(summary_text)
  return [text,summary_text,removed_lines,text_score,summary_text_score]

In [38]:
Get_summary_with_scores(text)

['After reaching his hotel in the city, RM revealed that his stay would be for four days and added that he would step out for dinner. As he sat at a roadside open-air restaurant, RM feasted on beer, burgers and fries. He said, "I\'m starving right now. I\'m out to grab some food. It\'s much quieter than I expected and feels like a rural town. I like the familiar atmosphere." RM attended Art Basel and explained on camera the details of the art fair. He also gave a glimpse as he had noodles and beer which was followed by soup noodles and wrap. Showing the pattern of a ping pong table, RM said, "The table looks like our (BTS) symbol." He also spoke about the art pieces as he viewed them. After that, RM took a tram to visit the Foundation Beyeler, a museum. He later took a walk through the city. On his third day, RM visited the Kunstmuseum Basel, the Vitra Design Museum and the gallery. As he walked around, RM showed a chair to his fans and said, "I have breaking news for you guys. Coldpla