In [1]:
import pandas as pd

In [4]:
train_df = pd.read_csv("../data/train.csv")
val_df = pd.read_csv("../data/validation.csv")
test_df = pd.read_csv("../data/test.csv")

In [5]:
print("train # samples:", train_df.shape[0])
print("validation # samples:", val_df.shape[0])
print("test # samples:", test_df.shape[0])

train # samples: 287113
validation # samples: 13368
test # samples: 11490


In [7]:
train_df.iloc[0]

id                     0001d1afc246a7964130f43ae940af6bc6c57f01
article       By . Associated Press . PUBLISHED: . 14:11 EST...
highlights    Bishop John Folda, of North Dakota, is taking ...
Name: 0, dtype: object

### Data preprocessing

In [8]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem.snowball import SnowballStemmer

import string

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\cresc\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [9]:
SYMBOLS = string.punctuation
STOPS = set(stopwords.words("english"))
stemmer = SnowballStemmer(language='english')

def preprocess(news, use_stemmer=False, remove_stopwords=True):
    # convert to lower case
    news = news.lower()
    
    # remove punctuations
    # news = news.translate(str.maketrans('', '', SYMBOLS))
    
    # stopwords removal
    if remove_stopwords:
        news = " ".join([word for word in str(news).split() if word not in STOPS])
    
    if use_stemmer:
        news = " ".join([stemmer.stem(word) for word in news.split()])
    return news
        
        
train_df["cleaned_article"] = train_df['article'].apply(lambda news: preprocess(news))

In [10]:
train_df.iloc[0]

id                          0001d1afc246a7964130f43ae940af6bc6c57f01
article            By . Associated Press . PUBLISHED: . 14:11 EST...
highlights         Bishop John Folda, of North Dakota, is taking ...
cleaned_article    associated press published 1411 est 25 october...
Name: 0, dtype: object

### Strong Baseline: TF-IDF Summarizer

In [12]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import PorterStemmer
nltk.download('punkt')
import math

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\cresc\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


In [13]:
def _create_frequency_matrix(sentences):
    frequency_matrix = {}
    ps = PorterStemmer()

    for sent in sentences:
        freq_table = {}
        words = word_tokenize(sent)
        for word in words:
            word = word.lower()
            word = ps.stem(word)
            if word in STOPS:
                continue

            if word in freq_table:
                freq_table[word] += 1
            else:
                freq_table[word] = 1

        frequency_matrix[sent[:15]] = freq_table

    return frequency_matrix

def _create_tf_matrix(freq_matrix):
    tf_matrix = {}

    for sent, f_table in freq_matrix.items():
        tf_table = {}

        count_words_in_sentence = len(f_table)
        for word, count in f_table.items():
            tf_table[word] = count / count_words_in_sentence

        tf_matrix[sent] = tf_table

    return tf_matrix

def _create_documents_per_words(freq_matrix):
    word_per_doc_table = {}

    for sent, f_table in freq_matrix.items():
        for word, count in f_table.items():
            if word in word_per_doc_table:
                word_per_doc_table[word] += 1
            else:
                word_per_doc_table[word] = 1

    return word_per_doc_table

def _create_idf_matrix(freq_matrix, count_doc_per_words, total_documents):
    idf_matrix = {}

    for sent, f_table in freq_matrix.items():
        idf_table = {}

        for word in f_table.keys():
            idf_table[word] = math.log10(total_documents / float(count_doc_per_words[word]))

        idf_matrix[sent] = idf_table

    return idf_matrix

def _create_tf_idf_matrix(tf_matrix, idf_matrix):
    tf_idf_matrix = {}

    for (sent1, f_table1), (sent2, f_table2) in zip(tf_matrix.items(), idf_matrix.items()):

        tf_idf_table = {}

        for (word1, value1), (word2, value2) in zip(f_table1.items(),
                                                    f_table2.items()):  # here, keys are the same in both the table
            tf_idf_table[word1] = float(value1 * value2)

        tf_idf_matrix[sent1] = tf_idf_table

    return tf_idf_matrix

def _score_sentences(tf_idf_matrix) -> dict:
    """
    score a sentence by its word's TF
    Basic algorithm: adding the TF frequency of every non-stop word in a sentence divided by total no of words in a sentence.
    :rtype: dict
    """

    sentence_value = {}

    for sent, f_table in tf_idf_matrix.items():
        total_score_per_sentence = 0

        count_words_in_sentence = len(f_table)
        for word, score in f_table.items():
            total_score_per_sentence += score

        sentence_value[sent] = total_score_per_sentence / count_words_in_sentence

    return sentence_value

def _find_average_score(sentence_value) -> int:
    """
    Find the average score from the sentence value dictionary
    :rtype: int
    """
    sum_values = 0
    for entry in sentence_value:
        sum_values += sentence_value[entry]

    # Average value of a sentence from original summary_text
    average = (sum_values / len(sentence_value))

    return average

def _generate_summary(sentences, sentence_value, threshold):
    sentence_count = 0
    rtv = ''

    for sentence in sentences:
        if sentence[:15] in sentence_value and sentence_value[sentence[:15]] >= (threshold):
            rtv += " " + sentence
            sentence_count += 1

    return rtv

In [15]:
sentences = sent_tokenize(train_df.iloc[0]["article"])

In [17]:
len(sentences)

15

In [18]:
sentences = sent_tokenize(train_df.iloc[0]["article"])
total_documents = len(sentences)

freq_matrix = _create_frequency_matrix(sentences)


'''
Term frequency (TF) is how often a word appears in a document, divided by how many words are there in a document.
'''
# 3 Calculate TermFrequency and generate a matrix
tf_matrix = _create_tf_matrix(freq_matrix)
#print(tf_matrix)

# 4 creating table for documents per words
count_doc_per_words = _create_documents_per_words(freq_matrix)
#print(count_doc_per_words)

'''
Inverse document frequency (IDF) is how unique or rare a word is.
'''
# 5 Calculate IDF and generate a matrix
idf_matrix = _create_idf_matrix(freq_matrix, count_doc_per_words, total_documents)
#print(idf_matrix)

# 6 Calculate TF-IDF and generate a matrix
tf_idf_matrix = _create_tf_idf_matrix(tf_matrix, idf_matrix)
#print(tf_idf_matrix)

# 7 Important Algorithm: score the sentences
sentence_scores = _score_sentences(tf_idf_matrix)
#print(sentence_scores)

# 8 Find the threshold
threshold = _find_average_score(sentence_scores)
#print(threshold)

# 9 Important Algorithm: Generate the summary
summary = _generate_summary(sentences, sentence_scores, 1.8 * threshold)
print(summary)

 Associated Press . PUBLISHED: . | . UPDATED: .


### References

The code is based upon the two sources below:
* https://www.kaggle.com/code/elvinagammed/text-summarization-with-bow-tf-idf-and-seq2seq
* https://towardsdatascience.com/text-summarization-using-tf-idf-e64a0644ace3 