In [18]:
# Text Summarizer | Type = Extractive
# Using Term Frequency - Inverse Document Frequency Technique
import string
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    print('punkt')
    nltk.download('punkt')

try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    print('stopwords')
    nltk.download('stopwords')


try:
    nltk.data.find('corpora/wordnet')
except LookupError:
    nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\akdev\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [19]:
def remove_punctuation_marks(text):
    punctuation_marks = dict((ord(punctuation_mark), None)
                             for punctuation_mark in string.punctuation)
    # new_dict = [key for key in punctuation_marks.keys if punctuation_marks[key] != None]
    return text.translate(punctuation_marks)

In [20]:
def get_lemmatized_tokens(text):
    normalized_tokens = nltk.word_tokenize(
        remove_punctuation_marks(text.lower()))
    return [nltk.stem.WordNetLemmatizer().lemmatize(normalized_token) for normalized_token in normalized_tokens]

In [21]:
def get_average(values):
    greater_than_zero_count = 0
    total = 0
    for value in values:
        if value != 0:
            greater_than_zero_count += 1
            total += value
    if total == 0 and greater_than_zero_count == 0:
        return 0
    else:
        return total / greater_than_zero_count

In [22]:
def get_threshold(tfidf_results):
    i = 0
    total = 0
    while i < (tfidf_results.shape[0]):
        total += get_average(tfidf_results[i, :].toarray()[0])
        i += 1
    return total / tfidf_results.shape[0]

In [23]:
def get_summary(documents, tfidf_results, handicap=0.85):
    summary = ""
    i = 0
    while i < (tfidf_results.shape[0]):
        if (get_average(tfidf_results[i, :].toarray()[0])) >= get_threshold(tfidf_results) * handicap:
            summary += ' ' + documents[i]
        i += 1
    return summary

In [24]:
# Use below function for calling from different file | Pass Text and handicap and then get Summary
def summary_func(text="", handicap=0.9):
    documents = nltk.sent_tokenize(text)

    tfidf_results = TfidfVectorizer(tokenizer=get_lemmatized_tokens, stop_words=stopwords.words(
        'english')).fit_transform(documents)
    
    return get_summary(documents, tfidf_results, handicap)

In [25]:
if __name__ == "__main__":
        text = '''This product is good for students for Study You can not play heavy gamesike? GTA 5 on this laptop.
              Nice product. good for students but can improve in screen resolution?.
              Quite dissatisfied with the screen quality overall it's all fine?.
              Nothing great to talk about! The specs are mediocre and the screen quality is passable.
              The sound has a lot more to improve.
              The size is quite ok for a satchel/ backpack and is lightweight!
              But I believe there are better offers.
              Good for light personal use only like browsing and office work!
              ASUS VIVOBOOK15 (2021)CELERON N4020, is not adequate for our office purpose we want to return it or exchange it for one with i5 or i3 processor.
              need to know where to send this bought item for refund or exchange.
              Expecting a response at the earliest. Just Go For It Dont Think. The best laptop for students.'''

        documents = nltk.sent_tokenize(text)
        # print(documents)
        # print([get_lemmatized_tokens(d) for d in documents])

        stop_words_list = stopwords.words('english').extend([get_lemmatized_tokens(i) for i in stopwords.words('english')]) 
        tfidf_results = TfidfVectorizer(tokenizer=get_lemmatized_tokens, stop_words=stop_words_list).fit_transform(
            documents)
        # nxm vector output n rows =  num_sentences or docs
        # nxm vector output m cols =  num_tokens
        # print(tfidf_results)
        # Handicap Parameter = 0.9
        print(get_summary(documents, tfidf_results, 0.9))
        # Length of Summarized text is inversely prop. to Handicap

 GTA 5 on this laptop. Nice product. Nothing great to talk about! The sound has a lot more to improve. But I believe there are better offers. Expecting a response at the earliest. Just Go For It Dont Think. The best laptop for students.


In [26]:
print(tfidf_results[0:5, 0:7].toarray())

[[0.         0.         0.         0.         0.         0.
  0.        ]
 [0.         0.47860244 0.         0.         0.         0.
  0.        ]
 [0.         0.         0.         0.         0.         0.
  0.        ]
 [0.         0.         0.         0.         0.         0.
  0.        ]
 [0.         0.         0.         0.         0.         0.3568927
  0.        ]]


In [27]:
print(tfidf_results[:, 0:7].toarray())

[[0.         0.         0.         0.         0.         0.
  0.        ]
 [0.         0.47860244 0.         0.         0.         0.
  0.        ]
 [0.         0.         0.         0.         0.         0.
  0.        ]
 [0.         0.         0.         0.         0.         0.
  0.        ]
 [0.         0.         0.         0.         0.         0.3568927
  0.        ]
 [0.         0.         0.         0.47132396 0.         0.
  0.        ]
 [0.         0.         0.         0.         0.         0.
  0.28122944]
 [0.         0.         0.31224164 0.         0.         0.
  0.        ]
 [0.         0.         0.25135036 0.         0.         0.
  0.25135036]
 [0.         0.         0.         0.         0.         0.
  0.        ]
 [0.         0.         0.         0.         0.         0.
  0.25629645]
 [0.20168016 0.         0.         0.         0.20168016 0.
  0.        ]
 [0.         0.         0.         0.         0.         0.
  0.        ]
 [0.         0.         0.34959

In [28]:
print(tfidf_results[0:5, :].toarray())

[[0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.26642928
  0.         0.         0.         0.         0.         0.
  0.3187853  0.30593331 0.         0.23840072 0.         0.
  0.         0.30593331 0.         0.         0.         0.
  0.         0.21666008 0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.26642928 0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.30593331
  0.         0.26642928 0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.23840072 0.30593331 0.
  0.         0.         0.         0.23840072 0.         0.
  0.         0.         0.         0.         0.         0.
  0.30593331]
 [0.      