### IMPORT

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle
from heapq import nlargest
import string
import nltk

In [2]:
punctuations=string.punctuation

### SUMMARIZATION

In [3]:
stopwords=pickle.load(open("stopwords.pkl", "rb"))
normalized_frequencies=pickle.load(open("normalized_frequencies.pkl", "rb"))

In [4]:
def get_summaries(text):
    sentences=text.strip().split(".")
    normalized_dict={}
    for sentence in sentences:
        if len(sentence)>0:
            temp_words_list=[]
            words=sentence.strip().split(" ")
            for word in words:
                if word not in stopwords and word not in punctuations:
                    b="".join([i for i in word if i not in punctuations])
                    if len(b)>0:
                        temp_words_list.append(b)
            if len(temp_words_list) >0:
                normalized_length=0
                for i in temp_words_list:
                    if i in normalized_frequencies:
                        normalized_length+=normalized_frequencies[i]
                    else:
                        normalized_length+=0
            normalized_dict[sentence]=normalized_length
    dict_length = int(len(normalized_dict) * 0.3)
    res=nlargest(dict_length, normalized_dict, key=normalized_dict.get)
    
    return ". ".join(res)

In [5]:
text=get_summaries("The cat (Felis catus), commonly referred to as the domestic cat or house cat, is the only domesticated species in the family Felidae. Recent advances in archaeology and genetics have shown that the domestication of the cat occurred in the Near East around 7500 BC. It is commonly kept as a house pet and farm cat, but also ranges freely as a feral cat avoiding human contact. It is valued by humans for companionship and its ability to kill vermin. Because of its retractable claws it is adapted to killing small prey like mice and rats. It has a strong flexible body, quick reflexes, sharp teeth, and its night vision and sense of smell are well developed. It is a social species, but a solitary hunter and a crepuscular predator. Cat communication includes vocalizations like meowing, purring, trilling, hissing, growling, and grunting as well as cat body language. It can hear sounds too faint or too high in frequency for human ears, such as those made by small mammals. It also secretes and perceives pheromones.")

### EXTRACTION

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
corpus=[text]
vectorizer=TfidfVectorizer(stop_words='english')
tfidf_matrix=vectorizer.fit_transform(corpus)
features_names=vectorizer.get_feature_names_out()
if(len(features_names)> 20):
    n=int(len(features_names)*0.6)
    top_keywords= [features_names[i] for i in tfidf_matrix.sum(axis=0).argsort()[0, ::-1][:n]]
else:
    top_keywords=[features_names[i] for i in tfidf_matrix.sum(axis=0).argsort()[0, ::-1][:len(features_names)]]

In [7]:
top_keywords

[array([['cat', 'vocalizations', 'house', 'body', 'commonly',
         'communication', 'contact', 'farm', 'feral', 'freely',
         'growling', 'grunting', 'hissing', 'human', 'trilling',
         'includes', 'kept', 'language', 'like', 'meowing', 'perceives',
         'pet', 'pheromones', 'purring', 'ranges', 'secretes', 'avoiding']],
       dtype=object)]

### DISTRACTOR GENERATION

In [8]:
 from sense2vec import Sense2Vec
 # Loading pretrained model
 s2v = Sense2Vec().from_disk("../../installed_packages/s2v_old")

In [9]:
from collections import OrderedDict

In [10]:
def sense2vec_get_words(word, s2v):
    output=[]
    word=word.lower()
    word=word.replace(" ", "_")
    sense=s2v.get_best_sense(word)
    most_similar=s2v.most_similar(sense, n=20)
    for each_word in most_similar:
        append_word=each_word[0].split("|")[0].replace("_"," ").lower()
        if(append_word.lower()!=word):
            output.append(append_word.title())
    out2=[]
    for i in output:
        if i not in out2:
            out2.append(i)
    return out2

In [11]:
sense2vec_get_words("ice cream", s2v)

['Icecream',
 'Ice-Cream',
 'Frozen Yogurt',
 'Milkshakes',
 'Cookie Dough',
 'Cereal',
 'Chocolate Cake',
 'Pop Tarts',
 'Oreos',
 'Chocolate Chip Cookies',
 'Potato Chips',
 'Dessert',
 'Cinnamon Rolls',
 'Chocolate Ice Cream',
 'Cheesecake',
 'Popsicles',
 'Chocolate Milk',
 'Chocolate',
 'Pancakes']

### SENTENCE GENERATION

In [15]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [16]:
from nltk.tokenize import sent_tokenize

In [25]:
def tokenize_sentences(text):
    sentences = [sent_tokenize(text)]
    print(sentences)
    sentences=[y for x in sentences for y in x]
    print(sentences)
    sentences=[x.strip() for x in sentences if len(x)>20]
    print(sentences)
    return sentences

In [27]:
tokenize_sentences("Hello, Awesome Reader, how are you doing today? The weather is great, and Python is awesome.")

[['Hello, Awesome Reader, how are you doing today?', 'The weather is great, and Python is awesome.']]
['Hello, Awesome Reader, how are you doing today?', 'The weather is great, and Python is awesome.']
['Hello, Awesome Reader, how are you doing today?', 'The weather is great, and Python is awesome.']


['Hello, Awesome Reader, how are you doing today?',
 'The weather is great, and Python is awesome.']