In [1]:
from parrot import Parrot
import torch
import warnings
import stanfordnlp

# stanfordnlp.download('en')  # 2 GB
stf_nlp = stanfordnlp.Pipeline(processors='tokenize,mwt,pos')

warnings.filterwarnings("ignore")

# Init models (make sure you init ONLY once if you integrate this to your code)
parrot = Parrot(model_tag="prithivida/parrot_paraphraser_on_T5")

phrases = [
    "Can you recommend some upscale restaurants in Newyork?",
    "What is the answer to life, the universe, and everything?",
    "What are famous places worth seeing in Russia?",
]

Use device: cpu
---
Loading: tokenize
With settings: 
{'model_path': '/Users/simonilincev/stanfordnlp_resources/en_ewt_models/en_ewt_tokenizer.pt', 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
---
Loading: pos
With settings: 
{'model_path': '/Users/simonilincev/stanfordnlp_resources/en_ewt_models/en_ewt_tagger.pt', 'pretrain_path': '/Users/simonilincev/stanfordnlp_resources/en_ewt_models/en_ewt.pretrain.pt', 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
Done loading processors!
---


In [20]:
# -*- coding: utf-8 -*-
import re
alphabets= "([A-Za-z])"
prefixes = "(Mr|St|Mrs|Ms|Dr)[.]"
suffixes = "(Inc|Ltd|Jr|Sr|Co)"
starters = "(Mr|Mrs|Ms|Dr|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
websites = "[.](com|net|org|io|gov)"
digits = "([0-9])"

def count_sentences(text):
    text = " " + text + "  "
    text = text.replace("\n"," ")
    text = re.sub(prefixes,"\\1<prd>",text)
    text = re.sub(websites,"<prd>\\1",text)
    text = re.sub(digits + "[.]" + digits,"\\1<prd>\\2",text)
    if "..." in text: text = text.replace("...","<prd><prd><prd>")
    if "Ph.D" in text: text = text.replace("Ph.D.","Ph<prd>D<prd>")
    text = re.sub("\s" + alphabets + "[.] "," \\1<prd> ",text)
    text = re.sub(acronyms+" "+starters,"\\1<stop> \\2",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>\\3<prd>",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>",text)
    text = re.sub(" "+suffixes+"[.] "+starters," \\1<stop> \\2",text)
    text = re.sub(" "+suffixes+"[.]"," \\1<prd>",text)
    text = re.sub(" " + alphabets + "[.]"," \\1<prd>",text)
    if "”" in text: text = text.replace(".”","”.")
    if "\"" in text: text = text.replace(".\"","\".")
    if "!" in text: text = text.replace("!\"","\"!")
    if "?" in text: text = text.replace("?\"","\"?")
    text = text.replace(".",".<stop>")
    text = text.replace("?","?<stop>")
    text = text.replace("!","!<stop>")
    text = text.replace("<prd>",".")
    sentences = text.split("<stop>")
    sentences = sentences[:-1]
    sentences = [s.strip() for s in sentences]
    return len(sentences)  # sentences

In [91]:
def paraphrase_text(text):
    # split text into sentences with regex based on punctuation marks
    sentences = split_into_sentences(text)
    output_text = ""

    for sentence in sentences:
        paras = parrot.augment(input_phrase=sentence, use_gpu=False, do_diverse=True)
        if not paras:
            paras = [] 

        easiest_para = sentence
        easiest_para_length = 99999
        for para in paras:
            para = stf_nlp(para[0]).sentences[0].words


            out = [w.text.capitalize() + " " if w.upos in ["PROPN","NNS"] else w.text + " " for w in para]


            out[0] = out[0].capitalize() # capitalize first word


            # now if the word.upos is a 'PART', we need to remove the space (if any) before it and connect it to the previous word
            # this is because the 'PART' is a part of the previous word
            for i in range(1, len(para)):
                if (para[i].upos == "PART" or para[i].upos == "PUNCT" or para[i].text[0] in ["\'", '\"', "’", "“", "”"]) and para[i].text != "to":
                    out[i - 1] = out[i - 1].rstrip()

            # check for any punctuation at end
            if para[-1].upos != "PUNCT":
                out = out[:-1] + [out[-1].rstrip()] + ["."]

            out = ''.join(out)

            # print the average word length
            avg_word_length = sum([len(w.text) for w in para]) / len(para)

            if avg_word_length < easiest_para_length:
                easiest_para_length = avg_word_length
                easiest_para = out

        output_text += easiest_para + " "

    return output_text

In [92]:
text = """
    Mr. and Mrs. Dursley, of number four, Privet Drive, were proud to say that they were perfectly normal, thank you very much. They were the last people you’d expect to be involved in anything strange or mysterious, because they just didn’t hold with such nonsense.
    
    Mr. Dursley was the director of a firm called Grunnings, which made drills. He was a big, beefy man with hardly any neck, although he did have a very large mustache. Mrs. Dursley was thin and blonde and had nearly twice the usual amount of neck, which came in very useful as she spent so much of her time craning over garden fences, spying on the neighbors. The Dursleys had a small son called Dudley and in their opinion there was no finer boy anywhere.

    The Dursleys had everything they wanted, but they also had a secret, and their greatest fear was that somebody would discover it. They didn’t think they could bear it if anyone found out about the Potters. Mrs. Potter was Mrs. Dursley’s sister, but they hadn’t met for several years; in fact, Mrs. Dursley pretended she didn’t have a sister, because her sister and her good-for-nothing husband were as unDursleyish as it was possible to be. The Dursleys shuddered to think what the neighbors would say if the Potters arrived in the street. The Dursleys knew that the Potters had a small son, too, but they had never even seen him. This boy was another good reason for keeping the Potters away; they didn’t want Dudley mixing with a child like that.
    """

text = [e for e in re.split("\n\s*", text) if e != ""]  # split into paragraphs
output = ""
for paragraph in text:
    output += paraphrase_text(paragraph) + "\n\n"

print(output)

# avg. length is 5000 characters
# here it takes ~40 seconds for 1500 characters
# so it will take ~2min 15

# therefore this will not be used in the future,
# but it can be mentioned somewhere for complexity points or whatnot

Dursley and his wife were proud to say that they were perfectly normal thank you very much. They were the last people you'd expect to be involved in anything strange or mysterious because they just didn't hold such nonsense to them. 

Mr dursley was the director of a firm called grunnings which made drills. He was a big, beefy man with hardly any neck, although he did have a very large mustache.  Mrs dursley was thin and blonde and had nearly twice the usual amount of neck which was very useful as she spent so much of her time craning over the garden fences spying on the neighbors. The dursleys had a small son called Dudley and in their opinion there wasn't a better boy. 

The dursleys had everything they wanted but they also had a secret and their greatest fear was that someone would find it. They didn't think they could bear it if anyone found out about the potters'  Mrs. Potter was Mrs. Dursley’s sister, but they hadn’t met for several years; in fact, Mrs. Dursley pretended she didn

In [21]:
import re
import nltk
import math
import heapq

article_text = """
Mr. and Mrs. Dursley, of number four, Privet Drive, were proud to say that they were perfectly normal, thank you very much. They were the last people you’d expect to be involved in anything strange or mysterious, because they just didn’t hold with such nonsense.
    
Mr. Dursley was the director of a firm called Grunnings, which made drills. He was a big, beefy man with hardly any neck, although he did have a very large mustache. Mrs. Dursley was thin and blonde and had nearly twice the usual amount of neck, which came in very useful as she spent so much of her time craning over garden fences, spying on the neighbors. The Dursleys had a small son called Dudley and in their opinion there was no finer boy anywhere.

The Dursleys had everything they wanted, but they also had a secret, and their greatest fear was that somebody would discover it. They didn’t think they could bear it if anyone found out about the Potters. Mrs. Potter was Mrs. Dursley’s sister, but they hadn’t met for several years; in fact, Mrs. Dursley pretended she didn’t have a sister, because her sister and her good-for-nothing husband were as unDursleyish as it was possible to be. The Dursleys shuddered to think what the neighbors would say if the Potters arrived in the street. The Dursleys knew that the Potters had a small son, too, but they had never even seen him. This boy was another good reason for keeping the Potters away; they didn’t want Dudley mixing with a child like that.
"""

# per https://www.kaggle.com/code/imkrkannan/text-summarization-with-nltk-in-python/notebook
def get_summary(paragraph, output_sentence_count):
    sentence_list = nltk.sent_tokenize(paragraph)
    stopwords = nltk.corpus.stopwords.words('english')


    word_frequencies = {}
    for word in nltk.word_tokenize(paragraph):
        if word not in stopwords:
            if word not in word_frequencies.keys():
                word_frequencies[word] = 1
            else:
                word_frequencies[word] += 1
        maximum_frequncy = max(word_frequencies.values())

    for word in word_frequencies.keys():
        word_frequencies[word] = (word_frequencies[word]/maximum_frequncy)

    sentence_scores = {}

    for sent in sentence_list:
        for word in nltk.word_tokenize(sent.lower()):
            if word in word_frequencies.keys():
                if len(sent.split(' ')) < 30:
                    if sent not in sentence_scores.keys():
                        sentence_scores[sent] = word_frequencies[word]
                    else:
                        sentence_scores[sent] += word_frequencies[word]

    return ''.join(heapq.nlargest(output_sentence_count, sentence_scores, key=sentence_scores.get))

# remove some punctuation and numbers
# article_text = re.sub(r'\[[0-9]*\]', ' ', article_text)

# split text into paragraphs
paragraphs = re.split("\n\s*", article_text)

summary = ""
for paragraph in paragraphs:
    paragraph = re.sub(r'\s+', ' ', paragraph)
    paragraph = re.sub('[^a-zA-Z]', ' ', paragraph)
    print(count_sentences(paragraph))
    summary += get_summary(paragraph, math.ceil(count_sentences(paragraph) / 2)) + "\n\n"

print(len(summary))


asfsdf    
0
asfsdf    
asfsdf  Mr  and Mrs  Dursley  of number four  Privet Drive  were proud to say that they were perfectly normal  thank you very much  They were the last people you d expect to be involved in anything strange or mysterious  because they just didn t hold with such nonsense   
0
asfsdf  Mr  and Mrs  Dursley  of number four  Privet Drive  were proud to say that they were perfectly normal  thank you very much  They were the last people you d expect to be involved in anything strange or mysterious  because they just didn t hold with such nonsense   
asfsdf  Mr  Dursley was the director of a firm called Grunnings  which made drills  He was a big  beefy man with hardly any neck  although he did have a very large mustache  Mrs  Dursley was thin and blonde and had nearly twice the usual amount of neck  which came in very useful as she spent so much of her time craning over garden fences  spying on the neighbors  The Dursleys had a small son called Dudley and in their opinio