In [192]:
# Check the resources doc if you haven't already!
# Also please add whatever interesting/useful sources you find on there for everyone else!
# Also, PLEASE check the documentation file in the Project folder if you are confused about any of the NLTK functions

# Text summarization tends to have two approaches: extraction and abstraction
# Because abstraction is more complex, we can try to build an extraction algorithm first
# Although I would definitely recommend checking it out to get an idea of how we can go forward from here

In [193]:
# For collaborators, right now I'm using the algorithm here:
# https://blog.floydhub.com/gentle-introduction-to-text-summarization-in-machine-learning/


In [194]:
# TLDR; Our extraction algorithm will go like this:
# -obtain data
# -process text
# -tokenization
# -find weighted frequency of words (weigh by sentence length, paragraph length, etc)
# -substitute words with their weighted frequencies (exactly what it sounds like)
# -sum up the weighted frequencies in each sentence, and the sentences with highest sums make up our summary

In [195]:
# Starting off with imports

import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to /Users/karengao/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/karengao/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [196]:
# Should try to make this less memory intensive
# paragraphContent should be fetched from database file
paragraphContent = """"""

In [197]:
# function to create word frequency table for the entire text of the paragraph
def create_frequency_table(content):
    frequency_table = {}
    word_list = word_tokenize(content)
    stop_words = set(stopwords.words('english'))
    stem = PorterStemmer()
    for word in word_list:
        word = stem.stem(word)
        if word in stop_words:
            continue
        if word in frequency_table: 
            frequency_table[word] += 1
        else: 
            frequency_table[word] = 1
# making punctuation have 0 frequency to prevent them from skewing our weighted frequency
    punctuation = {";", ":", "'", ".", ",", "!", "?", "(", ")"}
    for word in frequency_table:
        if word in punctuation:
            frequency_table[word] = 0
    return frequency_table

In [198]:
# takes in the word frequency dictionary from create_dictionary_table
def create_weighted_table(frequency_table):
    weighted_frequency_table = {}
    highestfreq = max(frequency_table.values())
    for word in frequency_table:
        weighted_frequency_table[word] = frequency_table[word] / highestfreq
    return weighted_frequency_table

In [199]:
def cutParagraph(content):
    allSentences = sent_tokenize(content)
    return allSentences;

In [200]:
# substitute words in each sentence with weighted frequencies, 
# sum up the weighted word frequencies in each sentence, these sums will be the sentence's "score"
# compare the sentence scores, and grab the ones with the highest scores for our summary

In [201]:
# clean sentences using PorterStemmer() and getting rid of stop words, similar to tokenization
# we can then calculate the scores in each sentence based on the "weighted frequencies" that we
# assigned to each of these words in our previously made dictionary
def clean_sentences(content):
    allSentences = sent_tokenize(content)
    stem = PorterStemmer()
    stop_words = set(stopwords.words('english'))
    all_clean_sentences = []
    for sentence in allSentences:
        cleaned_sentence = []
        word_list = word_tokenize(sentence)
        for word in word_list:
            word = stem.stem(word)
            if (word not in stop_words):
                cleaned_sentence.append(word)
        all_clean_sentences.append(cleaned_sentence)
    return all_clean_sentences

In [202]:
# calculates the frequencies of each sentence as a sentence score, and returns the sentence with the greatest frequency 

def one_sentence_summary(content):
    freq_dict = create_frequency_table(content)
    weighted_dict = create_weighted_table(freq_dict)
    cleanedSentences = clean_sentences(content)
    addedNum = []
    
    for clean_sentence in cleanedSentences:
        num = 0
        for token in clean_sentence:
            if token in weighted_dict.keys():
                num = num + weighted_dict[token]
        addedNum.append(num)
        
    largest = addedNum[0]
    numTracker = 0
    finalNum = 0
    for i in addedNum:
        numTracker = numTracker + 1
        if i > largest:
            largest = i
            finalNum = numTracker
    # finalNum is the index of the sentence that has the highest score
    allSentences = cutParagraph(content)
    return allSentences[finalNum - 1]

In [203]:
content1 = """Sally sells seashells by the seashore. She worries that she doesn't have enough seashells to sell on the seashore. She won't give up. Sally wants to sell seashells on the seashore, because she likes to sell seashells on the seashore. She is selling seashells on the seashell shore. How much wood could the woodchuck chuck if the woodchuck could chuck wood? She sells seashells by the seashore."""


In [204]:
content12 = """His parents continued to question him. He didn't know what to say to them since they refused to believe the truth. He explained again and again, and they dismissed his explanation as a figment of his imagination. There was no way that grandpa, who had been dead for five years, could have told him where the treasure had been hidden. Of course, it didn't help that grandpa was roaring with laughter in the chair next to him as he tried to explain once again how he'd found it.The trail to the left had a Danger! Do Not Pass sign telling people to take the trail to the right. This wasn't the way Zeke approached his hiking. Rather than a warning, Zeke read the sign as an invitation to explore an area that would be adventurous and exciting. As the others in the group all shited to the right, Zeke slipped past the danger sign to begin an adventure he would later regret.Sometimes there isn't a good answer. No matter how you try to rationalize the outcome, it doesn't make sense. And instead of an answer, you are simply left with a question. Why?He ordered his regular breakfast. Two eggs sunnyside up, hash browns, and two strips of bacon. He continued to look at the menu wondering if this would be the day he added something new. This was also part of the routine. A few seconds of hesitation to see if something else would be added to the order before demuring and saying that would be all. It was the same exact meal that he had ordered every day for the past two years.Explain to me again why I shouldn't cheat? he asked. All the others do and nobody ever gets punished for doing so. I should go about being happy losing to cheaters because I know that I don't? That's what you're telling me?"Dave wasn't exactly sure how he had ended up in this predicament. He ran through all the events that had lead to this current situation and it still didn't make sense. He wanted to spend some time to try and make sense of it all, but he had higher priorities at the moment. The first was how to get out of his current situation of being naked in a tree with snow falling all around and no way for him to get down.The thing that's great about this job is the time sourcing the items involves no traveling. I just look online to buy it. It's really as simple as that. While everyone else is searching for what they can sell, I sit in front of my computer and buy better stuff for less money and spend a fraction of the time doing it.Greg understood that this situation would make Michael terribly uncomfortable. Michael simply had no idea what was about to come and even though Greg could prevent it from happening, he opted to let it happen. It was quite ironic, really. It was something Greg had said he would never wish upon anyone a million times, yet here he was knowingly letting it happen to one of his best friends. He rationalized that it would ultimately make Michael a better person and that no matter how uncomfortable, everyone should experience racism at least once in their lifetime.Spending time at national parks can be an exciting adventure, but this wasn't the type of excitement she was hoping to experience. As she contemplated the situation she found herself in, she knew she'd gotten herself in a little more than she bargained for. It wasn't often that she found herself in a tree staring down at a pack of wolves that were looking to make her their next meal.There had to be a better way. That's all Nancy could think as she sat at her desk staring at her computer screen. She'd already spent five years of her life in this little cubicle staring at her computer doing "work" that didn't seem to matter to anyone including her own boss. There had to be more to her life than this and there had to be a better way to make a living. That's what she was thinking when the earthquake struck."""

In [205]:
one_sentence_summary(content1)

'Sally wants to sell seashells on the seashore, because she likes to sell seashells on the seashore.'

In [206]:
def summarize(content, degree_of_summarization):
    # this time, we are able to summarize larger bodies of text
    # we also give the user control over how general the degree of summarization is with another parameter
    # degree_of_summarization should be an integer between 0 and 10, with 10 being the highest degree of summarization
    # 10 would output a one sentence summary
    freq_dict = create_frequency_table(content)
    weighted_dict = create_weighted_table(freq_dict)
    cleanedSentences = clean_sentences(content)
    addedNum = []
    
    for clean_sentence in cleanedSentences:
        num = 0
        for token in clean_sentence:
            if token in weighted_dict.keys():
                num = num + weighted_dict[token]
        addedNum.append(num)

    # find largest sentence score to compare others against
    largest_score = addedNum[0]
    for num in addedNum:
        if num > largest_score:
            largest_score = num

    # start collecting sentences
    index_list = []
    deg = degree_of_summarization
    for i in range(len(addedNum)):
        if addedNum[i] >= (largest_score * (deg / 10)):
            index_list.append(i)
    allSentences = cutParagraph(content)
    
    summary = """"""
    for i in index_list:
        summary = summary + " " + allSentences[i]
    return summary

In [207]:
def keyWords(content):
    freq_dict = create_frequency_table(content)
    weighted_dict = create_weighted_table(freq_dict)
    Sentences = cutParagraph(content)
    words = word_tokenize(content)
    ps = PorterStemmer()
    
    finalThree = []
    finalThreeSt = []
    mass = 0
    value = ""
    value2 = ""
    for j in words:
        if (j != "n't"):
            for i in weighted_dict.keys():
                if ps.stem(j) == i:
                    if weighted_dict[i] > mass:
                        mass = weighted_dict[i]
                        value = j
                        value2 = i
    finalThree.append(value)
    finalThreeSt.append(value2)
    
    mass = 0
    for j in words:
        if (j != "n't"):
            for i in weighted_dict.keys():
                if ps.stem(j) == i:
                    if weighted_dict[i] > mass and i != finalThreeSt[0]:
                        mass = weighted_dict[i]
                        value = j
                        value2 = i
    finalThree.append(value)
    finalThreeSt.append(value2)
    
    mass = 0
    for j in words:
        if (j != "n't"):
            for i in weighted_dict.keys():
                if ps.stem(j) == i:
                    if weighted_dict[i] > mass and i != finalThreeSt[0] and i != finalThreeSt[1]:
                        mass = weighted_dict[i]
                        value = j
                        value2 = i
    finalThree.append(value)
    finalThreeSt.append(value2)
    
    return finalThree
 

In [208]:
keyWords(content12)

['was', 'This', 'would']

In [209]:
# This a five paragraph text
content2 = """Twenty seconds were all that was left and Richard could hear each one tick by. Fifteen seconds now remained and the panic began to fully set in. Ten seconds and he wasn't sure he had enough time. Five seconds, four, three, two, one... Don't forget that gifts often come with costs that go beyond their purchase price. When you purchase a child the latest smartphone, you're also committing to a monthly phone bill. When you purchase the latest gaming system, you're likely not going to be satisfied with the games that come with it for long and want to purchase new titles to play. When you buy gifts it's important to remember that some come with additional costs down the road that can be much more expensive than the initial gift itself. He took a sip of the drink. He wasn't sure whether he liked it or not, but at this moment it didn't matter. She had made it especially for him so he would have forced it down even if he had absolutely hated it. That's simply the way things worked. She made him a new-fangled drink each day and he took a sip of it and smiled, saying it was excellent.The bush began to shake. Brad couldn't see what was causing it to shake, but he didn't care. He had a pretty good idea about what was going on and what was happening. He was so confident that he approached the bush carefree and with a smile on his face. That all changed the instant he realized what was actually behind the bush. She looked at her student wondering if she could ever get through. "You need to learn to think for yourself," she wanted to tell him. "Your friends are holding you back and bringing you down." But she didn't because she knew his friends were all that he had and even if that meant a life of misery, he would never give them up."""

In [210]:
summarize(content2, 10)

" Five seconds, four, three, two, one... Don't forget that gifts often come with costs that go beyond their purchase price."

In [211]:
summarize(content2, 7)

" Five seconds, four, three, two, one... Don't forget that gifts often come with costs that go beyond their purchase price. When you purchase the latest gaming system, you're likely not going to be satisfied with the games that come with it for long and want to purchase new titles to play. He wasn't sure whether he liked it or not, but at this moment it didn't matter. She made him a new-fangled drink each day and he took a sip of it and smiled, saying it was excellent.The bush began to shake. Brad couldn't see what was causing it to shake, but he didn't care."

In [212]:
summarize(content2, 5)

" Twenty seconds were all that was left and Richard could hear each one tick by. Ten seconds and he wasn't sure he had enough time. Five seconds, four, three, two, one... Don't forget that gifts often come with costs that go beyond their purchase price. When you purchase the latest gaming system, you're likely not going to be satisfied with the games that come with it for long and want to purchase new titles to play. When you buy gifts it's important to remember that some come with additional costs down the road that can be much more expensive than the initial gift itself. He wasn't sure whether he liked it or not, but at this moment it didn't matter. She made him a new-fangled drink each day and he took a sip of it and smiled, saying it was excellent.The bush began to shake. Brad couldn't see what was causing it to shake, but he didn't care. He had a pretty good idea about what was going on and what was happening. He was so confident that he approached the bush carefree and with a smi

In [213]:
# For some reason, any degree of summarization that's less than 3 returns a summary where every contraction has a backslash in them
# I still haven't found the reason for that
# For demo purposes, let's just keep the degree of summarization above 4 or so.

summarize(content2, 2)

' Twenty seconds were all that was left and Richard could hear each one tick by. Fifteen seconds now remained and the panic began to fully set in. Ten seconds and he wasn\'t sure he had enough time. Five seconds, four, three, two, one... Don\'t forget that gifts often come with costs that go beyond their purchase price. When you purchase a child the latest smartphone, you\'re also committing to a monthly phone bill. When you purchase the latest gaming system, you\'re likely not going to be satisfied with the games that come with it for long and want to purchase new titles to play. When you buy gifts it\'s important to remember that some come with additional costs down the road that can be much more expensive than the initial gift itself. He wasn\'t sure whether he liked it or not, but at this moment it didn\'t matter. She had made it especially for him so he would have forced it down even if he had absolutely hated it. She made him a new-fangled drink each day and he took a sip of it a