In [344]:
# Check the resources doc if you haven't already!
# Also please add whatever interesting/useful sources you find on there for everyone else!
# Also, PLEASE check the documentation file in the Project folder if you are confused about any of the NLTK functions

# Text summarization tends to have two approaches: extraction and abstraction
# Because abstraction is more complex, we can try to build an extraction algorithm first
# Although I would definitely recommend checking it out to get an idea of how we can go forward from here

In [345]:
# For collaborators, right now I'm using the algorithm here:
# https://blog.floydhub.com/gentle-introduction-to-text-summarization-in-machine-learning/


In [346]:
# TLDR; Our extraction algorithm will go like this:
# -obtain data
# -process text
# -tokenization
# -find weighted frequency of words (weigh by sentence length, paragraph length, etc)
# -substitute words with their weighted frequencies (exactly what it sounds like)
# -sum up the weighted frequencies in each sentence, and the sentences with highest sums make up our summary

In [347]:
# Starting off with imports

import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to /Users/karengao/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/karengao/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [348]:
# Should try to make this less memory intensive
# paragraphContent should be fetched from database file, but I'm filling it in with example text for testing purposes
paragraphContent = """Sally sells seashells by the seashore. She worries that she doesn't have enough seashells to sell on the seashore. She won't give up. Sally wants to sell seashells on the seashore, because she likes to sell seashells on the seashore. She is selling seashells on the seashell shore. How much wood could the woodchuck chuck if the woodchuck could chuck wood? She sells seashells by the seashore."""

In [349]:
# function to create word frequency table for the entire text of the paragraph
def create_frequency_table(content):
    frequency_table = {}
    word_list = word_tokenize(content)
    stop_words = set(stopwords.words('english'))
    stem = PorterStemmer()
    for word in word_list:
        word = stem.stem(word)
        if word in stop_words:
            continue
        if word in frequency_table: 
            frequency_table[word] += 1
        else: 
            frequency_table[word] = 1
# making punctuation have 0 frequency to prevent them from skewing our weighted frequency
    punctuation = {";", ":", "'", ".", ",", "!", "?", "(", ")"}
    for word in frequency_table:
        if word in punctuation:
            frequency_table[word] = 0
    return frequency_table

In [350]:
freq_dict = create_frequency_table(paragraphContent)
for i in freq_dict:
    print(i + "\t" + str(freq_dict[i]))

salli	2
sell	6
seashel	7
seashor	5
.	0
worri	1
doe	1
n't	2
enough	1
wo	1
give	1
want	1
,	0
becaus	1
like	1
shore	1
much	1
wood	2
could	2
woodchuck	2
chuck	2
?	0


In [351]:
# takes in the word frequency dictionary from create_dictionary_table
def create_weighted_table(frequency_table):
    weighted_frequency_table = {}
    highestfreq = max(frequency_table.values())
    for word in frequency_table:
        weighted_frequency_table[word] = frequency_table[word] / highestfreq
    return weighted_frequency_table

In [352]:
weighted_dict = create_weighted_table(freq_dict)
for i in weighted_dict:
    print(i + "\t" + str(weighted_dict[i]))

salli	0.2857142857142857
sell	0.8571428571428571
seashel	1.0
seashor	0.7142857142857143
.	0.0
worri	0.14285714285714285
doe	0.14285714285714285
n't	0.2857142857142857
enough	0.14285714285714285
wo	0.14285714285714285
give	0.14285714285714285
want	0.14285714285714285
,	0.0
becaus	0.14285714285714285
like	0.14285714285714285
shore	0.14285714285714285
much	0.14285714285714285
wood	0.2857142857142857
could	0.2857142857142857
woodchuck	0.2857142857142857
chuck	0.2857142857142857
?	0.0


In [353]:
def cutParagraph(content):
    allSentences = sent_tokenize(content)
    return allSentences;
allSentences = cutParagraph(paragraphContent)
for sentence in allSentences:
    print(sentence)
for sentence in allSentences:
    words = word_tokenize(sentence)
    print(words)

Sally sells seashells by the seashore.
She worries that she doesn't have enough seashells to sell on the seashore.
She won't give up.
Sally wants to sell seashells on the seashore, because she likes to sell seashells on the seashore.
She is selling seashells on the seashell shore.
How much wood could the woodchuck chuck if the woodchuck could chuck wood?
She sells seashells by the seashore.
['Sally', 'sells', 'seashells', 'by', 'the', 'seashore', '.']
['She', 'worries', 'that', 'she', 'does', "n't", 'have', 'enough', 'seashells', 'to', 'sell', 'on', 'the', 'seashore', '.']
['She', 'wo', "n't", 'give', 'up', '.']
['Sally', 'wants', 'to', 'sell', 'seashells', 'on', 'the', 'seashore', ',', 'because', 'she', 'likes', 'to', 'sell', 'seashells', 'on', 'the', 'seashore', '.']
['She', 'is', 'selling', 'seashells', 'on', 'the', 'seashell', 'shore', '.']
['How', 'much', 'wood', 'could', 'the', 'woodchuck', 'chuck', 'if', 'the', 'woodchuck', 'could', 'chuck', 'wood', '?']
['She', 'sells', 'seashe

In [354]:
# substitute words in each sentence with weighted frequencies, 
# pretty damn sure PortStemmer() fucked up the words somehow
# sum up the weighted word frequencies in each sentence, these sums will be the sentence's "score"
# compare the sentence scores, and grab the ones with the highest scores for our summary

In [355]:
# calculates the frequencies of each sentence, and returns the sentence with the greatest frequency 

def sentence_scores(content):
    weighted_dict = create_weighted_table(freq_dict)
    addedNum = []
    num = 0
    
    for i in weighted_dict:
        if weighted_dict.get(i) == 0.0:
            addedNum.append(num)
            num = 0
        else:
            num = num + weighted_dict[i]
    
    largest = addedNum[0]
    numTracker = 0
    finalNum = 0
    for i in addedNum:
        numTracker = numTracker + 1
        if i > largest:
            largest = i
            finalNum = numTracker
    allSentences = cutParagraph(paragraphContent)
    return allSentences[finalNum - 1]

In [356]:
sentence_scores(paragraphContent)

'She sells seashells by the seashore.'