In [1]:
# Check the resources doc if you haven't already!
# Also please add whatever interesting/useful sources you find on there for everyone else!
# Also, PLEASE check the documentation file in the Project folder if you are confused about any of the NLTK functions

# Text summarization tends to have two approaches: extraction and abstraction
# Because abstraction is more complex, we can try to build an extraction algorithm first
# Although I would definitely recommend checking it out to get an idea of how we can go forward from here

In [2]:
# For collaborators, right now I'm using the algorithm here:
# https://blog.floydhub.com/gentle-introduction-to-text-summarization-in-machine-learning/


In [3]:
# TLDR; Our extraction algorithm will go like this:
# -obtain data
# -process text
# -tokenization
# -find weighted frequency of words (weigh by sentence length, paragraph length, etc)
# -substitute words with their weighted frequencies (exactly what it sounds like)
# -sum up the weighted frequencies in each sentence, and the sentences with highest sums make up our summary

In [4]:
# Starting off with import

import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jrh25\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jrh25\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
# word_tokenize breaks the sentence into individual components
text = "Sally sells seashells by the seashore."
words = word_tokenize(text)
print(words)

['Sally', 'sells', 'seashells', 'by', 'the', 'seashore', '.']


In [27]:
# Should try to make this less memory intensive
# paragraphContent should be fetched from database file, I fill it in for testing purposes
paragraphContent = """Sally sells seashells by the seashore. She worries that she doesn't have enough seashells to sell on the seashore. She won't give up. Sally wants to sell seashells on the seashore, because she likes to sell seashells on the seashore. She is selling seashells on the seashell shore. How much wood could the woodchuck chuck if the woodchuck could chuck wood? She sells seashells by the seashore."""

In [29]:
def cutParagraph(content):
    allSentences = sent_tokenize(content)
    return allSentences;

In [30]:
# sent_tokenize (sentence tokenize) breaks a paragraph into its individual sentences
allSentences = cutParagraph(paragraphContent)
for i in allSentences: 
    print(i)

Sally sells seashells by the seashore.
She worries that she doesn't have enough seashells to sell on the seashore.
She won't give up.
Sally wants to sell seashells on the seashore, because she likes to sell seashells on the seashore.
She is selling seashells on the seashell shore.
How much wood could the woodchuck chuck if the woodchuck could chuck wood?
She sells seashells by the seashore.


In [31]:
# stop words are basically all the extraneous words in each sentence that we can feasibly ignore
# this would (ideally) leave us with just the main nouns, verbs and such in the sentence
# we will use this in later functions
stop_words = set(stopwords.words('english'))
print(stop_words)

{'ours', 'she', 'doesn', "should've", 'needn', 'on', 'was', 'hasn', 'weren', 'about', 'up', 'me', 'now', "aren't", "mightn't", 'been', 'does', 'this', 'against', 'he', 'i', 'theirs', 'being', "hasn't", 'more', 'few', 'it', 'each', 'themselves', 'our', 'from', 'same', 'have', 'how', 'itself', 'nor', 'because', 'that', 'than', 'do', 'when', 'very', "mustn't", 'ma', 'can', 'them', "weren't", 'o', 'any', "doesn't", 'are', 'for', 'were', 'no', 'over', "isn't", 'again', 'don', 'there', 'they', 're', 'has', 'with', 'where', 'their', 'so', "couldn't", 'between', 'other', 'himself', 'my', 'under', 'an', 'off', 'before', 'mightn', 'm', 'your', "you've", 'while', 'whom', "don't", 'should', 'of', 'who', 'didn', 'yours', 'hadn', "needn't", 's', 'having', 'him', "you'd", 'once', "wouldn't", "she's", 'as', 'in', 'the', 'will', 'into', 'above', 'doing', 'his', 'to', 't', 've', "hadn't", 'not', 'too', 'until', 'by', 'we', 'down', 'shan', 'or', "you'll", 'most', 'at', 'd', 'further', 'isn', 'yourselves'

In [32]:
# The PorterStemmer() function simply diminishes a word to its root base. 
# For example, "cleaning" and "cleaned" would both return as "clean"
stem = PorterStemmer()
wd = stem.stem("cleaning")
print(wd)
wd1 = stem.stem("cleaned")
print(wd1)

clean
clean


In [33]:
# function to create word frequency table
def create_dictionary_table(sentence_text):
    frequency_table = {}
    word_list = word_tokenize(sentence_text)
    stop_words = set(stopwords.words('english'))
    stem = PorterStemmer()
    for word in word_list:
        word = stem.stem(word)
        if word in stop_words:
            continue
        if word in frequency_table: 
            frequency_table[word] += 1
        else: 
            frequency_table[word] = 1
    return frequency_table
        
    

In [None]:
# function to calculate sentence scores with weighted average occurrence of key words
def calculate_sentence_scores(sentences, frequency_table):
    sentence_weight = {}
    for sentence in allSentences:
        wordcount = len(word_tokenize(sentence))
        
    