In [1]:
# Check the resources doc if you haven't already!
# Also please add whatever interesting/useful sources you find on there for everyone else!

# Text summarization tends to have two approaches: extraction and abstraction
# Because abstraction is more complex, we can try to build an extraction algorithm first
# Although I would definitely recommend checking it out to get an idea of how we can go forward from here

In [2]:
# For collaborators, right now I'm using the algorithm here:
# https://blog.floydhub.com/gentle-introduction-to-text-summarization-in-machine-learning/


In [None]:
# TLDR; Our extraction algorithm will go like this:
# -obtain data
# -process text
# -tokenization
# -find weighted frequency of words (weigh by sentence length, paragraph length, etc)
# -substitute words with their weighted frequencies (exactly what it sounds like)
# -sum up the weighted frequencies in each sentence, and the sentences with highest sums make up our summary

In [21]:
# Starting off with import

import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jrh25\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jrh25\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [14]:
# word_tokenize breaks the sentence into individual components
text = "Sally sells seashells by the seashore."
words = word_tokenize(text)
print(words)

['Sally', 'sells', 'seashells', 'by', 'the', 'seashore', '.']


In [15]:
# Should try to make this less memory intensive
# paragraphContent should be fetched from database file, I fill it in for testing purposes
paragraphContent = """Sally sells seashells by the seashore. She sells seashells on the seashell shore. How much wood could the woodchuck chuck if the woodchuck could chuck wood? She sells seashells by the seashore."""

In [16]:
def cutParagraph(content):
    allSentences = sent_tokenize(content)
    return allSentences;

In [17]:
# sent_tokenize (sentence tokenize) breaks a paragraph into its individual sentences
cut = cutParagraph(paragraphContent)
for i in cut: 
    print(i)

Sally sells seashells by the seashore.
She sells seashells on the seashell shore.
How much wood could the woodchuck chuck if the woodchuck could chuck wood?
She sells seashells by the seashore.


In [23]:
# stop words are basically all the extraneous words in each sentence that we can feasibly ignore
stop_words = set(stopwords.words('english'))
print(stop_words)

{'out', 'they', 'before', 'why', 'will', 'these', 'their', 'if', 'off', 'our', 'because', 'for', 'but', "isn't", 'whom', 'where', 'just', 'when', 'or', 'shouldn', "shouldn't", 'during', 'up', 'than', 'her', 'had', 'himself', "hadn't", 'same', 'those', "shan't", 'haven', 'shan', 'do', 'there', 'me', 'above', 'didn', 'other', "it's", 'ourselves', 'further', 'under', 't', 'she', 'not', 'be', 's', 'aren', 'am', 've', 'i', 'theirs', 'what', "aren't", 'you', 'here', 'down', 'to', 'below', 'your', 'ma', 'of', 'being', 'any', "wasn't", 'does', 'how', 'very', 'which', 'having', "couldn't", 'nor', 'again', 'so', 'ours', 'herself', 'some', 'been', 'an', 'can', 'it', 'at', "needn't", 'its', 'mightn', 'each', "hasn't", "haven't", 'themselves', 'over', 'wasn', 'until', 'by', 'once', 'yourself', "you'd", 'about', 'is', 'll', 'all', 'should', "didn't", 'such', 'few', "you'll", "should've", 'no', 'won', 'from', 'as', 'have', 'we', 'yourselves', 'm', 'now', 'into', 'weren', 'needn', 'itself', 'doing', '