In [63]:
import spacy

In [64]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from string import punctuation
from heapq import nlargest

In [65]:
nlp = spacy.load('en_core_web_sm')

In [66]:
with open("text.txt") as my_file:
    text = my_file.read()

In [67]:
print(text)

The Orbiter Discovery, OV-103, is considered eligible for listing in the National Register of Historic Places (NRHP) in the context of the U.S. Space Shuttle Program (1969-2011) under Criterion A in the areas of Space Exploration and Transportation and under Criterion C in the area of Engineering. Because it has achieved significance within the past fifty years, Criteria Consideration G applies. Under Criterion A, Discovery is significant as the oldest of the three extant orbiter vehicles constructed for the Space Shuttle Program (SSP), the longest running American space program to date; she was the third of five orbiters built by NASA. Unlike the Mercury, Gemini, and Apollo programs, the SSP’s emphasis was on cost effectiveness and reusability, and eventually the construction of a space station. Including her maiden voyage (launched August 30, 1984), Discovery flew to space thirty-nine times, more than any of the other four orbiters; she was also the first orbiter to fly twenty missio

In [68]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [69]:
doc = nlp(text)

In [70]:
stop_words = set(stopwords.words('english'))

In [71]:
punctuation = punctuation + '\n'

In [73]:
word_frequencies = {}
for word in doc:
  if word.text.lower() not in stop_words:
    if word.text.lower() not in punctuation:
      if word.text not in word_frequencies.keys():
        word_frequencies[word.text] = 1
      else:
        word_frequencies[word.text] += 1

In [74]:
print(word_frequencies)

{'Orbiter': 1, 'Discovery': 7, 'OV-103': 1, 'considered': 1, 'eligible': 1, 'listing': 1, 'National': 1, 'Register': 1, 'Historic': 1, 'Places': 1, 'NRHP': 1, 'context': 1, 'U.S.': 2, 'Space': 10, 'Shuttle': 6, 'Program': 2, '1969': 1, '2011': 1, 'Criterion': 4, 'areas': 1, 'Exploration': 1, 'Transportation': 1, 'C': 2, 'area': 1, 'Engineering': 1, 'achieved': 1, 'significance': 1, 'within': 1, 'past': 1, 'fifty': 1, 'years': 1, 'Criteria': 1, 'Consideration': 1, 'G': 1, 'applies': 1, 'significant': 2, 'oldest': 1, 'three': 1, 'extant': 1, 'orbiter': 6, 'vehicles': 2, 'constructed': 1, 'SSP': 2, 'longest': 1, 'running': 1, 'American': 1, 'space': 3, 'program': 1, 'date': 1, 'third': 1, 'five': 3, 'orbiters': 2, 'built': 1, 'NASA': 1, 'Unlike': 1, 'Mercury': 1, 'Gemini': 1, 'Apollo': 1, 'programs': 1, '’s': 1, 'emphasis': 1, 'cost': 1, 'effectiveness': 1, 'reusability': 1, 'eventually': 1, 'construction': 2, 'station': 2, 'Including': 1, 'maiden': 1, 'voyage': 1, 'launched': 1, 'August'

In [75]:
max_frequency = max(word_frequencies.values())
print(max_frequency)

10


In [76]:
for word in word_frequencies.keys():
  word_frequencies[word] = word_frequencies[word]/max_frequency

In [77]:
sentences = sent_tokenize(text)

In [78]:
sentence_weight = dict()
for sentence in sentences:
    sentence_wordcount = len(word_tokenize(sentence))
    sentence_wordcount_no_sw = 0
    for word_weight in word_frequencies:
        if word_weight in sentence.lower():
            sentence_wordcount_no_sw += 1
            if sentence in sentence_weight:
                sentence_weight[sentence] += word_frequencies[word_weight]
            else:
                sentence_weight[sentence] = word_frequencies[word_weight]
    sentence_weight[sentence]

In [79]:
print(sentence_weight)

{'The Orbiter Discovery, OV-103, is considered eligible for listing in the National Register of Historic Places (NRHP) in the context of the U.S. Space Shuttle Program (1969-2011) under Criterion A in the areas of Space Exploration and Transportation and under Criterion C in the area of Engineering.': 2.3000000000000003, 'Because it has achieved significance within the past fifty years, Criteria Consideration G applies.': 0.7999999999999999, 'Under Criterion A, Discovery is significant as the oldest of the three extant orbiter vehicles constructed for the Space Shuttle Program (SSP), the longest running American space program to date; she was the third of five orbiters built by NASA.': 3.200000000000001, 'Unlike the Mercury, Gemini, and Apollo programs, the SSP’s emphasis was on cost effectiveness and reusability, and eventually the construction of a space station.': 1.4999999999999998, 'Including her maiden voyage (launched August 30, 1984), Discovery flew to space thirty-nine times, 

In [80]:
from heapq import nlargest

In [91]:
select_length = int(len(sentence_weight)*0.3)

In [92]:
summary = nlargest(select_length, sentence_weight, key = sentence_weight.get)

In [93]:
fin_summary = [word for word in summary]
summary = " ".join(fin_summary)

In [84]:
print(summary)

According to Wayne Hale, a flight director from Johnson Space Center, the Space Shuttle orbiter represents a “huge technological leap from expendable rockets and capsules to a reusable, winged, hypersonic, cargo-carrying spacecraft.” Although her base structure followed a conventional aircraft design, she used advanced materials that both minimized her weight for cargo-carrying purposes and featured low thermal expansion ratios, which provided a stable base for her Thermal Protection System (TPS) materials. Including her maiden voyage (launched August 30, 1984), Discovery flew to space thirty-nine times, more than any of the other four orbiters; she was also the first orbiter to fly twenty missions. The Space Shuttle orbiter also featured the first reusable TPS; all previous spaceflight vehicles had a single-use, ablative heat shield. As Hale stated, the Space Shuttle remains “the largest, fastest, winged hypersonic aircraft in history,” having regularly flown at twenty-five times the 

In [94]:
len(text), len(summary)

(2906, 1015)

In [86]:
from spacy.lang.en.stop_words import STOP_WORDS

nlp = spacy.load("en_core_web_sm")
stopwords = list(STOP_WORDS)
doc = nlp(text)

word_freq = {}
for word in doc:
    if word.text.lower() not in stopwords and word.text.lower() not in punctuation:
        if word.text not in word_freq.keys():
            word_freq[word.text] = 1
        else:
            word_freq[word.text] += 1

max_freq = max(word_freq.values())

for word in word_freq.keys():
    word_freq[word] = word_freq[word]/max_freq

In [87]:
sent_scores = {}
sent_tokents = [sent for sent in doc.sents]

for sent in sent_tokents:
    for word in sent:
        if word.text in word_freq.keys():
            if sent not in sent_scores.keys():
                sent_scores[sent] = word_freq[word.text]
            else:
                sent_scores[sent] += word_freq[word.text]

select_len = int(len(sent_tokents) * 0.3)
summary_spacy = nlargest(select_len, sent_scores, key=sent_scores.get)
fin_summary = [word.text for word in summary_spacy]
summary_spacy = " ".join(fin_summary)

In [89]:
print(summary_spacy)

The Orbiter Discovery, OV-103, is considered eligible for listing in the National Register of Historic Places (NRHP) in the context of the U.S. Space Shuttle Program (1969-2011) under Criterion A in the areas of Space Exploration and Transportation and under Criterion C in the area of Engineering. According to Wayne Hale, a flight director from Johnson Space Center, the Space Shuttle orbiter represents a “huge technological leap from expendable rockets and capsules to a reusable, winged, hypersonic, cargo-carrying spacecraft.” In addition, Discovery was vital to the construction of the International Space Station (ISS); she flew thirteen of the thirty-seven total missions flown to the station by a U.S. Space Shuttle. Under Criterion A, Discovery is significant as the oldest of the three extant orbiter vehicles constructed for the Space Shuttle Program (SSP), the longest running American space program to date; she was the third of five orbiters built by NASA. As Hale stated, the Space S

In [90]:
print(f"Summarry: {len(summary_spacy)} Text: {len(text)}")

Summarry: 1141 Text: 2906
