In [1]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation

In [2]:
stopwords= list(STOP_WORDS)

In [3]:
nlp= spacy.load('en_core_web_sm')

In [4]:
text= '''At the same time, there began to take form a system of numbering, the calendar, hieroglyphic writing, and a technically advanced art, all of which later influenced other peoples. Within the framework of this gradual evolution or cultural
progress the Preclassic horizon has been divided into Lower, Middle and Upper periods, to which can be added a transitional or Protoclassic period with several features that would later distinguish the emerging civilizations of Mesoamerica.'''

In [5]:
doc= nlp(text)

In [6]:
tokens= [token.text for token in doc]
print(tokens)

['At', 'the', 'same', 'time', ',', 'there', 'began', 'to', 'take', 'form', 'a', 'system', 'of', 'numbering', ',', 'the', 'calendar', ',', 'hieroglyphic', 'writing', ',', 'and', 'a', 'technically', 'advanced', 'art', ',', 'all', 'of', 'which', 'later', 'influenced', 'other', 'peoples', '.', 'Within', 'the', 'framework', 'of', 'this', 'gradual', 'evolution', 'or', 'cultural', '\n', 'progress', 'the', 'Preclassic', 'horizon', 'has', 'been', 'divided', 'into', 'Lower', ',', 'Middle', 'and', 'Upper', 'periods', ',', 'to', 'which', 'can', 'be', 'added', 'a', 'transitional', 'or', 'Protoclassic', 'period', 'with', 'several', 'features', 'that', 'would', 'later', 'distinguish', 'the', 'emerging', 'civilizations', 'of', 'Mesoamerica', '.']


In [7]:
punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [8]:
# Now adding new line in punctuation
punctuation= punctuation + '/n'

In [9]:
# Words Frequencies
word_frequencies={}
for word in doc:
    if word.text.lower() not in stopwords:
        if word.text.lower() not in punctuation:
            if word.text not in word_frequencies.keys():
                word_frequencies[word.text]=1
            else:
                word_frequencies[word.text]+=1

In [11]:
print(word_frequencies)

{'time': 1, 'began': 1, 'form': 1, 'system': 1, 'numbering': 1, 'calendar': 1, 'hieroglyphic': 1, 'writing': 1, 'technically': 1, 'advanced': 1, 'art': 1, 'later': 2, 'influenced': 1, 'peoples': 1, 'framework': 1, 'gradual': 1, 'evolution': 1, 'cultural': 1, '\n': 1, 'progress': 1, 'Preclassic': 1, 'horizon': 1, 'divided': 1, 'Lower': 1, 'Middle': 1, 'Upper': 1, 'periods': 1, 'added': 1, 'transitional': 1, 'Protoclassic': 1, 'period': 1, 'features': 1, 'distinguish': 1, 'emerging': 1, 'civilizations': 1, 'Mesoamerica': 1}


In [12]:
max_frequency= max(word_frequencies.values())

In [13]:
max_frequency

2

In [14]:
for word in word_frequencies.keys():
    word_frequencies[word]= word_frequencies[word]/max_frequency

In [15]:
print(word_frequencies)

{'time': 0.5, 'began': 0.5, 'form': 0.5, 'system': 0.5, 'numbering': 0.5, 'calendar': 0.5, 'hieroglyphic': 0.5, 'writing': 0.5, 'technically': 0.5, 'advanced': 0.5, 'art': 0.5, 'later': 1.0, 'influenced': 0.5, 'peoples': 0.5, 'framework': 0.5, 'gradual': 0.5, 'evolution': 0.5, 'cultural': 0.5, '\n': 0.5, 'progress': 0.5, 'Preclassic': 0.5, 'horizon': 0.5, 'divided': 0.5, 'Lower': 0.5, 'Middle': 0.5, 'Upper': 0.5, 'periods': 0.5, 'added': 0.5, 'transitional': 0.5, 'Protoclassic': 0.5, 'period': 0.5, 'features': 0.5, 'distinguish': 0.5, 'emerging': 0.5, 'civilizations': 0.5, 'Mesoamerica': 0.5}


In [16]:
# Sentence Tokenization
sentence_tokens= [sent for sent in doc.sents]
print(sentence_tokens)

[At the same time, there began to take form a system of numbering, the calendar, hieroglyphic writing, and a technically advanced art, all of which later influenced other peoples., Within the framework of this gradual evolution or cultural
progress the Preclassic horizon has been divided into Lower, Middle and Upper periods, to which can be added a transitional or Protoclassic period with several features that would later distinguish the emerging civilizations of Mesoamerica.]


In [17]:
sentence_scores={}
for sent in sentence_tokens:
    for word in sent:
        if word.text.lower() in word_frequencies.keys():
            if sent not in sentence_scores.keys():
                sentence_scores[sent]= word_frequencies[word.text.lower()]
            else:
                sentence_scores[sent]+= word_frequencies[word.text.lower()]

In [18]:
sentence_scores

{At the same time, there began to take form a system of numbering, the calendar, hieroglyphic writing, and a technically advanced art, all of which later influenced other peoples.: 7.5,
 Within the framework of this gradual evolution or cultural
 progress the Preclassic horizon has been divided into Lower, Middle and Upper periods, to which can be added a transitional or Protoclassic period with several features that would later distinguish the emerging civilizations of Mesoamerica.: 9.0}

In [19]:
from heapq import nlargest

In [23]:
# Select 50% of sentences
select_length= int(len(sentence_tokens)*0.5)
select_length

1

In [24]:
summary= nlargest(select_length, sentence_scores, key= sentence_scores.get)

In [25]:
summary

[Within the framework of this gradual evolution or cultural
 progress the Preclassic horizon has been divided into Lower, Middle and Upper periods, to which can be added a transitional or Protoclassic period with several features that would later distinguish the emerging civilizations of Mesoamerica.]

In [26]:
# To allign the summary text
final_summary= [word.text for word in summary]

In [27]:
summary= ' '.join(final_summary)

In [28]:
print(summary)

Within the framework of this gradual evolution or cultural
progress the Preclassic horizon has been divided into Lower, Middle and Upper periods, to which can be added a transitional or Protoclassic period with several features that would later distinguish the emerging civilizations of Mesoamerica.


In [29]:
len(text)

478

In [30]:
len(summary)

299