In [1]:
import spacy
from spacy.lang.en import STOP_WORDS
from string import punctuation

In [2]:
text="""A potential issue with this encoder-decoder approach is that a neural network needs to be able to compress all the necessary information of a source sentence into a fixed-length vector. 
        This may make it difficult for the neural network to cope with long sentences. The performance of a basic encoder-decoder deteriorates rapidly as the length of an input sentence increases."""

In [25]:
stopwords=list(STOP_WORDS)
pun=punctuation+'\n'
pun=list(pun)


In [4]:
nlp=spacy.load('en_core_web_md')

In [5]:
doc=nlp(text)

In [26]:
tokens=[token.text for token in doc]
print(tokens)

['A', 'potential', 'issue', 'with', 'this', 'encoder', '-', 'decoder', 'approach', 'is', 'that', 'a', 'neural', 'network', 'needs', 'to', 'be', 'able', 'to', 'compress', 'all', 'the', 'necessary', 'information', 'of', 'a', 'source', 'sentence', 'into', 'a', 'fixed', '-', 'length', 'vector', '.', '\n        ', 'This', 'may', 'make', 'it', 'difficult', 'for', 'the', 'neural', 'network', 'to', 'cope', 'with', 'long', 'sentences', '.', 'The', 'performance', 'of', 'a', 'basic', 'encoder', '-', 'decoder', 'deteriorates', 'rapidly', 'as', 'the', 'length', 'of', 'an', 'input', 'sentence', 'increases', '.']


In [30]:
tokens=[w.lower() for w in tokens if w not in stopwords and w not in pun]
print(tokens)

['potential', 'issue', 'encoder', 'decoder', 'approach', 'neural', 'network', 'needs', 'able', 'compress', 'necessary', 'information', 'source', 'sentence', 'fixed', 'length', 'vector', '\n        ', 'difficult', 'neural', 'network', 'cope', 'long', 'sentences', 'performance', 'basic', 'encoder', 'decoder', 'deteriorates', 'rapidly', 'length', 'input', 'sentence', 'increases']


In [32]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [33]:
TOKENS=Tokenizer()
TOKENS.fit_on_texts(tokens)

In [42]:
# count max frequencies of words
max_freq=max(TOKENS.word_counts.values())

In [43]:
# normalise the word frequencies 

for word in TOKENS.word_counts:
    TOKENS.word_counts[word]=TOKENS.word_counts[word]/max_freq

In [45]:
print(TOKENS.word_counts)

OrderedDict([('potential', 0.5), ('issue', 0.5), ('encoder', 1.0), ('decoder', 1.0), ('approach', 0.5), ('neural', 1.0), ('network', 1.0), ('needs', 0.5), ('able', 0.5), ('compress', 0.5), ('necessary', 0.5), ('information', 0.5), ('source', 0.5), ('sentence', 1.0), ('fixed', 0.5), ('length', 1.0), ('vector', 0.5), ('difficult', 0.5), ('cope', 0.5), ('long', 0.5), ('sentences', 0.5), ('performance', 0.5), ('basic', 0.5), ('deteriorates', 0.5), ('rapidly', 0.5), ('input', 0.5), ('increases', 0.5)])


In [46]:
sentences=[sent for sent in doc.sents]
print(sentences)

[A potential issue with this encoder-decoder approach is that a neural network needs to be able to compress all the necessary information of a source sentence into a fixed-length vector. 
        , This may make it difficult for the neural network to cope with long sentences., The performance of a basic encoder-decoder deteriorates rapidly as the length of an input sentence increases.]


In [47]:
sentence_scores={}
for sent in sentences:
    for word in sent:
        if word.text.lower() in TOKENS.word_counts.keys():
            if sent not in sentence_scores.keys():
                sentence_scores[sent]=TOKENS.word_counts[word.text.lower()]
            else:
                sentence_scores[sent]+=TOKENS.word_counts[word.text.lower()]

In [48]:
sentence_scores

{A potential issue with this encoder-decoder approach is that a neural network needs to be able to compress all the necessary information of a source sentence into a fixed-length vector. 
         : 11.5,
 This may make it difficult for the neural network to cope with long sentences.: 4.0,
 The performance of a basic encoder-decoder deteriorates rapidly as the length of an input sentence increases.: 7.0}

In [49]:
from heapq import nlargest

In [50]:
# here i select 40% of total sentences length
# we can select as much of our criteria
select_length=int(len(sentences)*0.4)
select_length

1

In [51]:
summary=nlargest(select_length,sentence_scores,key=sentence_scores.get)
summary

[A potential issue with this encoder-decoder approach is that a neural network needs to be able to compress all the necessary information of a source sentence into a fixed-length vector. 
         ]

In [52]:
finale_summary=''.join([w.text for w in summary])
finale_summary

'A potential issue with this encoder-decoder approach is that a neural network needs to be able to compress all the necessary information of a source sentence into a fixed-length vector. \n        '