[Reference](https://medium.com/1-hour-blog-series/automatic-text-summarization-made-simpler-using-python-577e7622c57a)

# 1 Spacy


In [1]:
import spacy

In [2]:
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation

In [3]:
stopwords = list(STOP_WORDS)


In [4]:
document1 ="""Machine learning (ML) is the scientific study of algorithms and statistical models that computer systems use to progressively improve their performance on a specific task. Machine learning algorithms build a mathematical model of sample data, known as “training data”, in order to make predictions or decisions without being explicitly programmed to perform the task. Machine learning algorithms are used in the applications of email filtering, detection of network intruders, and computer vision, where it is infeasible to develop an algorithm of specific instructions for performing the task. Machine learning is closely related to computational statistics, which focuses on making predictions using computers. The study of mathematical optimization delivers methods, theory and application domains to the field of machine learning. Data mining is a field of study within machine learning, and focuses on exploratory data analysis through unsupervised learning.In its application across business problems, machine learning is also referred to as predictive analytics."""
document2 = """Our Father who art in heaven, hallowed be thy name. Thy kingdom come. Thy will be done, on earth as it is in heaven. Give us this day our daily bread; and forgive us our trespasses, as we forgive those who trespass against us; and lead us not into temptation, but deliver us from evil
"""
nlp = spacy.load('en')

In [5]:
docx = nlp(document1)

In [6]:
mytokens = [token.text for token in docx]

In [7]:
word_frequencies = {}
for word in docx:
    if word.text not in stopwords:
        if word.text not in word_frequencies.keys():
            word_frequencies[word.text] = 1
        else:
            word_frequencies[word.text] += 1
word_frequencies

{'(': 1,
 ')': 1,
 ',': 9,
 '.': 7,
 'Data': 1,
 'In': 1,
 'ML': 1,
 'Machine': 4,
 'The': 1,
 'algorithm': 1,
 'algorithms': 3,
 'analysis': 1,
 'analytics': 1,
 'application': 2,
 'applications': 1,
 'build': 1,
 'business': 1,
 'closely': 1,
 'computational': 1,
 'computer': 2,
 'computers': 1,
 'data': 3,
 'decisions': 1,
 'delivers': 1,
 'detection': 1,
 'develop': 1,
 'domains': 1,
 'email': 1,
 'explicitly': 1,
 'exploratory': 1,
 'field': 2,
 'filtering': 1,
 'focuses': 2,
 'improve': 1,
 'infeasible': 1,
 'instructions': 1,
 'intruders': 1,
 'known': 1,
 'learning': 8,
 'machine': 3,
 'making': 1,
 'mathematical': 2,
 'methods': 1,
 'mining': 1,
 'model': 1,
 'models': 1,
 'network': 1,
 'optimization': 1,
 'order': 1,
 'perform': 1,
 'performance': 1,
 'performing': 1,
 'predictions': 2,
 'predictive': 1,
 'problems': 1,
 'programmed': 1,
 'progressively': 1,
 'referred': 1,
 'related': 1,
 'sample': 1,
 'scientific': 1,
 'specific': 2,
 'statistical': 1,
 'statistics': 1,
 '

In [10]:
maximum_frequency = max(word_frequencies.values())
for word in word_frequencies.keys():
    word_frequencies[word] = (word_frequencies[word]/maximum_frequency)

In [11]:
word_frequencies

{'(': 0.1111111111111111,
 ')': 0.1111111111111111,
 ',': 1.0,
 '.': 0.7777777777777778,
 'Data': 0.1111111111111111,
 'In': 0.1111111111111111,
 'ML': 0.1111111111111111,
 'Machine': 0.4444444444444444,
 'The': 0.1111111111111111,
 'algorithm': 0.1111111111111111,
 'algorithms': 0.3333333333333333,
 'analysis': 0.1111111111111111,
 'analytics': 0.1111111111111111,
 'application': 0.2222222222222222,
 'applications': 0.1111111111111111,
 'build': 0.1111111111111111,
 'business': 0.1111111111111111,
 'closely': 0.1111111111111111,
 'computational': 0.1111111111111111,
 'computer': 0.2222222222222222,
 'computers': 0.1111111111111111,
 'data': 0.3333333333333333,
 'decisions': 0.1111111111111111,
 'delivers': 0.1111111111111111,
 'detection': 0.1111111111111111,
 'develop': 0.1111111111111111,
 'domains': 0.1111111111111111,
 'email': 0.1111111111111111,
 'explicitly': 0.1111111111111111,
 'exploratory': 0.1111111111111111,
 'field': 0.2222222222222222,
 'filtering': 0.1111111111111111,


In [12]:
sentence_list = [ sentence for sentence in docx.sents ]

In [14]:
sentence_scores = {}
for sent in sentence_list:
    for word in sent:
        if word.text.lower() in word_frequencies.keys():
            if len(sent.text.split(' ')) < 30:
                if sent not in sentence_scores.keys():
                    sentence_scores[sent] = word_frequencies[word.text.lower()]
                else:
                    sentence_scores[sent] += word_frequencies[word.text.lower()]

In [15]:
print (sentence_scores)

{Machine learning (ML) is the scientific study of algorithms and statistical models that computer systems use to progressively improve their performance on a specific task.: 4.555555555555556, Machine learning algorithms build a mathematical model of sample data, known as “training data”, in order to make predictions or decisions without being explicitly programmed to perform the task.: 7.111111111111109, Machine learning is closely related to computational statistics, which focuses on making predictions using computers.: 4.111111111111112, The study of mathematical optimization delivers methods, theory and application domains to the field of machine learning.: 4.555555555555556, Data mining is a field of study within machine learning, and focuses on exploratory data analysis through unsupervised learning.: 5.777777777777778, In its application across business problems, machine learning is also referred to as predictive analytics.: 3.7777777777777777}


In [41]:
# Threshold =0.6
# for i in sentence_scores:
#     print(i)
#     if (i > Threshold):
#         print(sent)

# 2 Gensim


In [18]:
from gensim.summarization.summarizer import summarize
from gensim.summarization import keywords

In [20]:
summary=summarize(document1, word_count=50)

In [21]:
print(summary)

Machine learning (ML) is the scientific study of algorithms and statistical models that computer systems use to progressively improve their performance on a specific task.
Machine learning algorithms build a mathematical model of sample data, known as “training data”, in order to make predictions or decisions without being explicitly programmed to perform the task.


In [22]:
print(summarize(document1, ratio=0.5))


Machine learning (ML) is the scientific study of algorithms and statistical models that computer systems use to progressively improve their performance on a specific task.
Machine learning algorithms build a mathematical model of sample data, known as “training data”, in order to make predictions or decisions without being explicitly programmed to perform the task.
Machine learning is closely related to computational statistics, which focuses on making predictions using computers.


In [23]:
keywords(document1)

'data\nmachine learning\nfiltering\npredictions\npredictive\nmethods\nstatistics\nstatistical models\nspecific\nmodel'

# 3 py summarization


In [26]:
pip install pysummarization

Collecting pysummarization
  Downloading pysummarization-1.1.8.tar.gz (51 kB)
[?25l[K     |██████▍                         | 10 kB 20.6 MB/s eta 0:00:01[K     |████████████▊                   | 20 kB 10.5 MB/s eta 0:00:01[K     |███████████████████             | 30 kB 8.4 MB/s eta 0:00:01[K     |█████████████████████████▌      | 40 kB 7.7 MB/s eta 0:00:01[K     |███████████████████████████████▉| 51 kB 4.2 MB/s eta 0:00:01[K     |████████████████████████████████| 51 kB 193 kB/s 
Building wheels for collected packages: pysummarization
  Building wheel for pysummarization (setup.py) ... [?25l[?25hdone
  Created wheel for pysummarization: filename=pysummarization-1.1.8-py3-none-any.whl size=59453 sha256=134c1c3c578986d364a7d58f064b5101514e5ad4ae2c686b2285769397c3a1b0
  Stored in directory: /root/.cache/pip/wheels/da/14/3e/02d15001af23ca877c5149b66280a605e5cdbbe76972598afa
Successfully built pysummarization
Installing collected packages: pysummarization
Successfully installed

In [28]:
from pysummarization.nlpbase.auto_abstractor import AutoAbstractor
from pysummarization.tokenizabledoc.simple_tokenizer import SimpleTokenizer
from pysummarization.abstractabledoc.top_n_rank_abstractor import TopNRankAbstractor

In [32]:
document = "Natural language generation (NLG) is the natural language processing task of generating natural language from a machine representation system such as a knowledge base or a logical form. Psycholinguists prefer the term language production when such formal representations are interpreted as models for mental representations."

# Object of automatic summarization.
auto_abstractor = AutoAbstractor()
# Set tokenizer.
auto_abstractor.tokenizable_doc = SimpleTokenizer()
# Set delimiter for making a list of sentence.
auto_abstractor.delimiter_list = [".", "\n"]
# Object of abstracting and filtering document.
abstractable_doc = TopNRankAbstractor()
# Summarize document.
result_dict = auto_abstractor.summarize(document, abstractable_doc)

In [33]:
for sentence in result_dict["summarize_result"]:
    print(sentence)

Natural language generation (NLG) is the natural language processing task of generating natural language from a machine representation system such as a knowledge base or a logical form.

 Psycholinguists prefer the term language production when such formal representations are interpreted as models for mental representations.

