# Import Libraries

In [1]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
from nltk.corpus import stopwords
from nltk.cluster.util import cosine_distance
import numpy as np
import networkx as nx
from pprint import pprint

# Load DATA

In [3]:
def read_article(file_name):
    file = open(file_name, "r")
    filedata = file.readlines()
    article = filedata[0].split(". ")
    sentences = []

    for sentence in article:
        print(sentence)
        sentences.append(sentence.replace("[^a-zA-Z]", " ").split(" "))
    sentences.pop() 
    
    return sentences

In [4]:
def sentence_similarity(sent1, sent2, stopwords=None):
    if stopwords is None:
        stopwords = []
 
    sent1 = [w.lower() for w in sent1]
    sent2 = [w.lower() for w in sent2]
 
    all_words = list(set(sent1 + sent2))
 
    vector1 = [0] * len(all_words)
    vector2 = [0] * len(all_words)
 
    # build the vector for the first sentence
    for w in sent1:
        if w in stopwords:
            continue
        vector1[all_words.index(w)] += 1
 
    # build the vector for the second sentence
    for w in sent2:
        if w in stopwords:
            continue
        vector2[all_words.index(w)] += 1
 
    return 1 - cosine_distance(vector1, vector2)

In [5]:
def build_similarity_matrix(sentences, stop_words):
    # Create an empty similarity matrix
    similarity_matrix = np.zeros((len(sentences), len(sentences)))
 
    for idx1 in range(len(sentences)):
        for idx2 in range(len(sentences)):
            if idx1 == idx2: #ignore if both are same sentences
                continue 
            similarity_matrix[idx1][idx2] = sentence_similarity(sentences[idx1], sentences[idx2], stop_words)

    return similarity_matrix

In [6]:
def ranked_sentence(file_name):
    stop_words = stopwords.words('english')
    summarize_text = []

    file = open(file_name, "r")
    filedata = file.readlines()
    article = filedata[0].split(". ")
    sentences = []

    for sentence in article:
        sentences.append(sentence.replace("[^a-zA-Z]", " ").split(" "))
    sentences.pop() 

    # Step 2 - Generate Similary Martix across sentences
    sentence_similarity_martix = build_similarity_matrix(sentences, stop_words)

    # Step 3 - Rank sentences in similarity martix
    sentence_similarity_graph = nx.from_numpy_array(sentence_similarity_martix)
    scores = nx.pagerank(sentence_similarity_graph)
    
    # Step 4 - Sort the rank and pick top sentences
    ranked_sentence = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)
    print("\n\n Indexes of top ranked_sentence order are \n\n")
    pprint(ranked_sentence)

In [7]:
def generate_summary(file_name, top_n=5):
    stop_words = stopwords.words('english')
    summarize_text = []

    # Step 1 - Read text anc split it
    sentences =  read_article(file_name)

    # Step 2 - Generate Similary Martix across sentences
    sentence_similarity_martix = build_similarity_matrix(sentences, stop_words)

    # Step 3 - Rank sentences in similarity martix
    sentence_similarity_graph = nx.from_numpy_array(sentence_similarity_martix)
    scores = nx.pagerank(sentence_similarity_graph)

    # Step 4 - Sort the rank and pick top sentences
    ranked_sentence = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)

    for i in range(top_n):
        summarize_text.append(" ".join(ranked_sentence[i][1]))

    # Step 5 - Offcourse, output the summarize texr
    print("\n\n Summarize Text: \n\n", ". ".join(summarize_text))

In [8]:
ranked_sentence('text.txt')



 Indexes of top ranked_sentence order are 


[(0.15083257041122708,
  ['Envisioned',
   'as',
   'a',
   'three-year',
   'collaborative',
   'program,',
   'Intelligent',
   'Cloud',
   'Hub',
   'will',
   'support',
   'around',
   '100',
   'institutions',
   'with',
   'AI',
   'infrastructure,',
   'course',
   'content',
   'and',
   'curriculum,',
   'developer',
   'support,',
   'development',
   'tools',
   'and',
   'give',
   'students',
   'access',
   'to',
   'cloud',
   'and',
   'AI',
   'services']),
 (0.13161201335715553,
  ['The',
   'company',
   'will',
   'provide',
   'AI',
   'development',
   'tools',
   'and',
   'Azure',
   'AI',
   'services',
   'such',
   'as',
   'Microsoft',
   'Cognitive',
   'Services,',
   'Bot',
   'Services',
   'and',
   'Azure',
   'Machine',
   'Learning.According',
   'to',
   'Manish',
   'Prakash,',
   'Country',
   'General',
   'Manager-PS,',
   'Health',
   'and',
   'Education,',
   'Microsoft',
   'India,',
   'said,'

In [9]:
generate_summary('text.txt', 1)

In an attempt to build an AI-ready workforce, Microsoft announced Intelligent Cloud Hub which has been launched to empower the next generation of students with AI-ready skills
Envisioned as a three-year collaborative program, Intelligent Cloud Hub will support around 100 institutions with AI infrastructure, course content and curriculum, developer support, development tools and give students access to cloud and AI services
As part of the program, the Redmond giant which wants to expand its reach and is planning to build a strong developer ecosystem in India with the program will set up the core AI infrastructure and IoT Hub for the selected campuses
The company will provide AI development tools and Azure AI services such as Microsoft Cognitive Services, Bot Services and Azure Machine Learning.According to Manish Prakash, Country General Manager-PS, Health and Education, Microsoft India, said, "With AI being the defining technology of our time, it is transforming lives and industry and 

In [10]:
generate_summary('text.txt', 2)

In an attempt to build an AI-ready workforce, Microsoft announced Intelligent Cloud Hub which has been launched to empower the next generation of students with AI-ready skills
Envisioned as a three-year collaborative program, Intelligent Cloud Hub will support around 100 institutions with AI infrastructure, course content and curriculum, developer support, development tools and give students access to cloud and AI services
As part of the program, the Redmond giant which wants to expand its reach and is planning to build a strong developer ecosystem in India with the program will set up the core AI infrastructure and IoT Hub for the selected campuses
The company will provide AI development tools and Azure AI services such as Microsoft Cognitive Services, Bot Services and Azure Machine Learning.According to Manish Prakash, Country General Manager-PS, Health and Education, Microsoft India, said, "With AI being the defining technology of our time, it is transforming lives and industry and 

In [11]:
x = generate_summary('text.txt', 3)
print(x)

In an attempt to build an AI-ready workforce, Microsoft announced Intelligent Cloud Hub which has been launched to empower the next generation of students with AI-ready skills
Envisioned as a three-year collaborative program, Intelligent Cloud Hub will support around 100 institutions with AI infrastructure, course content and curriculum, developer support, development tools and give students access to cloud and AI services
As part of the program, the Redmond giant which wants to expand its reach and is planning to build a strong developer ecosystem in India with the program will set up the core AI infrastructure and IoT Hub for the selected campuses
The company will provide AI development tools and Azure AI services such as Microsoft Cognitive Services, Bot Services and Azure Machine Learning.According to Manish Prakash, Country General Manager-PS, Health and Education, Microsoft India, said, "With AI being the defining technology of our time, it is transforming lives and industry and 

In [12]:
text = 'For some, emoji have caused frustration for users (how the heck are you \
        supposed to use the 🙃 emoji?). Yet for many others, emoji has opened up \
        a fascinating new medium of communication. There are even emoji charade-esque \
        “games” where users can guess a movie title based on a series of emoji.\
        (try these: 💉💎 or 👦🏻👓⚡). But what happens when you push emoji a step further?'

In [13]:
import emot

emot.emoji(text)

{'value': ['🙃', '💉', '💎', '👦', '🏻', '👓', '⚡'],
 'mean': [':upside-down_face:',
  ':syringe:',
  ':gem_stone:',
  ':boy:',
  ':light_skin_tone:',
  ':glasses:',
  ':high_voltage:'],
 'location': [[100, 100],
  [337, 337],
  [338, 338],
  [343, 343],
  [344, 344],
  [345, 345],
  [346, 346]],
 'flag': True}

In [14]:
import spacy

import en_core_web_sm
nlp = en_core_web_sm.load()

def explain_text_entities(text):
    doc = nlp(text)
    for ent in doc.ents:
        print(f'Entity: {ent}, Label: {ent.label_}, {spacy.explain(ent.label_)}')
        
for i in range(1, 2):
    one_sentence = text
    doc = nlp(one_sentence)
    spacy.displacy.render(doc, style='ent',jupyter=True)

In [15]:
import gensim

In [16]:
help(gensim)

Help on package gensim:

NAME
    gensim

DESCRIPTION
    This package contains interfaces and functionality to compute pair-wise document similarities within a corpus
    of documents.

PACKAGE CONTENTS
    _matutils
    corpora (package)
    decorator
    downloader
    interfaces
    matutils
    models (package)
    nosy
    parsing (package)
    scripts (package)
    similarities (package)
    sklearn_api (package)
    summarization (package)
    test (package)
    topic_coherence (package)
    utils
    viz (package)

DATA

VERSION
    3.8.0

FILE
    c:\users\dell\anaconda3\envs\tensorflow env\lib\site-packages\gensim\__init__.py




In [17]:
help(gensim.summarization)

Help on package gensim.summarization in gensim:

NAME
    gensim.summarization - # bring model classes directly into package namespace, to save some typing

PACKAGE CONTENTS
    bm25
    commons
    graph
    keywords
    mz_entropy
    pagerank_weighted
    summarizer
    syntactic_unit
    textcleaner

FILE
    c:\users\dell\anaconda3\envs\tensorflow env\lib\site-packages\gensim\summarization\__init__.py




In [18]:
help(gensim.summarization.summarize)

Help on function summarize in module gensim.summarization.summarizer:

summarize(text, ratio=0.2, word_count=None, split=False)
    Get a summarized version of the given text.
    
    The output summary will consist of the most representative sentences
    and will be returned as a string, divided by newlines.
    
    Note
    ----
    The input should be a string, and must be longer than :const:`~gensim.summarization.summarizer.INPUT_MIN_LENGTH`
    sentences for the summary to make sense.
    The text will be split into sentences using the split_sentences method in the :mod:`gensim.summarization.texcleaner`
    module. Note that newlines divide sentences.
    
    
    Parameters
    ----------
    text : str
        Given text.
    ratio : float, optional
        Number between 0 and 1 that determines the proportion of the number of
        sentences of the original text to be chosen for the summary.
    word_count : int or None, optional
        Determines how many words will the

In [19]:
from gensim.summarization.summarizer import summarize
from gensim.summarization import keywords

In [20]:
print(summarize(text = 'For some, emoji have caused frustration for users (how the heck are you supposed to use the 🙃 emoji?).\
                Yet for many others, emoji PERSON has opened up a fascinating new medium of communication. There are \
                even emoji charade-esque PERSON “games” where users can guess a movie title based on a series of emoji. \
                (try these: 💉💎 or 👦🏻👓⚡). But what happens when you push emoji a step further?',
                ratio = 0.5,
                word_count = 15))

print(keywords('Computing has constantly been at the heart of technology advancement, and is also the pivotal \
force to propel the intelligent world. Huawei Intelligent Computing has been strategically invested in general \
and AI computing, and builds the innovative Kunpeng, Ascend, and x86 computing platforms to unlock the ultimate \
computing power. Huawei Intelligent Computing provides full-stack, all-scenario solutions for the cloud-edge-device \
to catalyze the intelligent transformation of traditional data centers and industries, leading the way forward \
to a fully connected, intelligent world.'))

There are                 even emoji charade-esque PERSON “games” where users can guess a movie title based on a series of emoji.
computing
intelligent
data
kunpeng
huawei


In [21]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation

In [22]:
stopwords = list(STOP_WORDS)

In [23]:
document1 ="""Machine learning (ML) is the scientific study of algorithms and statistical models that computer 
systems use to progressively improve their performance on a specific task. Machine learning algorithms build a 
mathematical model of sample data, known as "training data", in order to make predictions or decisions without 
being explicitly programmed to perform the task. Machine learning algorithms are used in the applications of 
email filtering, detection of network intruders, and computer vision, where it is infeasible to develop an 
algorithm of specific instructions for performing the task. Machine learning is closely related to computational
statistics, which focuses on making predictions using computers. The study of mathematical optimization deliver
methods, theory and application domains to the field of machine learning. Data mining is a field of study within
machine learning, and focuses on exploratory data analysis through unsupervised learning.In its application across 
business problems, machine learning is also referred to as predictive analytics."""

# second document
document2 = """Our Father who art in heaven, hallowed be thy name. Thy kingdom come. Thy will be done, on 
earth as it is in heaven. Give us this day our daily bread; and forgive us our trespasses, as we forgive
those who trespass against us; and lead us not into temptation, but deliver us from evil
"""

In [24]:
import en_core_web_sm
nlp = en_core_web_sm.load()

In [25]:
docx = nlp(document1)

In [26]:
mytokens = [token.text for token in docx]

In [27]:
word_frequencies = {}
for word in docx:
    if word.text not in stopwords:
            if word.text not in word_frequencies.keys():
                word_frequencies[word.text] = 1
            else:
                word_frequencies[word.text] += 1
                
# lets print these word frequencies
print(word_frequencies)

{'Machine': 4, 'learning': 8, '(': 1, 'ML': 1, ')': 1, 'scientific': 1, 'study': 3, 'algorithms': 3, 'statistical': 1, 'models': 1, 'computer': 2, '\n': 9, 'systems': 1, 'use': 1, 'progressively': 1, 'improve': 1, 'performance': 1, 'specific': 2, 'task': 3, '.': 7, 'build': 1, 'mathematical': 2, 'model': 1, 'sample': 1, 'data': 3, ',': 9, 'known': 1, '"': 2, 'training': 1, 'order': 1, 'predictions': 2, 'decisions': 1, 'explicitly': 1, 'programmed': 1, 'perform': 1, 'applications': 1, 'email': 1, 'filtering': 1, 'detection': 1, 'network': 1, 'intruders': 1, 'vision': 1, 'infeasible': 1, 'develop': 1, 'algorithm': 1, 'instructions': 1, 'performing': 1, 'closely': 1, 'related': 1, 'computational': 1, 'statistics': 1, 'focuses': 2, 'making': 1, 'computers': 1, 'The': 1, 'optimization': 1, 'deliver': 1, 'methods': 1, 'theory': 1, 'application': 2, 'domains': 1, 'field': 2, 'machine': 3, 'Data': 1, 'mining': 1, 'exploratory': 1, 'analysis': 1, 'unsupervised': 1, 'In': 1, 'business': 1, 'prob

In [28]:
maximum_frequency = max(word_frequencies.values())
print(maximum_frequency)

9


In [29]:
for word in word_frequencies.keys():  
        word_frequencies[word] = (word_frequencies[word]/maximum_frequency)

# Frequency Table
word_frequencies

{'Machine': 0.4444444444444444,
 'learning': 0.8888888888888888,
 '(': 0.1111111111111111,
 'ML': 0.1111111111111111,
 ')': 0.1111111111111111,
 'scientific': 0.1111111111111111,
 'study': 0.3333333333333333,
 'algorithms': 0.3333333333333333,
 'statistical': 0.1111111111111111,
 'models': 0.1111111111111111,
 'computer': 0.2222222222222222,
 '\n': 1.0,
 'systems': 0.1111111111111111,
 'use': 0.1111111111111111,
 'progressively': 0.1111111111111111,
 'improve': 0.1111111111111111,
 'performance': 0.1111111111111111,
 'specific': 0.2222222222222222,
 'task': 0.3333333333333333,
 '.': 0.7777777777777778,
 'build': 0.1111111111111111,
 'mathematical': 0.2222222222222222,
 'model': 0.1111111111111111,
 'sample': 0.1111111111111111,
 'data': 0.3333333333333333,
 ',': 1.0,
 'known': 0.1111111111111111,
 '"': 0.2222222222222222,
 'training': 0.1111111111111111,
 'order': 0.1111111111111111,
 'predictions': 0.2222222222222222,
 'decisions': 0.1111111111111111,
 'explicitly': 0.1111111111111111

In [30]:
sentence_list = [ sentence for sentence in docx.sents ]

# Sentence Score via comparrng each word with sentence
sentence_scores = {}  
for sent in sentence_list:  
        for word in sent:
            if word.text.lower() in word_frequencies.keys():
                if len(sent.text.split(' ')) < 30:
                    if sent not in sentence_scores.keys():
                        sentence_scores[sent] = word_frequencies[word.text.lower()]
                    else:
                        sentence_scores[sent] += word_frequencies[word.text.lower()]
                        
# printing the scores of importance for the sentences
print(sentence_scores)

{Machine learning (ML) is the scientific study of algorithms and statistical models that computer 
systems use to progressively improve their performance on a specific task.: 5.5555555555555545, Machine learning algorithms build a 
mathematical model of sample data, known as "training data", in order to make predictions or decisions without 
being explicitly programmed to perform the task.: 9.333333333333332, Machine learning is closely related to computational
statistics, which focuses on making predictions using computers.: 5.111111111111111, The study of mathematical optimization deliver
methods, theory and application domains to the field of machine learning.: 5.555555555555556, Data mining is a field of study within
machine learning, and focuses on exploratory data analysis through unsupervised learning.: 6.777777777777776, In its application across 
business problems, machine learning is also referred to as predictive analytics.: 4.777777777777778}


In [31]:
from heapq import nlargest

summarized_sentences = nlargest(7, sentence_scores, key=sentence_scores.get)
print(summarized_sentences)

[Machine learning algorithms build a 
mathematical model of sample data, known as "training data", in order to make predictions or decisions without 
being explicitly programmed to perform the task., Data mining is a field of study within
machine learning, and focuses on exploratory data analysis through unsupervised learning., The study of mathematical optimization deliver
methods, theory and application domains to the field of machine learning., Machine learning (ML) is the scientific study of algorithms and statistical models that computer 
systems use to progressively improve their performance on a specific task., Machine learning is closely related to computational
statistics, which focuses on making predictions using computers., In its application across 
business problems, machine learning is also referred to as predictive analytics.]


In [32]:
for w in summarized_sentences:
    print(w.text)

# List Comprehension of Sentences Converted From Spacy.span to strings
final_sentences = [ w.text for w in summarized_sentences ]

# lets join these sentences
summary = ' '.join(final_sentences)

# lets print the summary
print("\n\n Summarized Text:", summary)

Machine learning algorithms build a 
mathematical model of sample data, known as "training data", in order to make predictions or decisions without 
being explicitly programmed to perform the task.
Data mining is a field of study within
machine learning, and focuses on exploratory data analysis through unsupervised learning.
The study of mathematical optimization deliver
methods, theory and application domains to the field of machine learning.
Machine learning (ML) is the scientific study of algorithms and statistical models that computer 
systems use to progressively improve their performance on a specific task.
Machine learning is closely related to computational
statistics, which focuses on making predictions using computers.
In its application across 
business problems, machine learning is also referred to as predictive analytics.


 Summarized Text: Machine learning algorithms build a 
mathematical model of sample data, known as "training data", in order to make predictions or dec