In [1]:
import nltk    
# NLTK is a leading platform for building Python programs to work with human language data
# nltk.corpus: The modules in this package provide functions that can be used to read corpus files in a variety of formats.
from nltk.corpus import stopwords
#  A stop word is a commonly used word (such as “the”, “a”, “an”, “in”) that a search engine has been programmed to ignore
from nltk.cluster.util import cosine_distance
# Returns 1 minus the cosine of the angle between vectors v and u. This is equal to ``1 - (u.v / |u||v|)``.

import pyforest as py
import networkx as nx


# NetworkX is a Python package for the creation, manipulation,
#and study of the structure, dynamics, and functions of complex networks.

In [2]:
def read_article(file_name):
    file = open(file_name, 'r')
    filedata = file.readlines()
    article = filedata[0].split('. ') 
    sentences = []
    
    for sentence in article:
        print(sentence)
        sentences.append(sentence.replace("[^a-zA-Z]", " ").split(' '))
    sentences.pop()
    
    return sentences

In [3]:
def sentence_similarity(sent1, sent2, stopwords=None):
    if stopwords is None:
        stopwords = []
        
    sent1 = [w.lower() for w in sent1]
    sent2 = [w.lower() for w in sent2]
    
    all_words = list(set(sent1+sent2))
    
    vector1 = [0] * len(all_words)
    vector2 = [0] * len(all_words)
    
    for w in sent1:
        if w in stopwords:
            continue
        vector1[all_words.index(w)] += 1
        
    for w in sent2:
        if w in stopwords:
            continue
        vector2[all_words.index(w)] += 1
        
        
    return 1 - cosine_distance(vector1, vector2)

In [4]:
def build_similarity_matrix(sentences, stop_words):
    similarity_matrix = np.zeros((len(sentences), len(sentences)))
    
    for idx1 in range(len(sentences)):
        for idx2 in range(len(sentences)):
            if idx1==idx2:
                continue
            similarity_matrix[idx1][idx2] = sentence_similarity(sentences[idx1], sentences[idx2], stop_words)
            
    return similarity_matrix

In [5]:
def generate_summary(file_name, top_n=5):
    nltk.download('stopwords')
    stop_words = stopwords.words('english')
    summary_text = []
    
    sentences = read_article(file_name)
    
    sentence_similarity_matrix = build_similarity_matrix(sentences, stop_words)
    
    sentence_similarity_graph = nx.from_numpy_array(sentence_similarity_matrix)
    scores = nx.pagerank(sentence_similarity_graph)
    
    ranked_sentence = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)
    print('Indexes of top ranked sentence order are ', ranked_sentence)
    
    for i in range(top_n):
        summary_text.append(" ".join(ranked_sentence[i][1]))
        
    print("Summarize Text: \n", ". ".join(summary_text))
    
    

In [7]:
generate_summary('msft.txt', 5)

In an attempt to build an AI-ready workforce, Microsoft announced Intelligent Cloud Hub which has been launched to empower the next generation of students with AI-ready skills
Envisioned as a three-year collaborative program, Intelligent Cloud Hub will support around 100 institutions with AI infrastructure, course content and curriculum, developer support, development tools and give students access to cloud and AI services
As part of the program, the Redmond giant which wants to expand its reach and is planning to build a strong developer ecosystem in India with the program will set up the core AI infrastructure and IoT Hub for the selected campuses
The company will provide AI development tools and Azure AI services such as Microsoft Cognitive Services, Bot Services and Azure Machine Learning.According to Manish Prakash, Country General Manager-PS, Health and Education, Microsoft India, said, "With AI being the defining technology of our time, it is transforming lives and industry and 

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/maverick/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


<IPython.core.display.Javascript object>

Indexes of top ranked sentence order are  [(0.15083257041122708, ['Envisioned', 'as', 'a', 'three-year', 'collaborative', 'program,', 'Intelligent', 'Cloud', 'Hub', 'will', 'support', 'around', '100', 'institutions', 'with', 'AI', 'infrastructure,', 'course', 'content', 'and', 'curriculum,', 'developer', 'support,', 'development', 'tools', 'and', 'give', 'students', 'access', 'to', 'cloud', 'and', 'AI', 'services']), (0.13161201335715553, ['The', 'company', 'will', 'provide', 'AI', 'development', 'tools', 'and', 'Azure', 'AI', 'services', 'such', 'as', 'Microsoft', 'Cognitive', 'Services,', 'Bot', 'Services', 'and', 'Azure', 'Machine', 'Learning.According', 'to', 'Manish', 'Prakash,', 'Country', 'General', 'Manager-PS,', 'Health', 'and', 'Education,', 'Microsoft', 'India,', 'said,', '"With', 'AI', 'being', 'the', 'defining', 'technology', 'of', 'our', 'time,', 'it', 'is', 'transforming', 'lives', 'and', 'industry', 'and', 'the', 'jobs', 'of', 'tomorrow', 'will', 'require', 'a', 'differ

In [8]:
generate_summary('random_article.txt', 2)

Many languages do not use articles ("a," "an," and "the"), or if they do exist, the way they are used may be different than in English
Multilingual writers often find article usage to be one of the most difficult concepts to learn
Although there are some rules about article usage to help, there are also quite a few exceptions
Therefore, learning to use articles accurately takes a long time
To master article usage, it is necessary to do a great deal of reading, notice how articles are used in published texts, and take notes that can apply back to your own writing.



[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/maverick/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


<IPython.core.display.Javascript object>

Indexes of top ranked sentence order are  [(0.25, ['Therefore,', 'learning', 'to', 'use', 'articles', 'accurately', 'takes', 'a', 'long', 'time']), (0.25, ['Multilingual', 'writers', 'often', 'find', 'article', 'usage', 'to', 'be', 'one', 'of', 'the', 'most', 'difficult', 'concepts', 'to', 'learn']), (0.25, ['Many', 'languages', 'do', 'not', 'use', 'articles', '("a,"', '"an,"', 'and', '"the"),', 'or', 'if', 'they', 'do', 'exist,', 'the', 'way', 'they', 'are', 'used', 'may', 'be', 'different', 'than', 'in', 'English']), (0.25, ['Although', 'there', 'are', 'some', 'rules', 'about', 'article', 'usage', 'to', 'help,', 'there', 'are', 'also', 'quite', 'a', 'few', 'exceptions'])]
Summarize Text: 
 Therefore, learning to use articles accurately takes a long time. Multilingual writers often find article usage to be one of the most difficult concepts to learn
