#                                         SUMMARIZATION

## Importing Necessary Packages 

In [1]:
# Importing Libraries
from nltk.corpus import stopwords #you can remove stop words for speed
from nltk.cluster.util import cosine_distance
import numpy as np
import networkx as nx

## Opening File & Splitting Into Sentences

In [2]:
# Loading file
file = open("/Users/dileepkumarkatla/Downloads/Text1.txt", "r")
#This file contains one paragraph of multiple sentences
filedata = file.readlines()
article = filedata[0].split(". ") #Just do the first paragraph

sentences = []
for sentence in article:
    print(sentence)
    sentences.append(sentence.replace("[^a-zA-Z]", " ").split(" "))

It was the best of times
It was the worst of times
It was the age of wisdom
It was the age of foolishness
What is the importance of age
This is the best example.


## Printing List of Sentences

In [3]:
print("Sentences are ", sentences)

Sentences are  [['It', 'was', 'the', 'best', 'of', 'times'], ['It', 'was', 'the', 'worst', 'of', 'times'], ['It', 'was', 'the', 'age', 'of', 'wisdom'], ['It', 'was', 'the', 'age', 'of', 'foolishness'], ['What', 'is', 'the', 'importance', 'of', 'age'], ['This', 'is', 'the', 'best', 'example.']]


## Function To Calculate Similarity

In [4]:
# Defining sentence similarity & building vectors 
def sentence_similarity(sent1, sent2 ):
    sent1 = [w.lower() for w in sent1]
    sent2 = [w.lower() for w in sent2]
    all_words = list(set(sent1 + sent2))
    vector1 = [0] * len(all_words)
    vector2 = [0] * len(all_words)
     # build the vector for the first sentence
    for w in sent1:
          vector1[all_words.index(w)] += 1
     # build the vector for the second sentence
    for w in sent2:
          vector2[all_words.index(w)] += 1
    return 1 - cosine_distance(vector1, vector2)

## Creating Similarity Matrix

In [5]:
# Building Similarity Matrix 
similarity_matrix = np.zeros((len(sentences), len(sentences)))
 
for idx1 in range(len(sentences)):
        for idx2 in range(len(sentences)):
             if idx1 == idx2: #ignore if both are same sentences
                continue 
             similarity_matrix[idx1][idx2] = sentence_similarity(sentences[idx1], sentences[idx2])

print("Smilarity matrix \n", similarity_matrix)

Smilarity matrix 
 [[0.         0.83333333 0.66666667 0.66666667 0.33333333 0.36514837]
 [0.83333333 0.         0.66666667 0.66666667 0.33333333 0.18257419]
 [0.66666667 0.66666667 0.         0.83333333 0.5        0.18257419]
 [0.66666667 0.66666667 0.83333333 0.         0.5        0.18257419]
 [0.33333333 0.33333333 0.5        0.5        0.         0.36514837]
 [0.36514837 0.18257419 0.18257419 0.18257419 0.36514837 0.        ]]


## Getting Pagerank Scores

In [6]:
# Rank sentences in similarity martix
sentence_similarity_graph = nx.from_numpy_array(similarity_matrix)
scores = nx.pagerank(sentence_similarity_graph)
print("scores", scores)

scores {0: 0.19306449754902083, 1: 0.18095893645850156, 2: 0.1911855225552033, 3: 0.1911855225552033, 4: 0.14434636291527997, 5: 0.09925915796679063}


## Sorting Sentences by Pagerank

In [7]:
# Sort the rank and pick top sentences
ranked_sentence = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)    
print("Indexes of top ranked_sentence order are \n\n", ranked_sentence)

Indexes of top ranked_sentence order are 

 [(0.19306449754902083, ['It', 'was', 'the', 'best', 'of', 'times']), (0.1911855225552033, ['It', 'was', 'the', 'age', 'of', 'wisdom']), (0.1911855225552033, ['It', 'was', 'the', 'age', 'of', 'foolishness']), (0.18095893645850156, ['It', 'was', 'the', 'worst', 'of', 'times']), (0.14434636291527997, ['What', 'is', 'the', 'importance', 'of', 'age']), (0.09925915796679063, ['This', 'is', 'the', 'best', 'example.'])]


## Picking Top 'N' Sentences

In [8]:
# How many sentences to pick
n = int(input("How many sentences do you want in the summary? "))
#n=2
summarize_text = []
for i in range(n):
      summarize_text.append(" ".join(ranked_sentence[i][1]))

How many sentences do you want in the summary? 2


## Printing Sentences

In [9]:
# Offcourse, output the summarize text
print("Summarize Text: \n", ". ".join(summarize_text))

Summarize Text: 
 It was the best of times. It was the age of wisdom


# Summarization on Text File 3

## Opening File & Splitting Into Sentences

In [10]:
# Loading file
file = open("/Users/dileepkumarkatla/Downloads/Text3.txt", "r")
#This file contains one paragraph of multiple sentences
filedata = file.readlines()
article = filedata[0].split(". ") #Just do the first paragraph

sentences = []
for sentence in article:
    print(sentence)
    sentences.append(sentence.replace("[^a-zA-Z]", " ").split(" "))

As an institution of higher learning, Sacred Heart University places special emphasis on academic integrity, which is a commitment to the fundamental values of honesty, trust, fairness, respect, and responsibility
Only when these values are widely respected and practiced by all members of the University students, faculty, administrators, and staff can the University maintain a culture that promotes free exploration of knowledge, constructive debate, genuine learning, effective research, fair assessment of student progress, and development of members characters
These aims of the University require that its members exercise mutual responsibilities
At its core, academic integrity is secured by a principled commitment to carry out these responsibilities, not by rules and penalties
Students and faculty should strive to create an academic environment that is honest, fair, and respectful of all
They do this by evaluating others work fairly, by responding to others ideas critically yet courteo

## Printing List of Sentences

In [11]:
print("Sentences are ", sentences)

Sentences are  [['As', 'an', 'institution', 'of', 'higher', 'learning,', 'Sacred', 'Heart', 'University', 'places', 'special', 'emphasis', 'on', 'academic', 'integrity,', 'which', 'is', 'a', 'commitment', 'to', 'the', 'fundamental', 'values', 'of', 'honesty,', 'trust,', 'fairness,', 'respect,', 'and', 'responsibility'], ['Only', 'when', 'these', 'values', 'are', 'widely', 'respected', 'and', 'practiced', 'by', 'all', 'members', 'of', 'the', 'University', 'students,', 'faculty,', 'administrators,', 'and', 'staff', 'can', 'the', 'University', 'maintain', 'a', 'culture', 'that', 'promotes', 'free', 'exploration', 'of', 'knowledge,', 'constructive', 'debate,', 'genuine', 'learning,', 'effective', 'research,', 'fair', 'assessment', 'of', 'student', 'progress,', 'and', 'development', 'of', 'members', 'characters'], ['These', 'aims', 'of', 'the', 'University', 'require', 'that', 'its', 'members', 'exercise', 'mutual', 'responsibilities'], ['At', 'its', 'core,', 'academic', 'integrity', 'is', 

## Function To Calculate Similarity

In [13]:
# Defining sentence similarity & building vectors 
def sentence_similarity(sent1, sent2 ):
    sent1 = [w.lower() for w in sent1]
    sent2 = [w.lower() for w in sent2]
    all_words = list(set(sent1 + sent2))
    vector1 = [0] * len(all_words)
    vector2 = [0] * len(all_words)
     # build the vector for the first sentence
    for w in sent1:
          vector1[all_words.index(w)] += 1
     # build the vector for the second sentence
    for w in sent2:
          vector2[all_words.index(w)] += 1
    return 1 - cosine_distance(vector1, vector2)

## Creating Similarity Matrix

In [14]:
# Building Similarity Matrix 
similarity_matrix = np.zeros((len(sentences), len(sentences)))
 
for idx1 in range(len(sentences)):
        for idx2 in range(len(sentences)):
             if idx1 == idx2: #ignore if both are same sentences
                continue 
             similarity_matrix[idx1][idx2] = sentence_similarity(sentences[idx1], sentences[idx2])

print("Smilarity matrix \n", similarity_matrix)

Smilarity matrix 
 [[0.         0.375      0.20412415 0.22116293 0.31622777 0.25315802
  0.30439039 0.33348648 0.26382243 0.25259074 0.3006689 ]
 [0.375      0.         0.40824829 0.17201562 0.31622777 0.36822985
  0.32780503 0.26274693 0.22613351 0.3127314  0.33407655]
 [0.20412415 0.40824829 0.         0.12038585 0.12909944 0.15032921
  0.22941573 0.14852213 0.18463724 0.29462783 0.21821789]
 [0.22116293 0.17201562 0.12038585 0.         0.2331262  0.35290144
  0.16571045 0.17879963 0.13336627 0.         0.11821656]
 [0.31622777 0.31622777 0.12909944 0.2331262  0.         0.26200013
  0.26655699 0.26843775 0.14301939 0.09128709 0.25354628]
 [0.25315802 0.36822985 0.15032921 0.35290144 0.26200013 0.
  0.27590308 0.22327214 0.19429458 0.1860229  0.22143052]
 [0.30439039 0.32780503 0.22941573 0.16571045 0.26655699 0.27590308
  0.         0.3180176  0.25415212 0.32444284 0.37546963]
 [0.33348648 0.26274693 0.14852213 0.17879963 0.26843775 0.22327214
  0.3180176  0.         0.32907259 0.31

## Getting Pagerank Scores

In [15]:
# Rank sentences in similarity martix
sentence_similarity_graph = nx.from_numpy_array(similarity_matrix)
scores = nx.pagerank(sentence_similarity_graph)
print("scores", scores)

scores {0: 0.10052808533762261, 1: 0.1092617813187325, 2: 0.07743541734389628, 3: 0.06555131921078501, 4: 0.08353031285994028, 5: 0.09032582020361654, 6: 0.10094508912146115, 7: 0.10059598487085707, 8: 0.08058955768995534, 9: 0.08737172856802625, 10: 0.10386490347510732}


## Sorting Sentences by Pagerank

In [16]:
# Sort the rank and pick top sentences
ranked_sentence = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)    
print("Indexes of top ranked_sentence order are \n\n", ranked_sentence)

Indexes of top ranked_sentence order are 

 [(0.1092617813187325, ['Only', 'when', 'these', 'values', 'are', 'widely', 'respected', 'and', 'practiced', 'by', 'all', 'members', 'of', 'the', 'University', 'students,', 'faculty,', 'administrators,', 'and', 'staff', 'can', 'the', 'University', 'maintain', 'a', 'culture', 'that', 'promotes', 'free', 'exploration', 'of', 'knowledge,', 'constructive', 'debate,', 'genuine', 'learning,', 'effective', 'research,', 'fair', 'assessment', 'of', 'student', 'progress,', 'and', 'development', 'of', 'members', 'characters']), (0.10386490347510732, ['All', 'matriculated', 'students', 'will', 'be', 'provided', 'with', 'a', 'full', 'description', 'of', 'the', 'University', 'standards', 'for', 'academic', 'integrity,', 'consequences', 'for', 'violations,', 'and', 'the', 'appeals', 'procedure.']), (0.10094508912146115, ['Appropriate', 'disciplinary', 'action', 'will', 'be', 'taken', 'for', 'violations', 'of', 'academic', 'integrity,', 'including', 'plagiari

## Picking Top 'N' Sentences

In [17]:
# How many sentences to pick
n = int(input("How many sentences do you want in the summary? "))
#n=2
summarize_text = []
for i in range(n):
      summarize_text.append(" ".join(ranked_sentence[i][1]))

How many sentences do you want in the summary? 3


## Printing Sentences

In [18]:
# Offcourse, output the summarize text
print("Summarize Text: \n", ". ".join(summarize_text))

Summarize Text: 
 Only when these values are widely respected and practiced by all members of the University students, faculty, administrators, and staff can the University maintain a culture that promotes free exploration of knowledge, constructive debate, genuine learning, effective research, fair assessment of student progress, and development of members characters. All matriculated students will be provided with a full description of the University standards for academic integrity, consequences for violations, and the appeals procedure.. Appropriate disciplinary action will be taken for violations of academic integrity, including plagiarism, cheating, any use of materials for an assignment or exam that is not permitted by the instructor, and theft or mutilation of intellectual materials or other University equipment


# Summarization on Text File 5

## Opening File & Splitting Into Sentences

In [19]:
# Loading file
file = open("/Users/dileepkumarkatla/Downloads/Text5.txt", "r")
#This file contains one paragraph of multiple sentences
filedata = file.readlines()
article = filedata[0].split(". ") #Just do the first paragraph

sentences = []
for sentence in article:
    print(sentence)
    sentences.append(sentence.replace("[^a-zA-Z]", " ").split(" "))

In an attempt to build an AI-ready workforce, Microsoft announced Intelligent Cloud Hub which has been launched to empower the next generation of students with AI-ready skills
Envisioned as a three-year collaborative program, Intelligent Cloud Hub will support around 100 institutions with AI infrastructure, course content and curriculum, developer support, development tools and give students access to cloud and AI services
As part of the program, the Redmond giant which wants to expand its reach and is planning to build a strong developer ecosystem in India with the program will set up the core AI infrastructure and IoT Hub for the selected campuses
The company will provide AI development tools and Azure AI services such as Microsoft Cognitive Services, Bot Services and Azure Machine Learning.According to Manish Prakash, Country General Manager-PS, Health and Education, Microsoft India, said, "With AI being the defining technology of our time, it is transIn an attempt to build an AI-re

## Printing List of Sentences

In [20]:
print("Sentences are ", sentences)

Sentences are  [['In', 'an', 'attempt', 'to', 'build', 'an', 'AI-ready', 'workforce,', 'Microsoft', 'announced', 'Intelligent', 'Cloud', 'Hub', 'which', 'has', 'been', 'launched', 'to', 'empower', 'the', 'next', 'generation', 'of', 'students', 'with', 'AI-ready', 'skills'], ['Envisioned', 'as', 'a', 'three-year', 'collaborative', 'program,', 'Intelligent', 'Cloud', 'Hub', 'will', 'support', 'around', '100', 'institutions', 'with', 'AI', 'infrastructure,', 'course', 'content', 'and', 'curriculum,', 'developer', 'support,', 'development', 'tools', 'and', 'give', 'students', 'access', 'to', 'cloud', 'and', 'AI', 'services'], ['As', 'part', 'of', 'the', 'program,', 'the', 'Redmond', 'giant', 'which', 'wants', 'to', 'expand', 'its', 'reach', 'and', 'is', 'planning', 'to', 'build', 'a', 'strong', 'developer', 'ecosystem', 'in', 'India', 'with', 'the', 'program', 'will', 'set', 'up', 'the', 'core', 'AI', 'infrastructure', 'and', 'IoT', 'Hub', 'for', 'the', 'selected', 'campuses'], ['The', 'co

## Function To Calculate Similarity

In [21]:
# Defining sentence similarity & building vectors 
def sentence_similarity(sent1, sent2 ):
    sent1 = [w.lower() for w in sent1]
    sent2 = [w.lower() for w in sent2]
    all_words = list(set(sent1 + sent2))
    vector1 = [0] * len(all_words)
    vector2 = [0] * len(all_words)
     # build the vector for the first sentence
    for w in sent1:
          vector1[all_words.index(w)] += 1
     # build the vector for the second sentence
    for w in sent2:
          vector2[all_words.index(w)] += 1
    return 1 - cosine_distance(vector1, vector2)

## Creating Similarity Matrix

In [22]:
# Building Similarity Matrix 
similarity_matrix = np.zeros((len(sentences), len(sentences)))
 
for idx1 in range(len(sentences)):
        for idx2 in range(len(sentences)):
             if idx1 == idx2: #ignore if both are same sentences
                continue 
             similarity_matrix[idx1][idx2] = sentence_similarity(sentences[idx1], sentences[idx2])

print("Smilarity matrix \n", similarity_matrix)

Smilarity matrix 
 [[0.         0.20994555 0.32141217 0.6415029  0.20994555 0.32141217
  0.15589237 0.04828045 0.15974461 0.40146253 0.27852425 0.33009387
  0.15569979]
 [0.20994555 0.         0.31546459 0.42735216 1.         0.31546459
  0.4500225  0.41812101 0.31127151 0.18964186 0.15075567 0.30785965
  0.20225996]
 [0.32141217 0.31546459 0.         0.45361105 0.31546459 1.
  0.45317826 0.23897606 0.16943475 0.64517472 0.44312937 0.412959
  0.22019275]
 [0.6415029  0.42735216 0.45361105 0.         0.42735216 0.45361105
  0.78978629 0.28827833 0.26013299 0.46555195 0.34016803 0.39970544
  0.25354628]
 [0.20994555 1.         0.31546459 0.42735216 0.         0.31546459
  0.4500225  0.41812101 0.31127151 0.18964186 0.15075567 0.30785965
  0.20225996]
 [0.32141217 0.31546459 1.         0.45361105 0.31546459 0.
  0.45317826 0.23897606 0.16943475 0.64517472 0.44312937 0.412959
  0.22019275]
 [0.15589237 0.4500225  0.45317826 0.78978629 0.4500225  0.45317826
  0.         0.44155786 0.2282771

## Getting Pagerank Scores

In [23]:
# Rank sentences in similarity martix
sentence_similarity_graph = nx.from_numpy_array(similarity_matrix)
scores = nx.pagerank(sentence_similarity_graph)
print("scores", scores)

scores {0: 0.06505377280671527, 1: 0.08331029356108051, 2: 0.0944374740565271, 3: 0.0984332545355071, 4: 0.08331029356108051, 5: 0.0944374740565271, 6: 0.08956736183704041, 7: 0.06144739693431779, 8: 0.05275205184695904, 9: 0.08067100597330357, 10: 0.0654765113515673, 11: 0.07702790911312701, 12: 0.054075200366247446}


## Sorting Sentences by Pagerank

In [24]:
#  Sort the rank and pick top sentences
ranked_sentence = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)    
print("Indexes of top ranked_sentence order are \n\n", ranked_sentence)

Indexes of top ranked_sentence order are 

 [(0.0984332545355071, ['The', 'company', 'will', 'provide', 'AI', 'development', 'tools', 'and', 'Azure', 'AI', 'services', 'such', 'as', 'Microsoft', 'Cognitive', 'Services,', 'Bot', 'Services', 'and', 'Azure', 'Machine', 'Learning.According', 'to', 'Manish', 'Prakash,', 'Country', 'General', 'Manager-PS,', 'Health', 'and', 'Education,', 'Microsoft', 'India,', 'said,', '"With', 'AI', 'being', 'the', 'defining', 'technology', 'of', 'our', 'time,', 'it', 'is', 'transIn', 'an', 'attempt', 'to', 'build', 'an', 'AI-ready', 'workforce,', 'Microsoft', 'announced', 'Intelligent', 'Cloud', 'Hub', 'which', 'has', 'been', 'launched', 'to', 'empower', 'the', 'next', 'generation', 'of', 'students', 'with', 'AI-ready', 'skills']), (0.0944374740565271, ['As', 'part', 'of', 'the', 'program,', 'the', 'Redmond', 'giant', 'which', 'wants', 'to', 'expand', 'its', 'reach', 'and', 'is', 'planning', 'to', 'build', 'a', 'strong', 'developer', 'ecosystem', 'in', 'In

## Picking Top 'N' Sentences

In [25]:
# How many sentences to pick
n = int(input("How many sentences do you want in the summary? "))
#n=2
summarize_text = []
for i in range(n):
      summarize_text.append(" ".join(ranked_sentence[i][1]))

How many sentences do you want in the summary? 2


## Printing Sentences

In [26]:
#  Offcourse, output the summarize text
print("Summarize Text: \n", ". ".join(summarize_text))

Summarize Text: 
 The company will provide AI development tools and Azure AI services such as Microsoft Cognitive Services, Bot Services and Azure Machine Learning.According to Manish Prakash, Country General Manager-PS, Health and Education, Microsoft India, said, "With AI being the defining technology of our time, it is transIn an attempt to build an AI-ready workforce, Microsoft announced Intelligent Cloud Hub which has been launched to empower the next generation of students with AI-ready skills. As part of the program, the Redmond giant which wants to expand its reach and is planning to build a strong developer ecosystem in India with the program will set up the core AI infrastructure and IoT Hub for the selected campuses
