In [None]:
import pandas as pd
import numpy as np
import spacy
import en_core_web_sm
import nltk
import docx
import re
from tika import parser
import continuous_threading as thread
from stop_words import get_stop_words
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx
nltk.download('punkt')
nlp = spacy.load("en_core_web_sm")

In [None]:
def getText(filename):
    if filename[len(filename)-3:] == "pdf":
        raw = parser.from_file(filename)
        return raw['content']
    else:
        doc = docx.Document(filename)
        fullText = []
        for para in doc.paragraphs:
            fullText.append(para.text)
    return '\n'.join(fullText)

In [None]:
path_to_file = "n_thakur_petition.docx"
raw_text = getText(path_to_file)
raw_text = re.sub(r"\n", " ", raw_text)
paras = [i.replace('\t',' ') for i in raw_text.split('\n') if i!='']

inp_to_spacy = " ".join(paras) # create string from paras list
doc = nlp(inp_to_spacy) # a spacy doc object it has everything

# getting sentences out of doc
sentences = [sentence for idno, sentence in enumerate(doc.sents)]
sentences = [re.sub("…", "", str(sentence)) for sentence in sentences]
sentences = [re.sub("[.][.]+", "", str(sentence)) for sentence in sentences]

# again we input the data in spacy
input_to_spacy = " ".join(sentences)
doc = nlp(input_to_spacy)

tokens = [token for token in doc if not nlp.vocab[str(token)].is_stop]

# still it contains tokens
stop_words = get_stop_words('en')
tokens = [token for token in doc if str(token) not in stop_words]

word_embeddings = {}

# Glove files can be downloaded from the link given below.
# https://drive.google.com/open?id=12v__AwX9CjQhXkIsghDCVEjUNklj_7j1

dimension_of_embedding = 100
Glove = 'Gloves/' + str(dimension_of_embedding) + '.txt'
f = open(Glove, encoding = 'utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype = 'float32')
    word_embeddings[word] = coefs
f.close()

In [None]:
# create sentence vectors
sentence_vectors = []
for i in sentences:
    a = 0
    if len(i) != 0:
        length = len(i.split())
        v = sum([word_embeddings.get(i[a], np.zeros((100,)) ) for a in range(length)]) / (length+0.001)
    else:
        v = np.zeros((100,))
    sentence_vectors.append(v)

In [None]:
def Similarity(i, j):
    sim_mat[i][j] = cosine_similarity(sentence_vectors[i].reshape(1,100), sentence_vectors[j].reshape(1,100))[0,0]

In [None]:
# similarity matrix
sim_mat = np.zeros([len(sentences), len(sentences)])
import time
th1 = thread.OperationThread(target=Similarity)
th2 = thread.OperationThread(target=Similarity)
th3 = thread.OperationThread(target=Similarity)
th4 = thread.OperationThread(target=Similarity)
t1 = time.time()
loop = len(sentences)
i = j = 0

while i < loop:
#     print(i)
    while j < loop:
        if j < loop and i != j:
            th1.add_data(i, j)
        
        if j+1 < loop and i != j+1:
            th2.add_data(i, j+1)
        
        if j+2 < loop and i != j+2:
            th1.add_data(i, j+2)
        
        if j+3 < loop and i != j+3:
            th1.add_data(i, j+3)
        j = j + 4
    i = i + 1
    j = 0

th1.stop()
th2.stop()
th3.stop()
th4.stop()
print(time.time() - t1)

# Restart Script From here

In [None]:
nx_graph = nx.from_numpy_matrix(sim_mat)
scores = nx.pagerank(nx_graph)
ranked_sentences = sorted(((scores[i], s, i) for i, s in enumerate(sentences)), reverse=True)

In [None]:
summary = []
i = 0
# Taking top 10% sentences with highest importance
Length = int(len(ranked_sentences)/5)
Threshold = ranked_sentences[0][0]/1000

while i < max(Length, 10):
    # Checking that ith sentence is 0.1% important as compared to 1st sentence 
    if(Threshold > ranked_sentences[i][0]):
        break    
    if(ranked_sentences[i][0] <= 0):
        break
    
    summary.append([ranked_sentences[i][2], ranked_sentences[i][1]])
    i += 1
    if(i >= len(ranked_sentences)):
        break

In [None]:
sorted_summary = sorted(summary)
sorted_summary_final = []

In [None]:
for ind, sent in sorted_summary:
    if(sent.lower() not in sorted_summary_final):
        sorted_summary_final.append(sent)        

In [None]:
d = docx.Document()
d.add_heading('Summary', 0)

for i in sorted_summary_final:
    d.add_paragraph(i, style = 'List Bullet')

d.save('Summary.docx')

In [None]:
sorted_summary_final