In [3]:
import pandas as pd
import numpy as np
import spacy
import en_core_web_sm
import nltk
import re
from stop_words import get_stop_words

In [4]:
def getText(filename):
    if filename[len(filename)-3:] == "pdf":
        from tika import parser
        raw = parser.from_file(filename)
        return raw['content']
    else:
        import docx
        doc = docx.Document(filename)
        fullText = []
        for para in doc.paragraphs:
            fullText.append(para.text)
    return '\n'.join(fullText)

In [5]:
from gensim.test.utils import common_texts, get_tmpfile
from gensim.models import Word2Vec
path = get_tmpfile("word2vec.model")
model = Word2Vec(common_texts, size=100, window=5, min_count=1, workers=4)
model.save("word2vec.model")



  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [6]:
model = Word2Vec.load("word2vec.model")
model.train([["hello", "world"]], total_examples=1, epochs=1)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


(0, 2)

In [7]:
vector = model.wv['computer']  # numpy vector of a word

In [8]:
from gensim.test.utils import common_texts
from gensim.models import Phrases
from gensim.models import Word2Vec, Phrases

bigram_transformer = Phrases(common_texts)
model = Word2Vec(bigram_transformer[common_texts], min_count=1)


In [9]:
from gensim.models import word2vec
from gensim.models.phrases import Phrases,Phraser
from gensim.models.word2vec import Text8Corpus
# from gensim.models.utils import datapath


def bigram2vec(unigrams, bigram_to_search):
    bigrams = Phrases(unigrams)
    print(bigrams)
    model = word2vec.Word2Vec(bigrams[unigrams])
    if bigram_to_search in model.vocab.keys():
        return model[bigram_to_search]
    else:
        return None

In [10]:
bigram2vec(['this','is','these','may','are'], "this_is")

Phrases<19 vocab, min_count=5, threshold=10.0, max_vocab_size=40000000>


RuntimeError: you must first build vocabulary before training the model

In [13]:
raw_text =getText("n_thakur_petition.docx")

nlp = spacy.load("en_core_web_sm")

paras = [i.replace('\t',' ') for i in raw_text.split('\n') if i!='']

inp_to_spacy = " ".join(paras) # create string from paras list

doc = nlp(inp_to_spacy) # a spacy doc object it has everything

# getting sentences out of doc
sentences = [sentence for idno, sentence in enumerate(doc.sents)]

sentences = [re.sub("…", "", str(sentence)) for sentence in sentences]

sentences = [re.sub("[.][.]+", "", str(sentence)) for sentence in sentences]

# again we input the data in spacy

input_to_spacy = " ".join(sentences)
doc = nlp(input_to_spacy)

tokens = [token for token in doc if not nlp.vocab[str(token)].is_stop]

# still it contains tokens that we dont need like , . - etc
# removing them manually
stop_words = get_stop_words('en')
tokens = [token for token in doc if str(token) not in stop_words]

# Result from the above cells:
# 1. `sentences`
# 2. `tokens`

#nltk.download('punkt') # one time execution

# ## Need to download dataset
# http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip

word_embeddings = {}

f = open('glove.6B.100d.txt', encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    word_embeddings[word] = coefs
f.close()

In [14]:
word_embeddings.get("by")

#get_stop_words('')

array([-0.20875  , -0.1174   ,  0.26478  , -0.28339  ,  0.19584  ,
        0.7446   , -0.03887  ,  0.028499 , -0.44252  , -0.30426  ,
        0.27133  , -0.51907  ,  0.52183  , -0.76648  ,  0.28043  ,
       -0.48344  , -0.15626  , -0.49705  , -0.51024  , -0.03652  ,
        0.20579  , -0.6136   ,  0.46388  ,  0.73497  ,  0.66813  ,
       -0.4443   , -0.17603  , -0.5478   , -0.013521 ,  0.16333  ,
        0.28148  ,  0.054223 , -0.19906  , -0.1907   , -0.43179  ,
        0.14781  ,  0.27555  ,  0.18571  , -0.40776  , -0.15415  ,
       -0.5885   , -0.0085281, -0.14178  ,  0.7061   ,  0.54031  ,
       -0.43305  ,  0.17497  , -0.46208  , -0.31372  , -0.34039  ,
       -0.25128  ,  0.68228  ,  0.33576  ,  1.5862   , -0.39427  ,
       -2.9938   , -0.29773  ,  0.04213  ,  1.9075   , -0.072628 ,
       -0.092191 ,  0.66133  ,  0.13868  ,  0.78774  ,  0.69307  ,
       -0.22185  ,  0.71705  ,  1.1453   ,  1.2153   ,  0.14196  ,
       -0.79914  ,  0.16965  , -0.34532  , -0.51742  , -0.1564

In [None]:


#    create sentence vectors
sentence_vectors = []
for i in sentences:
    if len(i) != 0:
        v = sum([word_embeddings.get(w, np.zeros((100,))) for w in i.split()])/(len(i.split())+0.001)
    else:
        v = np.zeros((100,))
    sentence_vectors.append(v)



In [6]:
(sentence_vectors)

[array([-0.04853449,  0.02991144,  0.06620197, -0.09254678, -0.05481645,
         0.00543265,  0.02446365,  0.02282245,  0.06658906, -0.00273804,
         0.05924725, -0.04906299,  0.03838309,  0.00637623,  0.06017855,
        -0.05860877,  0.02208113,  0.00317655, -0.09234823,  0.03607484,
         0.00616141, -0.0277746 ,  0.06644194,  0.0652064 ,  0.09796886,
         0.01304028,  0.03124554, -0.10048707,  0.02397515, -0.0501057 ,
        -0.01804599,  0.09482074, -0.03688044,  0.00521954, -0.01943294,
         0.05749607,  0.02040994,  0.0544665 , -0.01754464, -0.06554207,
        -0.03611198, -0.04346808, -0.01601914, -0.03739751, -0.03211256,
        -0.06363948,  0.04272247, -0.12228539, -0.02071561, -0.07011284,
         0.00118516, -0.02498357,  0.03931438,  0.20569918, -0.0303371 ,
        -0.40615625, -0.03993429, -0.06530781,  0.23405228,  0.11256677,
        -0.07893444,  0.09284388,  0.01234481,  0.05572347,  0.15186402,
        -0.05053421,  0.06903014,  0.04942151,  0.1

In [7]:
sentence_vectors

# similarity matrix

sim_mat = np.zeros([len(sentences), len(sentences)])
from sklearn.metrics.pairwise import cosine_similarity


for i in range(len(sentences)):
    for j in range(len(sentences)):
        if i != j:
            sim_mat[i][j] = cosine_similarity(sentence_vectors[i].reshape(1,100), sentence_vectors[j].reshape(1,100))[0,0]



In [8]:
import networkx as nx

nx_graph = nx.from_numpy_matrix(sim_mat)
scores = nx.pagerank(nx_graph)


In [9]:
scores

{0: 0.009888378915392643,
 1: 0.0012786569870795162,
 2: 0.0012786569870795162,
 3: 0.0012786569870795162,
 4: 0.0012786569870795162,
 5: 0.005786148698844055,
 6: 0.009888378915392643,
 7: 0.0012786569870795162,
 8: 0.0012786569870795162,
 9: 0.010347821627503955,
 10: 0.0012786569870795162,
 11: 0.0012786569870795162,
 12: 0.0012786569870795162,
 13: 0.0012786569870795162,
 14: 0.005786148698844055,
 15: 0.0012786569870795162,
 16: 0.0012786569870795162,
 17: 0.0012786569870795162,
 18: -0.001659722193985991,
 19: 0.009888378915392643,
 20: -0.002721381110347538,
 21: 0.00903774013907537,
 22: -0.0037140490279505135,
 23: 0.0012786569870795162,
 24: 0.0012786569870795162,
 25: 0.009479614416729721,
 26: 0.0012786569870795162,
 27: 0.0012786569870795162,
 28: -0.003738712663067401,
 29: 0.0012786569870795162,
 30: 0.0012786569870795162,
 31: 0.0012786569870795162,
 32: 0.0012786569870795162,
 33: 0.0012786569870795162,
 34: 0.0012786569870795162,
 35: 0.0012786569870795162,
 36: 0.001

In [10]:

ranked_sentences = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)
summary = []

In [11]:
ranked_sentences

[(0.010435869946486005,
  'Any other or order/relief in favour of the petitioner and against the respondent No.1 to 3 may also be passed in the interest of justice. .'),
 (0.010431456283288428,
  '1 and 2 did not care for the same and closed their eyes from the side of the respondent No.3.  '),
 (0.010421175123763234,
  'That on each plot, the structure is raised upto third floor, First Floor, Second Floor and Third Floor, which shows that there are 3324 properties/floors more have been shown in the total plots of 831, whereas the aforementioned plots/properties are not existing in the Pockets C-4, C-5 and E-4, Sector 16, Rohini, Delhi, and the same have falsely been mentioned in the list.'),
 (0.010411198728532802,
  'It is submitted that the petitioner also submitted a letter dated 4.4.2008 to the respondent No.1 for the use of vehicles in the canvassing  with which the goods are transported, but the concerned officials did not care for the requests of the petitioner and the dumped h

In [12]:
# i=1
# for i in range(1, len(ranked_sentences)):
#     if ranked_sentences[i][0]==ranked_sentences[i-1][0]:
#         ranked_sentences.pop(i)
# print(ranked_sentences)

In [13]:
summary=[]
for i in range(len(ranked_sentences)):
    print(ranked_sentences[i][1])

Any other or order/relief in favour of the petitioner and against the respondent No.1 to 3 may also be passed in the interest of justice. .
1 and 2 did not care for the same and closed their eyes from the side of the respondent No.3.  
That on each plot, the structure is raised upto third floor, First Floor, Second Floor and Third Floor, which shows that there are 3324 properties/floors more have been shown in the total plots of 831, whereas the aforementioned plots/properties are not existing in the Pockets C-4, C-5 and E-4, Sector 16, Rohini, Delhi, and the same have falsely been mentioned in the list.
It is submitted that the petitioner also submitted a letter dated 4.4.2008 to the respondent No.1 for the use of vehicles in the canvassing  with which the goods are transported, but the concerned officials did not care for the requests of the petitioner and the dumped her complaints for the reasons best known to the respondent No.1.  
That the petitioner has filed the above titled pet

In [14]:
summary=[]
i=0
max_count=max(((int)(len(ranked_sentences)/10)),5)
while i< max_count:    
    if(ranked_sentences[0][0]/5000>ranked_sentences[i][0]):
        break    
    if(ranked_sentences[i][0]<=0):
        break
    if ranked_sentences[i][1] in summary:
        max_count+=1
        continue
    summary.append(ranked_sentences[i][1])
    i+=1
    if(i>=len(ranked_sentences)):
        break
    

ordered_summary = []
ordered_set = {-1}
for i in sentences:
    if i in summary and i.lower() not in ordered_set:
        ordered_summary.append(i)
        ordered_set.add(i.lower())
        
import docx
d = docx.Document()
d.add_heading('Summary', 0)

for i in ordered_summary:
    d.add_paragraph(
        i, style='List Bullet'
    )

#d.save('_summarized_2.docx')

In [15]:
ordered_summary

['Advocate Counsel for the Petitioner IN THE COUERT OF DISTRICT JUDGE: .',
 'That Election for Municipal Councillors for  the Municipal Corporation of  were declared to be held on .',
 'The true fact are that there are no pocket C-4, C-5 and B-4 in the Sector No.16, Rohini, .',
 'That on each plot, the structure is raised upto third floor, First Floor, Second Floor and Third Floor, which shows that there are 3324 properties/floors more have been shown in the total plots of 831, whereas the aforementioned plots/properties are not existing in the Pockets C-4, C-5 and E-4, Sector 16, Rohini, Delhi, and the same have falsely been mentioned in the list.',
 'the activities of the activities of the respondent No.3, but it did not care for the requests of the petitioner and turned a deaf ear to her requests.  ',
 'It is submitted that the petitioner also submitted a letter dated 4.4.2008 to the respondent No.1 for the use of vehicles in the canvassing  with which the goods are transported, but