In [1]:
import json

import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation
from collections import Counter
from heapq import nlargest

In [2]:
# https://spacy.io/usage
# need to download the trained pipeline
# python -m spacy download en_core_web_sm

nlp = spacy.load('en_core_web_sm')

In [3]:
with open('json_sample_files/legislative_details.json') as json_file:
    data = json.load(json_file)

In [4]:
len(data)

10

In [5]:
data[0].get('details').get('title')

'Forming Open and Robust University Minds (FORUM) Act; enact'

In [6]:
doc = data[0].get('document_text')

In [7]:
doc = nlp(doc)

In [8]:
len(list(doc.sents))

64

In [9]:
keyword = []
stopwords = list(STOP_WORDS)
pos_tag = ['PROPN', 'ADJ', 'NOUN', 'VERB']
for token in doc:
    if(token.text in stopwords or token.text in punctuation):
        continue
    if(token.pos_ in pos_tag):
        keyword.append(token.text)

In [10]:
freq_word = Counter(keyword)
print(freq_word.most_common(5))

[('education', 52), ('public', 47), ('higher', 47), ('campus', 39), ('expressive', 30)]


In [11]:
type(freq_word)

collections.Counter

In [12]:
max_freq = Counter(keyword).most_common(1)[0][1]
for word in freq_word.keys():  
        freq_word[word] = (freq_word[word]/max_freq)
freq_word.most_common(5)

[('education', 1.0),
 ('public', 0.9038461538461539),
 ('higher', 0.9038461538461539),
 ('campus', 0.75),
 ('expressive', 0.5769230769230769)]

In [13]:
sent_strength={}
for sent in doc.sents:
    for word in sent:
        if word.text in freq_word.keys():
            if sent in sent_strength.keys():
                sent_strength[sent]+=freq_word[word.text]
            else:
                sent_strength[sent]=freq_word[word.text]
print(sent_strength)

{LC 49 0477S: 0.38461538461538464, The House Committee on Higher Education offers the following substitute to HB 1: A BILL TO BE ENTITLED AN ACT To amend Title 20 of the Official Code of Georgia Annotated, relating to education generally, so as to provide for public forums at public institutions of higher education within the University System of Georgia and the Technical College System of Georgia for the campus community; to prevent the creation of "free speech zones" at such public institutions of higher education; to allow for reasonable, content- and viewpoint-neutral, and narrowly-tailored time, place, and manner restrictions on expressive activity at public institutions of higher education; to prohibit material and substantial disruption of protected expressive activity at public institutions of higher education; to prohibit such public institutions of higher education from denying benefits to or otherwise discriminating against a student organization on the basis of the student 

In [14]:
summarized_sentences = nlargest(3, sent_strength, key=sent_strength.get)
print(summarized_sentences)

[The House Committee on Higher Education offers the following substitute to HB 1: A BILL TO BE ENTITLED AN ACT To amend Title 20 of the Official Code of Georgia Annotated, relating to education generally, so as to provide for public forums at public institutions of higher education within the University System of Georgia and the Technical College System of Georgia for the campus community; to prevent the creation of "free speech zones" at such public institutions of higher education; to allow for reasonable, content- and viewpoint-neutral, and narrowly-tailored time, place, and manner restrictions on expressive activity at public institutions of higher education; to prohibit material and substantial disruption of protected expressive activity at public institutions of higher education; to prohibit such public institutions of higher education from denying benefits to or otherwise discriminating against a student organization on the basis of the student organization's religious, politica

In [15]:
final_sentences = [ w.text for w in summarized_sentences ]
summary = ' '.join(final_sentences)
print(summary)

The House Committee on Higher Education offers the following substitute to HB 1: A BILL TO BE ENTITLED AN ACT To amend Title 20 of the Official Code of Georgia Annotated, relating to education generally, so as to provide for public forums at public institutions of higher education within the University System of Georgia and the Technical College System of Georgia for the campus community; to prevent the creation of "free speech zones" at such public institutions of higher education; to allow for reasonable, content- and viewpoint-neutral, and narrowly-tailored time, place, and manner restrictions on expressive activity at public institutions of higher education; to prohibit material and substantial disruption of protected expressive activity at public institutions of higher education; to prohibit such public institutions of higher education from denying benefits to or otherwise discriminating against a student organization on the basis of the student organization's religious, political