In [None]:
from google.colab import drive
drive.mount('/content/drive')

MessageError: Error: credential propagation was unsuccessful

In [None]:
import pandas as pd
data = pd.read_csv("/content/The Air (Prevention and Control of Pollution) Act of 1981 - The Air (Prevention and Control of Pollution) Act of 1981.csv")

In [None]:
# Spacy version

In [None]:
import spacy
from spacy import displacy

In [None]:
nlp = spacy.load('en_core_web_sm')
nlp.add_pipe('merge_noun_chunks')

In [None]:
sentence = "In this Act, the definitions are as follows: air pollutant means any solid, liquid, or gaseous substance present in the atmosphere in such concentration as may be or tend to be injurious to human beings or other living creatures or plants or property or environment; air pollution means the presence in the atmosphere of any air pollutant; approved appliance means any equipment or gadget used for bringing any combustible material or generating or consuming any fume, gas of particulate matter and approved by the State Board for the purposes of this Act; approved fuel means any fuel approved by the State Board for the purposes of this Act; automobile means any vehicle powered either by an internal combustion engine or by any method of generating power to drive such vehicle by burning fuel; Board means the Central Board or a State Board; Central Board means the Central Pollution Control Board constituted under section 3 of the Water (Prevention and Control of Pollution) Act, 1974 (6 of 1974); chimney includes any structure with an opening or outlet from or through which any air pollutant may be emitted; control equipment means any apparatus, device, equipment, or system to control the quality and manner of emission of any air pollutant and includes any device used for securing the efficient operation of any industrial plant."

In [None]:
doc = nlp(sentence)

In [None]:
for token in doc:
    ancestors = [t.text for t in token.ancestors]
    children = [t.text for t in token.children]
    if(token.dep_ == "ROOT"):
      print(token.text, "\t", token.i, "\t",
          token.pos_, "\t", token.dep_, "\t",
          ancestors, "\t", children)

means 	 151 	 VERB 	 ROOT 	 [] 	 ['includes', ';', 'control equipment', 'control', '.']


In [None]:
def find_root_of_sentence(doc):
    root_token = None
    for token in doc:
        if (token.dep_ == "ROOT"):
            root_token = token
    return root_token

In [None]:
root_token = find_root_of_sentence(doc)

In [None]:
def find_other_verbs(doc, root_token):
    other_verbs = []
    for token in doc:
        ancestors = list(token.ancestors)
        if (token.pos_ == "VERB" and len(ancestors) == 1\
            and ancestors[0] == root_token):
            other_verbs.append(token)
    return other_verbs
other_verbs = find_other_verbs(doc, root_token)

In [None]:
def get_clause_token_span_for_verb(verb, doc, all_verbs):
    first_token_index = len(doc)
    last_token_index = 0
    this_verb_children = list(verb.children)
    for child in this_verb_children:
        if (child not in all_verbs):
            if (child.i < first_token_index):
                first_token_index = child.i
            if (child.i > last_token_index):
                last_token_index = child.i
    return(first_token_index, last_token_index)

In [None]:
token_spans = []
all_verbs = [root_token] + other_verbs
for other_verb in all_verbs:
    (first_token_index, last_token_index) = get_clause_token_span_for_verb(other_verb,doc, all_verbs)
    token_spans.append((first_token_index,last_token_index))

In [None]:
sentence_clauses = []
for token_span in token_spans:
    start = token_span[0]
    end = token_span[1]
    if (start < end):
        clause = doc[start:end]
        sentence_clauses.append(clause)
sentence_clauses = sorted(sentence_clauses,
                          key=lambda tup: tup[0])

In [None]:
clauses_text = [clause.text for clause in sentence_clauses]
print(clauses_text)

['means the Central Pollution Control Board constituted under section 3 of the Water (Prevention and Control of Pollution) Act, 1974 (6 of 1974); chimney includes', '; control equipment means any apparatus, device, equipment, or system to control the quality and manner of emission of any air pollutant and includes any device used for securing the efficient operation of any industrial plant', 'any apparatus, device, equipment, or system to control the quality and manner of emission of any air pollutant and']


In [None]:
# text = "Every rule made under this Act shall be laid, as soon as maybe after it is made, before each House of Parliament,
# while it is in session, for a total period of thirty days which may be comprised in one Session or in two or more successive sessions,
# and if, before the expiry of the session immediately following the session or the successive sessions aforesaid.
# both Houses agree in making any modification in the rule or both Houses agree that the rule should not be made,
# the rule shall thereafter have effect only in such modified form or be of no effect, as the case may be; so, however,
# that any such modification or annulment shall be without prejudice to the validity of anything previously done under that rule."
# doc = nlp(text)
i = 0
noun_found = False
attribute = ""
deontic = ""
aim = ""
for text in clauses_text:
  doc = nlp(text)
  for entity in doc:
      print(entity, entity.pos_, entity.dep_)
      if (entity.pos_ == "PROPN" or entity.pos_ == "NOUN"):
          attribute = str(entity)
          aim = str(entity.head.text)
          aim_found = True
          break

  for entity in doc:
    if (entity.pos_ == "AUX") and (aim == entity.head.text):
      deontic = str(entity)
      # print(deontic + entity.head.text)

# print(doc.vocab)

# for chunk in doc.noun_chunks:
#     if str(chunk.text) == attribute:
#         aim = str(chunk.root.head.text)

print("attribute: " + attribute)
print("aim: " + aim)
print("deontic: " + deontic)

means VERB ccomp
the Central Pollution Control Board PROPN nsubj
; PUNCT punct
control equipment NOUN nsubj
any apparatus NOUN ROOT
attribute: any apparatus
aim: any apparatus
deontic: 


In [None]:
def create_deontic_vocab(data):
  i = -1
  for sentence in data["Sentence"]:
    i += 1
    all_deontics = set()
    doc1 = nlp(sentence)
    root_token = find_root_of_sentence(doc1)
    other_verbs = find_other_verbs(doc1, root_token)

    token_spans = []
    all_verbs = [root_token] + other_verbs
    for other_verb in all_verbs:
      (first_token_index, last_token_index) = get_clause_token_span_for_verb(other_verb,doc1, all_verbs)
      token_spans.append((first_token_index,last_token_index))
    sentence_clauses = []
    for token_span in token_spans:
      start = token_span[0]
      end = token_span[1]
      if (start < end):
        clause = doc1[start:end]
        sentence_clauses.append(clause)
        sentence_clauses = sorted(sentence_clauses,
                                key=lambda tup: tup[0])
    clauses_text = []
    clauses_text = [clause.text for clause in sentence_clauses]
    deontic_here = ""
    for text in clauses_text:
      doc = nlp(text)
      for entity in doc:
        # print(entity, entity.pos_, entity.dep_)
        if (entity.pos_ == "PROPN" or entity.pos_ == "NOUN"):
            attribute = str(entity)
            aim = str(entity.head.text)
            aim_found = True
            break

      for entity in doc:
        if (entity.pos_ == "AUX") and (aim == entity.head.text) and entity.text != "be":
          deontic_here = str(entity)
          print(deontic_here + " " + entity.head.text)
    if(deontic_here != ""):
      data.iloc[i,3] = deontic_here
    all_deontics.add(deontic_here)

In [None]:
create_deontic_vocab(data)

In [None]:
data["deontic"]

Unnamed: 0,deontic
0,may
1,be
2,be
3,has
4,be
...,...
132,shall
133,be
134,be
135,shall


In [None]:
data

Unnamed: 0,Type,Deontic,Sentence,deontic
0,Flexible,may,"Short title, extent and commencement.—(1) This...",may
1,Rigid,shall,\n(3) It shall come into force on such date1as...,be
2,Flexible,may,"Definitions.—In this Act, unless the context o...",be
3,Flexible,may,4 \n(j) “emission” means any solid or liquid o...,has
4,Rigid,shall,Central Pollution Control Board.—The Central P...,be
...,...,...,...,...
132,Rigid,shall,\n(2) Every rule made by the Central Governmen...,shall
133,Flexible,may,Power of State Government to make rules.—(1) S...,be
134,Rigid,shall,"\n(2) In particular, and without prejudice to ...",be
135,Rigid,shall,22 \n(k) the manner in which any area or areas...,shall


NameError: name 'all_deontics' is not defined