In [49]:
import os
import re
import spacy
from spacy.matcher import Matcher
#import en_core_web_lg
import en_core_web_sm
!pip install tika
!pip install xlwt



In [50]:
from tika import parser
from xlwt import Workbook 

In [3]:
nlp = en_core_web_sm.load()

In [47]:
def parse_file(file_path):
  raw = parser.from_file(file_path)
  return raw['content']

def preprocess(pt):
  pt = re.sub('\[.*\]',' ',pt)
  pt = re.sub('\(.*\)',' ',pt)
  pt = re.sub('\n',' ',pt)
  pt = re.sub('\n\n',' ',pt)
  pt = re.sub('Order\%.*\%.',"",pt)
  pt = re.sub('Microsoft Word',"",pt)  
  pt = re.sub('\sPage\s[0-9]\sof\s[0-9]?[0-9]\s','',pt)
  pt = re.sub('.*docx','',pt)
  #pt = re.sub('_*_','',pt)
  pt = re.sub('/',' ',pt)
  pt = re.sub('`',' ',pt)
  pt = re.sub('\sPage\s[0-9]','',pt)
  #pt = re.sub('[0-9]?[0-9]\.','',pt)
  pt = pt.lstrip()
  pt = pt.rstrip()
  return pt

def split_header(text):
  appellants_list = []
  all_names = []
  bench = []
  
  if text.find('Per :')>-1:
    header_extracted = text.split('Per :')[0]
    body_sep = text.split('Per :')[1]
  elif text.find('Per:')>-1:
    header_extracted = text.split('Per:')[0]
    body_sep = text.split('Per:')[1]
  elif text.find('Presiding Officer')>-1:
    header_extracted = text.split('Presiding Officer')[0]
    body_sep = text.split('Presiding Officer')[1]
  elif text.find('ORDER')>-1:
    header_extracted = text.split('ORDER')[0]
    body_sep = text.split('ORDER')[1]
  elif text.find('JUDGMENT')>-1:
    header_extracted = text.split('JUDGMENT')[0]
    body_sep = text.split('JUDGMENT')[1]
  else:
    print("WARNING: Could not split header correctly")

  return header_extracted, body_sep

def header_extraction(header):
  #store names of appellants 
  appellant_names = []
  #all the names in the header 
  all_header_names  = []
  #names of the bench 
  bench_names = []
  #names of organization in the header 
  org_names = []

  #run NER on entire header to detect all possible names 
  preprocessed_header = preprocess(header)
  doc = nlp(preprocessed_header)
  for ent in doc.ents:
    if ent.label_ == "PERSON":
      all_header_names.append(ent.text)

  if 'versus' in header.lower():
    #split into appellants
    appellant_split = header.split('Versus')
    if len(appellant_split)>2:
      print('Appellant Exception ',end='\n\n\n')
    # Check if there are multiple appellants named in the header
    if 'appellants' in appellant_split[0].lower():
      #split into points 
      print("Multiple Appellants",end='\n\n\n')
    else: 
      appellant_segment_preprocess = preprocess(appellant_split[0])
      doc = nlp(appellant_segment_preprocess)
      for ent in doc.ents:
        if ent.label_ == "PERSON":
          appellant_names.append(ent.text)
        if ent.label_ == "ORG":
          org_names.append(ent.text)
      #get names of the bench 
    match = re.search('coram',header,re.I)
    occurance  = match.group()
    coram_fragment = header.split(occurance)[1]
    doc = nlp(preprocess(coram_fragment))
    for ent in doc.ents:
      if ent.label_ == "PERSON":
        bench_names.append(ent.text)
  else:
    #template for type 2 documents 
    #get names of judges 
    match = re.search('bench',header,re.I)
    occurance  = match.group()
    coram_fragment = header.split(occurance)[1]
    doc = nlp(preprocess(coram_fragment))
    for ent in doc.ents:
      if ent.label_ == "PERSON":
        bench_names.append(ent.text)
    # get ORG names 
    preprocessed_header = preprocess(header)
    doc = nlp(preprocessed_header)
    for ent in doc.ents:
      if ent.label_ == "ORG":
        org_names.append(ent.text)
    #names of appellant 
    appellant_names = [name for name in all_names if name not in bench_names]


  return all_header_names,bench_names,appellant_names,org_names

def split_pointwise(text):
  points_split = re.split("\n[0-9]?[0-9]\.\s",text)
  total_points = len(points_split)
  return points_split,total_points


def extract_all_entites(points):
  all_DATE = []
  all_GPE = []
  all_LAW = []
  all_MONEY = []
  all_ORG = []
  all_PER = []
  all_CARDINAL = []

  for point in points[1:]:
    point = preprocess(point)
    doc = nlp(point)
    for ent in doc.ents:
      if ent.label_ == "PERSON":
        all_PER.append(ent.text)
      if ent.label_ == "ORG":
        all_ORG.append(ent.text)
      if ent.label_ == "DATE":
        all_DATE.append(ent.text)
      if ent.label_ == "LAW":
        all_LAW.append(ent.text)
      if ent.label_ == "MONEY":
        all_MONEY.append(ent.text)
      if ent.label_ == "GPE":
        all_GPE.append(ent.text)
      if ent.label_ == "CARDINAL":
        all_CARDINAL.append(ent.text)

  return all_DATE,all_GPE,all_LAW,all_MONEY,all_ORG,all_PER,all_CARDINAL
#Rule based extraction of penalty and money using spacy matcher and regex 

def rule_based_matcher_penalty(points):

  pattern_one = [{'IS_ALPHA': False},
           {'ENT_TYPE': 'CARDINAL'},
           {'LOWER': 'lacs'}]
  pattern_two = [{'IS_ALPHA': False},
            {'ENT_TYPE': 'CARDINAL'},
            {'LOWER': 'lakh'}]
  pattern_three = [{'IS_ALPHA': False},
            {'ENT_TYPE': 'CARDINAL'},
            {'LOWER': 'lakhs'}]

  pattern_four = [{'IS_ALPHA': False},
            {'ENT_TYPE': 'CARDINAL'},
            {'LOWER': 'crore'}]

  pattern_five = [{'IS_ALPHA': False},
            {'ENT_TYPE': 'CARDINAL'},
            {'LOWER': 'crores'}]
  pattern_six = [{'LOWER': 'rs'},
            {'IS_PUNCT': True, 'OP': '?'},
            {'ENT_TYPE': 'CARDINAL'},
            {'LOWER': 'lacs'}]
  pattern_seven = [{'LOWER': 'rs'},
            {'IS_PUNCT': True, 'OP': '?'},
            {'ENT_TYPE': 'CARDINAL'},
            {'LOWER': 'lakhs'}]
  pattern_eight = [{'LOWER': 'rs'},
            {'IS_PUNCT': True, 'OP': '?'},
            {'ENT_TYPE': 'CARDINAL'},
            {'LOWER': 'crore'}]               
  pattern_nine = [{'LOWER': 'rs'},
            {'IS_PUNCT': True, 'OP': '?'},
            {'ENT_TYPE': 'CARDINAL'},
            {'LOWER': 'crores'}]


  matcher = Matcher(nlp.vocab)
  matcher.add("Rule1",None,pattern_one)
  matcher.add("Rule2",None,pattern_two)
  matcher.add("Rule3",None,pattern_three)
  matcher.add("Rule4",None,pattern_four)
  matcher.add("Rule5",None,pattern_five)
  matcher.add("Rule6",None,pattern_six)
  matcher.add("Rule7",None,pattern_seven)
  matcher.add("Rule8",None,pattern_eight)
  matcher.add("Rule9",None,pattern_nine)

  all_penalty = []
  all_money = []

  for point in points[1:]:
    preprocessed_point = preprocess(point)
    doc = nlp(preprocessed_point)
    tokens = [token.text for token in doc]
    matches = matcher(doc)
    if len(matches)>0:
      for match_id, start, end in matches:
        if point.lower().find('penalty') >-1:
          all_penalty.append(" ".join(tokens[start+1:end]))
          #print(" ".join(tokens[start+1:end]))
        else:
          all_money.append(" ".join(tokens[start+1:end]))
          #print(" ".join(tokens[start+1:end]))    
  
  return all_penalty,all_money

def rule_based_regex_penalty(points):
  all_penalty = []
  all_money = []

  for point in points[1:]:
    preprocessed_point = preprocess(point)
    indian_curr = re.compile(r'(Rs\.?|rs.)\s*(\d+(?:[.,]\d+)*)\s*(lacs|lakhs|crore|crores)')
    if point.lower().find('penalty') >-1:
      pt_penalty = [y for y in indian_curr.findall(preprocessed_point)]
      all_penalty += pt_penalty   
    else:
      pt_money = [y for y in indian_curr.findall(preprocessed_point)]
      all_money += pt_money   

  return all_penalty,all_money

def decision(points):
  preprocessed_point =  preprocess(points[-1])
  match = re.search('sd -',preprocessed_point,re.I)
  if match:
    occurance = match.group()
    preprocessed_point = preprocessed_point.split(occurance)[0]
  
  return preprocessed_point       
     

In [59]:
#Format excel sheet 
wb = Workbook()
sheet1 = wb.add_sheet('Sheet 1')

sheet1.write(0,0,'File Name')
sheet1.write(0,1,'Header Names S-NER')
sheet1.write(0,2,'Header Bench S-NER')
sheet1.write(0,3,'Header Appellant S-NER')
sheet1.write(0,4,'Header Orgs S-NER')
sheet1.write(0,5,'Dates S-NER')
sheet1.write(0,6,'GPE S-NER')
sheet1.write(0,7,'Laws S-NER')
sheet1.write(0,8,'Money S-NER')
sheet1.write(0,9,'Orgs S-NER')
sheet1.write(0,10,'People S-NER')
sheet1.write(0,11,'Cardinals S-NER')
sheet1.write(0,12,'Penalty SM')
sheet1.write(0,13,'Money SM')
sheet1.write(0,14,'Penalty Regex')
sheet1.write(0,15,'Money Regex')
#writing the decision to the excel file exceeds the allowed character count per cell 
#sheet1.write(0,16,'Decision')



In [61]:
list_of_files = []
root_name = ''
file_count = 2

for root,dir,files in os.walk('/content/drive/My Drive/SEBI /SAT orders/IndianKanoon/'):
  list_of_files = files
  root_name = root

for file_name in list_of_files:
  file_path = root_name+file_name
  text = parse_file(file_path) 
  #split docuemnt from header 
  header, rest = split_header(text)
  #print(header)
  #header extraction
  all_names, bench_names, appellant_names, org_names = header_extraction(header)
  #print(all_names,bench_names,appellant_names,org_names,sep='\n')
  #split the remaining document into points 
  points, no_of_points = split_pointwise(rest)
  #pass the points to get all the different types of entites in the doc 
  all_DATE,all_GPE,all_LAW,all_MONEY,all_ORG,all_PER,all_CARDINAL = extract_all_entites(points)
  #print(all_DATE,all_GPE,all_LAW,all_MONEY,all_ORG,all_PER,all_CARDINAL,sep='\n')
  penalty_extracted_matcher, money_extracted_matcher = rule_based_matcher_penalty(points)
  #print(penalty_extracted_matcher)
  #print(money_extracted_matcher)
  penalty_extracted_regex, money_extracted_regex = rule_based_regex_penalty(points)
  #print(penalty_extracted_regex)
  #print(money_extracted_regex)
  verdict = decision(points)
  #print(verdict)

  ##Write the resuls to an output file 
  sheet1.write(file_count,0,file_name)
  sheet1.write(file_count,1, ','.join(all_names))
  sheet1.write(file_count,2,','.join(bench_names))
  sheet1.write(file_count,3,','.join(appellant_names))
  sheet1.write(file_count,4,','.join(org_names))
  sheet1.write(file_count,5,','.join(all_DATE))
  sheet1.write(file_count,6,','.join(all_GPE))
  sheet1.write(file_count,7,','.join(all_LAW))
  sheet1.write(file_count,8,','.join(all_MONEY))
  sheet1.write(file_count,9,','.join(all_ORG))
  sheet1.write(file_count,10,','.join(all_PER))
  sheet1.write(file_count,11,','.join(all_CARDINAL))
  sheet1.write(file_count,12,','.join(penalty_extracted_matcher))
  sheet1.write(file_count,13,','.join(money_extracted_matcher))
  sheet1.write(file_count,14,str(penalty_extracted_regex))
  sheet1.write(file_count,15,str(money_extracted_regex))
  #sheet1.write(file_count,16,verdict)  
  file_count = file_count + 1 
  print('Done',end='\n')
wb.save("SAT_Order_First_Run.xls")