<a href="https://colab.research.google.com/github/AbhilashDatta/Role-Identification-in-Law-Documents/blob/main/Named_Entity_Recognition.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# NER using spacy

In [None]:
!pip install spacy
!python -m spacy download en_core_web_sm

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting en_core_web_sm==2.2.5
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.5/en_core_web_sm-2.2.5.tar.gz (12.0 MB)
[K     |████████████████████████████████| 12.0 MB 23.7 MB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')


In [None]:
!rm -rf case_docs

In [None]:
!mkdir case_docs

In [None]:
docids = [
          192122552,
          143104128,
          134946367,
          107445344,
          48386631,
          14063857,
          46028237,
          50547742,
          130610553,
          86443827, 
          183266961,
          72498871,
          9743362,
          132206422,
          27448625,
          130196424,
          184861797,
          160648177,
          83838189,
          194809881,
          90439252,
          199441852,
          149259890,
          139915188,
          13618343,
          58138931,
          161392433,
          62854486,
          173783033,
          124821870
          ]

In [None]:
# Gather Docs
import os
import json
from tqdm.auto import tqdm
import requests
from urllib.parse import urljoin
from datetime import datetime

def gather_docs(docids):
	base_url = "https://api.indiankanoon.org/"
	auth_token = "c7f20d514294b8bc1a8353e1008742d97fe9986d" # Replace this with your auth token

	#ids_file = "example.tsv" # tab-separated values of list of docids and doc titles
	save_folder = "case_docs/" # folder to store the json files returned by API
	headers = {
			'authorization': "Token {}".format(auth_token),
			'cache-control': "no-cache",
	}

	api_session = requests.Session()
	api_session.headers = headers

	for did in tqdm(docids):
		response = api_session.post(urljoin(base_url, 'doc/{}/?maxcites=50&maxcitedby=50'.format(did)))
		response.raise_for_status()
		
		with open(save_folder + str(did) + ".json", 'w') as fw:
			json.dump(response.json(), fw)
	
	SRC = 'case_docs/'
	for file in os.listdir('case_docs'):
		f = open('case_docs/'+file)
		name = file.split('.')[0]
		data = json.load(f)
		html_content = data['doc']
		new_file = open(SRC + name + '.html','w')
		new_file.write(html_content)

  
gather_docs(docids)

  0%|          | 0/30 [00:00<?, ?it/s]

In [None]:
abbreviation_string = 'No. no. S. v. Act. Rs. Mr. Art. u/s. Ltd. Nos. nos. vs. Dr. DR. u/ss. sub-s. cl. etc. i.e. viz. C.A. Govt. u/art. Ex. M/s. Smt. p.m. Hrs. Cl. V. I.P.C. Cr.P.C. Ss. a.m. D. L. ss. Cr.A. G. Sub-s. Sec. art. Pvt. Govt. Ms. Ext. sq. dt. Dt. w.e.f. I.A. I. cls. P.W. D.W. u/cl. Mrs. A.C. U. F.I.R. Exh. Etc. Mohd. km.'
abbv = abbreviation_string.split()

In [None]:
import spacy
from spacy.attrs import ORTH
import re
import glob
from bs4 import BeautifulSoup
import os
import json

def custom_sentencizer(doc):
    ''' Look for sentence start tokens by scanning for periods only. '''
    for i, token in enumerate(doc[:-2]):  # The last token cannot start a sentence
        if token.text == ".":
            #doc[i+1].is_sent_start = True
            pass
        else:
            doc[i+1].is_sent_start = False  # Tell the default sentencizer to ignore this token
    return doc

nlp = spacy.load("en_core_web_sm")
nlp.add_pipe(custom_sentencizer, before = "parser")

# with open("abbreviations.txt") as fr:
#     abbv = fr.read().strip().split('\n')

special_cases = {a: a for a in abbv} 
#"S.": "section", "s.": "section", "ss.": "section", "u/s.": "section", "u/ss.": "section", "art.": "article"}

for case, orth in special_cases.items():
    nlp.tokenizer.add_special_case(case, [{ORTH: orth}])


def getSentList(Text):
#     print(Text)
    _RE_COMBINE_WHITESPACE = re.compile(r"\s+")

    ''' Gives the list of clean sentences from the html text '''
    cleanText = Text.text.strip().replace('\n', ' ')
    cleanText = _RE_COMBINE_WHITESPACE.sub(" ", cleanText).strip()
    #print(cleanText)
    parsed = nlp(cleanText)
    sentList=[]
    for sent in parsed.sents:
#         print(sent[0])
        if len(sent) > 2 and re.search('"*[0-9]+\.$', sent.text.strip()) is None:
            sentList.append(sent.text)
    return sentList  


def getPointText(d, pointText, pointNumber):
    ''' Stores the text under a point into a dictionary with key as point number and value as nested dictionary of splitted sentences '''
    soup = BeautifulSoup(pointText, 'html.parser')
    d[pointNumber] = {}
    for text in soup.find_all(re.compile(r'(p|span|blockquote)')):
        sl = getSentList(text)
        if len(sl) > 0:
            d[pointNumber][text.get('id')] = sl


In [None]:
''' Files should be stored in the format of docid.html '''

SRC_FOLDER = "case_docs/"

for docid in tqdm(docids):
    docid = str(docid)+'.html'
    # if not docid.endswith("html"):
    #     continue
    
    with open(SRC_FOLDER + docid,'r',encoding='ascii',errors='ignore') as f:
        count = 1
        pointText = ""
        d = {}
        for line in f:
            text = re.search('(.*?)<p id="p_[0-9]+">(((?!\s*“\s*).)*?)'+str(count)+'[.]\s(.*?)',line)
            if text is not None:
                #print(pointText)
                getPointText(d,pointText,count-1)
                pointText = line
                count = count+1
            else:
                text = re.search('(.*?)<p id="p_[0-9]+">(((?!\s*“\s*).)*?)'+str(count+1)+'[.]\s(.*?)',line)
                if text is not None:
                    #print(pointText)
                    getPointText(d,pointText,count-1)
                    pointText = line
                    count = count+2
                else:
                    text = re.search('(.*?)<p id="p_[0-9]+">(((?!\s*“\s*).)*?)'+str(count+2)+'[.]\s(.*?)',line)
                    if text is not None:
                        #print(pointText)
                        getPointText(d,pointText,count-1)
                        pointText = line
                        count = count+3
                    else:
                        text = re.search('(.*?)<p id="p_[0-9]+">(((?!\s*“\s*).)*?)'+str(count+3)+'[.]\s(.*?)',line)
                        if text is not None:
                            #print(pointText)
                            getPointText(d,pointText,count-1)
                            pointText = line
                            count = count+4
                        else:
                            pointText = pointText+line
    getPointText(d,pointText,count-1)
    
    with open("case_docs/" + docid[:-5] + "_preprocessed.json", 'w', encoding='ascii',errors='ignore') as f:
        json.dump(d, f, indent=4)
        
    print(docid, end='\r')

  0%|          | 0/30 [00:00<?, ?it/s]



In [None]:
! pip install xlsxwriter --q

[?25l[K     |██▏                             | 10 kB 33.5 MB/s eta 0:00:01[K     |████▍                           | 20 kB 39.5 MB/s eta 0:00:01[K     |██████▌                         | 30 kB 44.0 MB/s eta 0:00:01[K     |████████▊                       | 40 kB 28.7 MB/s eta 0:00:01[K     |███████████                     | 51 kB 16.4 MB/s eta 0:00:01[K     |█████████████                   | 61 kB 18.8 MB/s eta 0:00:01[K     |███████████████▎                | 71 kB 20.6 MB/s eta 0:00:01[K     |█████████████████▌              | 81 kB 22.2 MB/s eta 0:00:01[K     |███████████████████▋            | 92 kB 24.3 MB/s eta 0:00:01[K     |█████████████████████▉          | 102 kB 26.2 MB/s eta 0:00:01[K     |████████████████████████        | 112 kB 26.2 MB/s eta 0:00:01[K     |██████████████████████████▏     | 122 kB 26.2 MB/s eta 0:00:01[K     |████████████████████████████▍   | 133 kB 26.2 MB/s eta 0:00:01[K     |██████████████████████████████▋ | 143 kB 26.2 MB/s eta 0:

In [None]:
import re

def Process(Doc):
  loc = 'case_docs/'+Doc
  file = open(loc)
  doc = eval(file.read())

  paras = []
  sents = []
  points = []
  for para_id, para in doc.items():
    para_ = ""
    for point_id, point in para.items():
      point_ = ""
      for sent in point:
        sents.append(sent)
        point_ += ' '+sent
        para_ += ' '+sent
      points.append(point_)
    paras.append(para_)

  d = {'points' : points}
  name = re.findall('[0-9]+', Doc)[0]

#   with open('case_docs/'+name+'_uncased.json', 'w', encoding='ascii',errors='ignore') as f:
#         json.dump(d, f, indent=4)
  return points


# Making Excel

In [None]:
import os
import subprocess
from tqdm.auto import tqdm
import pandas as pd

Docid = []
Num_Entities = []
Num_Tokens = []

for Doc in tqdm(docids):
# def fill(Doc):
    
    pp = str(Doc) + '_preprocessed.json'
    
    data = Process(pp)

    entities = []
    num_tokens = 0
    num_entities = 0
    labels = []

    for pt in data:
        doc = nlp(pt)
        num_tokens += len(doc)

        for ent in doc.ents:
            entities.append(ent)
            labels.append(ent.label_)
        

    df = pd.DataFrame({'Entities':entities,'Labels':labels})

    df1 = df[df['Labels']=='PERSON']
    df2 = df[df['Labels']=='ORG']
    df3 = df[df['Labels']=='GPE']

    df4 = pd.concat([df1,df2,df3], ignore_index = True)

    num_entities = len(df4)

    Docid.append(str(Doc))
    Num_Entities.append(num_entities)
    Num_Tokens.append(num_tokens)

    df = pd.DataFrame({'Docid':Docid, 'Num_Entities':Num_Entities, 'Num_Tokens':Num_Tokens})
    df.to_excel(str(Doc)+'.xlsx')

  0%|          | 0/30 [00:00<?, ?it/s]

In [None]:
df = pd.DataFrame({'Docid':Docid, 'Num_Entities':Num_Entities, 'Num_Tokens':Num_Tokens})
df.to_csv('obs.csv')

In [None]:
import os
import subprocess
from tqdm.auto import tqdm
import pandas as pd

!mkdir ner
writer = pd.ExcelWriter('/content/ner_list.xlsx')

for Doc in tqdm(docids):
# def fill(Doc):
    
    pp = str(Doc) + '_preprocessed.json'
    
    data = Process(pp)

    entities = []
    labels = []
    freq = dict()
    ent_set = set()

    for pt in data:
        doc = nlp(pt)

        for ent in doc.ents:
            try:
                freq[str(ent)] += 1
            except:
                freq[str(ent)] = 1

            if str(ent) not in ent_set:
                ent_set.add(str(ent))
                entities.append(ent)
                labels.append(ent.label_)
        

    df = pd.DataFrame({'Entities':entities,'Labels':labels})

    df1 = df[df['Labels']=='PERSON']
    df2 = df[df['Labels']=='ORG']
    df3 = df[df['Labels']=='GPE']

    df4 = pd.concat([df1,df2,df3], ignore_index = True)

    f = []
    for i in range(len(df4)):
        f.append(freq[str(df4.iloc[i]['Entities'])])

    df4['frequency'] = f

    # df4.to_excel(writer, str(Doc), engine='xlsxwriter')
    df4.to_excel('ner/'+str(Doc)+'.xlsx')

# writer.save()

  0%|          | 0/30 [00:00<?, ?it/s]

In [None]:
import shutil
shutil.make_archive('ner_list', 'zip', 'ner')

'/content/ner_list.zip'

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Analysis

In [None]:
import pandas as pd
import re

df_main = pd.read_excel('/content/ner_list.xlsx', sheet_name = None, keep_default_na=False)

docids = [k for k in df_main.keys()]
# gather_docs(docids)

for did in docids:
  df = df_main[did]
  pp = str(did) + '_preprocessed.json'
  data = Process(pp)  
  freq = []
  start_idx = []
  end_idx = []

  for ent in df['Entities']:
    ent = ent.lower()
    count = 0
    s_id = []
    e_id = []
    for p in data:
      p = p.lower()
      count += p.count(ent)
      try:
        s_id += [_.start() for _ in re.finditer(ent, p)]
        
      except:
        pass

    for s in s_id:
      e_id.append(s+len(ent))

    freq.append(count)
    start_idx.append(s_id)
    end_idx.append(e_id)

  df['COUNT'] = freq
  df['START_IDX'] = start_idx
  df['END_IDX'] = end_idx
  
  # print(df[['COUNT','START_IDX','END_IDX']])



In [None]:
df_main['91090158']

Unnamed: 0.1,Unnamed: 0,Entities,Labels,frequency,variant,role,COUNT,START_IDX,END_IDX
0,0,Sahodara Bai,PERSON,1.0,,VIC,1,[38],[50]
1,1,Kishore Kumar,PERSON,3.0,,P.WITNESS,3,"[254, 568, 325]","[267, 581, 338]"
2,2,18:45:14 IST,PERSON,1.0,,,1,[424],[436]
3,3,District Kawargha,GPE,1.0,,OTHER,1,[94],[111]
4,4,Lalchand,PERSON,6.0,,ACC,7,"[172, 1508, 808, 1240, 132, 424, 591]","[180, 1516, 816, 1248, 140, 432, 599]"
5,5,Ahiman Bai,PERSON,2.0,,ACC,2,"[197, 1686]","[207, 1696]"
6,6,Paikara,PERSON,1.0,,P.WITNESS,2,"[422, 437]","[429, 444]"
7,7,marg,PERSON,1.0,,,2,"[177, 508]","[181, 512]"
8,8,Mukund,PERSON,1.0,,P.WITNESS,1,[155],[161]
9,9,Pitambar Verma,PERSON,1.0,,P.WITNESS,1,[185],[199]


# Grouping Variants

In [None]:
import numpy as np

for did in tqdm(df_main.keys()):

  df = df_main[did]
  # print(df)
  ents = dict()
  for i in range(len(df)):
    ents[i] = []

  ids = []
  for i in range(len(df)):

    if isinstance(df.iloc[i]['variant'],float) and df.iloc[i]['variant']>0 and df.iloc[i]['variant']<len(df):
      if i!=int(df.iloc[i]['variant']):
        ents[int(df.iloc[i]['variant'])].append(df.iloc[i]['Entities'])    

        for s_id in df.iloc[i]['START_IDX']:
          df.iloc[int(df.iloc[i]['variant'])]['START_IDX'].append(s_id)
          df.iloc[int(df.iloc[i]['variant'])]['END_IDX'].append(s_id+len(df.iloc[i]['Entities']))
        
        df.iloc[int(df.iloc[i]['variant'])]['COUNT']+= df.iloc[i]['COUNT']
        ids.append(i)
        # df = df.drop(i)
        # continue

    ents[i].append(df.iloc[i]['Entities'])  

  df['Merged_Entities'] = ents.values()
  df.drop(ids, inplace=True)
  df.reset_index(inplace=True)
  df.drop(columns=['index','Unnamed: 0', 'variant', 'Entities'],inplace=True)
# print(df)
# print(df[['Merged_Entities', 'COUNT']])

  0%|          | 0/15 [00:00<?, ?it/s]

In [None]:
df_main['91090158']

Unnamed: 0,Labels,frequency,role,COUNT,START_IDX,END_IDX,Merged_Entities
0,PERSON,1.0,VIC,1,[38],[50],"(Sahodara Bai,)"
1,PERSON,3.0,P.WITNESS,3,"[254, 568, 325]","[267, 581, 338]","(Kishore Kumar,)"
2,PERSON,1.0,,1,[424],[436],"(18:45:14 IST,)"
3,GPE,1.0,OTHER,1,[94],[111],"(District Kawargha,)"
4,PERSON,6.0,ACC,7,"[172, 1508, 808, 1240, 132, 424, 591]","[180, 1516, 816, 1248, 140, 432, 599]","(Lalchand,)"
5,PERSON,2.0,ACC,2,"[197, 1686]","[207, 1696]","(Ahiman Bai,)"
6,PERSON,1.0,P.WITNESS,2,"[422, 437]","[429, 444]","(Paikara,)"
7,PERSON,1.0,,2,"[177, 508]","[181, 512]","(marg,)"
8,PERSON,1.0,P.WITNESS,1,[155],[161],"(Mukund,)"
9,PERSON,1.0,P.WITNESS,1,[185],[199],"(Pitambar Verma,)"


# Role Frequency

In [None]:
role_freq = dict()

for did in tqdm(df_main.keys()):
  freq = dict()
  df = df_main[did]
  for role in df['role']: 
    freq[role] = 0
  
  for role in df['role']: 
    freq[role]+=1

  role_freq[did] = freq


  0%|          | 0/15 [00:00<?, ?it/s]

In [None]:
role_freq[did]

{'': 35,
 'A.COUNSEL': 6,
 'ACC': 6,
 'APP': 9,
 'APP, ACC': 1,
 'AUTH': 2,
 'COURT': 2,
 'JUDGE(CC)': 3,
 'NA': 78,
 'OTHER': 24,
 'PREC': 5,
 'PREC(APP)': 23,
 'PREC(COURT)': 1,
 'PREC(JUDGE)': 1,
 'PREC(PREC)': 1,
 'PREC(RESP)': 17,
 'R.COUNSEL': 7,
 'RESP': 2}

# Frequency of NA

In [None]:
na_freq = dict()

for did in role_freq.keys():
  na_freq[did] = role_freq[did]['NA']

na_freq

{'10104667': 56,
 '111507500': 14,
 '121357872': 22,
 '127273457': 25,
 '14560127': 7,
 '160467640': 19,
 '166859104': 25,
 '178619490': 35,
 '189745935': 36,
 '195489804': 35,
 '58889922': 78,
 '66145267': 83,
 '69972738': 14,
 '90251163': 40,
 '91090158': 8}

# Extra Entities Count

In [None]:
xent = dict()

for did in (df_main.keys()):
  df = df_main[did]
  xent[did]=0
  for f in df['frequency']:
    if f=='':
      xent[did]+=1

xent

{'10104667': 2,
 '111507500': 0,
 '121357872': 7,
 '127273457': 1,
 '14560127': 7,
 '160467640': 3,
 '166859104': 4,
 '178619490': 2,
 '189745935': 8,
 '195489804': 5,
 '58889922': 0,
 '66145267': 9,
 '69972738': 5,
 '90251163': 2,
 '91090158': 7}