In [1]:
import pytesseract
from PIL import Image, ImageEnhance, ImageFilter
from pdf2image import convert_from_path
import glob
from natsort import natsorted

from opennyai import Pipeline
from opennyai.utils import Data
from opennyai.ner import get_unique_provision_count

import pandas as pd
from tqdm import tqdm
import json


import shutil
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Sample POCSO Judgement

In [None]:
pdf = "sample.pdf"
pages = convert_from_path(pdf, 350)

i = 1
for page in pages:
    image_name = "Page_" + str(i) + ".jpg"  
    page.save(image_name, "JPEG")
    i = i+1   

In [None]:
pics = glob.glob("*.jpg")
pics = natsorted(pics)

In [None]:
judgement_text = ''
for pic in pics:
    text = pytesseract.image_to_string(Image.open(pic))
    #os.remove(pic)
    judgement_text = judgement_text + text

In [None]:
#print(judgement_text)

In [None]:
# you can also load your text files directly into this
judgement_text = open('sample.txt').read()

texts_to_process = [judgement_text]

In [None]:
# create Data object for data  preprocessing before running ML models
data = Data(texts_to_process, preprocessing_nlp_model='en_core_web_trf')

In [None]:
# If you have access to GPU then set this to True else False
use_gpu = False

In [None]:
# Load NER model
pipeline = Pipeline(components=['NER'], use_gpu=use_gpu, verbose=True,
                   ner_model_name='en_legal_ner_trf',
                   ner_mini_batch_size=40000,
                   ner_do_sentence_level=True,
                   ner_do_postprocess=True,
                   ner_statute_shortforms_path='')

In [None]:
#Apply NER pipeline over the data
results = pipeline(data)

In [None]:
# Export the results to a json file
import json
with open('output.json','w') as f:
     json.dump(results,f,indent=4)

In [None]:
ner_doc_1 = pipeline._ner_model_output[0]

In [None]:
# Frequency count of all provisions in a judgement
provisions=get_unique_provision_count(ner_doc_1)
provisions

In [None]:
# Frequency count of all statues in a judgement
from opennyai.ner import get_unique_statute_count
statutes=get_unique_statute_count(ner_doc_1)
statutes

In [None]:
# Saving NER Results
from opennyai.ner import get_csv
get_csv(ner_doc_1,'sample_judgement1_NER.csv', save_path=r"/home/krishna/NLP_Justice/POCSO/sample_judgement1_NER.csv")

In [None]:
# Frequency count of all precendents in a judgement
from opennyai.ner import get_unique_precedent_count
import opennyai.ner as InLegalNER

precedents=InLegalNER.get_unique_precedent_count(ner_doc_1)
precedents

In [None]:
ner_doc_1 = pipeline._ner_model_output[0]
identified_entites = [(ent, ent.label_) for ent in ner_doc_1.ents]
identified_entites

In [None]:
ner_doc_1.user_data['precedent_clusters']

In [None]:
with open('sample.txt', 'w') as f:
    f.write(judgement_text)

# All judgements

In [None]:
judgement_paths_assam = []
path ="Assam"
for root, dirs, files in os.walk(path):
    for file in files:
        if(file.endswith(".pdf")):
            judgement_paths_assam.append(os.path.join(root,file))

In [None]:
judgements = []
for judgement in judgement_paths_assam:
    if ('judgment' in judgement.lower()):
        judgements.append(judgement)

In [None]:
len(judgements)

In [None]:
## Convert Judgment PDFs into txts
#converted_cases = []
for judgement in tqdm(judgements):
    case_id = judgement.split(r'/')[-1].split('_')[0]
    if case_id in converted_cases:
        continue
        
    folder_path = judgement.split(r'/')
    folder_path.pop()
    folder_path = r"/".join(folder_path)+"/"

    pages = convert_from_path(judgement, 350, output_folder='/home/krishna/NLP_Justice/POCSO/tmp')
    
    i = 1
    for page in pages:
        image_name = "Page_" + str(i) + ".jpg"  
        page.save(image_name, "JPEG")
        i = i+1
    shutil.rmtree('tmp')
    os.makedirs('tmp')
        
    pics = glob.glob("*.jpg")
    pics = natsorted(pics)
    
    judgement_text = ''
    for pic in pics:
        text = pytesseract.image_to_string(Image.open(pic))
        os.remove(pic)
        judgement_text = judgement_text + text
    
    with open(folder_path+case_id+'.txt', 'w') as f:
        f.write(judgement_text)
        
    converted_cases.append(case_id)

In [5]:
judgement_paths_assam = []
path ="Assam"
for root, dirs, files in os.walk(path):
    for file in files:
        if(file.endswith(".txt")):
            judgement_paths_assam.append(os.path.join(root,file))

In [9]:
case_dfs = []
case_dfs.append(pd.read_csv('POCSO.csv'))

In [12]:
case_dfs[0].case_id.nunique()

9

In [13]:
for judgement in tqdm(judgement_paths_assam[case_dfs[0].case_id.nunique():]):
    judgement_text = open(judgement).read()
    #Get Case_ID 
    case_id = judgement.split(r'/')[-1].split('.txt')[0]
    
    # load your text files directly into this
    texts_to_process = [judgement_text]
    # create Data object for data  preprocessing before running ML models
    data = Data(texts_to_process, preprocessing_nlp_model='en_core_web_trf')
    
    # Load NER model
    pipeline = Pipeline(components=['NER'], use_gpu=False, verbose=True,
                   ner_model_name='en_legal_ner_trf',
                   ner_mini_batch_size=40000,
                   ner_do_sentence_level=True,
                   ner_do_postprocess=True,
                   ner_statute_shortforms_path='')
    
    results = pipeline(data)
    ner_doc_1 = pipeline._ner_model_output[0]
    
    #Get Provisions recognised in the judgement
    provisions=get_unique_provision_count(ner_doc_1)
    provisions_df = pd.DataFrame(provisions.items())
    provisions_df.columns = ['provision_opennyai','freq']
    
    # Clean Provisions and Statute text
    provisions_df['provision_opennyai'] = provisions_df['provision_opennyai'].str.lower().replace('section', 's', regex=True)
    provisions_df = provisions_df.groupby('provision_opennyai')[['freq']].sum().reset_index()
    provisions_df['statute_opennyai'] = ''
    
    for index, row in provisions_df.iterrows():
        statute = row['provision_opennyai'].split('of',1)[-1].strip().title()
        statute = statute.replace('Protection Of Children From Sexual Offences','POCSO')    
        provision = row['provision_opennyai'].split('of',1)[0].strip().title()
        provision = provision.replace('As Well As',r'/')
        provisions_df.loc[index,'statute_opennyai'] = statute
        provisions_df.loc[index,'provision_opennyai'] = provision.replace('S.','').strip()
        provisions_df.loc[index,'provision_opennyai'] = provision.replace('S','').strip()
    
    #Concatenate all provisions of a statute 
    provisions_df = provisions_df.groupby(['statute_opennyai'])['provision_opennyai'].apply(list).reset_index()
    provisions_df['case_id'] = case_id
    provisions_df = provisions_df[['case_id', 'statute_opennyai', 'provision_opennyai']]
    
    # Get Provisions and Statute information from parsed meta data
    folder_path = judgement.split(r'/')
    folder_path.pop()
    case_metadata_json = r"/".join(folder_path)+"/"+case_id+"_parsed.json"
    with open(case_metadata_json) as file:
        data = json.load(file)
    
    sections_df = pd.DataFrame(data['acts'])
    sections_df.columns = ['statute_metadata','provision_metadata']
    sections_df['case_id_meta'] = case_id
    
    # Concatenate Opennyai data and metadata
    df = pd.concat([provisions_df,sections_df], axis=1)
    df = df.drop('case_id_meta',axis=1)
    case_dfs.append(df)
    pd.concat(case_dfs).to_csv('POCSO.csv',index=False)

  0%|                                                  | 0/1152 [00:00<?, ?it/s]

[38;5;4mℹ Pre-processing will happen on CPU![0m
[38;5;4mℹ Loading NER...[0m
[38;5;4mℹ NER will run on CPU![0m
[38;5;4mℹ Processing documents with Legal NER!!![0m



  0%|                                                     | 0/1 [00:00<?, ?it/s][A
100%|█████████████████████████████████████████████| 1/1 [00:19<00:00, 19.93s/it][A
  0%|                                        | 1/1152 [00:24<7:49:50, 24.49s/it]

[38;5;4mℹ Pre-processing will happen on CPU![0m
[38;5;4mℹ Loading NER...[0m
[38;5;4mℹ NER will run on CPU![0m
[38;5;4mℹ Processing documents with Legal NER!!![0m



  0%|                                                     | 0/1 [00:00<?, ?it/s][A
100%|█████████████████████████████████████████████| 1/1 [00:04<00:00,  4.49s/it][A
  0%|                                        | 2/1152 [00:33<4:50:35, 15.16s/it]

[38;5;4mℹ Pre-processing will happen on CPU![0m
[38;5;4mℹ Loading NER...[0m
[38;5;4mℹ NER will run on CPU![0m
[38;5;4mℹ Processing documents with Legal NER!!![0m



  0%|                                                     | 0/1 [00:00<?, ?it/s][A
100%|█████████████████████████████████████████████| 1/1 [00:05<00:00,  5.29s/it][A
  0%|                                        | 3/1152 [00:42<4:01:10, 12.59s/it]

[38;5;4mℹ Pre-processing will happen on CPU![0m
[38;5;4mℹ Loading NER...[0m
[38;5;4mℹ NER will run on CPU![0m
[38;5;4mℹ Processing documents with Legal NER!!![0m



  0%|                                                     | 0/1 [00:00<?, ?it/s][A
100%|█████████████████████████████████████████████| 1/1 [00:16<00:00, 16.70s/it][A
  0%|▏                                       | 4/1152 [01:04<5:09:57, 16.20s/it]

[38;5;4mℹ Pre-processing will happen on CPU![0m
[38;5;4mℹ Loading NER...[0m
[38;5;4mℹ NER will run on CPU![0m
[38;5;4mℹ Processing documents with Legal NER!!![0m



  0%|                                                     | 0/1 [00:00<?, ?it/s][A
100%|█████████████████████████████████████████████| 1/1 [00:17<00:00, 17.56s/it][A
  0%|▏                                       | 5/1152 [01:26<5:47:03, 18.15s/it]

[38;5;4mℹ Pre-processing will happen on CPU![0m
[38;5;4mℹ Loading NER...[0m
[38;5;4mℹ NER will run on CPU![0m
[38;5;4mℹ Processing documents with Legal NER!!![0m



  0%|                                                     | 0/1 [00:00<?, ?it/s][A
100%|█████████████████████████████████████████████| 1/1 [00:30<00:00, 30.50s/it][A
  1%|▏                                       | 6/1152 [02:00<7:31:56, 23.66s/it]

[38;5;3m⚠ There was some issue while performing postprocessing for doc id
628656e784631f6e1b205914cf2ac828ea10fc19889a3c4678df038a7ad7a1ed.
Some of postprocessing info may be absent because of this in doc.[0m
[38;5;4mℹ Pre-processing will happen on CPU![0m
[38;5;4mℹ Loading NER...[0m
[38;5;4mℹ NER will run on CPU![0m
[38;5;4mℹ Processing documents with Legal NER!!![0m



  0%|                                                     | 0/1 [00:00<?, ?it/s][A
100%|█████████████████████████████████████████████| 1/1 [00:26<00:00, 26.88s/it][A
  1%|▏                                       | 7/1152 [02:31<8:15:27, 25.96s/it]

[38;5;4mℹ Pre-processing will happen on CPU![0m
[38;5;4mℹ Loading NER...[0m
[38;5;4mℹ NER will run on CPU![0m
[38;5;4mℹ Processing documents with Legal NER!!![0m



  0%|                                                     | 0/1 [00:00<?, ?it/s][A
100%|█████████████████████████████████████████████| 1/1 [00:13<00:00, 13.21s/it][A
  1%|▎                                       | 8/1152 [02:48<7:21:22, 23.15s/it]

[38;5;4mℹ Pre-processing will happen on CPU![0m
[38;5;4mℹ Loading NER...[0m
[38;5;4mℹ NER will run on CPU![0m
[38;5;4mℹ Processing documents with Legal NER!!![0m



  0%|                                                     | 0/1 [00:00<?, ?it/s][A
100%|█████████████████████████████████████████████| 1/1 [00:13<00:00, 13.58s/it][A
  1%|▎                                       | 9/1152 [03:05<6:48:30, 21.44s/it]

[38;5;4mℹ Pre-processing will happen on CPU![0m
[38;5;4mℹ Loading NER...[0m
[38;5;4mℹ NER will run on CPU![0m
[38;5;4mℹ Processing documents with Legal NER!!![0m



  0%|                                                     | 0/1 [00:00<?, ?it/s][A
100%|█████████████████████████████████████████████| 1/1 [00:51<00:00, 51.81s/it][A
  1%|▎                                     | 10/1152 [04:01<10:11:36, 32.13s/it]

[38;5;4mℹ Pre-processing will happen on CPU![0m
[38;5;4mℹ Loading NER...[0m
[38;5;4mℹ NER will run on CPU![0m
[38;5;4mℹ Processing documents with Legal NER!!![0m



  0%|                                                     | 0/1 [00:00<?, ?it/s][A
100%|█████████████████████████████████████████████| 1/1 [00:30<00:00, 30.49s/it][A
  1%|▎                                     | 11/1152 [04:36<10:25:02, 32.87s/it]

[38;5;4mℹ Pre-processing will happen on CPU![0m
[38;5;4mℹ Loading NER...[0m
[38;5;4mℹ NER will run on CPU![0m
[38;5;4mℹ Processing documents with Legal NER!!![0m



  0%|                                                     | 0/1 [00:00<?, ?it/s][A
100%|█████████████████████████████████████████████| 1/1 [00:13<00:00, 13.31s/it][A
  1%|▍                                      | 12/1152 [04:53<8:55:15, 28.17s/it]

[38;5;4mℹ Pre-processing will happen on CPU![0m
[38;5;4mℹ Loading NER...[0m
[38;5;4mℹ NER will run on CPU![0m
[38;5;4mℹ Processing documents with Legal NER!!![0m



  0%|                                                     | 0/1 [00:00<?, ?it/s][A
100%|█████████████████████████████████████████████| 1/1 [00:15<00:00, 15.25s/it][A
  1%|▍                                      | 13/1152 [05:13<8:03:52, 25.49s/it]

[38;5;3m⚠ There was some issue while performing postprocessing for doc id
757283765157cdc8bf073b20fc0963089ae8ee9885278e0425df3ba1151fc4a3.
Some of postprocessing info may be absent because of this in doc.[0m
[38;5;4mℹ Pre-processing will happen on CPU![0m
[38;5;4mℹ Loading NER...[0m
[38;5;4mℹ NER will run on CPU![0m
[38;5;4mℹ Processing documents with Legal NER!!![0m



  0%|                                                     | 0/1 [00:00<?, ?it/s][A
100%|█████████████████████████████████████████████| 1/1 [00:09<00:00,  9.03s/it][A
  1%|▍                                      | 14/1152 [05:26<6:52:05, 21.73s/it]

[38;5;4mℹ Pre-processing will happen on CPU![0m
[38;5;4mℹ Loading NER...[0m
[38;5;4mℹ NER will run on CPU![0m
[38;5;4mℹ Processing documents with Legal NER!!![0m



  0%|                                                     | 0/1 [00:00<?, ?it/s][A
100%|█████████████████████████████████████████████| 1/1 [00:09<00:00,  9.74s/it][A
  1%|▌                                      | 15/1152 [05:40<6:07:20, 19.39s/it]

[38;5;4mℹ Pre-processing will happen on CPU![0m
[38;5;4mℹ Loading NER...[0m
[38;5;4mℹ NER will run on CPU![0m
[38;5;4mℹ Processing documents with Legal NER!!![0m



  0%|                                                     | 0/1 [00:00<?, ?it/s][A
100%|█████████████████████████████████████████████| 1/1 [00:08<00:00,  8.47s/it][A
  1%|▌                                      | 16/1152 [05:52<5:27:24, 17.29s/it]

[38;5;4mℹ Pre-processing will happen on CPU![0m
[38;5;4mℹ Loading NER...[0m
[38;5;4mℹ NER will run on CPU![0m
[38;5;4mℹ Processing documents with Legal NER!!![0m



  0%|                                                     | 0/1 [00:00<?, ?it/s][A
100%|█████████████████████████████████████████████| 1/1 [00:05<00:00,  5.05s/it][A
  1%|▌                                      | 17/1152 [06:01<4:41:46, 14.90s/it]

[38;5;4mℹ Pre-processing will happen on CPU![0m
[38;5;4mℹ Loading NER...[0m
[38;5;4mℹ NER will run on CPU![0m
[38;5;4mℹ Processing documents with Legal NER!!![0m



  0%|                                                     | 0/1 [00:00<?, ?it/s][A
100%|█████████████████████████████████████████████| 1/1 [00:19<00:00, 19.22s/it][A
  2%|▌                                      | 18/1152 [06:25<5:32:24, 17.59s/it]

[38;5;4mℹ Pre-processing will happen on CPU![0m
[38;5;4mℹ Loading NER...[0m
[38;5;4mℹ NER will run on CPU![0m
[38;5;4mℹ Processing documents with Legal NER!!![0m



  0%|                                                     | 0/1 [00:00<?, ?it/s][A
100%|█████████████████████████████████████████████| 1/1 [00:19<00:00, 19.80s/it][A
  2%|▋                                      | 19/1152 [06:49<6:09:06, 19.55s/it]

[38;5;3m⚠ There was some issue while performing postprocessing for doc id
8ffce53c3bc7092148ce0f8303fe9f38c579bcade432bbb712e25b0a8ca69ad7.
Some of postprocessing info may be absent because of this in doc.[0m
[38;5;4mℹ Pre-processing will happen on CPU![0m
[38;5;4mℹ Loading NER...[0m
[38;5;4mℹ NER will run on CPU![0m
[38;5;4mℹ Processing documents with Legal NER!!![0m



  0%|                                                     | 0/1 [00:00<?, ?it/s][A
100%|█████████████████████████████████████████████| 1/1 [00:07<00:00,  7.82s/it][A
  2%|▋                                      | 20/1152 [07:01<5:24:56, 17.22s/it]

[38;5;4mℹ Pre-processing will happen on CPU![0m
[38;5;4mℹ Loading NER...[0m
[38;5;4mℹ NER will run on CPU![0m
[38;5;4mℹ Processing documents with Legal NER!!![0m



  0%|                                                     | 0/1 [00:00<?, ?it/s][A
100%|█████████████████████████████████████████████| 1/1 [00:26<00:00, 26.64s/it][A
  2%|▋                                      | 21/1152 [07:32<6:42:00, 21.33s/it]

[38;5;4mℹ Pre-processing will happen on CPU![0m
[38;5;4mℹ Loading NER...[0m
[38;5;4mℹ NER will run on CPU![0m
[38;5;4mℹ Processing documents with Legal NER!!![0m



  0%|                                                     | 0/1 [00:00<?, ?it/s][A
100%|█████████████████████████████████████████████| 1/1 [00:04<00:00,  4.47s/it][A
  2%|▋                                      | 22/1152 [07:41<5:29:10, 17.48s/it]

[38;5;4mℹ Pre-processing will happen on CPU![0m
[38;5;4mℹ Loading NER...[0m
[38;5;4mℹ NER will run on CPU![0m
[38;5;4mℹ Processing documents with Legal NER!!![0m



  0%|                                                     | 0/1 [00:00<?, ?it/s][A
100%|█████████████████████████████████████████████| 1/1 [01:01<00:00, 61.31s/it][A
  2%|▊                                     | 23/1152 [08:46<10:00:16, 31.90s/it]

[38;5;4mℹ Pre-processing will happen on CPU![0m
[38;5;4mℹ Loading NER...[0m
[38;5;4mℹ NER will run on CPU![0m
[38;5;4mℹ Processing documents with Legal NER!!![0m



  0%|                                                     | 0/1 [00:00<?, ?it/s][A
100%|█████████████████████████████████████████████| 1/1 [00:07<00:00,  7.10s/it][A
  2%|▊                                      | 24/1152 [08:57<8:02:21, 25.66s/it]

[38;5;4mℹ Pre-processing will happen on CPU![0m
[38;5;4mℹ Loading NER...[0m
[38;5;4mℹ NER will run on CPU![0m
[38;5;4mℹ Processing documents with Legal NER!!![0m



  0%|                                                     | 0/1 [00:00<?, ?it/s][A
100%|█████████████████████████████████████████████| 1/1 [00:09<00:00,  9.02s/it][A
  2%|▊                                      | 25/1152 [09:11<6:52:02, 21.94s/it]

[38;5;4mℹ Pre-processing will happen on CPU![0m
[38;5;4mℹ Loading NER...[0m
[38;5;4mℹ NER will run on CPU![0m
[38;5;4mℹ Processing documents with Legal NER!!![0m



  0%|                                                     | 0/1 [00:00<?, ?it/s][A
100%|█████████████████████████████████████████████| 1/1 [00:17<00:00, 17.05s/it][A
  2%|▉                                      | 26/1152 [09:32<6:50:53, 21.89s/it]

[38;5;4mℹ Pre-processing will happen on CPU![0m
[38;5;4mℹ Loading NER...[0m
[38;5;4mℹ NER will run on CPU![0m
[38;5;4mℹ Processing documents with Legal NER!!![0m



  0%|                                                     | 0/1 [00:00<?, ?it/s][A
100%|█████████████████████████████████████████████| 1/1 [00:25<00:00, 25.35s/it][A
  2%|▉                                      | 27/1152 [10:02<7:35:13, 24.28s/it]

[38;5;4mℹ Pre-processing will happen on CPU![0m
[38;5;4mℹ Loading NER...[0m
[38;5;4mℹ NER will run on CPU![0m
[38;5;4mℹ Processing documents with Legal NER!!![0m



  0%|                                                     | 0/1 [00:00<?, ?it/s][A
100%|█████████████████████████████████████████████| 1/1 [00:08<00:00,  8.19s/it][A
  2%|▉                                      | 28/1152 [10:15<6:28:17, 20.73s/it]

[38;5;4mℹ Pre-processing will happen on CPU![0m
[38;5;4mℹ Loading NER...[0m
[38;5;4mℹ NER will run on CPU![0m
[38;5;4mℹ Processing documents with Legal NER!!![0m



  0%|                                                     | 0/1 [00:00<?, ?it/s][A
100%|█████████████████████████████████████████████| 1/1 [00:10<00:00, 10.93s/it][A
  3%|▉                                      | 29/1152 [10:30<5:57:23, 19.10s/it]

[38;5;4mℹ Pre-processing will happen on CPU![0m
[38;5;4mℹ Loading NER...[0m
[38;5;4mℹ NER will run on CPU![0m
[38;5;4mℹ Processing documents with Legal NER!!![0m



  0%|                                                     | 0/1 [00:00<?, ?it/s][A
100%|█████████████████████████████████████████████| 1/1 [00:05<00:00,  5.74s/it][A
  3%|█                                      | 30/1152 [10:40<5:05:54, 16.36s/it]

[38;5;4mℹ Pre-processing will happen on CPU![0m
[38;5;4mℹ Loading NER...[0m
[38;5;4mℹ NER will run on CPU![0m
[38;5;4mℹ Processing documents with Legal NER!!![0m



  0%|                                                     | 0/1 [00:00<?, ?it/s][A
100%|█████████████████████████████████████████████| 1/1 [00:22<00:00, 22.55s/it][A
  3%|█                                      | 31/1152 [11:06<6:02:49, 19.42s/it]

[38;5;4mℹ Pre-processing will happen on CPU![0m
[38;5;4mℹ Loading NER...[0m
[38;5;4mℹ NER will run on CPU![0m
[38;5;4mℹ Processing documents with Legal NER!!![0m



  0%|                                                     | 0/1 [00:00<?, ?it/s][A
100%|█████████████████████████████████████████████| 1/1 [00:04<00:00,  4.43s/it][A
  3%|█                                      | 32/1152 [11:15<5:02:07, 16.19s/it]

[38;5;4mℹ Pre-processing will happen on CPU![0m
[38;5;4mℹ Loading NER...[0m
[38;5;4mℹ NER will run on CPU![0m
[38;5;4mℹ Processing documents with Legal NER!!![0m



  0%|                                                     | 0/1 [00:00<?, ?it/s][A
100%|█████████████████████████████████████████████| 1/1 [00:24<00:00, 24.75s/it][A
  3%|█                                      | 33/1152 [11:44<6:13:40, 20.04s/it]

[38;5;4mℹ Pre-processing will happen on CPU![0m
[38;5;4mℹ Loading NER...[0m
[38;5;4mℹ NER will run on CPU![0m
[38;5;4mℹ Processing documents with Legal NER!!![0m



  0%|                                                     | 0/1 [00:00<?, ?it/s][A
100%|█████████████████████████████████████████████| 1/1 [00:17<00:00, 17.23s/it][A
  3%|█▏                                     | 34/1152 [12:05<6:20:57, 20.45s/it]

[38;5;4mℹ Pre-processing will happen on CPU![0m
[38;5;4mℹ Loading NER...[0m
[38;5;4mℹ NER will run on CPU![0m
[38;5;4mℹ Processing documents with Legal NER!!![0m



  0%|                                                     | 0/1 [00:00<?, ?it/s][A
100%|█████████████████████████████████████████████| 1/1 [00:24<00:00, 24.27s/it][A
  3%|█▏                                     | 35/1152 [12:34<7:05:08, 22.84s/it]

[38;5;4mℹ Pre-processing will happen on CPU![0m
[38;5;4mℹ Loading NER...[0m
[38;5;4mℹ NER will run on CPU![0m
[38;5;4mℹ Processing documents with Legal NER!!![0m



  0%|                                                     | 0/1 [00:00<?, ?it/s][A
100%|█████████████████████████████████████████████| 1/1 [00:35<00:00, 35.06s/it][A
  3%|█▏                                     | 36/1152 [13:13<8:36:06, 27.75s/it]

[38;5;4mℹ Pre-processing will happen on CPU![0m
[38;5;4mℹ Loading NER...[0m
[38;5;4mℹ NER will run on CPU![0m
[38;5;4mℹ Processing documents with Legal NER!!![0m



  0%|                                                     | 0/1 [00:00<?, ?it/s][A
100%|█████████████████████████████████████████████| 1/1 [00:05<00:00,  5.61s/it][A
  3%|█▎                                     | 37/1152 [13:23<6:54:37, 22.31s/it]

[38;5;4mℹ Pre-processing will happen on CPU![0m
[38;5;4mℹ Loading NER...[0m
[38;5;4mℹ NER will run on CPU![0m
[38;5;4mℹ Processing documents with Legal NER!!![0m



  0%|                                                     | 0/1 [00:00<?, ?it/s][A
100%|█████████████████████████████████████████████| 1/1 [00:31<00:00, 31.96s/it][A
  3%|█▎                                     | 38/1152 [13:59<8:10:39, 26.43s/it]

[38;5;4mℹ Pre-processing will happen on CPU![0m
[38;5;4mℹ Loading NER...[0m
[38;5;4mℹ NER will run on CPU![0m
[38;5;4mℹ Processing documents with Legal NER!!![0m



  0%|                                                     | 0/1 [00:00<?, ?it/s][A
100%|█████████████████████████████████████████████| 1/1 [00:08<00:00,  8.46s/it][A
  3%|█▎                                     | 39/1152 [14:11<6:52:19, 22.23s/it]

[38;5;4mℹ Pre-processing will happen on CPU![0m
[38;5;4mℹ Loading NER...[0m
[38;5;4mℹ NER will run on CPU![0m
[38;5;4mℹ Processing documents with Legal NER!!![0m



  0%|                                                     | 0/1 [00:00<?, ?it/s][A
100%|█████████████████████████████████████████████| 1/1 [00:13<00:00, 13.27s/it][A
  3%|█▎                                     | 40/1152 [14:29<6:24:59, 20.77s/it]

[38;5;4mℹ Pre-processing will happen on CPU![0m
[38;5;4mℹ Loading NER...[0m
[38;5;4mℹ NER will run on CPU![0m
[38;5;4mℹ Processing documents with Legal NER!!![0m



  0%|                                                     | 0/1 [00:00<?, ?it/s][A
100%|█████████████████████████████████████████████| 1/1 [00:18<00:00, 18.94s/it][A
  4%|█▍                                     | 41/1152 [14:52<6:38:35, 21.53s/it]

[38;5;4mℹ Pre-processing will happen on CPU![0m
[38;5;4mℹ Loading NER...[0m
[38;5;4mℹ NER will run on CPU![0m
[38;5;4mℹ Processing documents with Legal NER!!![0m



  0%|                                                     | 0/1 [00:00<?, ?it/s][A
100%|█████████████████████████████████████████████| 1/1 [00:13<00:00, 13.08s/it][A
  4%|█▍                                     | 42/1152 [15:09<6:13:41, 20.20s/it]

[38;5;4mℹ Pre-processing will happen on CPU![0m
[38;5;4mℹ Loading NER...[0m
[38;5;4mℹ NER will run on CPU![0m
[38;5;4mℹ Processing documents with Legal NER!!![0m



  0%|                                                     | 0/1 [00:00<?, ?it/s][A
100%|█████████████████████████████████████████████| 1/1 [00:13<00:00, 13.94s/it][A
  4%|█▍                                     | 43/1152 [15:27<6:01:34, 19.56s/it]

[38;5;4mℹ Pre-processing will happen on CPU![0m
[38;5;4mℹ Loading NER...[0m
[38;5;4mℹ NER will run on CPU![0m
[38;5;4mℹ Processing documents with Legal NER!!![0m



  0%|                                                     | 0/1 [00:00<?, ?it/s][A
100%|█████████████████████████████████████████████| 1/1 [00:42<00:00, 42.84s/it][A
  4%|█▍                                     | 44/1152 [16:14<8:32:37, 27.76s/it]

[38;5;4mℹ Pre-processing will happen on CPU![0m
[38;5;4mℹ Loading NER...[0m
[38;5;4mℹ NER will run on CPU![0m
[38;5;4mℹ Processing documents with Legal NER!!![0m



  0%|                                                     | 0/1 [00:00<?, ?it/s][A
100%|█████████████████████████████████████████████| 1/1 [00:13<00:00, 13.12s/it][A
  4%|█▌                                     | 45/1152 [16:31<7:34:13, 24.62s/it]

[38;5;4mℹ Pre-processing will happen on CPU![0m
[38;5;4mℹ Loading NER...[0m
[38;5;4mℹ NER will run on CPU![0m
[38;5;4mℹ Processing documents with Legal NER!!![0m



  0%|                                                     | 0/1 [00:00<?, ?it/s][A
100%|█████████████████████████████████████████████| 1/1 [00:11<00:00, 11.74s/it][A
  4%|█▌                                     | 46/1152 [16:47<6:44:29, 21.94s/it]

[38;5;4mℹ Pre-processing will happen on CPU![0m
[38;5;4mℹ Loading NER...[0m
[38;5;4mℹ NER will run on CPU![0m
[38;5;4mℹ Processing documents with Legal NER!!![0m



  0%|                                                     | 0/1 [00:00<?, ?it/s][A
100%|█████████████████████████████████████████████| 1/1 [00:10<00:00, 10.33s/it][A
  4%|█▌                                     | 47/1152 [17:01<6:02:44, 19.70s/it]

[38;5;4mℹ Pre-processing will happen on CPU![0m
[38;5;4mℹ Loading NER...[0m
[38;5;4mℹ NER will run on CPU![0m
[38;5;4mℹ Processing documents with Legal NER!!![0m



  0%|                                                     | 0/1 [00:00<?, ?it/s][A
100%|█████████████████████████████████████████████| 1/1 [00:14<00:00, 14.06s/it][A
  4%|█▋                                     | 48/1152 [17:20<5:55:48, 19.34s/it]

[38;5;4mℹ Pre-processing will happen on CPU![0m
[38;5;4mℹ Loading NER...[0m
[38;5;4mℹ NER will run on CPU![0m
[38;5;4mℹ Processing documents with Legal NER!!![0m



  0%|                                                     | 0/1 [00:00<?, ?it/s][A
100%|█████████████████████████████████████████████| 1/1 [00:10<00:00, 10.15s/it][A
  4%|█▋                                     | 49/1152 [17:34<5:27:43, 17.83s/it]

[38;5;4mℹ Pre-processing will happen on CPU![0m
[38;5;4mℹ Loading NER...[0m
[38;5;4mℹ NER will run on CPU![0m
[38;5;4mℹ Processing documents with Legal NER!!![0m



  0%|                                                     | 0/1 [00:00<?, ?it/s][A
100%|█████████████████████████████████████████████| 1/1 [00:07<00:00,  7.11s/it][A
  4%|█▋                                     | 50/1152 [17:46<4:52:06, 15.90s/it]

[38;5;4mℹ Pre-processing will happen on CPU![0m
[38;5;4mℹ Loading NER...[0m
[38;5;4mℹ NER will run on CPU![0m
[38;5;4mℹ Processing documents with Legal NER!!![0m



  0%|                                                     | 0/1 [00:00<?, ?it/s][A
100%|█████████████████████████████████████████████| 1/1 [00:05<00:00,  5.08s/it][A
  4%|█▋                                     | 51/1152 [17:55<4:17:10, 14.02s/it]

[38;5;4mℹ Pre-processing will happen on CPU![0m
[38;5;4mℹ Loading NER...[0m
[38;5;4mℹ NER will run on CPU![0m
[38;5;4mℹ Processing documents with Legal NER!!![0m



  0%|                                                     | 0/1 [00:00<?, ?it/s][A
100%|█████████████████████████████████████████████| 1/1 [00:27<00:00, 27.74s/it][A
  5%|█▊                                     | 52/1152 [18:28<5:57:49, 19.52s/it]

[38;5;3m⚠ There was some issue while performing postprocessing for doc id
c422f49332e8527f5c75503baa8c671596d137bddd1072d756007598685e1405.
Some of postprocessing info may be absent because of this in doc.[0m
[38;5;4mℹ Pre-processing will happen on CPU![0m
[38;5;4mℹ Loading NER...[0m
[38;5;4mℹ NER will run on CPU![0m
[38;5;4mℹ Processing documents with Legal NER!!![0m



  0%|                                                     | 0/1 [00:00<?, ?it/s][A
100%|█████████████████████████████████████████████| 1/1 [00:08<00:00,  8.62s/it][A
  5%|█▊                                     | 53/1152 [18:40<5:20:23, 17.49s/it]

[38;5;4mℹ Pre-processing will happen on CPU![0m
[38;5;4mℹ Loading NER...[0m
[38;5;4mℹ NER will run on CPU![0m
[38;5;4mℹ Processing documents with Legal NER!!![0m



  0%|                                                     | 0/1 [00:00<?, ?it/s][A
100%|█████████████████████████████████████████████| 1/1 [00:12<00:00, 12.69s/it][A
  5%|█▊                                     | 54/1152 [18:57<5:17:28, 17.35s/it]

[38;5;4mℹ Pre-processing will happen on CPU![0m
[38;5;4mℹ Loading NER...[0m
[38;5;4mℹ NER will run on CPU![0m
[38;5;4mℹ Processing documents with Legal NER!!![0m



  0%|                                                     | 0/1 [00:00<?, ?it/s][A
100%|█████████████████████████████████████████████| 1/1 [00:18<00:00, 18.32s/it][A
  5%|█▊                                     | 55/1152 [19:20<5:45:37, 18.90s/it]

[38;5;4mℹ Pre-processing will happen on CPU![0m
[38;5;4mℹ Loading NER...[0m
[38;5;4mℹ NER will run on CPU![0m
[38;5;4mℹ Processing documents with Legal NER!!![0m



  0%|                                                     | 0/1 [00:00<?, ?it/s][A
100%|█████████████████████████████████████████████| 1/1 [00:09<00:00,  9.57s/it][A
  5%|█▉                                     | 56/1152 [19:34<5:16:51, 17.35s/it]

[38;5;4mℹ Pre-processing will happen on CPU![0m
[38;5;4mℹ Loading NER...[0m
[38;5;4mℹ NER will run on CPU![0m
[38;5;4mℹ Processing documents with Legal NER!!![0m



  0%|                                                     | 0/1 [00:00<?, ?it/s][A
100%|█████████████████████████████████████████████| 1/1 [00:18<00:00, 18.93s/it][A
  5%|█▉                                     | 57/1152 [19:57<5:50:06, 19.18s/it]

[38;5;3m⚠ There was some issue while performing postprocessing for doc id
cff557c8028622db5c7c8d0a408d2c0797fae41d244a920d001c0c2e637bcc16.
Some of postprocessing info may be absent because of this in doc.[0m
[38;5;4mℹ Pre-processing will happen on CPU![0m
[38;5;4mℹ Loading NER...[0m
[38;5;4mℹ NER will run on CPU![0m
[38;5;4mℹ Processing documents with Legal NER!!![0m



  0%|                                                     | 0/1 [00:00<?, ?it/s][A
100%|█████████████████████████████████████████████| 1/1 [00:23<00:00, 23.51s/it][A
  5%|█▉                                     | 58/1152 [20:25<6:36:02, 21.72s/it]

[38;5;4mℹ Pre-processing will happen on CPU![0m
[38;5;4mℹ Loading NER...[0m
[38;5;4mℹ NER will run on CPU![0m
[38;5;4mℹ Processing documents with Legal NER!!![0m



  0%|                                                     | 0/1 [00:00<?, ?it/s][A
100%|█████████████████████████████████████████████| 1/1 [00:16<00:00, 16.59s/it][A
  5%|█▉                                     | 59/1152 [20:45<6:29:50, 21.40s/it]

[38;5;4mℹ Pre-processing will happen on CPU![0m
[38;5;4mℹ Loading NER...[0m
[38;5;4mℹ NER will run on CPU![0m
[38;5;4mℹ Processing documents with Legal NER!!![0m



  0%|                                                     | 0/1 [00:00<?, ?it/s][A
100%|█████████████████████████████████████████████| 1/1 [00:16<00:00, 16.38s/it][A
  5%|██                                     | 60/1152 [21:06<6:24:45, 21.14s/it]

[38;5;4mℹ Pre-processing will happen on CPU![0m
[38;5;4mℹ Loading NER...[0m
[38;5;4mℹ NER will run on CPU![0m
[38;5;4mℹ Processing documents with Legal NER!!![0m



  0%|                                                     | 0/1 [00:00<?, ?it/s][A
100%|█████████████████████████████████████████████| 1/1 [00:47<00:00, 47.23s/it][A
  5%|██                                     | 61/1152 [21:57<9:10:06, 30.25s/it]

[38;5;4mℹ Pre-processing will happen on CPU![0m
[38;5;4mℹ Loading NER...[0m
[38;5;4mℹ NER will run on CPU![0m
[38;5;4mℹ Processing documents with Legal NER!!![0m



  0%|                                                     | 0/1 [00:00<?, ?it/s][A
100%|█████████████████████████████████████████████| 1/1 [00:10<00:00, 10.68s/it][A
  5%|██                                     | 62/1152 [22:12<7:45:52, 25.64s/it]

[38;5;4mℹ Pre-processing will happen on CPU![0m
[38;5;4mℹ Loading NER...[0m
[38;5;4mℹ NER will run on CPU![0m
[38;5;4mℹ Processing documents with Legal NER!!![0m



  0%|                                                     | 0/1 [00:00<?, ?it/s][A
100%|█████████████████████████████████████████████| 1/1 [00:14<00:00, 14.13s/it][A
  5%|██▏                                    | 63/1152 [22:31<7:05:14, 23.43s/it]

[38;5;4mℹ Pre-processing will happen on CPU![0m
[38;5;4mℹ Loading NER...[0m
[38;5;4mℹ NER will run on CPU![0m
[38;5;4mℹ Processing documents with Legal NER!!![0m



  0%|                                                     | 0/1 [00:00<?, ?it/s][A
100%|█████████████████████████████████████████████| 1/1 [00:10<00:00, 10.13s/it][A
  6%|██▏                                    | 64/1152 [22:45<6:16:19, 20.75s/it]

[38;5;4mℹ Pre-processing will happen on CPU![0m
[38;5;4mℹ Loading NER...[0m
[38;5;4mℹ NER will run on CPU![0m
[38;5;4mℹ Processing documents with Legal NER!!![0m



  0%|                                                     | 0/1 [00:00<?, ?it/s][A
100%|█████████████████████████████████████████████| 1/1 [00:07<00:00,  7.16s/it][A
  6%|██▏                                    | 65/1152 [22:56<5:25:10, 17.95s/it]

[38;5;4mℹ Pre-processing will happen on CPU![0m
[38;5;4mℹ Loading NER...[0m
[38;5;4mℹ NER will run on CPU![0m
[38;5;4mℹ Processing documents with Legal NER!!![0m



  0%|                                                     | 0/1 [00:00<?, ?it/s][A
100%|█████████████████████████████████████████████| 1/1 [00:51<00:00, 51.96s/it][A
  6%|██▏                                    | 66/1152 [23:53<8:54:22, 29.52s/it]

[38;5;4mℹ Pre-processing will happen on CPU![0m
[38;5;4mℹ Loading NER...[0m
[38;5;4mℹ NER will run on CPU![0m
[38;5;4mℹ Processing documents with Legal NER!!![0m



  0%|                                                     | 0/1 [00:00<?, ?it/s][A
100%|█████████████████████████████████████████████| 1/1 [00:12<00:00, 12.52s/it][A
  6%|██▎                                    | 67/1152 [24:10<7:45:45, 25.76s/it]

[38;5;4mℹ Pre-processing will happen on CPU![0m
[38;5;4mℹ Loading NER...[0m
[38;5;4mℹ NER will run on CPU![0m
[38;5;4mℹ Processing documents with Legal NER!!![0m



  0%|                                                     | 0/1 [00:00<?, ?it/s][A
100%|█████████████████████████████████████████████| 1/1 [00:20<00:00, 20.65s/it][A
  6%|██▎                                    | 68/1152 [24:35<7:42:01, 25.57s/it]

[38;5;3m⚠ There was some issue while performing postprocessing for doc id
066ac9ca5d713fb0ce8ce8d06e022108563ad10b66eb46c960c6595b48b8321d.
Some of postprocessing info may be absent because of this in doc.[0m
[38;5;4mℹ Pre-processing will happen on CPU![0m
[38;5;4mℹ Loading NER...[0m
[38;5;4mℹ NER will run on CPU![0m
[38;5;4mℹ Processing documents with Legal NER!!![0m



  0%|                                                     | 0/1 [00:00<?, ?it/s][A
100%|█████████████████████████████████████████████| 1/1 [00:11<00:00, 11.35s/it][A
  6%|██▎                                    | 69/1152 [24:51<6:48:04, 22.61s/it]

[38;5;4mℹ Pre-processing will happen on CPU![0m
[38;5;4mℹ Loading NER...[0m
[38;5;4mℹ NER will run on CPU![0m
[38;5;4mℹ Processing documents with Legal NER!!![0m



  0%|                                                     | 0/1 [00:05<?, ?it/s][A
  6%|██▎                                    | 69/1152 [25:01<6:32:42, 21.76s/it]


KeyboardInterrupt: 

In [None]:
# Frequency count of all statues in a judgement
from opennyai.ner import get_unique_statute_count
statutes=get_unique_statute_count(ner_doc_1)
statutes

In [None]:
# Frequency count of all provisions in a judgement
from opennyai.ner import get_unique_provision_count
provisions=get_unique_provision_count(ner_doc_1)
provisions

In [None]:
provisions_df = pd.DataFrame(provisions.items())
provisions_df.columns = ['provision','freq']
provisions_df['provision'] = provisions_df['provision'].str.lower().replace('section', 's', regex=True)

In [None]:
provisions_df = provisions_df.groupby('provision')[['freq']].sum().reset_index()
provisions_df['statute'] = ''

In [None]:
for index, row in provisions_df.iterrows():
    statute = row['provision'].split('of',1)[-1].strip().title()
    statute = statute.replace('Protection Of Children From Sexual Offences','POCSO')
    
    provision = row['provision'].split('of',1)[0].strip().title()
    provision = provision.replace('As Well As',r'/')
    provisions_df.loc[index,'statute'] = statute
    provisions_df.loc[index,'provision'] = provision

In [None]:
provisions_df

In [None]:
provisions_df = provisions_df.groupby(['statute'])['provision'].apply(list).reset_index()
provisions_df

In [None]:
provisions_df['case_id'] = case_id

In [None]:
provisions_df

In [None]:
l = judgment.split(r'/')
l.pop()
case_metadata_json = r"/".join(l)+"/"+case_id+"_parsed.json"

import json
with open(case_metadata_json) as file:
    data = json.load(file)

In [None]:
sections_df = pd.DataFrame(data['acts'])
sections_df.columns = ['statute_meta','provision_meta']
sections_df['case_id_meta'] = case_id

In [None]:
k = pd.concat([provisions_df,sections_df], axis=1)

In [None]:
k.drop('case_id_meta',axis=1)

# Limitations
1. The judgements are converted from PDFs to txt formats. The PDFs often contains official stamps and other symbols, which when converted into txt become a dirty group of letters. This made a couple of sentences in each page incomprehensible. The recognition of entities could be affected in these sentences.