In [1]:
import glob

from opennyai import Pipeline
from opennyai.utils import Data
from opennyai.ner import get_unique_provision_count

import pandas as pd
from tqdm import tqdm
import json


import shutil
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Sample POCSO Judgement

In [None]:
pdf = "sample.pdf"
pages = convert_from_path(pdf, 350)

i = 1
for page in pages:
    image_name = "Page_" + str(i) + ".jpg"  
    page.save(image_name, "JPEG")
    i = i+1   

In [None]:
pics = glob.glob("*.jpg")
pics = natsorted(pics)

In [None]:
judgement_text = ''
for pic in pics:
    text = pytesseract.image_to_string(Image.open(pic))
    #os.remove(pic)
    judgement_text = judgement_text + text

In [None]:
#print(judgement_text)

In [None]:
# you can also load your text files directly into this
judgement_text = open('sample.txt').read()

texts_to_process = [judgement_text]

In [None]:
# create Data object for data  preprocessing before running ML models
data = Data(texts_to_process, preprocessing_nlp_model='en_core_web_trf')

In [None]:
# If you have access to GPU then set this to True else False
use_gpu = False

In [None]:
# Load NER model
pipeline = Pipeline(components=['NER'], use_gpu=use_gpu, verbose=True,
                   ner_model_name='en_legal_ner_trf',
                   ner_mini_batch_size=40000,
                   ner_do_sentence_level=True,
                   ner_do_postprocess=True,
                   ner_statute_shortforms_path='')

In [None]:
#Apply NER pipeline over the data
results = pipeline(data)

In [None]:
# Export the results to a json file
import json
with open('output.json','w') as f:
     json.dump(results,f,indent=4)

In [None]:
ner_doc_1 = pipeline._ner_model_output[0]

In [None]:
# Frequency count of all provisions in a judgement
provisions=get_unique_provision_count(ner_doc_1)
provisions

In [None]:
# Frequency count of all statues in a judgement
from opennyai.ner import get_unique_statute_count
statutes=get_unique_statute_count(ner_doc_1)
statutes

In [None]:
# Saving NER Results
from opennyai.ner import get_csv
get_csv(ner_doc_1,'sample_judgement1_NER.csv', save_path=r"/home/krishna/NLP_Justice/POCSO/sample_judgement1_NER.csv")

In [None]:
# Frequency count of all precendents in a judgement
from opennyai.ner import get_unique_precedent_count
import opennyai.ner as InLegalNER

precedents=InLegalNER.get_unique_precedent_count(ner_doc_1)
precedents

In [None]:
ner_doc_1 = pipeline._ner_model_output[0]
identified_entites = [(ent, ent.label_) for ent in ner_doc_1.ents]
identified_entites

In [None]:
ner_doc_1.user_data['precedent_clusters']

In [None]:
with open('sample.txt', 'w') as f:
    f.write(judgement_text)

# All judgements

50 judgements were filtered on which NER has to be experimented.

In [2]:
filtered_cases_cino = pd.read_csv('POCSO_Filter2.csv')['cino'].to_list()

In [3]:
# List of all POCSO judgements in Assam
judgement_paths_assam = []
path ="Assam"
for root, dirs, files in os.walk(path):
    for file in files:
        if(file.endswith(".txt")):
            judgement_paths_assam.append(os.path.join(root,file))

In [4]:
# List of 50 filtered POCSO judgements in Assam
judgements_filtered = []
for judgement in judgement_paths_assam:
    #Get Case_ID 
    case_id = judgement.split(r'/')[-1].split('.txt')[0]
    
    if case_id in filtered_cases_cino:
        judgements_filtered.append(judgement)

In [5]:
len(judgements_filtered)

51

In [6]:
case_dfs = []
case_dfs.append(pd.read_csv('POCSO_NER_Template.csv'))

In [35]:
#NER
import warnings
warnings.filterwarnings("ignore")

for judgement in tqdm(judgements_filtered[case_dfs[0].case_id.nunique():]):
    judgement_text = open(judgement).read()
    #Get Case_ID 
    case_id = judgement.split(r'/')[-1].split('.txt')[0]
    
    # load your text files directly into this
    texts_to_process = [judgement_text]
    # create Data object for data  preprocessing before running ML models
    data = Data(texts_to_process, preprocessing_nlp_model='en_core_web_trf')
    
    # Load NER model
    pipeline = Pipeline(components=['NER'], use_gpu=False, verbose=True,
                   ner_model_name='en_legal_ner_trf',
                   ner_mini_batch_size=40000,
                   ner_do_sentence_level=True,
                   ner_do_postprocess=True,
                   ner_statute_shortforms_path='')
    
    results = pipeline(data)
    ner_doc_1 = pipeline._ner_model_output[0]
    
    #Get Provisions recognised in the judgement
    provisions=get_unique_provision_count(ner_doc_1)
    provisions_df = pd.DataFrame(provisions.items())
    provisions_df.columns = ['provision_opennyai','freq']
    
    # Clean Provisions and Statute text
    provisions_df['provision_opennyai'] = provisions_df['provision_opennyai'].str.lower().replace('section', 's', regex=True)
    provisions_df = provisions_df.groupby('provision_opennyai')[['freq']].sum().reset_index()
    provisions_df['statute_opennyai'] = ''
    
    for index, row in provisions_df.iterrows():
        statute = row['provision_opennyai'].split('of',1)[-1].strip().title()
        statute = statute.replace('Protection Of Children From Sexual Offences','POCSO')    
        provision = row['provision_opennyai'].split('of',1)[0].strip().title()
        provision = provision.replace('As Well As',r'/')
        provisions_df.loc[index,'statute_opennyai'] = statute
        provisions_df.loc[index,'provision_opennyai'] = provision.replace('S.','').strip()
        provisions_df.loc[index,'provision_opennyai'] = provision.replace('S','').strip()
    
    #Concatenate all provisions of a statute 
    provisions_df = provisions_df.groupby(['statute_opennyai'])['provision_opennyai'].apply(list).reset_index()
    provisions_df['case_id'] = case_id
    provisions_df = provisions_df[['case_id', 'statute_opennyai', 'provision_opennyai']]
    
    # Get Provisions and Statute information from parsed meta data
    folder_path = judgement.split(r'/')
    folder_path.pop()
    case_metadata_json = r"/".join(folder_path)+"/"+case_id+"_parsed.json"
    with open(case_metadata_json) as file:
        data = json.load(file)
    
    sections_df = pd.DataFrame(data['acts'])
    sections_df.columns = ['statute_metadata','provision_metadata']
    sections_df['case_id_meta'] = case_id
    
    # Concatenate Opennyai data and metadata
    df = pd.concat([provisions_df,sections_df], axis=1)
    df = df.drop('case_id_meta',axis=1)
    case_dfs.append(df)
    pd.concat(case_dfs).to_csv('POCSO.csv',index=False)

  0%|                                                    | 0/51 [00:00<?, ?it/s]

[38;5;4mℹ Pre-processing will happen on CPU![0m
[38;5;4mℹ Loading NER...[0m
[38;5;4mℹ NER will run on CPU![0m
[38;5;4mℹ Processing documents with Legal NER!!![0m



  0%|                                                     | 0/1 [00:00<?, ?it/s][A
100%|████████████████████████████████████████████| 1/1 [03:38<00:00, 218.77s/it][A

[38;5;3m⚠ There was some issue while performing postprocessing for doc id
722476655a55d67cb66f16df00680a78422403f85d3e4a2af0f4881f73503eb0.
Some of postprocessing info may be absent because of this in doc.[0m



  2%|▊                                        | 1/51 [03:43<3:05:53, 223.08s/it]

[38;5;4mℹ Pre-processing will happen on CPU![0m
[38;5;4mℹ Loading NER...[0m
[38;5;4mℹ NER will run on CPU![0m
[38;5;4mℹ Processing documents with Legal NER!!![0m



  0%|                                                     | 0/1 [00:00<?, ?it/s][A
100%|█████████████████████████████████████████████| 1/1 [00:09<00:00,  9.86s/it][A
  4%|█▌                                       | 2/51 [03:57<1:21:52, 100.25s/it]

[38;5;4mℹ Pre-processing will happen on CPU![0m
[38;5;4mℹ Loading NER...[0m
[38;5;4mℹ NER will run on CPU![0m
[38;5;4mℹ Processing documents with Legal NER!!![0m



  0%|                                                     | 0/1 [00:00<?, ?it/s][A
100%|█████████████████████████████████████████████| 1/1 [00:10<00:00, 10.32s/it][A
  6%|██▌                                         | 3/51 [04:12<48:58, 61.22s/it]

[38;5;4mℹ Pre-processing will happen on CPU![0m
[38;5;4mℹ Loading NER...[0m
[38;5;4mℹ NER will run on CPU![0m
[38;5;4mℹ Processing documents with Legal NER!!![0m



  0%|                                                     | 0/1 [00:00<?, ?it/s][A
100%|█████████████████████████████████████████████| 1/1 [00:15<00:00, 15.74s/it][A
  8%|███▍                                        | 4/51 [04:32<35:17, 45.05s/it]

[38;5;4mℹ Pre-processing will happen on CPU![0m
[38;5;4mℹ Loading NER...[0m
[38;5;4mℹ NER will run on CPU![0m
[38;5;4mℹ Processing documents with Legal NER!!![0m



  0%|                                                     | 0/1 [00:00<?, ?it/s][A
100%|█████████████████████████████████████████████| 1/1 [00:46<00:00, 46.11s/it][A
 10%|████▎                                       | 5/51 [05:23<36:07, 47.12s/it]

[38;5;3m⚠ There was some issue while performing postprocessing for doc id
7c2792b079a6cbaeac8476b0b7cf303b3203224dd6cbb5af29762d441b1827bd.
Some of postprocessing info may be absent because of this in doc.[0m
[38;5;4mℹ Pre-processing will happen on CPU![0m
[38;5;4mℹ Loading NER...[0m
[38;5;4mℹ NER will run on CPU![0m
[38;5;4mℹ Processing documents with Legal NER!!![0m



  0%|                                                     | 0/1 [00:00<?, ?it/s][A
100%|█████████████████████████████████████████████| 1/1 [00:07<00:00,  7.33s/it][A
 12%|█████▏                                      | 6/51 [05:34<26:14, 34.99s/it]

[38;5;4mℹ Pre-processing will happen on CPU![0m
[38;5;4mℹ Loading NER...[0m
[38;5;4mℹ NER will run on CPU![0m
[38;5;4mℹ Processing documents with Legal NER!!![0m



  0%|                                                     | 0/1 [00:00<?, ?it/s][A
100%|█████████████████████████████████████████████| 1/1 [00:08<00:00,  8.74s/it][A
 14%|██████                                      | 7/51 [05:47<20:23, 27.81s/it]

[38;5;4mℹ Pre-processing will happen on CPU![0m
[38;5;4mℹ Loading NER...[0m
[38;5;4mℹ NER will run on CPU![0m
[38;5;4mℹ Processing documents with Legal NER!!![0m



  0%|                                                     | 0/1 [00:00<?, ?it/s][A
100%|█████████████████████████████████████████████| 1/1 [00:08<00:00,  8.66s/it][A
 16%|██████▉                                     | 8/51 [06:01<16:38, 23.23s/it]

[38;5;4mℹ Pre-processing will happen on CPU![0m
[38;5;4mℹ Loading NER...[0m
[38;5;4mℹ NER will run on CPU![0m
[38;5;4mℹ Processing documents with Legal NER!!![0m



  0%|                                                     | 0/1 [00:00<?, ?it/s][A
100%|█████████████████████████████████████████████| 1/1 [00:08<00:00,  8.13s/it][A
 18%|███████▊                                    | 9/51 [06:13<13:56, 19.92s/it]

[38;5;4mℹ Pre-processing will happen on CPU![0m
[38;5;4mℹ Loading NER...[0m
[38;5;4mℹ NER will run on CPU![0m
[38;5;4mℹ Processing documents with Legal NER!!![0m



  0%|                                                     | 0/1 [00:00<?, ?it/s][A
100%|█████████████████████████████████████████████| 1/1 [00:07<00:00,  7.76s/it][A
 20%|████████▍                                  | 10/51 [06:25<11:55, 17.44s/it]

[38;5;4mℹ Pre-processing will happen on CPU![0m
[38;5;4mℹ Loading NER...[0m
[38;5;4mℹ NER will run on CPU![0m
[38;5;4mℹ Processing documents with Legal NER!!![0m



  0%|                                                     | 0/1 [00:00<?, ?it/s][A
100%|█████████████████████████████████████████████| 1/1 [00:39<00:00, 39.30s/it][A
 22%|█████████▎                                 | 11/51 [07:09<17:00, 25.52s/it]

[38;5;4mℹ Pre-processing will happen on CPU![0m
[38;5;4mℹ Loading NER...[0m
[38;5;4mℹ NER will run on CPU![0m
[38;5;4mℹ Processing documents with Legal NER!!![0m



  0%|                                                     | 0/1 [00:00<?, ?it/s][A
100%|█████████████████████████████████████████████| 1/1 [00:07<00:00,  7.35s/it][A
 24%|██████████                                 | 12/51 [07:20<13:49, 21.26s/it]

[38;5;4mℹ Pre-processing will happen on CPU![0m
[38;5;4mℹ Loading NER...[0m
[38;5;4mℹ NER will run on CPU![0m
[38;5;4mℹ Processing documents with Legal NER!!![0m



  0%|                                                     | 0/1 [00:00<?, ?it/s][A
100%|█████████████████████████████████████████████| 1/1 [00:17<00:00, 17.88s/it][A
 25%|██████████▉                                | 13/51 [07:43<13:37, 21.52s/it]

[38;5;4mℹ Pre-processing will happen on CPU![0m
[38;5;4mℹ Loading NER...[0m
[38;5;4mℹ NER will run on CPU![0m
[38;5;4mℹ Processing documents with Legal NER!!![0m



  0%|                                                     | 0/1 [00:00<?, ?it/s][A
100%|█████████████████████████████████████████████| 1/1 [00:18<00:00, 18.31s/it][A
 27%|███████████▊                               | 14/51 [08:05<13:31, 21.94s/it]

[38;5;3m⚠ There was some issue while performing postprocessing for doc id
f6a0737b09bc0e07dd9f8c218a892a30fec7e515f4c06c40e113802ff9f3fc23.
Some of postprocessing info may be absent because of this in doc.[0m
[38;5;4mℹ Pre-processing will happen on CPU![0m
[38;5;4mℹ Loading NER...[0m
[38;5;4mℹ NER will run on CPU![0m
[38;5;4mℹ Processing documents with Legal NER!!![0m



  0%|                                                     | 0/1 [00:00<?, ?it/s][A
100%|█████████████████████████████████████████████| 1/1 [00:37<00:00, 37.48s/it][A
 29%|████████████▋                              | 15/51 [08:47<16:44, 27.91s/it]

[38;5;4mℹ Pre-processing will happen on CPU![0m
[38;5;4mℹ Loading NER...[0m
[38;5;4mℹ NER will run on CPU![0m
[38;5;4mℹ Processing documents with Legal NER!!![0m



  0%|                                                     | 0/1 [00:00<?, ?it/s][A
100%|█████████████████████████████████████████████| 1/1 [00:16<00:00, 16.39s/it][A
 31%|█████████████▍                             | 16/51 [09:08<15:01, 25.74s/it]

[38;5;4mℹ Pre-processing will happen on CPU![0m
[38;5;4mℹ Loading NER...[0m
[38;5;4mℹ NER will run on CPU![0m
[38;5;4mℹ Processing documents with Legal NER!!![0m



  0%|                                                     | 0/1 [00:00<?, ?it/s][A
100%|█████████████████████████████████████████████| 1/1 [00:13<00:00, 13.20s/it][A
 33%|██████████████▎                            | 17/51 [09:25<13:10, 23.26s/it]

[38;5;4mℹ Pre-processing will happen on CPU![0m
[38;5;4mℹ Loading NER...[0m
[38;5;4mℹ NER will run on CPU![0m
[38;5;4mℹ Processing documents with Legal NER!!![0m



  0%|                                                     | 0/1 [00:00<?, ?it/s][A
100%|█████████████████████████████████████████████| 1/1 [00:25<00:00, 25.35s/it][A
 35%|███████████████▏                           | 18/51 [09:55<13:55, 25.31s/it]

[38;5;4mℹ Pre-processing will happen on CPU![0m
[38;5;4mℹ Loading NER...[0m
[38;5;4mℹ NER will run on CPU![0m
[38;5;4mℹ Processing documents with Legal NER!!![0m



  0%|                                                     | 0/1 [00:00<?, ?it/s][A
100%|█████████████████████████████████████████████| 1/1 [00:25<00:00, 25.20s/it][A
 37%|████████████████                           | 19/51 [10:25<14:08, 26.53s/it]

[38;5;4mℹ Pre-processing will happen on CPU![0m
[38;5;4mℹ Loading NER...[0m
[38;5;4mℹ NER will run on CPU![0m
[38;5;4mℹ Processing documents with Legal NER!!![0m



  0%|                                                     | 0/1 [00:00<?, ?it/s][A
100%|█████████████████████████████████████████████| 1/1 [00:18<00:00, 18.91s/it][A
 39%|████████████████▊                          | 20/51 [10:48<13:11, 25.52s/it]

[38;5;4mℹ Pre-processing will happen on CPU![0m
[38;5;4mℹ Loading NER...[0m
[38;5;4mℹ NER will run on CPU![0m
[38;5;4mℹ Processing documents with Legal NER!!![0m



  0%|                                                     | 0/1 [00:00<?, ?it/s][A
100%|█████████████████████████████████████████████| 1/1 [00:06<00:00,  6.30s/it][A
 41%|█████████████████▋                         | 21/51 [10:59<10:34, 21.14s/it]

[38;5;4mℹ Pre-processing will happen on CPU![0m
[38;5;4mℹ Loading NER...[0m
[38;5;4mℹ NER will run on CPU![0m
[38;5;4mℹ Processing documents with Legal NER!!![0m



  0%|                                                     | 0/1 [00:00<?, ?it/s][A
100%|█████████████████████████████████████████████| 1/1 [00:15<00:00, 15.23s/it][A
 43%|██████████████████▌                        | 22/51 [11:19<10:01, 20.73s/it]

[38;5;4mℹ Pre-processing will happen on CPU![0m
[38;5;4mℹ Loading NER...[0m
[38;5;4mℹ NER will run on CPU![0m
[38;5;4mℹ Processing documents with Legal NER!!![0m



  0%|                                                     | 0/1 [00:00<?, ?it/s][A
100%|█████████████████████████████████████████████| 1/1 [00:15<00:00, 15.75s/it][A
 45%|███████████████████▍                       | 23/51 [11:39<09:35, 20.56s/it]

[38;5;4mℹ Pre-processing will happen on CPU![0m
[38;5;4mℹ Loading NER...[0m
[38;5;4mℹ NER will run on CPU![0m
[38;5;4mℹ Processing documents with Legal NER!!![0m



  0%|                                                     | 0/1 [00:00<?, ?it/s][A
100%|█████████████████████████████████████████████| 1/1 [00:07<00:00,  7.38s/it][A
 47%|████████████████████▏                      | 24/51 [11:51<08:05, 17.97s/it]

[38;5;4mℹ Pre-processing will happen on CPU![0m
[38;5;4mℹ Loading NER...[0m
[38;5;4mℹ NER will run on CPU![0m
[38;5;4mℹ Processing documents with Legal NER!!![0m



  0%|                                                     | 0/1 [00:00<?, ?it/s][A
100%|█████████████████████████████████████████████| 1/1 [00:13<00:00, 13.36s/it][A
 49%|█████████████████████                      | 25/51 [12:09<07:46, 17.92s/it]

[38;5;4mℹ Pre-processing will happen on CPU![0m
[38;5;4mℹ Loading NER...[0m
[38;5;4mℹ NER will run on CPU![0m
[38;5;4mℹ Processing documents with Legal NER!!![0m



  0%|                                                     | 0/1 [00:00<?, ?it/s][A
100%|█████████████████████████████████████████████| 1/1 [00:07<00:00,  7.05s/it][A
 51%|█████████████████████▉                     | 26/51 [12:20<06:39, 15.97s/it]

[38;5;4mℹ Pre-processing will happen on CPU![0m
[38;5;4mℹ Loading NER...[0m
[38;5;4mℹ NER will run on CPU![0m
[38;5;4mℹ Processing documents with Legal NER!!![0m



  0%|                                                     | 0/1 [00:00<?, ?it/s][A
100%|█████████████████████████████████████████████| 1/1 [00:14<00:00, 14.79s/it][A
 53%|██████████████████████▊                    | 27/51 [12:39<06:47, 16.98s/it]

[38;5;4mℹ Pre-processing will happen on CPU![0m
[38;5;4mℹ Loading NER...[0m
[38;5;4mℹ NER will run on CPU![0m
[38;5;4mℹ Processing documents with Legal NER!!![0m



  0%|                                                     | 0/1 [00:00<?, ?it/s][A
100%|█████████████████████████████████████████████| 1/1 [00:09<00:00,  9.24s/it][A
 55%|███████████████████████▌                   | 28/51 [12:53<06:07, 15.96s/it]

[38;5;4mℹ Pre-processing will happen on CPU![0m
[38;5;4mℹ Loading NER...[0m
[38;5;4mℹ NER will run on CPU![0m
[38;5;4mℹ Processing documents with Legal NER!!![0m



  0%|                                                     | 0/1 [00:00<?, ?it/s][A
100%|█████████████████████████████████████████████| 1/1 [00:10<00:00, 10.90s/it][A
 57%|████████████████████████▍                  | 29/51 [13:08<05:47, 15.78s/it]

[38;5;4mℹ Pre-processing will happen on CPU![0m
[38;5;4mℹ Loading NER...[0m
[38;5;4mℹ NER will run on CPU![0m
[38;5;4mℹ Processing documents with Legal NER!!![0m



  0%|                                                     | 0/1 [00:00<?, ?it/s][A
100%|█████████████████████████████████████████████| 1/1 [00:11<00:00, 11.17s/it][A
 59%|█████████████████████████▎                 | 30/51 [13:24<05:30, 15.73s/it]

[38;5;4mℹ Pre-processing will happen on CPU![0m
[38;5;4mℹ Loading NER...[0m
[38;5;4mℹ NER will run on CPU![0m
[38;5;4mℹ Processing documents with Legal NER!!![0m



  0%|                                                     | 0/1 [00:00<?, ?it/s][A
100%|█████████████████████████████████████████████| 1/1 [00:45<00:00, 45.30s/it][A
 61%|██████████████████████████▏                | 31/51 [14:14<08:39, 25.98s/it]

[38;5;3m⚠ There was some issue while performing postprocessing for doc id
6e3e85e838f9d68952d5a7edb08742810d82f16f83985c4279bc78b2e3f2b952.
Some of postprocessing info may be absent because of this in doc.[0m
[38;5;4mℹ Pre-processing will happen on CPU![0m
[38;5;4mℹ Loading NER...[0m
[38;5;4mℹ NER will run on CPU![0m
[38;5;4mℹ Processing documents with Legal NER!!![0m



  0%|                                                     | 0/1 [00:00<?, ?it/s][A
100%|█████████████████████████████████████████████| 1/1 [00:08<00:00,  8.97s/it][A
 63%|██████████████████████████▉                | 32/51 [14:27<07:00, 22.16s/it]

[38;5;4mℹ Pre-processing will happen on CPU![0m
[38;5;4mℹ Loading NER...[0m
[38;5;4mℹ NER will run on CPU![0m
[38;5;4mℹ Processing documents with Legal NER!!![0m



  0%|                                                     | 0/1 [00:00<?, ?it/s][A
100%|█████████████████████████████████████████████| 1/1 [00:42<00:00, 42.14s/it][A
 65%|███████████████████████████▊               | 33/51 [15:14<08:51, 29.50s/it]

[38;5;3m⚠ There was some issue while performing postprocessing for doc id
f699dd00e7e9ba51c6f08e65f8e729c6f0a57009f4335ab0df54d1d6acce8dd9.
Some of postprocessing info may be absent because of this in doc.[0m
[38;5;4mℹ Pre-processing will happen on CPU![0m
[38;5;4mℹ Loading NER...[0m
[38;5;4mℹ NER will run on CPU![0m
[38;5;4mℹ Processing documents with Legal NER!!![0m



  0%|                                                     | 0/1 [00:00<?, ?it/s][A
100%|█████████████████████████████████████████████| 1/1 [00:07<00:00,  7.63s/it][A
 67%|████████████████████████████▋              | 34/51 [15:26<06:52, 24.28s/it]

[38;5;4mℹ Pre-processing will happen on CPU![0m
[38;5;4mℹ Loading NER...[0m
[38;5;4mℹ NER will run on CPU![0m
[38;5;4mℹ Processing documents with Legal NER!!![0m



  0%|                                                     | 0/1 [00:00<?, ?it/s][A
100%|█████████████████████████████████████████████| 1/1 [00:15<00:00, 15.40s/it][A
 69%|█████████████████████████████▌             | 35/51 [15:46<06:07, 22.94s/it]

[38;5;4mℹ Pre-processing will happen on CPU![0m
[38;5;4mℹ Loading NER...[0m
[38;5;4mℹ NER will run on CPU![0m
[38;5;4mℹ Processing documents with Legal NER!!![0m



  0%|                                                     | 0/1 [00:00<?, ?it/s][A
100%|█████████████████████████████████████████████| 1/1 [00:07<00:00,  7.87s/it][A
 71%|██████████████████████████████▎            | 36/51 [15:58<04:55, 19.69s/it]

[38;5;4mℹ Pre-processing will happen on CPU![0m
[38;5;4mℹ Loading NER...[0m
[38;5;4mℹ NER will run on CPU![0m
[38;5;4mℹ Processing documents with Legal NER!!![0m



  0%|                                                     | 0/1 [00:00<?, ?it/s][A
100%|█████████████████████████████████████████████| 1/1 [00:44<00:00, 44.42s/it][A
 73%|███████████████████████████████▏           | 37/51 [16:46<06:37, 28.41s/it]

[38;5;3m⚠ There was some issue while performing postprocessing for doc id
84812a94ddbf010f6964cc54d84278f6484605c94acd03c6bf50826b6a42b73b.
Some of postprocessing info may be absent because of this in doc.[0m
[38;5;4mℹ Pre-processing will happen on CPU![0m
[38;5;4mℹ Loading NER...[0m
[38;5;4mℹ NER will run on CPU![0m
[38;5;4mℹ Processing documents with Legal NER!!![0m



  0%|                                                     | 0/1 [00:00<?, ?it/s][A
100%|█████████████████████████████████████████████| 1/1 [00:15<00:00, 15.55s/it][A
 75%|████████████████████████████████           | 38/51 [17:07<05:37, 25.97s/it]

[38;5;4mℹ Pre-processing will happen on CPU![0m
[38;5;4mℹ Loading NER...[0m
[38;5;4mℹ NER will run on CPU![0m
[38;5;4mℹ Processing documents with Legal NER!!![0m



  0%|                                                     | 0/1 [00:00<?, ?it/s][A
100%|█████████████████████████████████████████████| 1/1 [00:06<00:00,  6.42s/it][A
 76%|████████████████████████████████▉          | 39/51 [17:18<04:18, 21.50s/it]

[38;5;4mℹ Pre-processing will happen on CPU![0m
[38;5;4mℹ Loading NER...[0m
[38;5;4mℹ NER will run on CPU![0m
[38;5;4mℹ Processing documents with Legal NER!!![0m



  0%|                                                     | 0/1 [00:00<?, ?it/s][A
100%|█████████████████████████████████████████████| 1/1 [00:10<00:00, 10.22s/it][A
 78%|█████████████████████████████████▋         | 40/51 [17:33<03:34, 19.48s/it]

[38;5;4mℹ Pre-processing will happen on CPU![0m
[38;5;4mℹ Loading NER...[0m
[38;5;4mℹ NER will run on CPU![0m
[38;5;4mℹ Processing documents with Legal NER!!![0m



  0%|                                                     | 0/1 [00:00<?, ?it/s][A
100%|█████████████████████████████████████████████| 1/1 [00:06<00:00,  6.84s/it][A
 80%|██████████████████████████████████▌        | 41/51 [17:44<02:50, 17.05s/it]

[38;5;4mℹ Pre-processing will happen on CPU![0m
[38;5;4mℹ Loading NER...[0m
[38;5;4mℹ NER will run on CPU![0m
[38;5;4mℹ Processing documents with Legal NER!!![0m



  0%|                                                     | 0/1 [00:00<?, ?it/s][A
100%|█████████████████████████████████████████████| 1/1 [00:10<00:00, 10.78s/it][A
 82%|███████████████████████████████████▍       | 42/51 [17:59<02:28, 16.55s/it]

[38;5;4mℹ Pre-processing will happen on CPU![0m
[38;5;4mℹ Loading NER...[0m
[38;5;4mℹ NER will run on CPU![0m
[38;5;4mℹ Processing documents with Legal NER!!![0m



  0%|                                                     | 0/1 [00:00<?, ?it/s][A
100%|█████████████████████████████████████████████| 1/1 [00:08<00:00,  8.53s/it][A
 84%|████████████████████████████████████▎      | 43/51 [18:12<02:03, 15.47s/it]

[38;5;4mℹ Pre-processing will happen on CPU![0m
[38;5;4mℹ Loading NER...[0m
[38;5;4mℹ NER will run on CPU![0m
[38;5;4mℹ Processing documents with Legal NER!!![0m



  0%|                                                     | 0/1 [00:00<?, ?it/s][A
100%|█████████████████████████████████████████████| 1/1 [00:13<00:00, 13.02s/it][A
 86%|█████████████████████████████████████      | 44/51 [18:30<01:52, 16.10s/it]

[38;5;4mℹ Pre-processing will happen on CPU![0m
[38;5;4mℹ Loading NER...[0m
[38;5;4mℹ NER will run on CPU![0m
[38;5;4mℹ Processing documents with Legal NER!!![0m



  0%|                                                     | 0/1 [00:00<?, ?it/s][A
100%|█████████████████████████████████████████████| 1/1 [00:21<00:00, 21.58s/it][A
 88%|█████████████████████████████████████▉     | 45/51 [18:56<01:55, 19.20s/it]

[38;5;4mℹ Pre-processing will happen on CPU![0m
[38;5;4mℹ Loading NER...[0m
[38;5;4mℹ NER will run on CPU![0m
[38;5;4mℹ Processing documents with Legal NER!!![0m



  0%|                                                     | 0/1 [00:00<?, ?it/s][A
100%|█████████████████████████████████████████████| 1/1 [00:06<00:00,  6.79s/it][A
 90%|██████████████████████████████████████▊    | 46/51 [19:08<01:24, 16.84s/it]

[38;5;4mℹ Pre-processing will happen on CPU![0m
[38;5;4mℹ Loading NER...[0m
[38;5;4mℹ NER will run on CPU![0m
[38;5;4mℹ Processing documents with Legal NER!!![0m



  0%|                                                     | 0/1 [00:00<?, ?it/s][A
100%|█████████████████████████████████████████████| 1/1 [00:15<00:00, 15.29s/it][A
 92%|███████████████████████████████████████▋   | 47/51 [19:27<01:10, 17.73s/it]

[38;5;4mℹ Pre-processing will happen on CPU![0m
[38;5;4mℹ Loading NER...[0m
[38;5;4mℹ NER will run on CPU![0m
[38;5;4mℹ Processing documents with Legal NER!!![0m



  0%|                                                     | 0/1 [00:00<?, ?it/s][A
100%|█████████████████████████████████████████████| 1/1 [00:07<00:00,  7.89s/it][A
 94%|████████████████████████████████████████▍  | 48/51 [19:40<00:48, 16.19s/it]

[38;5;4mℹ Pre-processing will happen on CPU![0m
[38;5;4mℹ Loading NER...[0m
[38;5;4mℹ NER will run on CPU![0m
[38;5;4mℹ Processing documents with Legal NER!!![0m



  0%|                                                     | 0/1 [00:00<?, ?it/s][A
100%|█████████████████████████████████████████████| 1/1 [00:09<00:00,  9.88s/it][A
 96%|█████████████████████████████████████████▎ | 49/51 [19:55<00:31, 15.83s/it]

[38;5;4mℹ Pre-processing will happen on CPU![0m
[38;5;4mℹ Loading NER...[0m
[38;5;4mℹ NER will run on CPU![0m
[38;5;4mℹ Processing documents with Legal NER!!![0m



  0%|                                                     | 0/1 [00:00<?, ?it/s][A
100%|█████████████████████████████████████████████| 1/1 [00:26<00:00, 26.16s/it][A
 98%|██████████████████████████████████████████▏| 50/51 [20:26<00:20, 20.27s/it]

[38;5;4mℹ Pre-processing will happen on CPU![0m
[38;5;4mℹ Loading NER...[0m
[38;5;4mℹ NER will run on CPU![0m
[38;5;4mℹ Processing documents with Legal NER!!![0m



  0%|                                                     | 0/1 [00:00<?, ?it/s][A
100%|█████████████████████████████████████████████| 1/1 [00:09<00:00,  9.45s/it][A
100%|███████████████████████████████████████████| 51/51 [20:40<00:00, 24.32s/it]


In [None]:
# Frequency count of all statues in a judgement
from opennyai.ner import get_unique_statute_count
statutes=get_unique_statute_count(ner_doc_1)
statutes

In [None]:
# Frequency count of all provisions in a judgement
from opennyai.ner import get_unique_provision_count
provisions=get_unique_provision_count(ner_doc_1)
provisions

In [None]:
provisions_df = pd.DataFrame(provisions.items())
provisions_df.columns = ['provision','freq']
provisions_df['provision'] = provisions_df['provision'].str.lower().replace('section', 's', regex=True)

In [None]:
provisions_df = provisions_df.groupby('provision')[['freq']].sum().reset_index()
provisions_df['statute'] = ''

In [None]:
for index, row in provisions_df.iterrows():
    statute = row['provision'].split('of',1)[-1].strip().title()
    statute = statute.replace('Protection Of Children From Sexual Offences','POCSO')
    
    provision = row['provision'].split('of',1)[0].strip().title()
    provision = provision.replace('As Well As',r'/')
    provisions_df.loc[index,'statute'] = statute
    provisions_df.loc[index,'provision'] = provision

In [None]:
provisions_df

In [None]:
provisions_df = provisions_df.groupby(['statute'])['provision'].apply(list).reset_index()
provisions_df

In [None]:
provisions_df['case_id'] = case_id

In [None]:
provisions_df

In [None]:
l = judgment.split(r'/')
l.pop()
case_metadata_json = r"/".join(l)+"/"+case_id+"_parsed.json"

import json
with open(case_metadata_json) as file:
    data = json.load(file)

In [None]:
sections_df = pd.DataFrame(data['acts'])
sections_df.columns = ['statute_meta','provision_meta']
sections_df['case_id_meta'] = case_id

# Limitations
1. The judgements are converted from PDFs to txt formats. The PDFs often contains official stamps and other symbols, which when converted into txt become a dirty group of letters. This made a couple of sentences in each page incomprehensible. The recognition of entities could be affected in these sentences.

In [3]:
filtered_cases_cino = pd.read_csv('POCSO_Filter2.csv')['cino'].to_list()

In [8]:
# List of all POCSO judgements in Assam
judgement_paths_assam = []
path ="Assam"
for root, dirs, files in os.walk(path):
    for file in files:
        if(file.endswith(".pdf")):
            judgement_paths_assam.append(os.path.join(root,file))

In [18]:
# List of 50 filtered POCSO judgements in Assam
judgements_filtered = []
for judgement in judgement_paths_assam:
    #Get Case_ID 
    case_id = judgement.split(r'/')[-1].split('_Judgment_')[0]
    
    if case_id in filtered_cases_cino:
        judgements_filtered.append(judgement)

In [19]:
len(judgements_filtered)

51

In [17]:
f = 'Assam/Dhubri/additional_district_and_sessions_judge_ASDU100003992019/ASDU100003992019_Judgment_23-10-2019.pdf'

f.split(r'/')[-1].split('_Judgment_')[0]

'ASDU100003992019'

In [23]:
import shutil
for judgement in judgements_filtered:
    shutil.copy2(judgement, 'POCSO_Experiment_Judgements')  