In [2]:
import re
failed_records = []
from bs4 import BeautifulSoup
import json



def get_issue_for_consideration(raw_string):

    # Parse the raw HTML string with BeautifulSoup
    soup = BeautifulSoup(raw_string, 'html.parser')

    # Find the <strong> tag with the Judge information
    judge_section = soup.find('strong', string=lambda t: t and 'Judge' in t)

    # Find the first <br> tag after the Judge section
    if judge_section:
        issue_for_consideration = judge_section.find_next('br').next_sibling.strip()
    else:
        issue_for_consideration = ""

    return issue_for_consideration

def extract_metadata(raw_string):
    case_title = ""
    citation = ""
    judges = "" 
    issue_for_consideration= "" 
    headnotes = "" 
    decision_date = "" 
    case_number = "" 
    disposal_nature = "" 
    direction_issue = ""
    
    try:
        # Case Title
        case_title = re.search(r'aria-label="(.+?) pdf"', raw_string).group(1)
        
        # Updated regex to capture the complete content inside the span tag
        citation = re.search(r'<span class=\'escrText\'>(.*?)<\/span>', raw_string).group(1)
        
        # Judges
        judges = re.search(r'<strong>Judge :(.+?)</strong>', raw_string).group(1).strip()
        
        # Issue for Consideration
        #print(raw_string)
        #issue_for_consideration = re.search(r'Issue for Consideration(.+?)(?=<strong)', raw_string).group(1).strip()
        issue_for_consideration = get_issue_for_consideration(raw_string)

        # Headnotes
        headnotes = re.search(r'Headnotes(.+?)(?=<strong)', raw_string)
        headnotes = headnotes.group(1).strip() if headnotes else None
        
        # Decision Date
        decision_date = re.search(r'Decision Date :</span><font color=\'green\'>(.+?)</font>', raw_string).group(1)
        
        # Case Number
        case_number = re.search(r'Case No :</span><font color=\'green\'>(.+?)</font>', raw_string).group(1)
        
        # Disposal Nature
        disposal_nature = re.search(r'Disposal Nature :</span><font color=\'green\'>(.+?)</font>', raw_string).group(1)

        #Direction Issue
        direction_issue = re.search(r'Direction Issue :</span><font color=\'green\'>(.+?)</font>', raw_string).group(1)
        
        return {
            'CaseTitle': case_title if case_title else "",
            'Citation': citation if citation else "",
            'Judges': judges if judges else "",
            'IssueForConsideration': issue_for_consideration if issue_for_consideration else "",
            'Headnotes': headnotes if headnotes else "",
            'DecisionDate': decision_date if decision_date else "",
            'CaseNumber': case_number if case_number else "",
            'DisposalNature': disposal_nature if disposal_nature else "",
            'DirectionIssue' : direction_issue if direction_issue else ""
        }
    except:

        failed_records.append(raw_string)


        return {
            'CaseTitle': case_title if case_title else "",
            'Citation': citation if citation else "",
            'Judges': judges if judges else "",
            'IssueForConsideration': issue_for_consideration if issue_for_consideration else "",
            'Headnotes': headnotes if headnotes else "",
            'DecisionDate': decision_date if decision_date else "",
            'CaseNumber': case_number if case_number else "",
            'DisposalNature': disposal_nature if disposal_nature else "",
            'DirectionIssue' : direction_issue if direction_issue else ""
        }


def extract_metadata_from_file(file_name):

    with open(file_name, "r") as file:
        data = json.load(file)

    rows = data['reportrow']['aaData']

    metadata = []

    for row in rows:
        metadata.append(extract_metadata(row[1]))
    print(len(metadata))
    print(json.dumps(metadata, indent=4))
    return metadata

escr_consolidated_metadata = []    
#for file_name in files_list:
#    escr_consolidated_metadata.extend(extract_metadata_from_file(file_name))

import os 
raw_metadata_path = "raw-metadata/"
for file in os.listdir(raw_metadata_path):
    file_path = os.path.join(raw_metadata_path, file)
    escr_consolidated_metadata.extend(extract_metadata_from_file(file_path))


10
[
    {
        "CaseTitle": " Subodh Kumar Singh Rathour  Vs The Chief Executive Officer & Ors.",
        "Citation": "[2024] 7 S.C.R. 532",
        "Judges": "D.Y. CHANDRACHUD,J.B. PARDIWALA,Manoj Misra",
        "IssueForConsideration": "Issue for Consideration Scope of judicial review of the actions of the State in the matters relating to contract/tender disputes under writ jurisdiction; whether the action on the part of the respondent in cancelling the tender was amenable to the High Court; if so, whether the said action could be termed as arbitrary or unfair and in consequence of violation of Article 14 of the Constitution of India. Headnotes\u2020 Contract/tender disputes \u2013 Judicial review \u2013 Scope \u2013 Tender awarded to the appellant on Public-Private",
        "Headnotes": "&dagger; Contract/tender disputes &ndash; Judicial review &ndash; Scope &ndash; Tender awarded to the appellant on Public-Private<br>",
        "DecisionDate": " 09-07-2024",
        "CaseNumb

In [3]:
import pandas as pd 
data = pd.DataFrame(escr_consolidated_metadata)

In [5]:
data[data.duplicated()].sort_values(by="DecisionDate")

Unnamed: 0,CaseTitle,Citation,Judges,IssueForConsideration,Headnotes,DecisionDate,CaseNumber,DisposalNature,DirectionIssue


In [7]:
for i,case in data.iterrows():
    case_metadata = case.to_dict()
    base_name = case_metadata['CaseNumber'].replace("/","_").lstrip()
    # Create a directory for the extracted text file
    output_dir = os.path.join("escr-judgements-dataset/judgements_metadata/", base_name)
    os.makedirs(output_dir, exist_ok=True)
    # Define the path for the combined text file
    combined_file_path = os.path.join(output_dir, f"{base_name}.json")

    print(combined_file_path)

    # Write the dictionary to a JSON file
    with open(combined_file_path, 'w') as file:
        json.dump(case_metadata, file, indent=4)

escr-judgements-dataset/judgements_metadata/CIVIL APPEAL_6741_2024\CIVIL APPEAL_6741_2024.json
escr-judgements-dataset/judgements_metadata/CIVIL APPEAL_7230_2024\CIVIL APPEAL_7230_2024.json
escr-judgements-dataset/judgements_metadata/CIVIL APPEAL_6135_2024\CIVIL APPEAL_6135_2024.json
escr-judgements-dataset/judgements_metadata/CIVIL APPEAL_1389_2024\CIVIL APPEAL_1389_2024.json
escr-judgements-dataset/judgements_metadata/CIVIL APPEAL_4603_2024\CIVIL APPEAL_4603_2024.json
escr-judgements-dataset/judgements_metadata/CRIMINAL APPEAL_437_2015\CRIMINAL APPEAL_437_2015.json
escr-judgements-dataset/judgements_metadata/WRIT PETITION (CIVIL)_255_2024\WRIT PETITION (CIVIL)_255_2024.json
escr-judgements-dataset/judgements_metadata/CIVIL APPEAL_5194_2024\CIVIL APPEAL_5194_2024.json
escr-judgements-dataset/judgements_metadata/DIARYNO AND DIARYYR_8208_2024\DIARYNO AND DIARYYR_8208_2024.json
escr-judgements-dataset/judgements_metadata/CURATIVE PETITION (CIVIL)_108_2022\CURATIVE PETITION (CIVIL)_108_20

In [105]:
data.shape

(29, 8)