In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import json
import os
import pandas as pd
from tqdm import tqdm
from utils import *

In [4]:
structured_path = './echr-data/echr_2_0_0_structured_cases.json'
st = load_json(structured_path)

unstructured_path = './echr-data/echr_2_0_0_unstructured_cases.json'
ust= load_json(unstructured_path)

In [5]:
len(st), len(ust)

(16096, 16096)

In [6]:
def clean_text(text):
    return replace_unicode_with_space(text)

import re
def replace_unicode_with_space(text):
    # \u0000-\uFFFF matches all Unicode characters in the Basic Multilingual Plane (BMP)
    # This will replace every Unicode character with a space
    text =  re.sub(r'[^\x00-\x7F]', ' ', text)
    # Replace multiple spaces with a single space
    text = re.sub(r'\s+', ' ', text)
    # Strip leading and trailing spaces
    text = text.strip()

    return text


import re

def extract_special_format(text):
    """
    Detects and extracts any string in the form of dddd/dd from the given text.
    """
    # Define the regex pattern for dddd/dd
    pattern = r'\b\d{4}/\d{2}\b'
    # Find all matches in the text
    matches = re.findall(pattern, text)
    return matches

In [7]:
def flatten_elements(elements_list):
    
    content_list = []
    
    
    for element in elements_list:
        content_list.append(clean_text(element['content']))
        
        if len(element['elements']) > 0:
            content_list.append(flatten_elements(element['elements']))
    return '\n'.join(content_list)#content_list
 

In [8]:
def get_content(sections):
    r = {}
    for section in sections:
        content, elements = section['content'], section['elements']
        
        if 'section_name' not in section.keys():
            section_name = 'other'
        else:
            section_name = section['section_name']
        
        elements = flatten_elements(elements)
        
        
        if section_name not in r.keys(): 
            r[section_name] = elements
        else:
             r[section_name] =  r[section_name] + elements
    return r

In [9]:
output_directory = './echr-processed/'

In [12]:
keys = list()
for i, j in tqdm(zip(st, ust)):
    case = {}
    assert i['itemid'] == j['itemid']
    
    docx_key = list(j['content'].keys())[0]
    sections = j['content'][docx_key]
    
    case['itemid'] = i['itemid'] 
    case['importance'] = i['importance'] 
    case['judgementdate'] = i['judgementdate'] 
    case['__articles'] = j['__articles'] 
    case['__conclusion'] = j['__conclusion']
    case['appno'] = j['appno'] 
    case['article'] = j['article'] 
    case['extractedappno'] = j['extractedappno'] 
    case['docname'] = j['docname'] 
    case['parties'] = j['parties']
    case['parties'] = j['parties'] 
    case['rank'] = j['rank'] 

    r = get_content(sections)
    
    cited = []
    for k in r.keys():
        string = r[k]
        cited.extend(extract_special_format(string))
    case['cite'] = cited

    keys.extend(list(r.keys()))
    case.update(r)
    
    save_json(case, output_directory + i['itemid'] + '.json')

16096it [00:51, 313.81it/s]


In [15]:
set(keys)

{'abbreviations',
 'appendix',
 'conclusion',
 'facts',
 'introduction',
 'law',
 'opinion',
 'other',
 'procedure',
 'relevant_law',
 'schedule',
 'submission',
 'toc'}

In [10]:
keys = list()
for i, j in tqdm(zip(st, ust)):
    case = {}
    assert i['itemid'] == j['itemid']
    
    docx_key = list(j['content'].keys())[0]
    sections = j['content'][docx_key]
    
    case['itemid'] = i['itemid'] 
    case['importance'] = i['importance'] 
    if case['importance'] == '1':
        print(i['itemid'])
    

16096it [00:00, 95123.89it/s]

001-104911
001-77745
001-72700
001-60732
001-81356
001-78425
001-126982
001-155353
001-207757
001-60821
001-61828
001-59125
001-96453
001-189641
001-61549
001-67580
001-77694
001-161530
001-61317
001-83256
001-58736
001-90651
001-145389
001-72925
001-87354
001-206515
001-68548
001-60035
001-69629
001-146501
001-100293
001-116441
001-58763
001-58374
001-186828
001-177429
001-58404
001-98669
001-111634
001-59560
001-61058
001-178753
001-61185
001-102940
001-77522
001-67100
001-67538
001-176769
001-219333
001-220960
001-107325
001-72929
001-60974
001-200442
001-201353
001-58914
001-203165
001-75454
001-58900
001-82338
001-170347
001-83951
001-203169
001-58338
001-194523
001-90051
001-168972
001-61196
001-61637
001-59102
001-69564
001-60638
001-60684
001-208877
001-103904
001-209520
001-58908
001-58226
001-140005
001-67930
001-61572
001-58306
001-208326
001-177082
001-58451
001-61188
001-58336
001-148367
001-60891
001-61521
001-99817
001-61886
001-180486
001-77791
001-57974
001-159070
001-


