In [18]:
import pymupdf
import re
import json

In [20]:
import pymupdf
import os

def extract_page_range(pdf_path):
    doc = pymupdf.open(pdf_path)
    start_page = 4
    start_flag = False
    end_page = None
    
    for idx, page in enumerate(doc):
        text = page.get_text()
        if not start_flag and 'Cases by Article' in text:
            start_page = idx + 1
            start_flag = True
        if ('Cases by respondent State' in text) or ('Cases by applicant' in text) or ('Cases by State' in text):
            end_page = idx + 1
            break
    
    return start_page, end_page-1

def process_pdf(pdf_path):
        
    # Get the page range
    start_page, end_page = extract_page_range(pdf_path)
    if not start_page or not end_page:
        print("Could not find start or end page markers")
        return
    
    print(f"Processing pages {start_page} to {end_page}")
    
    # Open the PDF
    doc = pymupdf.open(pdf_path)
    all_cases = []
    all_links = []
    all_links_2 = []
    
    # Process each page
    for page_num in range(start_page - 1, end_page):
        page = doc[page_num]
                
        # Save text content
        text = page.get_text()#.encode("utf8")
        text = re.sub(r'Cases by Article\s*\n\s*\n', '', text) 
        text = re.sub(r'\d+\s*\nKey cases 2024[^\n]*\n', '', text)
        text = re.sub(r'\| Legal summary', '| Legal summary\n***', text)
        text = re.sub(r'\*\*\*\s*$', '***', text)
        cases = [case.strip() for case in text.split('***') if case.strip()]

        if len(cases)*2 != len(page.get_links()): # skip this page
            print(f'The page has been skipped: {page_num + 1}, {len(cases)} cases and {len(page.get_links())} links')
            continue
        
        assert len(cases)*2 == len(page.get_links()), f'Number of cases and links do not match on page {page_num + 1}, {len(cases)} cases and {len(page.get_links())} links'
        
        # Save links
        linkes = []
        linkes_2 = []
        for link in page.get_links():
            if "uri" in link:
                if '001-' in link['uri']:
                    linkes.append(link['uri'])
                else:
                    linkes_2.append(link['uri'])

        cases = [case.strip() for case in text.split('***') if case.strip()]
        links = linkes
        print(len(cases), len(links), len(linkes_2))
        all_cases.extend(cases)
        all_links.extend(links)
        all_links_2.extend(linkes_2)


        all_cases_linked = list(zip(all_cases, all_links, all_links_2))
        
        print(f"Processed page {page_num + 1}")
    return all_cases_linked


def create_case_objects(case_link_tuples, file_name):
    cases_list = []
    for case_text, link, link_2 in case_link_tuples:
        case_obj = {
            "case_text": case_text.strip(),
            "link": link.strip(),
            "link_2": link_2.strip()
        }
        cases_list.append(case_obj)
    
    # Write to JSON file with proper formatting
    with open(file_name, 'w', encoding='utf-8') as f:
        json.dump({"cases": cases_list}, f, indent=2, ensure_ascii=False)

In [10]:
# Usage example
pdf_path = "./key_cases_pdf/Cases_list_2024_ENG.pdf"
output_dir = "./key_cases_meta_data/kc_2024_parsed.json"

all_cases_linked = process_pdf(pdf_path)
create_case_objects(all_cases_linked, output_dir)

Processing pages 4 to 11
3 3 3
Processed page 4
4 4 4
Processed page 5
5 5 5
Processed page 6
6 6 6
Processed page 7
4 4 4
Processed page 8
3 3 3
Processed page 9
3 3 3
Processed page 10
1 1 1
Processed page 11


In [11]:
# Usage example
pdf_path = "./key_cases_pdf/Cases_list_2023_ENG.pdf"
output_dir = "./key_cases_meta_data/kc_2023_parsed.json"

all_cases_linked = process_pdf(pdf_path)
create_case_objects(all_cases_linked, output_dir)

Processing pages 4 to 15
4 4 4
Processed page 4
4 2 6
Processed page 5
3 3 3
Processed page 6
6 6 6
Processed page 7
6 5 7
Processed page 8
5 4 6
Processed page 9
6 6 6
Processed page 10
4 4 4
Processed page 11
4 4 4
Processed page 12
5 5 5
Processed page 13
4 3 5
Processed page 14
2 1 3
Processed page 15


In [21]:
# Usage example
pdf_path = "./key_cases_pdf/Cases_list_2022_ENG.pdf"
output_dir = "./key_cases_meta_data/kc_2022_parsed.json"

all_cases_linked = process_pdf(pdf_path)
create_case_objects(all_cases_linked, output_dir)

Processing pages 4 to 14
4 4 4
Processed page 4
5 5 5
Processed page 5
6 6 6
Processed page 6
5 5 5
Processed page 7
The page has been skipped: 8, 5 cases and 8 links
6 6 6
Processed page 9
6 6 6
Processed page 10
5 5 5
Processed page 11
5 4 6
Processed page 12
4 2 6
Processed page 13
2 0 4
Processed page 14


In [22]:
# Usage example
pdf_path = "./key_cases_pdf/Cases_list_2021_ENG.pdf"
output_dir = "./key_cases_meta_data/kc_2021_parsed.json"

all_cases_linked = process_pdf(pdf_path)
create_case_objects(all_cases_linked, output_dir)

Processing pages 4 to 13
The page has been skipped: 4, 5 cases and 12 links
The page has been skipped: 5, 6 cases and 14 links
7 7 7
Processed page 6
The page has been skipped: 7, 2 cases and 12 links
The page has been skipped: 8, 7 cases and 18 links
The page has been skipped: 9, 3 cases and 14 links
The page has been skipped: 10, 4 cases and 14 links
The page has been skipped: 11, 5 cases and 12 links
The page has been skipped: 12, 4 cases and 10 links
The page has been skipped: 13, 1 cases and 4 links


In [14]:
# Usage example
pdf_path = "./key_cases_pdf/Cases_list_2020_ENG.pdf"
output_dir = "./key_cases_meta_data/kc_2020_parsed.json"

all_cases_linked = process_pdf(pdf_path)
create_case_objects(all_cases_linked, output_dir)

Processing pages 4 to 12
5 5 5
Processed page 4
6 6 6
Processed page 5
6 6 6
Processed page 6
7 6 8
Processed page 7
7 7 7
Processed page 8
6 6 6
Processed page 9
6 6 6
Processed page 10
6 6 6
Processed page 11
2 1 3
Processed page 12


In [23]:
# Usage example
pdf_path = "./key_cases_pdf/Cases_list_2019_ENG.pdf"
output_dir = "./key_cases_meta_data/kc_2019_parsed.json"

all_cases_linked = process_pdf(pdf_path)
create_case_objects(all_cases_linked, output_dir)

Processing pages 4 to 8
6 6 6
Processed page 4
The page has been skipped: 5, 6 cases and 10 links
The page has been skipped: 6, 8 cases and 14 links
The page has been skipped: 7, 6 cases and 14 links
3 3 3
Processed page 8


In [16]:
# Usage example
pdf_path = "./key_cases_pdf/Cases_list_2018_ENG.pdf"
output_dir = "./key_cases_meta_data/kc_2018_parsed.json"

all_cases_linked = process_pdf(pdf_path)
create_case_objects(all_cases_linked, output_dir)

Processing pages 4 to 10
5 5 5
Processed page 4
7 7 7
Processed page 5
6 6 6
Processed page 6
7 7 7
Processed page 7
7 7 7
Processed page 8
6 6 6
Processed page 9
3 3 3
Processed page 10


In [17]:
# Usage example
pdf_path = "./key_cases_pdf/Cases_list_2017_ENG.pdf"
output_dir = "./key_cases_meta_data/kc_2017_parsed.json"

all_cases_linked = process_pdf(pdf_path)
create_case_objects(all_cases_linked, output_dir)

Processing pages 4 to 10
5 5 5
Processed page 4
6 6 6
Processed page 5
7 7 7
Processed page 6
7 7 7
Processed page 7
7 7 7
Processed page 8


AssertionError: Number of cases and links do not match on page 9, 6 cases and 10 links

In [84]:
# Usage example
pdf_path = "./key_cases_pdf/Cases_list_2016_ENG.pdf"
output_dir = "kc_2016_parsed.json"

all_cases_linked = process_pdf(pdf_path)
create_case_objects(all_cases_linked, output_dir)

Processing pages 4 to 11
5 10
Processed page 4
7 14
Processed page 5


AssertionError: Number of cases and links do not match on page 6, 6 cases and 10 links