# 1. load dataset

In [1]:
import os
import pandas as pd
from datasets import load_dataset


print("start loading dataset...")

# Login using e.g. `huggingface-cli login` to access this dataset
ds = load_dataset("isaacus/open-australian-legal-corpus")

print(f"✓ Dataset loaded successfully!")
print(f"Available splits: {list(ds.keys())}")

# Get the first split (corpus, train, etc.)
split_name = list(ds.keys())[0]
dataset = ds[split_name]
print(f"Using split: '{split_name}', Total rows: {len(dataset)}")

#store dataset into data/raw as json
os.makedirs("../data/raw", exist_ok=True)
output_path = "../data/raw/1raw_corpus.jsonl"

dataset.to_json(output_path, force_ascii=False)

print(f"✓ Data saved successfully!")

  from .autonotebook import tqdm as notebook_tqdm


start loading dataset...




✓ Dataset loaded successfully!
Available splits: ['corpus']
Using split: 'corpus', Total rows: 232560


Creating json from Arrow format: 100%|██████████| 233/233 [01:29<00:00,  2.60ba/s]

✓ Data saved successfully!





only selece data: type = primary_legislation

In [9]:
import pandas as pd

print("Reading in chunks...")
chunks = []
for chunk in pd.read_json("../data/raw/1raw_corpus.jsonl", lines=True, chunksize=10000):
    filtered_chunk = chunk[(chunk['type'] == 'primary_legislation') & (chunk['jurisdiction'] == 'new_south_wales')]
    chunks.append(filtered_chunk)

# Combine and sort
filtered_dataset = pd.concat(chunks, ignore_index=True)
filtered_dataset = filtered_dataset.sort_values('date', ascending=False)

print(f"Filtered rows: {len(filtered_dataset)}")

# Save
filtered_dataset.to_json("../data/raw/2primary_legislation_new_south_wales.jsonl", 
                         orient='records', lines=True, force_ascii=False)
print("✓ Done!")

Reading in chunks...


  filtered_dataset = pd.concat(chunks, ignore_index=True)


Filtered rows: 1418
✓ Done!


select 100 piece of data randomly

In [11]:
# select 100 piece of data randomly
print("Reading in chunks...")
chunks = []
for chunk in pd.read_json("../data/raw/2primary_legislation_new_south_wales.jsonl", lines=True, chunksize=140):
    chunks.append(chunk.sample(n=10, random_state=42))  # select 10 samples from each chunk

# Combine all chunks into a single DataFrame
random_sample = pd.concat(chunks, ignore_index=True)

random_sample.to_json("../data/raw/4primary_legislation_new_south_wales_random_sample2.json", 
                         orient='records', lines=False, force_ascii=False)
print("✓ Random sample saved!")

Reading in chunks...
✓ Random sample saved!


only citation and url

In [13]:
import pandas as pd

print("Reading in chunks...")
chunks = []
for chunk in pd.read_json("../data/raw/1raw_corpus.jsonl", lines=True, chunksize=10000):
# only citation and url column
    filtered_chunk = chunk[(chunk['type'] == 'primary_legislation') & (chunk['jurisdiction'] == 'new_south_wales')]
    filtered_chunk = filtered_chunk[['citation', 'url']]
    chunks.append(filtered_chunk)

# Combine
filtered_dataset = pd.concat(chunks, ignore_index=True)

print(f"Filtered rows: {len(filtered_dataset)}")

# Save
filtered_dataset.to_json("../data/raw/2primary_legislation_new_south_wales_filtered.json", 
                         orient='records', lines=False, force_ascii=False)
print("✓ Done!")

Reading in chunks...
Filtered rows: 1418
✓ Done!


In [19]:
import pandas as pd

print("Reading in chunks...")
chunks = []
for chunk in pd.read_json("../data/raw/1raw_corpus.jsonl", lines=True, chunksize=10000):
    # 修复：用 | 代替 or，并给每个条件加括号
    filtered_chunk = chunk[
        (chunk['citation'] == 'Children (Protection and Parental Responsibility) Act 1997 (NSW)') |
        (chunk['citation'] == 'Children (Education and Care Services National Law Application) Act 2010 (NSW)') |
        (chunk['citation'] == 'Children (Education and Care Services) Supplementary Provisions Act 2011 (NSW)')
    ]
    chunks.append(filtered_chunk)

# Combine
filtered_dataset = pd.concat(chunks, ignore_index=True)

print(f"Filtered rows: {len(filtered_dataset)}")

# Save
filtered_dataset.to_json("../data/raw/primary_legislation_new_south_wales.json", 
                         orient='records', lines=False, force_ascii=False, indent=2)
print("✓ Done!")

Reading in chunks...
Filtered rows: 3
✓ Done!
Filtered rows: 3
✓ Done!


  filtered_dataset = pd.concat(chunks, ignore_index=True)


data preprocessing: data cleaning

In [25]:
import json
import re
from pathlib import Path
from typing import Any, Dict, List, Union

def normalize_legislation_text(raw_text: str) -> str:
    """
    Rules:
    1) If a line begins with "Part <num>" -> prefix with "[PART] "
    2) If a line begins with "Division <num>" -> prefix with "[DIVISION] "
    3) If a line begins with "<num> " or "<num><letter> " (e.g., 9A, 29CA) -> prefix with "[SECTION] "
    4) If newline is followed by 4 spaces (i.e. '\n    ') -> treat as indented text; add ':' before that indented block
    5) Remove all '\n' and extra spaces in the final output
    """
    if not raw_text:
        return ""

    # Normalize line endings
    text = raw_text.replace("\r\n", "\n").replace("\r", "\n")

    # Convert indented newlines into a colon separator before the indented text.
    # Example: "Title\n    body..." -> "Title: body..."
    #text = re.sub(r"\n[ ]{4,}", ": ", text)

    # Now we can safely split by remaining newlines (these were your '\n\n' / new paragraph markers).
    lines = [ln.strip() for ln in text.split("\n") if ln.strip()]

    out_lines: List[str] = []

    part_pat = re.compile(r"^Part\s+\d+\b", re.IGNORECASE)
    div_pat = re.compile(r"^Division\s+\d+\b", re.IGNORECASE)
    # Sections like "1 Short title", "9A Application ...", "29CA APRA may request ..."
    sec_pat = re.compile(r"^(?:\d+[A-Z]{0,3})\b")

    for ln in lines:
        if part_pat.match(ln):
            out_lines.append(f"[PART] {ln}")
        elif div_pat.match(ln):
            out_lines.append(f"[DIVISION] {ln}")
        elif sec_pat.match(ln):
            # Only tag if it *starts* with a section-like token (e.g., "10 Definitions")
            out_lines.append(f"[SECTION] {ln}")
        else:
            out_lines.append(ln)

    # Join and collapse any remaining whitespace
    normalized = " ".join(out_lines)
    normalized = re.sub(r"\s+", " ", normalized).strip()
    return normalized


def process_json_file(
    input_path: Union[str, Path],
    output_path: Union[str, Path],
    text_key: str = "text",
    output_key: str = "text_normalized",
) -> None:
    """
    Reads a JSON file that is either:
      - a list[dict] records, or
      - a dict with a list under some key (common pattern: {"data": [...]})
    For each record, reads record[text_key], writes record[output_key].
    """
    input_path = Path(input_path)
    output_path = Path(output_path)

    with input_path.open("r", encoding="utf-8") as f:
        data: Any = json.load(f)

    def handle_records(records: List[Dict[str, Any]]) -> None:
        for rec in records:
            raw = rec.get(text_key, "")
            rec[output_key] = normalize_legislation_text(raw)

    if isinstance(data, list):
        handle_records(data)
    elif isinstance(data, dict):
        # If it's a dict, try common patterns: "data", "records", otherwise treat as single record if it has text_key
        if isinstance(data.get("data"), list):
            handle_records(data["data"])
        elif isinstance(data.get("records"), list):
            handle_records(data["records"])
        elif text_key in data:
            data[output_key] = normalize_legislation_text(data.get(text_key, ""))
        else:
            raise ValueError(
                f"Unrecognized JSON structure. Expected list or dict with key '{text_key}' "
                f"or list under 'data'/'records'. Got keys: {list(data.keys())[:30]}"
            )
    else:
        raise ValueError(f"Unsupported JSON root type: {type(data)}")

    output_path.parent.mkdir(parents=True, exist_ok=True)
    with output_path.open("w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=2)


# Example usage:
# process_json_file(
#     input_path="/Users/yaz/Desktop/free_topic/Free_topic_true/data/raw/4primary_legislation_random_sample.json",
#     output_path="/Users/yaz/Desktop/free_topic/Free_topic_true/data/raw/4primary_legislation_random_sample.normalized.json",
#     text_key="text",
#     output_key="text_normalized",
# )
process_json_file(
    input_path="../data/raw/act-1997-078/primary_legislation_new_south_wales.json",
    output_path="../data/processed/act-1997-078/act-1997-078_normalized.json",
    text_key="text",
    output_key="text_normalized",
)
process_json_file(
    input_path="../data/raw/act-2010-104/primary_legislation_new_south_wales.json",
    output_path="../data/processed/act-2010-104/act-2010-104_normalized.json",
    text_key="text",
    output_key="text_normalized",
)
process_json_file(
    input_path="../data/raw/act-2011-070/primary_legislation_new_south_wales.json",
    output_path="../data/processed/act-2011-070/act-2011-070_normalized.json",
    text_key="text",
    output_key="text_normalized",
)

read csv and transform into excel

In [10]:
# read csv and transform into excel
import pandas as pd
node = pd.read_csv('../data/neo4j_data/Document_Structure_Graph3/nodes.csv')
node.to_excel('../data/neo4j_data/Document_Structure_Graph3/nodes.xlsx', index=False)

rel = pd.read_csv('../data/neo4j_data/Document_Structure_Graph3/relationships.csv')
rel.to_excel('../data/neo4j_data/Document_Structure_Graph3/relationships.xlsx', index=False)