# Opinion Chunking

This notebook loads the scraped opinions, cleans the text, and splits it into chunks while preserving metadata.

In [1]:
import json
import re
import os
from typing import List, Dict, Any

INPUT_FILE = 'opinions.json'
OUTPUT_FILE = 'opinion_chunks.json'

print(f"Loading data from {INPUT_FILE}...")
with open(INPUT_FILE, 'r') as f:
    opinions = json.load(f)
print(f"Loaded {len(opinions)} opinions.")

Loading data from opinions.json...
Loaded 100 opinions.


In [2]:
def clean_text(text: str) -> str:
    if not text:
        return ""
    # Remove HTML tags
    text = re.sub(r'<[^>]+>', '', text)
    # Remove frequent "Page" headers or line numbers visible in some legal scraping
    # (Matches simple "Page X" or numbers at start of lines)
    text = re.sub(r'(?m)^\s*\d+\s*$', '', text)
    # Normalize whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def recursive_split(text: str, max_chars: int = 1500, overlap: int = 100) -> List[str]:
    """
    Splits text by sentences and groups them into chunks.
    Only splits mid-sentence if a single sentence exceeds max_chars.
    """
    # Simple regex for sentence splitting (handles common abbreviations roughly, but good enough for general legal text)
    # Splits on period/question/exclamation followed by whitespace.
    sentences = re.split(r'(?<=[.!?])\s+', text)
    
    chunks = []
    current_chunk = ""
    
    for sentence in sentences:
        sentence = sentence.strip()
        if not sentence:
            continue
            
        # Check if adding this sentence would exceed limit
        if len(current_chunk) + len(sentence) + 1 <= max_chars:
            current_chunk += (sentence + " ")
        else:
            # Current chunk is full, save it
            if current_chunk:
                chunks.append(current_chunk.strip())
            
            # Now handle the new sentence
            if len(sentence) > max_chars:
                # If the sentence ITSELF is too big, we must hard split it
                # We'll use a simple character split for this giant sentence
                sub_chunks = [sentence[i:i+max_chars] for i in range(0, len(sentence), max_chars-overlap)]
                chunks.extend(sub_chunks)
                current_chunk = "" 
            else:
                # Start a new chunk with this sentence
                # Implement overlap: try to take the last sentence from previous chunk if possible, 
                # but for simple sentence grouping, clean breaks are often preferred.
                # Here we just start fresh.
                current_chunk = sentence + " "
    
    if current_chunk:
        chunks.append(current_chunk.strip())
        
    return chunks

def create_chunks(opinions: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
    all_chunks = []
    
    for op in opinions:
        # Prefer plain text, fallbacks handled here (though scraping usually fixes it)
        raw_text = op.get('plain_text') or op.get('html') or op.get('html_lawbox') or ""
        if not raw_text:
            continue
            
        cleaned_text = clean_text(raw_text)
        text_chunks = recursive_split(cleaned_text, max_chars=1500, overlap=100) # 1500 chars is roughly 250-300 words
        
        for i, chunk_text in enumerate(text_chunks):
            chunk_data = op.copy()
            # Remove the full text fields from the chunk to save space/confusion
            for key in ['plain_text', 'html', 'html_lawbox', 'html_columbia', 'html_anon_2020', 'html_with_citations']:
                chunk_data.pop(key, None)
            
            chunk_data['text'] = chunk_text
            chunk_data['chunk_index'] = i
            chunk_data['total_chunks'] = len(text_chunks)
            all_chunks.append(chunk_data)
            
    return all_chunks

In [3]:
chunks = create_chunks(opinions)
print(f"Created {len(chunks)} chunks from {len(opinions)} opinions.")

with open(OUTPUT_FILE, 'w') as f:
    json.dump(chunks, f, indent=2)
    
print(f"Saved to {OUTPUT_FILE}")

Created 670 chunks from 100 opinions.
Saved to opinion_chunks.json


In [4]:
# Verification
print(f"Sample chunk:\n{json.dumps(chunks[0], indent=2)}")
assert 'text' in chunks[0]
assert 'date_created' in chunks[0] # Verify metadata preservation

# Check text cleaning quality (random check)
sample_text = chunks[0]['text']
print("\nText snippet:", sample_text[:200])
assert '<' not in sample_text and '>' not in sample_text[:10], "HTML tags might remain"
# Verify sentence ending if possible (rough heuristic)
if len(sample_text) > 50 and sample_text[-1] not in ['.', '!', '?', '"', "'"]:
    print("Warning: Chunk might not end with a sentence punctuation (could be end of file or mid-sentence split).")
else:
    print("Chunk ends with punctuation (likely sentence boundary preserved).")

Sample chunk:
{
  "resource_uri": "https://www.courtlistener.com/api/rest/v3/opinions/11224238/",
  "id": 11224238,
  "absolute_url": "/opinion/10757653/prescila-lovell-for-herself-as-a-private-attorney-general-and-on-behalf/",
  "cluster_id": 10757653,
  "cluster": "https://www.courtlistener.com/api/rest/v3/clusters/10757653/",
  "author_id": null,
  "author": null,
  "joined_by": [],
  "date_created": "2025-12-13T11:56:49.962743-08:00",
  "date_modified": "2025-12-13T11:56:50.187131-08:00",
  "author_str": "",
  "per_curiam": false,
  "joined_by_str": "",
  "type": "100trialcourt",
  "sha1": "04ffbb08b99c24bd9f9805ab4a9676934db5fee8",
  "page_count": 3,
  "download_url": null,
  "local_path": "recap/gov.uscourts.caed.475980/gov.uscourts.caed.475980.16.0.pdf",
  "xml_harvard": "",
  "extracted_by_ocr": true,
  "ordering_key": null,
  "main_version": null,
  "opinions_cited": [
    "https://www.courtlistener.com/api/rest/v3/opinions/590987/"
  ],
  "text": "1 Stephanie Sheridan (CA 135