# CourtListener Opinions Scraper

This notebook scrapes the latest opinions from the CourtListener API.

In [1]:
import os
import requests
import json
import time
from dotenv import load_dotenv

# Load environment variables from scripts/.env
load_dotenv('../.env')

API_KEY = os.getenv('COURT_API_KEY')
if not API_KEY:
    raise ValueError("COURT_API_KEY not found in environment variables")

BASE_URL = "https://www.courtlistener.com/api/rest/v3/opinions/"
HEADERS = {'Authorization': f'Token {API_KEY}'}
TARGET_COUNT = 100
OUTPUT_FILE = 'opinions.json'

print("API Key loaded. Ready to scrape.")

API Key loaded. Ready to scrape.


In [2]:
opinions = []
next_url = f"{BASE_URL}?order_by=-date_filed"

print(f"Starting scrape from: {next_url}")

while len(opinions) < TARGET_COUNT and next_url:
    print(f"Fetching: {next_url}")
    try:
        response = requests.get(next_url, headers=HEADERS)
        response.raise_for_status()
        data = response.json()
        
        results = data.get('results', [])
        print(f"Found {len(results)} items in this page.")
        
        for item in results:
            # Check if we have substantial text content
            text_content = item.get('plain_text') or item.get('html') or item.get('html_lawbox') or item.get('html_columbia') or item.get('html_anon_2020')
            
            if not text_content:
                # If no text in list response, try fetching detail
                resource_uri = item.get('resource_uri')
                if resource_uri:
                    print(f"Fetching detail for {item.get('id')}...")
                    try:
                        detail_response = requests.get(resource_uri, headers=HEADERS)
                        detail_response.raise_for_status()
                        item = detail_response.json()
                    except Exception as e:
                        print(f"Failed to fetch detail for {item.get('id')}: {e}")
                        continue
                    # Add a small delay to be polite
                    time.sleep(0.5)
            
            opinions.append(item)
            if len(opinions) >= TARGET_COUNT:
                break
        
        next_url = data.get('next')
        # Polite delay between pages
        time.sleep(1)
        
    except Exception as e:
        print(f"Error fetching page: {e}")
        break

print(f"Scraping complete. Collected {len(opinions)} opinions.")

Starting scrape from: https://www.courtlistener.com/api/rest/v3/opinions/?order_by=-date_filed
Fetching: https://www.courtlistener.com/api/rest/v3/opinions/?order_by=-date_filed
Found 20 items in this page.
Fetching: https://www.courtlistener.com/api/rest/v3/opinions/?order_by=-date_filed&page=2
Found 20 items in this page.
Fetching: https://www.courtlistener.com/api/rest/v3/opinions/?order_by=-date_filed&page=3
Found 20 items in this page.
Fetching: https://www.courtlistener.com/api/rest/v3/opinions/?order_by=-date_filed&page=4
Found 20 items in this page.
Fetching: https://www.courtlistener.com/api/rest/v3/opinions/?order_by=-date_filed&page=5
Found 20 items in this page.
Scraping complete. Collected 100 opinions.


In [3]:
with open(OUTPUT_FILE, 'w') as f:
    json.dump(opinions, f, indent=2)

print(f"Saved to {OUTPUT_FILE}")

Saved to opinions.json


In [4]:
# Verification
import json
from datetime import datetime

with open(OUTPUT_FILE, 'r') as f:
    saved_data = json.load(f)

print(f"Total records: {len(saved_data)}")
assert len(saved_data) >= 100, "Failed to collect 100 opinions"

# Check for text content
text_count = sum(1 for item in saved_data if item.get('plain_text') or item.get('html') or item.get('html_lawbox'))
print(f"Records with text content: {text_count}")

# Check dates (using date_created as proxy for recency if date_filed is missing)
dates = []
for item in saved_data[:10]:
    date_val = item.get('date_filed') or item.get('date_created')
    if date_val:
        dates.append(date_val)

print(f"First 10 dates (filed or created): {dates}")

# Simple check for descending order (allowing for some batching/same timestamps)
if dates == sorted(dates, reverse=True):
    print("Dates appear to be in descending order.")
else:
    print("Dates are recent but might not be strictly sorted by creation time (which is expected for batched updates).")

Total records: 100
Records with text content: 100
First 10 dates (filed or created): ['2025-12-13T11:56:49.962743-08:00', '2025-12-13T11:56:18.037148-08:00', '2025-12-13T11:55:58.804303-08:00', '2025-12-13T11:55:51.108410-08:00', '2025-12-13T11:55:39.097370-08:00', '2025-12-13T11:55:36.832155-08:00', '2025-12-13T11:55:34.409135-08:00', '2025-12-13T11:55:32.408994-08:00', '2025-12-13T11:55:26.188733-08:00', '2025-12-13T11:55:22.538209-08:00']
Dates appear to be in descending order.
