In [None]:
import orjson
import json
import uuid
import random

In [2]:

def generate_json_file(filename, target_size_mb):
    """
    Generates a JSON file of approximately target_size_mb.
    Uses a streaming approach to keep memory usage low.
    """
    target_bytes = target_size_mb * 1024 * 1024
    roles = ['admin', 'user', 'editor', 'guest', 'support']
    
    print(f"Generating {filename} ({target_size_mb} MB)...")

    with open(filename, 'w', encoding='utf-8') as f:
        f.write('[\n')  # Start JSON array
        
        current_size = f.tell()
        first_record = True
        record_count = 0

        while current_size < target_bytes:
            # Generate a "CRUD" style database record
            record = {
                "id": record_count,
                "uuid": str(uuid.uuid4()),
                "username": f"user_name_{record_count}",
                "email": f"contact_{record_count}@example-domain.com",
                "profile": {
                    "first_name": random.choice(['John', 'Jane', 'Alex', 'Max', 'Sarah']),
                    "last_name": random.choice(['Smith', 'Doe', 'Johnson', 'Brown', 'Lee']),
                    "bio": "This is a repeating string used to fill up space and simulate a longer text field in a database. " * 3,
                },
                "role": random.choice(roles),
                "is_active": random.choice([True, False]),
                "permissions": ["read", "write", "delete"] if record_count % 10 == 0 else ["read"],
                "created_at": "2023-10-27T10:00:00Z",
                "updated_at": "2024-01-15T14:30:22Z"
            }

            # Handle comma placement for valid JSON
            json_str = json.dumps(record, indent=2)
            if not first_record:
                f.write(',\n')
            
            f.write(json_str)
            
            first_record = False
            record_count += 1
            
            # Update current file size
            # We flush occasionally to get an accurate file size check
            if record_count % 100 == 0:
                f.flush()
                current_size = os.path.getsize(filename)

        f.write('\n]')  # End JSON array

    final_size = os.path.getsize(filename) / (1024 * 1024)
    print(f"Finished! Actual size: {final_size:.2f} MB | Total Records: {record_count}\n")

In [3]:
generate_json_file('large_data.json', 5000)

Generating large_data.json (5000 MB)...
Finished! Actual size: 5000.05 MB | Total Records: 7405800

