# From JSON to SQL in a second

This notebook shows how to read a large JSONL file, convert it to a highly efficient Parquet file, and query it instantly with DuckDB.

In [None]:
import pyarrow.parquet as pq
import pyarrow as pa
import duckdb
import json
from pathlib import Path

jsonl_file = Path('../data/input/events.jsonl')
parquet_file = Path('../data/output/demo_events.parquet')

# Read Cumulocity events.
print(f"Reading JSONL data from {jsonl_file}...")
with jsonl_file.open('r') as f:
    data = [json.loads(line) for line in f]

# Write to Parquet.
print(f"Writing {len(data):,} records to {parquet_file}...")
table = pa.Table.from_pylist(data)
pq.write_table(table, parquet_file, compression='SNAPPY')

# Run a simple SQL query on the file using DuckDB.
result = duckdb.sql(f"""
SELECT *
FROM '{parquet_file.as_posix()}'
""")
result.show()

# Run a little more complex SQL query.
result = duckdb.sql(f"""
SELECT type, COUNT(*) AS event_count
FROM '{parquet_file.as_posix()}'
GROUP BY type
ORDER BY event_count DESC
LIMIT 10;
""")
print("Top 10 event types:")
result.show()