In [None]:
pip install requests orjson

This benchmark compares the performance of the json.loads() and orjson.loads() functions when deserializing JSON data obtained from the SEC. The results show that orjson is significantly faster than the standard json library, making it a preferable choice for our application requiring efficient processing of large volumes of JSON data.

In [None]:
import time
import requests
import json
import orjson

url = "https://data.sec.gov/api/xbrl/companyfacts/CIK0001318605.json"
headers = {
    "User-Agent": "FinDrum Contact <[email protected]>"
}

response = requests.get(url, headers=headers)
response.raise_for_status()
content_bytes = response.content
content_str = content_bytes.decode('utf-8')

start = time.time()
data_json = json.loads(content_str)
elapsed_json = time.time() - start
print(f"json.loads: {elapsed_json:.4f} seconds")

start = time.time()
data_orjson = orjson.loads(content_bytes)
elapsed_orjson = time.time() - start
print(f"orjson.loads: {elapsed_orjson:.4f} seconds")

print(f"Are the results equal? {data_json == data_orjson}")

In [1]:
pip install pyarrow fastparquet

Collecting pyarrow
  Using cached pyarrow-20.0.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting fastparquet
  Using cached fastparquet-2024.11.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.2 kB)
Collecting cramjam>=2.3 (from fastparquet)
  Using cached cramjam-2.10.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.0 kB)
Collecting fsspec (from fastparquet)
  Using cached fsspec-2025.3.2-py3-none-any.whl.metadata (11 kB)
Using cached pyarrow-20.0.0-cp312-cp312-manylinux_2_28_x86_64.whl (42.3 MB)
Using cached fastparquet-2024.11.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.8 MB)
Using cached cramjam-2.10.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.0 MB)
Using cached fsspec-2025.3.2-py3-none-any.whl (194 kB)
Installing collected packages: pyarrow, fsspec, cramjam, fastparquet
Successfully installed cramjam-2.10.0 fastparquet-2024.11.0 fsspec-2025.3.2 pyarrow-20.0.0

[1m[[0m[34;4

In [15]:
import pandas as pd
import numpy as np
import time
import io

df = pd.DataFrame({
    "col1": np.random.randint(0, 1000000, size=1_000_000),
    "col2": np.random.rand(1_000_000),
    "col3": np.random.choice(["A", "B", "C", "D"], size=1_000_000)
})

engines = ["pyarrow", "fastparquet"]

results = {}

print(df.describe(include="all"))

for engine in engines:
    buffer = io.BytesIO()
    label = engine
    try:
        start = time.time()
        df.to_parquet(buffer, engine=engine, index=False)
        elapsed = time.time() - start
        results[label] = elapsed
        print(f"{label:<12} → {elapsed:.4f} seconds")
    except Exception as e:
        results[label] = None
        print(f"{label:<12} → FAILED ({e})")

                  col1          col2     col3
count   1000000.000000  1.000000e+06  1000000
unique             NaN           NaN        4
top                NaN           NaN        D
freq               NaN           NaN   251252
mean     499683.011860  4.998688e-01      NaN
std      289013.014023  2.890450e-01      NaN
min           0.000000  1.905448e-08      NaN
25%      248999.000000  2.493063e-01      NaN
50%      499277.000000  5.000773e-01      NaN
75%      750504.500000  7.507578e-01      NaN
max      999997.000000  9.999978e-01      NaN
pyarrow      → 0.1264 seconds
fastparquet  → 0.0973 seconds
