In [None]:
pip install requests orjson

This benchmark compares the performance of the json.loads() and orjson.loads() functions when deserializing JSON data obtained from the SEC. The results show that orjson is significantly faster than the standard json library, making it a preferable choice for our application requiring efficient processing of large volumes of JSON data.

In [None]:
import time
import requests
import json
import orjson

url = "https://data.sec.gov/api/xbrl/companyfacts/CIK0001318605.json"
headers = {
    "User-Agent": "FinDrum Contact <[email protected]>"
}

response = requests.get(url, headers=headers)
response.raise_for_status()
content_bytes = response.content
content_str = content_bytes.decode('utf-8')

n_iterations = 100
times_json = []
times_orjson = []

for i in range(n_iterations):
    start = time.time()
    _ = json.loads(content_str)
    times_json.append(time.time() - start)

    start = time.time()
    _ = orjson.loads(content_bytes)
    times_orjson.append(time.time() - start)

mean_json = sum(times_json) / n_iterations
mean_orjson = sum(times_orjson) / n_iterations

print(f"\nAverage json.loads:   {mean_json:.6f} seconds")
print(f"Average orjson.loads: {mean_orjson:.6f} seconds")
print(f"Speedup: {mean_json / mean_orjson:.2f}x faster with orjson")

In [None]:
pip install pyarrow fastparquet

In [None]:
import pandas as pd
import numpy as np
import time
import io

df = pd.DataFrame({
    "col1": np.random.randint(0, 1000000, size=1_000_000),
    "col2": np.random.rand(1_000_000),
    "col3": np.random.choice(["A", "B", "C", "D"], size=1_000_000)
})

engines = ["pyarrow", "fastparquet"]
n_iterations = 100
results = {engine: {"write": [], "read": []} for engine in engines}

for engine in engines:
    print(f"\n--- Benchmark for engine: {engine} ---")

    for i in range(n_iterations):
        buffer = io.BytesIO()

        try:
            start = time.time()
            df.to_parquet(buffer, engine=engine, index=False)
            results[engine]["write"].append(time.time() - start)
        except Exception as e:
            print(f"Write FAILED on iteration {i}: {e}")
            break

        try:
            buffer.seek(0)
            start = time.time()
            df_read = pd.read_parquet(buffer, engine=engine)
            results[engine]["read"].append(time.time() - start)
        except Exception as e:
            print(f"Read FAILED on iteration {i}: {e}")
            break

    if results[engine]["write"] and results[engine]["read"]:
        mean_write = sum(results[engine]["write"]) / len(results[engine]["write"])
        mean_read = sum(results[engine]["read"]) / len(results[engine]["read"])
        results[engine]["mean_write"] = mean_write
        results[engine]["mean_read"] = mean_read
        print(f"Average write time over {len(results[engine]['write'])} runs: {mean_write:.6f} sec")
        print(f"Average read  time over {len(results[engine]['read'])} runs: {mean_read:.6f} sec")
    else:
        print(f"{engine} failed before completing {n_iterations} iterations.")

if all("mean_write" in results[eng] for eng in engines):
    write_speedup = results["fastparquet"]["mean_write"] / results["pyarrow"]["mean_write"]
    read_speedup = results["fastparquet"]["mean_read"] / results["pyarrow"]["mean_read"]

    print(f"\nSpeedup (pyarrow vs fastparquet):")
    print(f"Write speedup: {write_speedup:.2f}x faster using pyarrow")
    print(f"Read  speedup: {read_speedup:.2f}x faster using pyarrow")


In [None]:
import pandas as pd
import numpy as np
import io

engine = "fastparquet"
n_iterations = 100

unsorted_sizes = []
sorted_sizes = []

def get_parquet_size(dataframe: pd.DataFrame) -> int:
    buffer = io.BytesIO()
    dataframe.to_parquet(buffer, engine=engine, index=False)
    return buffer.getbuffer().nbytes

for i in range(n_iterations):
    df = pd.DataFrame({
        "col1": np.random.randint(0, 10000, size=1_000_000),
        "col2": np.random.rand(1_000_000),
        "col3": np.random.choice(["A", "B", "C", "D"], size=1_000_000)
    })

    df_sorted = df.sort_values(by=["col1", "col3"]).reset_index(drop=True)

    try:
        unsorted_sizes.append(get_parquet_size(df))
        sorted_sizes.append(get_parquet_size(df_sorted))
    except Exception as e:
        print(f"Iteration {i} FAILED: {e}")
        break

unsorted_sizes_kb = [s / 1024 for s in unsorted_sizes]
sorted_sizes_kb = [s / 1024 for s in sorted_sizes]

mean_unsorted = sum(unsorted_sizes_kb) / len(unsorted_sizes_kb)
mean_sorted = sum(sorted_sizes_kb) / len(sorted_sizes_kb)
reduction = (1 - mean_sorted / mean_unsorted) * 100

print(f"\nAverage Unsorted Parquet size: {mean_unsorted:.2f} KB")
print(f"Average Sorted   Parquet size: {mean_sorted:.2f} KB")
print(f"Average Reduction: {reduction:.2f}% over {n_iterations} iterations")
