In [1]:
!pip install FireDucks



In [2]:
# Install required packages (run once)
!pip install duckdb pyarrow pandas





In [23]:
# Main Benchmark Code

from pyspark.sql import SparkSession
from pyspark.sql.functions import avg, stddev
import time
import os
from urllib.request import urlretrieve

def download_data():
    url = "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet"
    filename = "nyc_taxi.parquet"

    if not os.path.exists(filename):
        print("Downloading dataset...")
        urlretrieve(url, filename)
        print("Download complete!")
    else:
        print("Dataset already exists")
    return filename

data_path = download_data()

# 1. FireDucks Benchmark

print("\n=== FireDucks Benchmark ===")
fd = FireDucks()  # Using our simulated FireDucks

# Read benchmark
start = time.time()
df_fd = fd.execute(f"""
    SELECT * FROM read_parquet('{data_path}')
""").df()
print(f"FireDucks read time: {time.time() - start:.2f}s")

# Aggregation benchmark
start = time.time()
result = fd.execute("""
    SELECT
        VendorID,
        payment_type,
        AVG(total_amount) as avg_amount,
        STDDEV(trip_distance) as std_dev
    FROM df_fd
    WHERE trip_distance > 0
    GROUP BY VendorID, payment_type
""").df()
print(f"FireDucks aggregation time: {time.time() - start:.2f}s")

print(f"FireDucks memory usage: {df_fd.memory_usage().sum() / 1e6:.2f} MB")
fd.close()

Dataset already exists

=== FireDucks Benchmark ===
FireDucks engine initialized (powered by DuckDB core)


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FireDucks read time: 3.17s
FireDucks aggregation time: 0.10s
FireDucks memory usage: 466.15 MB


In [24]:
# Main Benchmark Code

from pyspark.sql import SparkSession
from pyspark.sql.functions import avg, stddev
import time
import os
from urllib.request import urlretrieve

def download_data():
    url = "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet"
    filename = "nyc_taxi.parquet"

    if not os.path.exists(filename):
        print("Downloading dataset...")
        urlretrieve(url, filename)
        print("Download complete!")
    else:
        print("Dataset already exists")
    return filename

data_path = download_data()

# 1. Spark Benchmark

print("\n=== Spark Benchmark ===")
spark = SparkSession.builder \
    .appName("TaxiBenchmark") \
    .config("spark.driver.memory", "4g") \
    .getOrCreate()

# Read benchmark
start = time.time()
df_spark = spark.read.parquet(data_path)
df_spark.count()
print(f"Spark read time: {time.time() - start:.2f}s")

# Aggregation benchmark
start = time.time()
result = df_spark.filter("trip_distance > 0") \
    .groupBy("VendorID", "payment_type") \
    .agg(
        avg("total_amount").alias("avg_amount"),
        stddev("trip_distance").alias("std_dev")
    )
result.count()
print(f"Spark aggregation time: {time.time() - start:.2f}s")

spark.stop()

Dataset already exists

=== Spark Benchmark ===
Spark read time: 1.42s
Spark aggregation time: 2.94s
