# FireDucks vs Pandas: Benchmarking for Large-Scale Data Processing

In [None]:
import fireducks.pandas as fd
import pandas as pd
import numpy as np
import time

In [None]:
!pip install fireducks



Collecting fireducks
  Downloading fireducks-1.2.6-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (1.0 kB)
Collecting firefw==1.2.6 (from fireducks)
  Downloading firefw-1.2.6-py3-none-any.whl.metadata (818 bytes)
Collecting pyarrow<19.1,>=19.0 (from fireducks)
  Downloading pyarrow-19.0.1-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Downloading fireducks-1.2.6-cp311-cp311-manylinux_2_28_x86_64.whl (7.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m14.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading firefw-1.2.6-py3-none-any.whl (12 kB)
Downloading pyarrow-19.0.1-cp311-cp311-manylinux_2_28_x86_64.whl (42.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.1/42.1 MB[0m [31m20.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyarrow, firefw, fireducks
  Attempting uninstall: pyarrow
    Found existing installation: pyarrow 18.1.0
    Uninstalling pyarrow-18.1.0:
      Successfully uninstalled pya

## Load Data using FireDucks

In [7]:
FIREDUCKS_FLAGS = "--benchmark-mode"

## Convert to Pandas for Unsupported Operations

In [10]:
import pandas as pd
import numpy as np

# Create mock data
np.random.seed(42)
data = {
    "InvoiceNo": np.random.randint(10000, 99999, 10000),
    "StockCode": np.random.choice(["A100", "B200", "C300"], 10000),
    "Description": np.random.choice(["Widget", "Gadget", "Thingy"], 10000),
    "Quantity": np.random.randint(1, 50, 10000),
    "InvoiceDate": pd.date_range(start='2021-01-01', periods=10000, freq='min'),
    "UnitPrice": np.random.uniform(1.0, 100.0, 10000),
    "CustomerID": np.random.randint(1000, 2000, 10000),
    "Country": np.random.choice(["UK", "Germany", "France", "Spain"], 10000)
}

df = pd.DataFrame(data)

# Save to CSV
df.to_csv("/content/fake_data.csv", index=False)
print("Fake CSV file created at /content/fake_data.csv 🎉")



Fake CSV file created at /content/fake_data.csv 🎉


In [13]:
import fireducks as fpd
import time

path = "/content/fake_data.csv"

start = time.time()
df_fd = fpd.pandas.read_csv(path, encoding="ISO-8859-1")
print("Data Loaded in", time.time() - start, "seconds")

df_fd_pd = df_fd.to_pandas()


Data Loaded in 0.026114463806152344 seconds


In [14]:
# Expand Dataset
df_fd_pd = pd.concat([df_fd_pd] * 2)

# Drop a column
df_fd_pd.drop(columns=["Description"], inplace=True)

# Sort
df_fd_pd = df_fd_pd.sort_values(by=["InvoiceDate"])

# Group & Aggregate
df_fd_pd_grouped = df_fd_pd.groupby("Country")["Quantity"].sum()

# Add Fake Column
df_fd_pd["FakeColumn"] = np.random.randint(1, 100, df_fd_pd.shape[0])

# String Transformation
df_fd_pd["InvoiceNo"] = df_fd_pd["InvoiceNo"].astype(str) + "_FD"


## Dataset Expansion

In [15]:
start = time.time()
df_fd_pd = pd.concat([df_fd_pd] * 2)
print("Dataset Expanded in", time.time() - start, "seconds")

Dataset Expanded in 0.005506038665771484 seconds


## Dropping Columns

In [17]:
print("Available columns:", df_fd_pd.columns.tolist())

# Drop only if 'Description' exists
if "Description" in df_fd_pd.columns:
    start = time.time()
    df_fd_pd.drop(columns=["Description"], inplace=True)
    print("Column 'Description' dropped in", time.time() - start, "seconds")
else:
    print("'Description' column not found — skipping drop step.")



Available columns: ['InvoiceNo', 'StockCode', 'Quantity', 'InvoiceDate', 'UnitPrice', 'CustomerID', 'Country', 'FakeColumn']
'Description' column not found — skipping drop step.


In [21]:
print("Columns in df_fd_pd:", df_fd_pd.columns.tolist())


Columns in df_fd_pd: ['InvoiceNo', 'StockCode', 'Quantity', 'InvoiceDate', 'UnitPrice', 'CustomerID', 'Country', 'FakeColumn']


In [22]:
start = time.time()
df_fd_pd.drop(columns=["FakeColumn"], inplace=True)  # or choose any column you want
print("Columns Dropped in", time.time() - start, "seconds")


Columns Dropped in 0.007134914398193359 seconds


## Sorting

In [23]:
start = time.time()
df_fd_pd = df_fd_pd.sort_values(by=["InvoiceDate"])
print("Sorting Completed in", time.time() - start, "seconds")

Sorting Completed in 0.06834578514099121 seconds


## Grouping and Aggregation

In [24]:
start = time.time()
df_fd_pd_grouped = df_fd_pd.groupby("Country")["Quantity"].sum()
print("Grouping Completed in", time.time() - start, "seconds")

Grouping Completed in 0.006224393844604492 seconds


## Fake Data Generation

In [25]:
start = time.time()
df_fd_pd["FakeColumn"] = np.random.randint(1, 100, df_fd_pd.shape[0])
print("Fake Data Generated in", time.time() - start, "seconds")

Fake Data Generated in 0.0020291805267333984 seconds


## String Transformation

In [26]:
start = time.time()
df_fd_pd["InvoiceNo"] = df_fd_pd["InvoiceNo"].astype(str) + "_FD"
print("String Transformation Completed in", time.time() - start, "seconds")

String Transformation Completed in 0.008248090744018555 seconds
