In [None]:
%pip install fireducks boto3 memory_profiler matplotlib seaborn pandas

Note: you may need to restart the kernel to use updated packages.


In [None]:
# generating dataset

import pandas as pds
import numpy as np
from datetime import datetime, timedelta

# Generate synthetic transaction data
np.random.seed(42)
num_records = 1000000  # 1 million records
user_ids = np.random.randint(1, 10000, size=num_records)
transaction_amounts = np.random.uniform(10, 5000, size=num_records)
timestamps = [datetime.now() - timedelta(days=np.random.randint(0, 365)) for _ in range(num_records)]
locations = np.random.choice(["New York", "London", "Tokyo", "Paris", "Berlin"], size=num_records)

# Create DataFrame
data = {
    "Transaction_ID": np.arange(1, num_records + 1),
    "User_ID": user_ids,
    "Transaction_Amount": transaction_amounts,
    "Timestamp": timestamps,
    "Location": locations
}
df = pd.DataFrame(data)

# Save to CSV
df.to_csv("../data/sample-dataset.csv", index=False)
print("Dataset generated and saved to sample-dataset.csv")

Dataset generated and saved to sample-dataset.csv


In [None]:
# preprocessing the dataset

import time
from memory_profiler import memory_usage
import pandas as pd
import fireducks.pandas as fd

# Measure pandas load time
start_time = time.time()
df_pandas = pd.read_csv("../data/sample-dataset.csv")
pandas_load_time = time.time() - start_time

# Measure FireDucks load time
start_time = time.time()
df_fireducks = fd.read_csv("../data/sample-dataset.csv")
fireducks_load_time = time.time() - start_time

# Measure memory usage
pandas_memory = memory_usage((pd.read_csv, ["../data/sample-dataset.csv"]), max_usage=True)
fireducks_memory = memory_usage((fd.read_csv, ["../data/sample-dataset.csv"]), max_usage=True)

# Print results
print(f"Pandas Load Time: {pandas_load_time:.2f} sec")
print(f"FireDucks Load Time: {fireducks_load_time:.2f} sec")
print(f"Pandas Memory Usage: {pandas_memory:.2f} MB")
print(f"FireDucks Memory Usage: {fireducks_memory:.2f} MB")

In [None]:
# visualizing the result difference

import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.animation import FuncAnimation

# Data for visualization
tools = ["Pandas", "FireDucks"]
times = [pandas_load_time, fireducks_load_time]

# Create a bar chart
fig, ax = plt.subplots()
bar = ax.bar(tools, times, color=["blue", "green"])
ax.set_ylabel("Load Time (sec)")
ax.set_title("Performance Comparison")

# Animation function
def animate(i):
    for rect, h in zip(bar, times[:i+1]):
        rect.set_height(h)
    return bar

ani = FuncAnimation(fig, animate, frames=len(times), interval=1000, repeat=False)
plt.show()

# Save as GIF
ani.save("../performance_comparison.gif", writer="imagemagick")