In [None]:
# Data can be downloaded from here:
!wget -q https://data.rapids.ai/datasets/nyc_parking/nyc_parking_violations_2022.parquet -q

In [None]:
!pip install fireducks -q

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m25.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.1/42.1 MB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
cudf-cu12 24.12.0 requires pyarrow<19.0.0a0,>=14.0.0; platform_machine == "x86_64", but you have pyarrow 19.0.0 which is incompatible.
pylibcudf-cu12 24.12.0 requires pyarrow<19.0.0a0,>=14.0.0; platform_machine == "x86_64", but you have pyarrow 19.0.0 which is incompatible.[0m[31m
[0m

In [None]:
%load_ext fireducks.pandas
import pandas as pd

In [None]:
import fireducks.pandas as pd

df = pd.DataFrame({
    'category': ['A', 'B', 'C', 'D'] * 250_000,
    'value': range(1_000_000)
})

In [None]:
df

Unnamed: 0,category,value
0,A,0
1,B,1
2,C,2
3,D,3
4,A,4
...,...,...
999995,D,999995
999996,A,999996
999997,B,999997
999998,C,999998


In [None]:
import pandas as ab

df1 = ab.DataFrame({
    'category': ['A', 'B', 'C', 'D'] * 250_000,
    'value': range(1_000_000)
})

In [None]:
df1.head()

Unnamed: 0,category,value
0,A,0
1,B,1
2,C,2
3,D,3
4,A,4


# Running the code with some sample data along with time comparison between fireducks vs pandas


In [3]:
import time
import timeit

# Measure the time taken to import the libraries
start_import_fireducks = time.time()
import fireducks.pandas as pd
end_import_fireducks = time.time()
fireducks_import_time = end_import_fireducks - start_import_fireducks

start_import_pandas = time.time()
import pandas as ab
end_import_pandas = time.time()
pandas_import_time = end_import_pandas - start_import_pandas

# Function to create the DataFrame using fireducks.pandas
def create_fireducks_df():
    return pd.DataFrame({
        'category': ['A', 'B', 'C', 'D'] * 250_000,
        'value': range(1_000_000)
    })

# Function to create the DataFrame using pandas
def create_pandas_df():
    return ab.DataFrame({
        'category': ['A', 'B', 'C', 'D'] * 250_000,
        'value': range(1_000_000)
    })

# Measure execution time for DataFrame creation using timeit
fireducks_creation_time = timeit.timeit(create_fireducks_df, number=1)
pandas_creation_time = timeit.timeit(create_pandas_df, number=1)

# Print results
print(f"Time taken to import fireducks.pandas: {fireducks_import_time:.6f} seconds")
print(f"Time taken to import pandas: {pandas_import_time:.6f} seconds")
print(f"Time taken to create DataFrame with fireducks.pandas: {fireducks_creation_time:.6f} seconds")
print(f"Time taken to create DataFrame with pandas: {pandas_creation_time:.6f} seconds")

Time taken to import fireducks.pandas: 0.000052 seconds
Time taken to import pandas: 0.000039 seconds
Time taken to create DataFrame with fireducks.pandas: 0.360670 seconds
Time taken to create DataFrame with pandas: 0.044766 seconds


# Running the code to load the parquet file downloaded [link](https://data.rapids.ai/datasets/nyc_parking/nyc_parking_violations_2022.parquet) along with time comparison between fireducks vs pandas


In [None]:
import time
import timeit
import pandas as pd
import fireducks.pandas as fpd

# Local path of the large Parquet file
PARQUET_PATH = "/content/nyc_parking_violations_2022.parquet"

# Measure the time taken to import the libraries
def measure_import_time(module_name):
    start_time = time.time()
    __import__(module_name)
    return time.time() - start_time

fireducks_import_time = measure_import_time("fireducks.pandas")
pandas_import_time = measure_import_time("pandas")

# Function to load the Parquet file using fireducks.pandas
def load_fireducks_parquet():
    return fpd.read_parquet(PARQUET_PATH)

# Function to load the Parquet file using pandas
def load_pandas_parquet():
    return pd.read_parquet(PARQUET_PATH)

# Measure execution time for loading Parquet file
fireducks_load_time = timeit.timeit(load_fireducks_parquet, number=1)
pandas_load_time = timeit.timeit(load_pandas_parquet, number=1)

# Print results
print(f"Time taken to import fireducks.pandas: {fireducks_import_time:.6f} seconds")
print(f"Time taken to import pandas: {pandas_import_time:.6f} seconds")
print(f"Time taken to load Parquet file with fireducks.pandas: {fireducks_load_time:.6f} seconds")
print(f"Time taken to load Parquet file with pandas: {pandas_load_time:.6f} seconds")

Time taken to import fireducks.pandas: 0.000002 seconds
Time taken to import pandas: 0.000001 seconds
Time taken to load Parquet file with fireducks.pandas: 0.102790 seconds
Time taken to load Parquet file with pandas: 36.340968 seconds
