In [343]:
import pandas as pd
import polars as pl
import numpy as np
import time
import os
import urllib.request

In [344]:
# Create synthetic data
n_rows = 10_000_000
data = {
    "id": np.arange(n_rows),
    "value1": np.random.rand(n_rows),
    "value2": np.random.rand(n_rows),
}

In [345]:
start_time = time.time()
df_pandas = pd.DataFrame(data)
# Simple operation: calculate a new column
df_pandas["sum"] = df_pandas["value1"] + df_pandas["value2"]
print(f"Pandas time: {time.time() - start_time:.2f} seconds")

Pandas time: 0.11 seconds


In [346]:
start_time = time.time()
df_polars = pl.DataFrame(data)
# Same operation in Polars
df_polars = df_polars.with_columns(
    (pl.col("value1") + pl.col("value2")).alias("sum")
)
print(f"Polars time: {time.time() - start_time:.2f} seconds")

Polars time: 0.03 seconds


In [347]:
# Sample dataset (~1 million rows)
url = "https://people.sc.fsu.edu/~jburkardt/data/csv/hw_200.csv"

In [348]:
start_time = time.time()
df_pandas = pd.read_csv(url)
pandas_time = time.time() - start_time
print(f"Pandas read_csv time: {pandas_time:.3f} seconds")

Pandas read_csv time: 0.117 seconds


In [349]:
start_time = time.time()
df_polars = pl.read_csv(url)
polars_time = time.time() - start_time
print(f"Polars read_csv time: {polars_time:.3f} seconds")

Polars read_csv time: 0.101 seconds


In [350]:
speedup = pandas_time / polars_time
print(f"\n✅ Polars was approximately {speedup:.2f}x faster than Pandas.")


✅ Polars was approximately 1.16x faster than Pandas.


In [351]:
url = "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet"
filename = "yellow_tripdata_2023-01.parquet"

# Download the file if it doesn't exist
if not os.path.exists(filename):
    print(f"Downloading {filename}...")
    urllib.request.urlretrieve(url, filename)
    print("Download complete.")

# Read with Pandas
print("Reading with pandas...")
start_time = time.time()
df_pandas = pd.read_parquet(filename)
pandas_time = time.time() - start_time
print(f"Pandas read time: {pandas_time:.2f} seconds")

# Read with Polars
print("\nReading with polars...")
start_time = time.time()
df_polars = pl.read_parquet(filename)
polars_time = time.time() - start_time
print(f"Polars read time: {polars_time:.2f} seconds")

speedup = pandas_time / polars_time
print(f"\n✅ Polars was approximately {speedup:.2f}x faster than Pandas.")

Reading with pandas...
Pandas read time: 0.32 seconds

Reading with polars...
Polars read time: 0.09 seconds

✅ Polars was approximately 3.42x faster than Pandas.
