In [0]:
%pip install polars matplotlib numpy

In [0]:
import time
import pandas as pd
import polars as pl
import numpy as np
import matplotlib.pyplot as plt

In [0]:
# Define a function to benchmark operations
def benchmark_operations(df_pandas, df_polars):
    """
    Run a few common data operations and measure execution time for Pandas and Polars.
    """
    operations_pandas = [
        lambda df: df[df["col1"] > 50],  # Filtering
        lambda df: df.groupby("col2").agg({"col1": "mean"}),  # Aggregation
        lambda df: df.sort_values("col1"),  # Sorting
    ]
    
    operations_polars = [
        lambda df: df.filter(pl.col("col1") > 50),  # Filtering in Polars
        lambda df: df.group_by("col2").agg(pl.col("col1").mean().alias("mean_col1")),  # Aggregation
        lambda df: df.sort("col1"),  # Sorting
    ]

    # Measure Pandas execution time
    start = time.time()
    for op in operations_pandas:
        _ = op(df_pandas)
    pandas_time = time.time() - start

    # Measure Polars execution time
    start = time.time()
    for op in operations_polars:
        _ = op(df_polars)
    polars_time = time.time() - start
    
    return pandas_time, polars_time

In [0]:
if __name__ == "__main__":
    # Define dataset sizes
    dataset_sizes = [100_000, 1_000_000, 10_000_000]
    results = {"Dataset Size": [], "Library": [], "Execution Time (s)": []}

    # Run benchmarking for each dataset size
    for size in dataset_sizes:
        # Generate synthetic dataset
        np.random.seed(42)
        data = {
            "col1": np.random.randint(0, 100, size),
            "col2": np.random.choice(["A", "B", "C", "D"], size)
        }
        
        # Convert to Pandas and Polars DataFrames
        df_pandas = pd.DataFrame(data)
        df_polars = pl.DataFrame(data)
        
        # Benchmark operations
        pandas_time, polars_time = benchmark_operations(df_pandas, df_polars)
        
        # Store results
        results["Dataset Size"].extend([size, size])
        results["Library"].extend(["Pandas", "Polars"])
        results["Execution Time (s)"].extend([pandas_time, polars_time])

    # Convert results to DataFrame
    df_results = pd.DataFrame(results)

    # Plot the results
    plt.figure(figsize=(8, 6))
    for library, color, marker in zip(["Pandas", "Polars"], ["blue", "red"], ["o", "s"]):
        subset = df_results[df_results["Library"] == library]
        plt.plot(
            subset["Dataset Size"], subset["Execution Time (s)"], 
            marker=marker, linestyle="-", color=color, label=library
        )
        for x, y in zip(subset["Dataset Size"], subset["Execution Time (s)"]):
            plt.text(x, y, f"{y:.2f}s", fontsize=10, ha='center', 
                    bbox=dict(facecolor=color, alpha=0.7, edgecolor="none", boxstyle="round,pad=0.3"))

    plt.xscale("log")
    plt.xlabel("Dataset Size")
    plt.ylabel("Execution Time (seconds)")
    plt.title("⚡ Pandas vs Polars Performance Scaling ⚡")
    plt.legend()
    plt.grid(True, which="both", linestyle="--", linewidth=0.5)
    plt.show()