# Python + Rust: Pandas vs Polars
## Comparative Analysis: Small to Large Datasets

Comparing three fundamentally different approaches across small, medium, and large datasets:
- **Pandas (NumPy)**: Python + Monothread C (eager evaluation)
- **Pandas + PyArrow**: Python + Columnar Memory (eager evaluation)
- **Polars (Rust)**: Rust + Multithreading + Lazy Evaluation + Query Optimization

*Note: For extreme scale analysis (100M+ rows), see `analysis-xlarge.ipynb`*

In [11]:
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
from pathlib import Path
import warnings
import numpy as np

warnings.filterwarnings('ignore')

# Load results
results_dir = Path('results')

# Load only files that exist
dfs = []
engines_present = []

if (results_dir / 'pandas.csv').exists():
    pandas_df = pd.read_csv(results_dir / 'pandas.csv')
    pandas_df['engine'] = 'Pandas (NumPy)'
    dfs.append(pandas_df)
    engines_present.append('Pandas (NumPy)')

if (results_dir / 'pandas-pyarrow.csv').exists():
    pandas_pyarrow_df = pd.read_csv(results_dir / 'pandas-pyarrow.csv')
    pandas_pyarrow_df['engine'] = 'Pandas + PyArrow'
    dfs.append(pandas_pyarrow_df)
    engines_present.append('Pandas + PyArrow')

if (results_dir / 'polars.csv').exists():
    polars_df = pd.read_csv(results_dir / 'polars.csv')
    polars_df['engine'] = 'Polars (Rust)'
    dfs.append(polars_df)
    engines_present.append('Polars (Rust)')

if not dfs:
    raise FileNotFoundError("Nenhum arquivo CSV encontrado em results/. Execute 'uv run demo benchmark' primeiro.")

# Combine all results
combined_df = pd.concat(dfs, ignore_index=True)

# EXCLUDE xlarge - focus on small/medium/large
combined_df = combined_df[combined_df['scenario'] != 'xlarge'].copy()

if combined_df.empty:
    raise ValueError("No data found for small/medium/large scenarios. Run benchmarks including these sizes.")

# Scenario mapping
all_scenarios = ['small', 'medium', 'large', 'xlarge']
all_labels = ['Small (1K)', 'Medium (1M)', 'Large (10M)', 'XLarge (100M)']

# Filter to only scenarios that exist in data
present_scenarios = combined_df['scenario'].unique()
scenario_order = [s for s in all_scenarios if s in present_scenarios]
scenario_labels = [all_labels[all_scenarios.index(s)] for s in scenario_order]
scenario_map = dict(zip(scenario_order, scenario_labels))
combined_df['scenario_label'] = combined_df['scenario'].map(scenario_map)

print(f"Engines encontrados: {', '.join(engines_present)}")
print(f"Cenários encontrados: {', '.join(scenario_order)}")
print("\nResultados combinados:")
print(combined_df.to_string(index=False))

Engines encontrados: Pandas (NumPy), Pandas + PyArrow, Polars (Rust)
Cenários encontrados: small, medium, large

Resultados combinados:
scenario  time_seconds  memory_mb           engine scenario_label
   small        0.5059      61.41   Pandas (NumPy)     Small (1K)
  medium        1.4546     252.58   Pandas (NumPy)    Medium (1M)
   large       10.6597    2091.20   Pandas (NumPy)    Large (10M)
   small        0.2158      58.94 Pandas + PyArrow     Small (1K)
  medium        0.7234     211.08 Pandas + PyArrow    Medium (1M)
   large        3.5732    1440.26 Pandas + PyArrow    Large (10M)
   small        0.0200       0.02    Polars (Rust)     Small (1K)
  medium        0.0561       0.00    Polars (Rust)    Medium (1M)
   large        0.3069       0.03    Polars (Rust)    Large (10M)


## 1. The Polars Takeoff: Execution Time (Logarithmic Scale)

**Why Logarithmic?** With Polars being 31x faster, a linear scale would make the differences invisible. The log scale reveals the **architectural inflection point**: from Medium (1M) onwards, Polars separates from the pack. At 100M rows, it's an insurmountable gap.

**Key Message**: Only Rust/Multithreading can scale this way.

In [12]:
# Execution time with LOG SCALE - shows the architectural difference
fig_time_log = px.bar(
    combined_df.sort_values(['scenario', 'engine']),
    x='scenario_label',
    y='time_seconds',
    color='engine',
    barmode='group',
    title='Execution Time: The Polars Architectural Advantage (Log Scale)',
    labels={'time_seconds': 'Time (seconds, log scale)', 'scenario_label': 'Dataset Size'},
    color_discrete_map={
        'Pandas (NumPy)': '#1f77b4',
        'Pandas + PyArrow': '#ff7f0e',
        'Polars (Rust)': '#2ca02c'
    },
    category_orders={'scenario_label': scenario_labels},
    hover_data={'time_seconds': ':.4f', 'scenario': False}
)

fig_time_log.update_yaxes(type='log')

fig_time_log.update_layout(
    height=700,
    template='plotly_white',
    font=dict(size=13),
    hovermode='x unified',
    showlegend=True,
    legend=dict(x=0.02, y=0.98, bgcolor='rgba(255,255,255,0.8)'),
    margin=dict(t=120)
)

# Add annotation
fig_time_log.add_annotation(
    text="<b>At 100M rows: Polars is 31x faster than Pandas</b><br>Multithreading + Lazy Evaluation wins",
    xref="paper", yref="paper",
    x=0.5, y=1.08, showarrow=False,
    font=dict(size=13, color='#2ca02c'),
    bgcolor='rgba(255,255,200,0.8)',
    bordercolor='#2ca02c',
    borderwidth=2,
    borderpad=10
)

fig_time_log.show()

## 2. Memory Usage Across Dataset Sizes

**Key Observation**: Watch how memory requirements grow differently for each engine as dataset size increases. Polars' lazy evaluation keeps memory usage minimal across all sizes, while Pandas' eager model forces exponential growth.

In [13]:
# Memory usage across all scenarios
memory_data = combined_df.copy()
memory_data['memory_gb'] = memory_data['memory_mb'] / 1024

fig_memory = px.bar(
    memory_data.sort_values(['scenario', 'engine']),
    x='scenario_label',
    y='memory_gb',
    color='engine',
    barmode='group',
    title='Peak Memory Usage Across Dataset Sizes',
    labels={'memory_gb': 'Memory (GB)', 'scenario_label': 'Dataset Size'},
    color_discrete_map={
        'Pandas (NumPy)': '#1f77b4',
        'Pandas + PyArrow': '#ff7f0e',
        'Polars (Rust)': '#2ca02c'
    },
    category_orders={'scenario_label': scenario_labels},
    hover_data={'memory_gb': ':.3f', 'memory_mb': ':.0f'}
)

fig_memory.update_layout(
    height=700,
    template='plotly_white',
    font=dict(size=13),
    hovermode='x unified',
    showlegend=True,
    legend=dict(x=0.02, y=0.98, bgcolor='rgba(255,255,255,0.8)'),
    margin=dict(t=100)
)

fig_memory.show()

## 3. The Hidden Cost: RAM Overhead Factor

**What is Overhead Factor?** The ratio of peak RAM used ÷ file size on disk.

As datasets grow, Polars maintains constant overhead through lazy evaluation, while Pandas requires proportionally more memory for larger files.

In [14]:
# Calculate overhead factor for all scenarios
overhead_data = combined_df.copy()
overhead_data['overhead_factor'] = None

for scenario in scenario_order:
    scenario_rows = overhead_data[overhead_data['scenario'] == scenario]
    fact_file = Path('data') / f'fact_content_performance_{scenario}.parquet'
    
    if fact_file.exists():
        file_size_mb = fact_file.stat().st_size / (1024 * 1024)
        overhead_data.loc[scenario_rows.index, 'overhead_factor'] = scenario_rows['memory_mb'] / file_size_mb
        
        scenario_label = scenario_map.get(scenario, scenario)
        print(f"{scenario_label}: {file_size_mb:.1f} MB file")

# Remove rows where we couldn't calculate overhead
overhead_data = overhead_data[overhead_data['overhead_factor'].notna()].copy()

if not overhead_data.empty:
    fig_overhead = px.bar(
        overhead_data.sort_values(['scenario', 'engine']),
        x='scenario_label',
        y='overhead_factor',
        color='engine',
        barmode='group',
        title='RAM Overhead Factor: Memory Used ÷ File Size',
        labels={'overhead_factor': 'Overhead Factor (multiplier)', 'scenario_label': 'Dataset Size'},
        color_discrete_map={
            'Pandas (NumPy)': '#1f77b4',
            'Pandas + PyArrow': '#ff7f0e',
            'Polars (Rust)': '#2ca02c'
        },
        category_orders={'scenario_label': scenario_labels},
        hover_data={'overhead_factor': ':.1f', 'memory_mb': ':.0f'}
    )

    fig_overhead.update_layout(
        height=700,
        template='plotly_white',
        font=dict(size=13),
        hovermode='x unified',
        showlegend=True,
        legend=dict(x=0.02, y=0.98, bgcolor='rgba(255,255,255,0.8)'),
        margin=dict(t=100)
    )

    fig_overhead.show()
else:
    print("No parquet files found for overhead calculation")

Small (1K): 0.0 MB file
Medium (1M): 5.1 MB file
Large (10M): 51.0 MB file


## Summary: Performance Across Dataset Sizes

**Key Insights**:
- **Execution Speed**: Polars consistently outperforms both Pandas variants, with advantages growing as datasets get larger
- **Memory Usage**: Polars maintains near-constant memory regardless of size, while Pandas memory grows with dataset
- **Overhead Factor**: Shows how much memory is wasted compared to actual file size. Polars' lazy evaluation minimizes this overhead across all sizes

For extreme scale behavior (100M+ rows), see `analysis-xlarge.ipynb` which includes dedicated analysis of the inflection point where architecture differences become most pronounced.

In [15]:
# Final performance summary for all scenarios
print("\n" + "="*110)
print("PERFORMANCE COMPARISON ACROSS ALL SCENARIOS")
print("="*110)

for scenario in scenario_order:
    scenario_data = combined_df[combined_df['scenario'] == scenario].copy()
    scenario_label = scenario_map.get(scenario, scenario)
    
    if scenario_data.empty:
        continue
    
    scenario_data_sorted = scenario_data.sort_values('time_seconds')
    
    print(f"\n### {scenario_label} ###")
    
    fastest_time = scenario_data_sorted['time_seconds'].iloc[0]
    
    for _, row in scenario_data_sorted.iterrows():
        speedup = fastest_time / row['time_seconds'] if row['time_seconds'] > 0 else float('inf')
        
        memory_str = f"{row['memory_mb']:7.0f} MB"
        
        if row['memory_mb'] > 0:
            file_scenario = f"fact_content_performance_{scenario}.parquet"
            fact_file_scenario = Path('data') / file_scenario
            if fact_file_scenario.exists():
                file_size_mb_scenario = fact_file_scenario.stat().st_size / (1024 * 1024)
                overhead = row['memory_mb'] / file_size_mb_scenario
                overhead_str = f"({overhead:5.1f}x)"
            else:
                overhead_str = ""
        else:
            overhead_str = "(negligible)"
        
        print(f"  {row['engine']:20} | Time: {row['time_seconds']:7.1f}s ({speedup:5.1f}x) | Memory: {memory_str} {overhead_str}")

print("\n" + "="*110)


PERFORMANCE COMPARISON ACROSS ALL SCENARIOS

### Small (1K) ###
  Polars (Rust)        | Time:     0.0s (  1.0x) | Memory:       0 MB (  2.0x)
  Pandas + PyArrow     | Time:     0.2s (  0.1x) | Memory:      59 MB (5924.9x)
  Pandas (NumPy)       | Time:     0.5s (  0.0x) | Memory:      61 MB (6173.2x)

### Medium (1M) ###
  Polars (Rust)        | Time:     0.1s (  1.0x) | Memory:       0 MB (negligible)
  Pandas + PyArrow     | Time:     0.7s (  0.1x) | Memory:     211 MB ( 41.6x)
  Pandas (NumPy)       | Time:     1.5s (  0.0x) | Memory:     253 MB ( 49.8x)

### Large (10M) ###
  Polars (Rust)        | Time:     0.3s (  1.0x) | Memory:       0 MB (  0.0x)
  Pandas + PyArrow     | Time:     3.6s (  0.1x) | Memory:    1440 MB ( 28.3x)
  Pandas (NumPy)       | Time:    10.7s (  0.0x) | Memory:    2091 MB ( 41.0x)

