In [None]:
# !pip install narwhals pandas polars pyarrow


In [2]:
import pandas as pd
import numpy as np
import narwhals as nw
import polars as pl

In [None]:
# Create synthetic dataset using pandas (as a native DataFrame)
num_rows = 200_000_000  # 200M rows for demonstration
rng = np.random.default_rng(seed=42)
pdf_sales = pd.DataFrame({
    "id": rng.integers(0, 20_000, size=num_rows),        # many repeat IDs to allow groupby
    "value": rng.normal(loc=100.0, scale=50.0, size=num_rows).round(2),  # some numeric value
})
# Create a region mapping: each id gets a region label (e.g., "North","South","East","West")
unique_ids = pdf_sales["id"].unique()
regions = ["North", "South", "East", "West"]
id_to_region = {id_val: rng.choice(regions) for id_val in unique_ids}
pdf_regions = pd.DataFrame({
    "id": list(id_to_region.keys()),
    "region": [id_to_region[i] for i in id_to_region.keys()]
})
# Quick peek at data shape
print(pdf_sales.shape, pdf_regions.shape)
print(pdf_sales.head(3), "\n", pdf_regions.head(3))
print(pdf_sales.info())

(200000000, 2) (20000, 2)
      id   value
0   1785   54.31
1  15479  123.47
2  13091  -15.55 
       id region
0   1785  North
1  15479   West
2  13091  North
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000000 entries, 0 to 199999999
Data columns (total 2 columns):
 #   Column  Dtype  
---  ------  -----  
 0   id      int64  
 1   value   float64
dtypes: float64(1), int64(1)
memory usage: 3.0 GB
None


In [4]:
print(pdf_regions.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      20000 non-null  int64 
 1   region  20000 non-null  object
dtypes: int64(1), object(1)
memory usage: 312.6+ KB
None


In [5]:
# Convert to Polars DataFrames for demonstration
pl_sales = pl.DataFrame(pdf_sales)
pl_regions = pl.DataFrame(pdf_regions)

In [6]:
# Wrap the pandas dataframes with Narwhals
df_sales_nw = nw.from_native(pdf_sales)
df_regions_nw = nw.from_native(pdf_regions)

In [7]:
  # For a comprehensive overview similar to pandas .info()
def polars_info(df):
    print(f"Shape: {df.shape}")
    print(f"Memory usage: {df.estimated_size() / (1024**2):.2f} MB")
    print("\nColumn info:")
    for name, dtype in df.schema.items():
        print(f"  {name}: {dtype}")

In [8]:
polars_info(df_sales_nw)
polars_info(df_regions_nw)

Shape: (200000000, 2)
Memory usage: 3051.76 MB

Column info:
  id: Int64
  value: Float64
Shape: (20000, 2)
Memory usage: 1.63 MB

Column info:
  id: Int64
  region: String


In [None]:
import time
start_time = time.time()
# 1. Join sales with region labels on 'id'
df_joined = df_sales_nw.join(df_regions_nw, on="id", how="inner")
print("After join, columns:", df_joined.columns)
print("Sample row (native):", nw.to_native(df_joined).iloc[0].to_dict())
# 2. Add a derived column: value normalized by overall mean
overall_mean = pdf_sales["value"].mean()  # compute using pandas for reference
df_joined = df_joined.with_columns(
    (nw.col("value") / overall_mean).alias("value_norm")
)
# 3. Filter rows: keep only transactions with positive value
df_joined = df_joined.filter(nw.col("value") > 0)
print("Post-filter shape (native):", nw.to_native(df_joined).shape)
# 4. Group by region and aggregate total and average value
df_summary = df_joined.group_by("region").agg(
    nw.col("value").sum().alias("total_value"),
    nw.col("value").mean().alias("avg_value"),
    nw.len().alias("transaction_count")
)
# Convert result to native pandas for display
summary_native = nw.to_native(df_summary)
print("Summary by region (pandas):\n", summary_native)
end_time = time.time()
print(f"Execution time: {end_time - start_time:.4f} seconds")

After join, columns: ['id', 'value', 'region']
Sample row (native): {'id': 1785, 'value': 54.31, 'region': 'North'}
Post-filter shape (native): (195448682, 4)
Summary by region (pandas):
   region   total_value   avg_value  transaction_count
0  North  5.038577e+09  102.766448           49029396
1   West  5.101366e+09  102.751330           49647685
2   East  4.878919e+09  102.760417           47478583
3  South  5.065127e+09  102.755460           49293018
Execution time: 39.0191 seconds


In [15]:
print(type(df_sales_nw))

<class 'narwhals.dataframe.DataFrame'>


In [None]:
start_time = time.time()
# Verification using direct pandas (should match summary_native)
check = pdf_sales.merge(pdf_regions, on="id").query("value > 0").groupby("region")["value"].agg(['sum','mean','count'])
print(check.reset_index())
end_time = time.time()
print(f"Execution time: {end_time - start_time:.4f} seconds")


  region           sum        mean     count
0   East  4.878919e+09  102.760417  47478583
1  North  5.038577e+09  102.766448  49029396
2  South  5.065127e+09  102.755460  49293018
3   West  5.101366e+09  102.751330  49647685
Execution time: 34.1956 seconds


In [16]:
print(type(check))

<class 'pandas.core.frame.DataFrame'>


In [None]:
start_time = time.time()
# Using Polars directly (no Narwhals) for comparison:
pl_summary = (pl_sales.join(pl_regions, on="id", how="inner")
                        .filter(pl.col("value") > 0)
                        .with_columns((pl.col("value") / pl.col("value").mean()).alias("value_norm"))
                        .group_by("region")
                        .agg([
                            pl.col("value").sum().alias("total_value"),
                            pl.col("value").mean().alias("avg_value"),
                            pl.col("value").len().alias("transaction_count")
                        ])
             )
print("Summary by region (Polars):\n", pl_summary)
end_time = time.time()
print(f"Execution time: {end_time - start_time:.4f} seconds")

Summary by region (Polars):
 shape: (4, 4)
┌────────┬─────────────┬────────────┬───────────────────┐
│ region ┆ total_value ┆ avg_value  ┆ transaction_count │
│ ---    ┆ ---         ┆ ---        ┆ ---               │
│ str    ┆ f64         ┆ f64        ┆ u32               │
╞════════╪═════════════╪════════════╪═══════════════════╡
│ South  ┆ 5.0651e9    ┆ 102.75546  ┆ 49293018          │
│ North  ┆ 5.0386e9    ┆ 102.766448 ┆ 49029396          │
│ West   ┆ 5.1014e9    ┆ 102.75133  ┆ 49647685          │
│ East   ┆ 4.8789e9    ┆ 102.760417 ┆ 47478583          │
└────────┴─────────────┴────────────┴───────────────────┘
Execution time: 7.5713 seconds


In [17]:
print(type(pl_summary))

<class 'polars.dataframe.frame.DataFrame'>


In [None]:
start_time = time.time()
# Wrap Polars dataframes with Narwhals and reuse the same transformation pipeline
df_sales_nw_pl = nw.from_native(pl_sales)
df_regions_nw_pl = nw.from_native(pl_regions)
df_summary_pl = (df_sales_nw_pl.join(df_regions_nw_pl, on="id", how="inner")
                                .filter(nw.col("value") > 0)
                                .with_columns((nw.col("value") / pl_sales["value"].mean()).alias("value_norm")) 
                                .group_by("region")
                                .agg(
                                    nw.col("value").sum().alias("total_value"),
                                    nw.col("value").mean().alias("avg_value"),
                                    nw.len().alias("transaction_count")
                                ))
summary_pl_native = nw.to_native(df_summary_pl)  # this will be a Polars DataFrame
print("Summary by region (via Narwhals on Polars):\n", summary_pl_native)
end_time = time.time()
print(f"Execution time: {end_time - start_time:.4f} seconds")

Summary by region (via Narwhals on Polars):
 shape: (4, 4)
┌────────┬─────────────┬────────────┬───────────────────┐
│ region ┆ total_value ┆ avg_value  ┆ transaction_count │
│ ---    ┆ ---         ┆ ---        ┆ ---               │
│ str    ┆ f64         ┆ f64        ┆ u32               │
╞════════╪═════════════╪════════════╪═══════════════════╡
│ West   ┆ 5.1014e9    ┆ 102.75133  ┆ 49647685          │
│ East   ┆ 4.8789e9    ┆ 102.760417 ┆ 47478583          │
│ South  ┆ 5.0651e9    ┆ 102.75546  ┆ 49293018          │
│ North  ┆ 5.0386e9    ┆ 102.766448 ┆ 49029396          │
└────────┴─────────────┴────────────┴───────────────────┘
Execution time: 10.4196 seconds


In [18]:
print(type(df_summary_pl))

<class 'narwhals.dataframe.DataFrame'>
