## Comments

PolaRS is quite slower than Pandas but still faster than all other frameworks. Notice that we're doing 100 iterations. 100 iterations in the other frameworks would take _a lot_ more than 3.2s.

In [1]:
!export POLARS_MAX_THREADS=12

In [2]:
import pandas as pd
import numpy as np
import polars as pl
import utils

polars_df = pl.read_csv("../datasets/yellow_tripdata_2015-01.csv")
pandas_df = pd.read_csv('../datasets/yellow_tripdata_2015-01.csv')
NUM_ITERATIONS = 100

In [3]:
%%time_cell

# Taken from: erikbruin/nlp-on-student-writing-eda

# The PolRS API starts being a bit weird here.
polars_df = polars_df.with_column(pl.lit(1).alias('discourse_nr'))
counter = 1

for i in range(1, NUM_ITERATIONS):
  if polars_df[i, 'VendorID'] == polars_df[i-1, 'VendorID']:
    counter += 1
    polars_df[i, 'discourse_nr'] = counter
  else:
    counter = 1
    polars_df[i, 'discourse_nr'] = counter

In [4]:
polars_time = _TIMED_CELL
print(f"PolaRS time: {polars_time:.1f}s")

PolaRS time: 3.1s


In [5]:
%%time_cell

# Taken from: erikbruin/nlp-on-student-writing-eda

pandas_df['discourse_nr'] = 1
counter = 1

for i in range(1, NUM_ITERATIONS):
  if pandas_df.loc[i, 'VendorID'] == pandas_df.loc[i-1, 'VendorID']:
    counter += 1
    pandas_df.loc[i, 'discourse_nr'] = counter
  else:
    counter = 1
    pandas_df.loc[i, 'discourse_nr'] = counter

In [6]:
pandas_time = _TIMED_CELL
print(f"Pandas time: {pandas_time:.1f}s")

Pandas time: 0.0s


In [7]:
slowdown = polars_time / pandas_time
utils.print_md(f"### PolaRS is {slowdown:.1f}x slower.")

### PolaRS is 99.7x slower.

In [8]:
# Had to do .to_pandas()
assert (polars_df['discourse_nr'].to_pandas() == pandas_df['discourse_nr']).all()