In [1]:
from datetime import datetime

import numpy as np
import polars as pl
from pathlib import Path
from polars import col, lit
import pandas as pd
from pandas.io.common import get_handle

## Polars

* [Python Docs](https://pola-rs.github.io/polars-book/user-guide/introduction.html)
* [Github](https://github.com/pola-rs/polars)
* [PyPI](https://pypi.org/project/polars/)
* Features:
    * Leazy & Eager computation
    * Rust implementation
    * Arrow Memory Format
    * Easy and transparent parallelisation using multithreading
    * PySpark-like Syntax
    * Supports real NA values in contrast to Pandas
    * Easily deal with complex data types, e.g. list of strings/floats
    * Copy-On-Write (COW) symantics in constrast to Pandas where you kind of never know

In [2]:
pl.__version__

'0.7.16'

In [3]:
# Download a huge csv as a test. Takes a while and only needed once...
big_csv = Path("./big.csv")
csv_url = "http://sdm.lbl.gov/fastbit/data/star2002-full.csv.gz"

if not big_csv.exists():
    with get_handle(csv_url, compression="gzip", mode="r") as fh_in, open(big_csv, "bw") as fh_out:
        fh_out.write(fh_in.handle.buffer.read())

## Eager Execution

In [4]:
edf = pl.read_csv(str(big_csv), has_headers=False)

In [5]:
edf.filter(col("column_1") == 1).select(["column_9"]).head()

column_9
i64
654
61
7
27
1


#### alternatively *Pandas* style (not recommended!)

In [6]:
edf[edf["column_1"] == 1][["column_9"]].head()

column_9
i64
654
61
7
27
1


Why shouldn't I use the Pandas style? Because ...

* it's much harder to read since it's not *operator chaining*,
* it's more verbose if you assign actual variable names to your dataframes and not just use `df` all the time. Check out this filtering example: `agg_metric_df[agg_metric_df["metric_1"] < 0.9]`. Using `col` to refer to the column of the current dataframe is much cleaner.,
* it's not possible to switch later from eager to lazy execution

## Lazy Execution

Just switching `read_csv` to `scan_csv` is all it needs to go from eager to lazy in this example. `collect` or `fetch` is then used to trigger the execution.

In [7]:
ldf = pl.scan_csv(str(big_csv), has_headers=False)

In [8]:
ldf = ldf.filter(col("column_1") == 1)
ldf.select(["column_9"]).collect().head()

column_9
i64
654
61
7
27
1


#### Pandas style fails

In [9]:
ldf = pl.scan_csv(str(big_csv), has_headers=False)
ldf[ldf["column_1"] == 1][["column_9"]].head()

TypeError: 'LazyFrame' object is not subscriptable

## Slicing & Indexing

In [10]:
edf[1,3] # index by (row, column)

20011015.222604

In [11]:
edf[1, [3]]  # index by (column, row) # a bug right now

Series: 'column_2' [i64]
[
	1613423
]

This works more or less like `iloc` in Pandas:

In [12]:
pdf = pd.read_csv(big_csv, header=None)

In [13]:
pdf.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,1,1613423,807,20011020.0,1613424,4518,0,0,654,1395,20011200.0,10.955403,2288071,-0.288203,0.407312,10.559091
1,1,1613423,808,20011020.0,1613424,886,0,0,61,371,20011200.0,23.326479,2288071,-0.24733,0.455916,57.810596
2,1,1613423,809,20011020.0,1613424,638,0,0,7,121,20011200.0,2.444299,2288071,-0.390961,0.589534,167.75714
3,4,1613423,810,20011020.0,1613424,4259,0,0,1024,1302,20011200.0,9.521868,2288071,-0.290154,0.446027,8.644362
4,5,1613423,811,20011020.0,1613424,3673,1,0,592,1246,20011200.0,13.560424,2288071,-0.257418,0.419689,29.02236


In [14]:
pdf.iloc[1, 3]

20011015.222604

In [15]:
pdf.iloc[1, [3]]

3    2.001102e+07
Name: 1, dtype: float64

## Dealing with missing values

In [16]:
left_df = pl.DataFrame({"a": [1, 2, 3], "b": [None, "b", "c"]})
right_df = pl.DataFrame({"a": [1, 2], "c": [42, 69]})

df = left_df.join(right_df, on="a", how="left")
df

a,b,c
i64,str,i64
1,,42.0
2,"""b""",69.0
3,"""c""",


Note that the last element of the `c` column is `null`, not `NaN` as in Pandas, and the datatype is still int and not automatically converted to float as in Pandas.

In [17]:
df.filter(col("c").is_null())

a,b,c
i64,str,i64
3,"""c""",


Pandas does something pretty scary here

In [26]:
left_pdf = left_df.to_pandas()
right_pdf = right_df.to_pandas()

In [19]:
pdf = pd.merge(left_pdf, right_pdf, on="a", how="left")

In [20]:
pdf

Unnamed: 0,a,b,c
0,1,,42.0
1,2,b,69.0
2,3,c,


Depending on the datatype, Pandas shows `None` or `NaN`, also note that the column `c` was converted from `int` to `float`!

# New columns

In [21]:
df.with_column((lit(3)*col("c")).alias("3*c"))

a,b,c,3*c
i64,str,i64,i64
1,,42.0,126.0
2,"""b""",69.0,207.0
3,"""c""",,


same is possible in Pandas but note that we had to retype again the variable name `pdf` just to access a column!

In [22]:
pdf.assign(**{"3*c": 3*pdf["c"]})

Unnamed: 0,a,b,c,3*c
0,1,,42.0,126.0
1,2,b,69.0,207.0
2,3,c,,


# Column Expressions

In [55]:
df = pl.DataFrame(
    {
        "nrs": [1, 2, 3, None, 5],
        "names": ["foo", "ham", "spam", "egg", None],
        "random": np.random.rand(5),
        "groups": ["A", "A", "B", "C", "B"],
    }
)
df

nrs,names,random,groups
i64,str,f64,str
1.0,"""foo""",0.484,"""A"""
2.0,"""ham""",0.849,"""A"""
3.0,"""spam""",0.188,"""B"""
,"""egg""",0.318,"""C"""
5.0,,0.612,"""B"""


In [56]:
# and in Pandas
pdf = df.to_pandas()

#### construct a new dataframe with a sorted column and some aggregation

In [57]:
df.select([pl.sum("nrs"), pl.col("names").sort(), col("names").n_unique().alias("unique_names_1")])

nrs,names,unique_names_1
i64,str,u32
11,,5
11,"""egg""",5
11,"""foo""",5
11,"""ham""",5
11,"""spam""",5


In Pandas we create a new DataFrame and reference several times `pdf`

In [58]:
pd.DataFrame({"nrs": pdf["nrs"].sum(), "names": pdf["names"].sort_values(), "unique_names_1": pdf["names"].nunique(dropna=False)})

Unnamed: 0,nrs,names,unique_names_1
3,11.0,egg,5
0,11.0,foo,5
1,11.0,ham,5
2,11.0,spam,5
4,11.0,,5


#### Select certain elements from a column by filtering from another

In [59]:
df.select(col("names").filter(col("random") > 0.4))

names
str
"""foo"""
"""ham"""
""


Syntax in Pandas is way less readable

In [60]:
pdf.loc[pdf["random"] > 0.4][["names"]]

Unnamed: 0,names
0,foo
1,ham
4,


### Complex expressions are possible which are all *embarassingly parallel* by design and thus parallelized

In [61]:
df.select([pl.when(col("random") > 0.5).then(0).otherwise(col("random")).alias("result") * pl.sum("nrs")])

result
f64
5.324
0.0
2.065
3.496
0.0


SQL-like `when`/`then`/`otherwise` statements are not possible in Pandas, thus we use `np.where`

In [62]:
pd.Series(np.where(pdf["random"] > 0.5, 0, pdf["random"]*pdf["nrs"].sum()), name="result").to_frame()

Unnamed: 0,result
0,5.323855
1,0.0
2,2.064787
3,3.496318
4,0.0


#### Even window expressions are possible

In [63]:
df.select([
        col("*"),  # select all
        col("random").sum().over("groups").alias("sum[random]/groups"),
        col("random").list().over("names").alias("random/name"),
    ])

nrs,names,random,groups,sum[random]/groups,random/name
i64,str,f64,str,f64,list
1.0,"""foo""",0.484,"""A""",1.333,"""[0.4839868474002975]"""
2.0,"""ham""",0.849,"""A""",1.333,"""[0.8492025955510352]"""
3.0,"""spam""",0.188,"""B""",0.799,"""[0.18770790330576081]"""
,"""egg""",0.318,"""C""",0.318,"""[0.3178470681965706]"""
5.0,,0.612,"""B""",0.799,"""[0.6115924506977032]"""


Doing the same in Pandas is a bit more complex. Also note that there is an unexpected `NaN` in the last row. This is due to the fact that when inserting `pdf.groupby(['names'], dropna=False)['random'].apply(list)` we compare `NaN` to `NaN` which is false by definition. Another subtle problem caused by the fact that Pandas uses `NaN` to express `NA`.
Also note that Polars needs no explicit index like Pandas to do operations like this just like Spark has no way to set an index explicitely.

In [83]:
(pdf.set_index("groups")
    .assign(**{"sum[random]/groups": pdf.groupby(['groups'])['random'].sum()})
    .set_index("names")
    .assign(**{"random/name": pdf.groupby(['names'], dropna=False)['random'].apply(list)})
    .reset_index()
)

Unnamed: 0,names,nrs,random,sum[random]/groups,random/name
0,foo,1.0,0.483987,1.333189,[0.4839868474002975]
1,ham,2.0,0.849203,1.333189,[0.8492025955510352]
2,spam,3.0,0.187708,0.7993,[0.18770790330576081]
3,egg,,0.317847,0.317847,[0.3178470681965706]
4,,5.0,0.611592,0.7993,


# GroupBy

In [30]:
df = pl.read_csv("https://theunitedstates.io/congress-legislators/legislators-current.csv")

In [31]:
q = (
    df
    .lazy() # allows for working only on a subset using limit
    .groupby("first_name")
    .agg([pl.count("party"), col("gender").list(), pl.first("last_name")])
    .sort("party_count", reverse=True)
    .limit(5)
)
q.collect()

first_name,party_count,gender_agg_list,last_name_first
str,u32,list,str
"""John""",19,"""[M, M, ... M]""","""Barrasso"""
"""Mike""",12,"""[M, M, ... M]""","""Kelly"""
"""Michael""",11,"""[M, M, ... M]""","""Bennet"""
"""David""",11,"""[M, M, ... M]""","""Cicilline"""
"""James""",9,"""[M, M, ... M]""","""Inhofe"""


Note how easily we can deal with lists of strings by aggregating over gender using `list()`.

Even conditionals work with aggregations:

In [32]:
q = (
    df.lazy()
    .groupby("state")
    .agg(
        [
            (col("party") == "Democrat").sum().alias("anti"),
            (col("party") == "Republican").sum().alias("pro"),
        ]
    )
    .sort("pro", reverse=True)
    .limit(5)
)
q.collect()

state,anti,pro
str,u32,u32
"""TX""",13,24
"""FL""",10,18
"""OH""",4,13
"""CA""",44,11
"""PA""",10,10


expressions allow to easily compose more complex aggregations

In [33]:
def compute_age() -> pl.Expr:
    # Date64 is time in ms
    ms_to_year = 1e3 * 3600 * 24 * 365
    return (
        lit(datetime(2021, 1, 1)) - col("birthday")
    ) / (ms_to_year)


def avg_age(gender: str) -> pl.Expr:
    return (
        compute_age()
        .filter(col("gender") == gender)
        .mean()
        .alias(f"avg {gender} age")
    )


q = (
    df.lazy()
    .groupby(["state"])
    .agg(
        [
             avg_age("M"),
             avg_age("F"),
            (col("gender") == "M").sum().alias("# male"),
            (col("gender") == "F").sum().alias("# female"),
        ]
    )
    .limit(5)
)
q.collect()

state,avg M age,avg F age,# male,# female
str,f64,f64,u32,u32
"""OH""",59.51,72.726,15,2
"""AL""",65.167,56.038,8,1
"""WV""",65.612,68.677,3,2
"""CT""",63.439,62.871,5,2
"""AZ""",60.004,59.168,8,3


# User-Defined (Aggregation) Functions

In [34]:
df = pl.DataFrame({"foo": np.arange(10), "bar": np.random.rand(10), "cls": np.random.randint(2, size=10)})

In [35]:
df

foo,bar,cls
i64,f64,i64
0,0.488,0
1,0.304,1
2,0.913,0
3,0.864,1
4,0.704,1
5,0.654,0
6,0.889,1
7,0.908,1
8,0.133,1
9,0.435,1


#### Vector Operations

`map` for vector operations on a whole column

In [36]:
def my_custom_func(s: pl.Series) -> pl.Series:
    return np.exp(s) / np.log(s)

my_udf = pl.udf(my_custom_func, output_type=pl.Float64)

In [37]:
df.filter(pl.col("bar").map(my_udf) > -1)

foo,bar,cls
i64,f64,i64
8,0.133,1


`apply` for scalar operations on a cell level

In [38]:
df.select(col("bar").apply(lambda x: 3*x))

bar
f64
1.465
0.912
2.738
2.592
2.112
1.962
2.667
2.723
0.398
1.306


#### Aggregation Operations

In [39]:
df.groupby(["cls"]).agg([col("bar").apply(lambda a: a.sum()* 3)])

cls,bar
i64,f64
1,12.711
0,6.165
