[Reference](https://medium.com/@pyzone.dev/polars-a-dataframe-library-faster-than-pandas-c1267315af0e)

In [1]:
pip install polars

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting polars
  Downloading polars-0.16.1-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (15.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.3/15.3 MB[0m [31m43.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: polars
Successfully installed polars-0.16.1


In [2]:
import polars as pl
import numpy as np

In [3]:
df = pl.DataFrame({
    'col_str': ['a', 'b', 'c', 'd', 'e'],
    'col_int': [1, None, 3, 4, 5],
    'col_float': [0.1, np.nan, 0.3, None, 0.5],
})
print(df)

shape: (5, 3)
┌─────────┬─────────┬───────────┐
│ col_str ┆ col_int ┆ col_float │
│ ---     ┆ ---     ┆ ---       │
│ str     ┆ i64     ┆ f64       │
╞═════════╪═════════╪═══════════╡
│ a       ┆ 1       ┆ 0.1       │
│ b       ┆ null    ┆ NaN       │
│ c       ┆ 3       ┆ 0.3       │
│ d       ┆ 4       ┆ null      │
│ e       ┆ 5       ┆ 0.5       │
└─────────┴─────────┴───────────┘


In [4]:
print(df.shape)
print(df.height)
print(df.width)

(5, 3)
5
3


In [5]:
print(df.head(2))

shape: (2, 3)
┌─────────┬─────────┬───────────┐
│ col_str ┆ col_int ┆ col_float │
│ ---     ┆ ---     ┆ ---       │
│ str     ┆ i64     ┆ f64       │
╞═════════╪═════════╪═══════════╡
│ a       ┆ 1       ┆ 0.1       │
│ b       ┆ null    ┆ NaN       │
└─────────┴─────────┴───────────┘


In [6]:
print(df[2])

shape: (1, 3)
┌─────────┬─────────┬───────────┐
│ col_str ┆ col_int ┆ col_float │
│ ---     ┆ ---     ┆ ---       │
│ str     ┆ i64     ┆ f64       │
╞═════════╪═════════╪═══════════╡
│ c       ┆ 3       ┆ 0.3       │
└─────────┴─────────┴───────────┘


In [7]:
print(df[3:])

shape: (2, 3)
┌─────────┬─────────┬───────────┐
│ col_str ┆ col_int ┆ col_float │
│ ---     ┆ ---     ┆ ---       │
│ str     ┆ i64     ┆ f64       │
╞═════════╪═════════╪═══════════╡
│ d       ┆ 4       ┆ null      │
│ e       ┆ 5       ┆ 0.5       │
└─────────┴─────────┴───────────┘


In [8]:
print(df[[1, 3], 'col_str'])

shape: (2, 1)
┌─────────┐
│ col_str │
│ ---     │
│ str     │
╞═════════╡
│ b       │
│ d       │
└─────────┘


In [9]:
print(df[[1, 3], [0, 2]])

shape: (2, 2)
┌─────────┬───────────┐
│ col_str ┆ col_float │
│ ---     ┆ ---       │
│ str     ┆ f64       │
╞═════════╪═══════════╡
│ b       ┆ NaN       │
│ d       ┆ null      │
└─────────┴───────────┘


In [10]:
print(df[['col_str', 'col_float']])

shape: (5, 2)
┌─────────┬───────────┐
│ col_str ┆ col_float │
│ ---     ┆ ---       │
│ str     ┆ f64       │
╞═════════╪═══════════╡
│ a       ┆ 0.1       │
│ b       ┆ NaN       │
│ c       ┆ 0.3       │
│ d       ┆ null      │
│ e       ┆ 0.5       │
└─────────┴───────────┘


In [11]:
print(df['col_int'])  # you can also use to df.col_int

shape: (5,)
Series: 'col_int' [i64]
[
	1
	null
	3
	4
	5
]


In [12]:
print(df[-1, 'col_float'])

0.5


In [15]:
# df['col_bool'] = [True, True, False, False, True]

In [14]:
# same operation as above
df = df.with_column(
     pl.Series('col_bool', [True, True, False, False, True])
)

  df = df.with_column(


In [16]:
df = df.rename({'col_float': 'col_flt'})

In [18]:
# df['col_flt'] = df['col_flt'].cast(pl.Float32)

In [19]:
df = df.drop('col_bool')

In [20]:
print(df.to_pandas())
print(df.to_numpy())

  col_str  col_int  col_flt
0       a      1.0      0.1
1       b      NaN      NaN
2       c      3.0      0.3
3       d      4.0      NaN
4       e      5.0      0.5
[['a' 1.0 0.1]
 ['b' nan nan]
 ['c' 3.0 0.3]
 ['d' 4.0 nan]
 ['e' 5.0 0.5]]


In [21]:
pl.Config.set_tbl_rows(20)
pl.Config.set_tbl_cols(10)

polars.cfg.Config

In [22]:
df = pl.DataFrame({
    'col_str': ['a', 'b', 'c'],
    'col_int': [1, 2, None],
})

In [24]:
# df['col_int_div_2'] = df['col_int'].apply(lambda x: x / 2)
# df = df.with_columns([
#     pl.col('col_int')
#     .is_in([1, 2])
#     .is_not()
#     .alias('col_int_not_in_1_2'),
#     # when / then / otherwise
#     pl.when(pl.col('col_int_div_2') >= 1)
#     .then(1)
#     .otherwise(pl.Series([11, 12, 13]))
#     .alias('wto'),
# ])
# print(df)

In [26]:
# from datetime import datetime

# df_2 = pl.DataFrame({
#     'col_str': ['a', 'c', 'd'],
#     'col_datetime': [
#         datetime.strptime(
#             f'2021-10-{i} 11:22:33 +0900',
#             '%Y-%m-%d %H:%M:%S %z'
#         ) for i in [12, 15, 17]
#     ],
# })
# df_join = df[['col_str', 'col_int']].join(
#     df_2, on='col_str', how='left')
# print(df_join)

In [27]:
df = df[['col_str', 'col_int']].vstack(
    pl.DataFrame({
        'col_str': ['x', 'y', 'z'],
        'col_int': [7, 8, 9],
    })
)
print(df)

shape: (6, 2)
┌─────────┬─────────┐
│ col_str ┆ col_int │
│ ---     ┆ ---     │
│ str     ┆ i64     │
╞═════════╪═════════╡
│ a       ┆ 1       │
│ b       ┆ 2       │
│ c       ┆ null    │
│ x       ┆ 7       │
│ y       ┆ 8       │
│ z       ┆ 9       │
└─────────┴─────────┘


In [28]:
df = df.filter((pl.col('col_int') >= 1) & (pl.col('col_int') <= 7))
df = df.sort('col_int', reverse=True)
print(df)

shape: (3, 2)
┌─────────┬─────────┐
│ col_str ┆ col_int │
│ ---     ┆ ---     │
│ str     ┆ i64     │
╞═════════╪═════════╡
│ x       ┆ 7       │
│ b       ┆ 2       │
│ a       ┆ 1       │
└─────────┴─────────┘


In [30]:
# df['col_int_shifted'] = df['col_int'].shift(1)
# print(df)

In [31]:
df = pl.DataFrame({
    'col_str': ['a', 'b', 'c', 'a', 'b', 'c', 'a', 'b', 'a', 'c'],
    'col_int': [1, 3, 2, 6, 5, 3, 1, 4, 2, 1],
    'col_float': [.2, .4, .1, .5, .6, .8, .9, .1, .5, .2],
})

In [32]:
print(df.describe())

shape: (7, 4)
┌────────────┬─────────┬─────────┬───────────┐
│ describe   ┆ col_str ┆ col_int ┆ col_float │
│ ---        ┆ ---     ┆ ---     ┆ ---       │
│ str        ┆ str     ┆ f64     ┆ f64       │
╞════════════╪═════════╪═════════╪═══════════╡
│ count      ┆ 10      ┆ 10.0    ┆ 10.0      │
│ null_count ┆ 0       ┆ 0.0     ┆ 0.0       │
│ mean       ┆ null    ┆ 2.8     ┆ 0.43      │
│ std        ┆ null    ┆ 1.75119 ┆ 0.283039  │
│ min        ┆ a       ┆ 1.0     ┆ 0.1       │
│ max        ┆ c       ┆ 6.0     ┆ 0.9       │
│ median     ┆ null    ┆ 2.5     ┆ 0.45      │
└────────────┴─────────┴─────────┴───────────┘


In [33]:
print(df.groupby('col_str').max())

shape: (3, 3)
┌─────────┬─────────┬───────────┐
│ col_str ┆ col_int ┆ col_float │
│ ---     ┆ ---     ┆ ---       │
│ str     ┆ i64     ┆ f64       │
╞═════════╪═════════╪═══════════╡
│ a       ┆ 6       ┆ 0.9       │
│ c       ┆ 3       ┆ 0.8       │
│ b       ┆ 5       ┆ 0.6       │
└─────────┴─────────┴───────────┘


In [35]:
# df_agg = df.groupby('col_str').agg([
#     pl.col('col_float').sum(),
#     pl.sum('col_int'),  # 短く書ける
#     pl.sum('col_int').alias('int_sum'),  # 列名を自分でつけられる
#     pl.col('col_int').list(),  # list にもできる
#     pl.col('col_int').first(),  # 他にも count, mean, などなど
#     (pl.col('col_int') > 2).sum().alias(
#         'col_int_gt_2_count'),  # 条件を満たすものをカウント
# ])
# print(df_agg)

In [36]:
df_window = df.select([
    # 'col_str',
    # 'col_int',
    # 'col_float',
    pl.all(),  # select all column from the original df
    pl.col('col_int')
    .max()
    .over('col_str')
    .alias('max_int_by_str'),
    pl.col('col_float')
    .mean()
    .over('col_str')
    .alias('avg_float_by_str'),
])
print(df_window)

shape: (10, 5)
┌─────────┬─────────┬───────────┬────────────────┬──────────────────┐
│ col_str ┆ col_int ┆ col_float ┆ max_int_by_str ┆ avg_float_by_str │
│ ---     ┆ ---     ┆ ---       ┆ ---            ┆ ---              │
│ str     ┆ i64     ┆ f64       ┆ i64            ┆ f64              │
╞═════════╪═════════╪═══════════╪════════════════╪══════════════════╡
│ a       ┆ 1       ┆ 0.2       ┆ 6              ┆ 0.525            │
│ b       ┆ 3       ┆ 0.4       ┆ 5              ┆ 0.366667         │
│ c       ┆ 2       ┆ 0.1       ┆ 3              ┆ 0.366667         │
│ a       ┆ 6       ┆ 0.5       ┆ 6              ┆ 0.525            │
│ b       ┆ 5       ┆ 0.6       ┆ 5              ┆ 0.366667         │
│ c       ┆ 3       ┆ 0.8       ┆ 3              ┆ 0.366667         │
│ a       ┆ 1       ┆ 0.9       ┆ 6              ┆ 0.525            │
│ b       ┆ 4       ┆ 0.1       ┆ 5              ┆ 0.366667         │
│ a       ┆ 2       ┆ 0.5       ┆ 6              ┆ 0.525            │
│ c  

In [39]:
# # It won't work if you don't sort the contents of over first
# df_window_sort = df.sort('col_str').select([
#     pl.all(),
#     pl.col('col_int')
#     .rank('min')
#     .over('col_str')
#     .flatten()
#     .alias('rank_int_by_str'),
# ])
# print(df_window_sort)

In [40]:
df = pl.DataFrame({
    'col_str': ['a', 'a', 'a', 'b', 'b'],
    'col_str_2': ['x', 'y', 'z', 'x', 'y'],
    'col_int': [1, 3, 1, 2, 5],
})

In [42]:
# df_pivot = df.groupby('col_str').pivot(
#     pivot_column='col_str_2',
#     values_column='col_int'
# ).first()
# print(df_pivot)

In [46]:
# from contextlib import contextmanager
# import time
# import numpy as np
# import pandas as pd
# import polars as pl

# @contextmanager
# def timer(name: str):
#     t0 = time.time()
#     yield
#     print(f'{name}: {time.time() - t0:.1f} s')
# np.random.seed(42)
# N = 10**8
# M = 10**4
# df_dict = {
#     'col_int': np.random.randint(0, M, N),
#     'col_float': np.random.rand(N),
# }
# df_dict_2 = {
#     'col_int': np.random.randint(0, 10**5, M),
#     'col_float': np.random.rand(M),
# }
# df_pd = pd.DataFrame(df_dict)
# df_pl = pl.DataFrame(df_dict)
# df_pd_2 = pd.DataFrame(df_dict_2)
# df_pl_2 = pl.DataFrame(df_dict_2)

# with timer('pandas groupby'):
#     df_pd.groupby('col_int').agg({'col_float': 'mean'})
# with timer('polars groupby'):
#     df_pl.groupby('col_int').agg({'col_float': 'mean'})
    
# with timer('pandas join'):
#     pd.merge(
#         df_pd, df_pd_2, on='col_int',
#         how='left',  suffixes=['', '_2']
#     )
# with timer('polars join'):
#     df_pl.join(
#         df_pl_2, on='col_int',
#         how='left', suffix='_2'
#     )
# with timer('pandas sort'):
#     df_pd.sort_values('col_float')
# with timer('polars sort'):
#     df_pl.sort('col_float')
# with timer('pandas filter'):
#     df_pd.query('col_float < 0.5')
# with timer('polars filter'):
#     df_pl.filter(pl.col('col_float') < 0.5)