In [1]:
import gc
import random

import polars as pl
import polars.selectors as cs
import pandas as pd
import numpy as np
from tqdm.auto import tqdm

### Время осознать истинную мощь Polars &#x1F4A3;

#### Reading the data

In [5]:
%%time
df = pl.read_csv("data/dummy_dataset.csv")
del df
gc.collect()

CPU times: user 9.35 s, sys: 799 ms, total: 10.2 s
Wall time: 1.59 s


0

In [6]:
%%time
df_pd = pd.read_csv("data/dummy_dataset.csv")
del df_pd
gc.collect()

CPU times: user 19.2 s, sys: 1.59 s, total: 20.8 s
Wall time: 21 s


0

![](https://media1.tenor.com/m/csmPA2hAwK0AAAAC/monkey-spin.gif)

In [7]:
df = pl.read_csv("data/dummy_dataset.csv")
print(f"Polars memoru usage: {df.estimated_size(unit='gb'):.3f} GB")
df_pd = pd.read_csv("data/dummy_dataset.csv")
print(
    f"Pandas memoru usage: {df_pd.memory_usage(deep=True).sum() / (1024 * 1024 * 1024):.3f} GB"
)

Polars memoru usage: 0.997 GB
Pandas memoru usage: 4.718 GB


In [8]:
%%time
df.describe()

CPU times: user 3.99 s, sys: 856 ms, total: 4.84 s
Wall time: 861 ms


statistic,id,name,birth_date,hair_color,is_MU_fan,iq,accession_month,balance,debt,country,city,street_name,building_number,coordinate,license_plate,vin,swift,company_name,company_suffix,job_name,credit_card_expire,credit_card_number,credit_card_security_code,currency
str,f64,str,str,str,f64,f64,f64,f64,f64,str,str,str,f64,f64,str,str,str,str,str,str,str,f64,f64,str
"""count""",5000005.0,"""5000005""","""5000005""","""5000005""",5000005.0,5000005.0,5000005.0,5000005.0,5000005.0,"""5000005""","""5000005""","""5000005""",5000005.0,5000005.0,"""5000005""","""5000005""","""5000005""","""5000005""","""5000005""","""5000005""","""5000005""",5000005.0,5000005.0,"""5000005"""
"""null_count""",0.0,"""0""","""0""","""0""",0.0,0.0,0.0,0.0,0.0,"""0""","""0""","""0""",0.0,0.0,"""0""","""0""","""0""","""0""","""0""","""0""","""0""",0.0,0.0,"""0"""
"""mean""",4997.99319,,,,0.500334,499.568274,6.498762,499860000.0,4999600000.0,,,,18498.602239,0.001235,,,,,,,,3.7672e+17,874.367594,
"""std""",2885.534873,,,,,288.726905,3.44521,288610000.0,2888300000.0,,,,27921.598079,103.915211,,,,,,,,1.2452e+18,1521.526748,
"""min""",0.0,"""Aaron""","""1994-08-14""","""AliceBlue""",0.0,0.0,1.0,192.502858,418.515157,"""Afghanistan""","""Aaronberg""","""Aaron Alley""",0.0,-179.99998,"""0-00001H""","""000010JS3ZKE9HRG1""","""AAAAGB08""","""Abbott Group""","""Group""","""Academic librarian""","""01/25""",60400000000.0,0.0,"""AED"""
"""25%""",2500.0,,,,,249.0,4.0,249980000.0,2496000000.0,,,,675.0,-90.022236,,,,,,,,180000000000000.0,269.0,
"""50%""",4997.0,,,,,500.0,7.0,499920000.0,4999500000.0,,,,4546.0,0.014419,,,,,,,,3512400000000000.0,540.0,
"""75%""",7495.0,,,,,750.0,9.0,749550000.0,7500800000.0,,,,25052.0,89.977051,,,,,,,,4624200000000000.0,810.0,
"""max""",9999.0,"""Zoe""","""2024-08-13""","""YellowGreen""",1.0,999.0,12.0,1000000000.0,10000000000.0,"""Zimbabwe""","""Zunigaville""","""Zuniga Wells""",99999.0,179.999937,"""ZZZ8110""","""ZZZZT77A8RXP8FH7H""","""ZZZZGBT1""","""Zuniga-Zuniga""","""and Sons""","""Youth worker""","""12/33""",5e+18,9999.0,"""ZWD"""


In [6]:
%%time
df_pd.describe()

CPU times: user 1.34 s, sys: 252 ms, total: 1.6 s
Wall time: 1.76 s


Unnamed: 0,id,iq,accession_month,balance,debt,building_number,coordinate,credit_card_number,credit_card_security_code
count,5000005.0,5000005.0,5000005.0,5000005.0,5000005.0,5000005.0,5000005.0,5000005.0,5000005.0
mean,4997.993,499.5683,6.498762,499857300.0,4999578000.0,18498.6,0.001235371,3.767211e+17,874.3676
std,2885.535,288.7269,3.44521,288607600.0,2888256000.0,27921.6,103.9152,1.245163e+18,1521.527
min,0.0,0.0,1.0,192.5029,418.5152,0.0,-180.0,60400000000.0,0.0
25%,2500.0,249.0,4.0,249983800.0,2496024000.0,675.0,-90.02224,180000200000000.0,269.0
50%,4997.0,500.0,7.0,499920800.0,4999505000.0,4546.0,0.014419,3512421000000000.0,540.0
75%,7495.0,750.0,9.0,749547800.0,7500798000.0,25052.0,89.97705,4624171000000000.0,810.0
max,9999.0,999.0,12.0,999999700.0,10000000000.0,99999.0,179.9999,4.999998e+18,9999.0


#### Groupping

In [7]:
cols_for_groupping = [
    "name",
    "hair_color",
    "accession_month",
    "credit_card_expire",
    "currency",
    "city",
]
cols_for_aggregation_num = [
    "balance",
    "debt",
    "building_number",
    "credit_card_number",
    "credit_card_security_code",
    "coordinate",
]
cols_for_aggregation_cat = ["country", "street_name", "credit_card_expire"]
df.group_by(cols_for_groupping[0]).agg(
    *[pl.mean(num).alias(f"{num}_avg") for num in cols_for_aggregation_num],
    *[pl.std(num).alias(f"{num}_std") for num in cols_for_aggregation_num],
    *[pl.median(num).alias(f"{num}_median") for num in cols_for_aggregation_num],
    *[
        pl.col(cat).mode().first().alias(f"{cat}_mode")
        for cat in cols_for_aggregation_cat
    ],
)

name,balance_avg,debt_avg,building_number_avg,credit_card_number_avg,credit_card_security_code_avg,coordinate_avg,balance_std,debt_std,building_number_std,credit_card_number_std,credit_card_security_code_std,coordinate_std,balance_median,debt_median,building_number_median,credit_card_number_median,credit_card_security_code_median,coordinate_median,country_mode,street_name_mode,credit_card_expire_mode
str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str,str,str
"""Rickey""",5.1210e8,4.9930e9,18296.096091,4.3568e17,756.26873,-0.026451,2.8581e8,2.7637e9,27856.898073,1.3453e18,1174.245561,101.795071,5.2950e8,4.9390e9,4031.5,3.5049e15,535.5,1.681911,"""Nicaragua""","""Williams Expressway""","""10/25"""
"""Debra""",5.0311e8,4.9880e9,18859.38333,3.7167e17,850.378181,-0.090819,2.8860e8,2.9013e9,28210.803402,1.2377e18,1480.660039,103.604484,5.0896e8,5.0032e9,4668.5,3.5100e15,533.5,1.620669,"""Congo""","""Smith Square""","""03/31"""
"""Travis""",4.9522e8,4.9952e9,18449.438772,3.7177e17,892.879807,-0.267829,2.8839e8,2.8909e9,28191.760521,1.2377e18,1550.740545,103.484138,4.9205e8,5.0175e9,4185.0,3.5155e15,544.0,-0.631944,"""Congo""","""Christopher Islands""","""10/24"""
"""Whitney""",5.0952e8,4.9804e9,18844.158577,3.8525e17,836.235156,1.700718,2.8894e8,2.8860e9,28203.954169,1.2638e18,1431.933426,103.515747,5.1888e8,4.9513e9,4591.0,3.5171e15,544.0,4.723871,"""Korea""","""Smith Isle""","""05/27"""
"""Geoffrey""",5.0721e8,4.9723e9,17749.273059,3.6979e17,838.471233,-0.357873,2.9131e8,2.8701e9,27054.501021,1.2308e18,1421.421416,103.508626,5.1591e8,4.9248e9,4596.0,3.5187e15,519.0,3.113072,"""Korea""","""Michael Heights""","""06/28"""
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""Cassandra""",5.0253e8,5.0364e9,18267.537155,3.9286e17,886.00127,0.083164,2.8759e8,2.8797e9,27709.987812,1.2702e18,1553.786291,104.213801,5.0403e8,5.0556e9,4414.5,3.5150e15,546.0,2.477512,"""Congo""","""Jeffrey Run""","""11/29"""
"""Benjamin""",5.0088e8,5.0100e9,18656.637876,3.7351e17,878.917184,-0.63615,2.8663e8,2.8906e9,28104.565321,1.2414e18,1527.218829,104.254524,5.0134e8,5.0300e9,4525.0,3.5146e15,540.0,-1.356776,"""Korea""","""Michael Well""","""01/27"""
"""Jeremy""",5.0177e8,4.9490e9,18412.446195,3.8940e17,869.173135,0.119967,2.8942e8,2.8980e9,27802.141185,1.2637e18,1501.79161,103.528885,5.0468e8,4.9541e9,4497.0,3.5141e15,545.0,0.122004,"""Korea""","""Michael Plains""","""11/26"""
"""Cassie""",4.7972e8,4.8537e9,17193.802879,4.7605e17,913.449612,7.533024,2.7785e8,2.8334e9,27084.350983,1.3848e18,1615.491842,105.363543,4.6473e8,4.8626e9,4756.0,3.5357e15,548.0,9.205185,"""Congo""","""Michelle Loop""","""03/33"""


In [8]:
df.select([pl.col(col).n_unique() for col in cols_for_groupping])

name,hair_color,accession_month,credit_card_expire,currency,city
u32,u32,u32,u32,u32,u32
690,140,12,121,164,116088


In [9]:
del df_pd
gc.collect()

0

In [10]:
%%time
cols_for_groupping = [
    "name",
    "hair_color",
    "accession_month",
    "credit_card_expire",
    "currency",
    "city",
]
cols_for_aggregation_num = [
    "balance",
    "debt",
    "building_number",
    "credit_card_number",
    "credit_card_security_code",
    "coordinate",
]
cols_for_aggregation_cat = ["country", "street_name", "credit_card_expire"]

for col in tqdm(cols_for_groupping):
    temp = df.group_by(col).agg(
        *[pl.mean(num).alias(f"{num}_avg") for num in cols_for_aggregation_num],
        *[pl.std(num).alias(f"{num}_std") for num in cols_for_aggregation_num],
        *[pl.median(num).alias(f"{num}_median") for num in cols_for_aggregation_num],
        *[
            pl.col(cat).mode().first().alias(f"{cat}_mode")
            for cat in cols_for_aggregation_cat
        ],
    )
    del temp

  0%|          | 0/6 [00:00<?, ?it/s]

CPU times: user 27.2 s, sys: 2.23 s, total: 29.4 s
Wall time: 8.14 s


In [11]:
%%time
cols_for_groupping = [
    "name",
    "hair_color",
    "accession_month",
    "credit_card_expire",
    "currency",
    "city",
]
cols_for_aggregation_num = [
    "balance",
    "debt",
    "building_number",
    "credit_card_number",
    "credit_card_security_code",
    "coordinate",
]
cols_for_aggregation_cat = ["country", "street_name", "credit_card_expire"]
for col in tqdm(cols_for_groupping):
    temp = (
        df.lazy()
        .group_by(col)
        .agg(
            *[pl.mean(num).alias(f"{num}_avg") for num in cols_for_aggregation_num],
            *[pl.std(num).alias(f"{num}_std") for num in cols_for_aggregation_num],
            *[
                pl.median(num).alias(f"{num}_median")
                for num in cols_for_aggregation_num
            ],
            *[
                pl.col(cat).mode().first().alias(f"{cat}_mode")
                for cat in cols_for_aggregation_cat
            ],
        )
    )
    temp.collect()
    del temp

  0%|          | 0/6 [00:00<?, ?it/s]

CPU times: user 26 s, sys: 1.86 s, total: 27.9 s
Wall time: 5.92 s


In [12]:
del df
gc.collect()

41

In [13]:
df_pd = pd.read_csv("data/dummy_dataset.csv")

In [14]:
%%time
cols_for_groupping = [
    "name",
    "hair_color",
    "accession_month",
    "credit_card_expire",
    "currency",
    "city",
]
cols_for_aggregation_num = [
    "balance",
    "debt",
    "building_number",
    "credit_card_number",
    "credit_card_security_code",
    "coordinate",
]
cols_for_aggregation_cat = ["country", "street_name", "credit_card_expire"]
for col in tqdm(cols_for_groupping):
    aggs = {num: ["mean", "median", "std"] for num in cols_for_aggregation_num}
    aggs.update({cat: pd.Series.mode for cat in cols_for_aggregation_cat})
    temp = df_pd.groupby(by=col).agg(aggs)
    del temp

  0%|          | 0/6 [00:00<?, ?it/s]

CPU times: user 40.6 s, sys: 1.81 s, total: 42.4 s
Wall time: 42 s


#### Joins

In [15]:
cols_for_groupping = [
    "name",
    "hair_color",
    "city",
    "credit_card_expire",
    "currency",
    "accession_month",
]
cols_for_aggregation_num = [
    "balance",
    "debt",
    "building_number",
    "credit_card_number",
    "credit_card_security_code",
    "coordinate",
]
cols_for_aggregation_cat = ["country", "street_name", "credit_card_expire"]

In [16]:
del df_pd
gc.collect()

21

In [17]:
df = pl.read_csv("data/dummy_dataset.csv")

In [18]:
arr = [
    df.group_by(cols_for_groupping[i]).agg(
        *[pl.mean(num).alias(f"{num}_avg_{i}") for num in cols_for_aggregation_num],
        *[pl.std(num).alias(f"{num}_std_{i}") for num in cols_for_aggregation_num],
        *[
            pl.median(num).alias(f"{num}_median_{i}")
            for num in cols_for_aggregation_num
        ],
        *[
            pl.col(cat).mode().first().alias(f"{cat}_mode_{i}")
            for cat in cols_for_aggregation_cat
        ],
    )
    for i in range(3)
]

In [19]:
%%time
for i in tqdm(range(3)):
    df = df.join(arr[i], on=cols_for_groupping[i], how="left")

  0%|          | 0/3 [00:00<?, ?it/s]

CPU times: user 1.83 s, sys: 1.08 s, total: 2.91 s
Wall time: 1.47 s


In [20]:
del arr
del df
gc.collect()

19

In [21]:
df = pl.read_csv("data/dummy_dataset.csv")

arr = [
    df.group_by(cols_for_groupping[i])
    .agg(
        *[pl.mean(num).alias(f"{num}_avg_{i}") for num in cols_for_aggregation_num],
        *[pl.std(num).alias(f"{num}_std_{i}") for num in cols_for_aggregation_num],
        *[
            pl.median(num).alias(f"{num}_median_{i}")
            for num in cols_for_aggregation_num
        ],
        *[
            pl.col(cat).mode().first().alias(f"{cat}_mode_{i}")
            for cat in cols_for_aggregation_cat
        ],
    )
    .lazy()
    for i in range(3)
]
df = df.lazy()

In [22]:
%%time
for i in tqdm(range(3)):
    df = df.join(arr[i], on=cols_for_groupping[i], how="left")
df = df.collect()



  0%|          | 0/3 [00:00<?, ?it/s]

CPU times: user 1.7 s, sys: 911 ms, total: 2.61 s
Wall time: 1.56 s


In [23]:
del arr
del df
gc.collect()

19

In [24]:
df_pd = pd.read_csv("data/dummy_dataset.csv")

arr = []
for i in range(3):
    aggs = {num: ["mean", "median", "std"] for num in cols_for_aggregation_num}
    aggs.update({cat: pd.Series.mode for cat in cols_for_aggregation_cat})
    arr.append(
        df_pd.groupby(by=cols_for_groupping[i], as_index=False)
        .agg(aggs)
        .droplevel(level=0, axis=1)
        .rename(
            columns=lambda x: (
                x + f"_{i}_{random.random()}" if x != "" else cols_for_groupping[i]
            )
        )
    )

In [25]:
%%time
for i in tqdm(range(3)):
    df_pd = df_pd.merge(arr[i], on=cols_for_groupping[i], how="left")

  0%|          | 0/3 [00:00<?, ?it/s]

CPU times: user 7.6 s, sys: 6.31 s, total: 13.9 s
Wall time: 17.3 s


In [26]:
del arr
del df_pd
gc.collect()

19

### Assignation

In [27]:
df = pl.read_csv("data/dummy_dataset.csv")

In [28]:
num_cols = df.select(cs.numeric()).columns
cat_cols = df.select(cs.string()).columns

In [29]:
%%time
df.with_columns(
    [pl.col(col).sqrt().alias(col + "_sqrt") for col in num_cols]
    + [pl.col(col).sin().alias(col + "_sin") for col in num_cols]
    + [(pl.col(col) + 5).alias(col + "_add") for col in num_cols]
    + [pl.col(col).str.tail(3).alias(col + "_tail") for col in cat_cols]
)

CPU times: user 2.67 s, sys: 646 ms, total: 3.32 s
Wall time: 516 ms


id,name,birth_date,hair_color,is_MU_fan,iq,accession_month,balance,debt,country,city,street_name,building_number,coordinate,license_plate,vin,swift,company_name,company_suffix,job_name,credit_card_expire,credit_card_number,credit_card_security_code,currency,id_sqrt,iq_sqrt,accession_month_sqrt,balance_sqrt,debt_sqrt,building_number_sqrt,coordinate_sqrt,credit_card_number_sqrt,credit_card_security_code_sqrt,id_sin,iq_sin,accession_month_sin,balance_sin,debt_sin,building_number_sin,coordinate_sin,credit_card_number_sin,credit_card_security_code_sin,id_add,iq_add,accession_month_add,balance_add,debt_add,building_number_add,coordinate_add,credit_card_number_add,credit_card_security_code_add,name_tail,birth_date_tail,hair_color_tail,country_tail,city_tail,street_name_tail,license_plate_tail,vin_tail,swift_tail,company_name_tail,company_suffix_tail,job_name_tail,credit_card_expire_tail,currency_tail
i64,str,str,str,bool,i64,i64,f64,f64,str,str,str,i64,f64,str,str,str,str,str,str,str,i64,i64,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,i64,i64,i64,f64,f64,i64,f64,i64,i64,str,str,str,str,str,str,str,str,str,str,str,str,str,str
1984,"""Jennifer""","""2006-05-24""","""PaleGoldenRod""",true,906,7,9.7458e8,9.6302e9,"""Bolivia""","""Lake Jeffrey""","""Alison Underpass""",602,-134.798172,"""3CD16""","""GZMAE9F91PXWCUH9R""","""DWBDGB73""","""Salazar PLC""","""PLC""","""Biomedical scientist""","""03/28""",213141891186046,5,"""NAD""",44.542115,30.099834,2.645751,31218.311267,98133.349158,24.535688,,1.4599e7,2.236068,-0.996454,0.939551,0.656987,0.989182,0.80225,-0.926796,-0.286251,0.583621,-0.958924,1989,911,12,9.7458e8,9.6302e9,607,-129.798172,213141891186051,10,"""fer""","""-24""","""Rod""","""via""","""rey""","""ass""","""D16""","""H9R""","""B73""","""PLC""","""PLC""","""ist""","""/28""","""NAD"""
4377,"""John""","""2022-03-15""","""Sienna""",false,235,1,5.7888e8,2.9483e9,"""Egypt""","""Kellyborough""","""Heather Dale""",38153,42.986348,"""90-ZJ96""","""GUGTCFXK934FD4MM8""","""IBBRGBPC""","""Payne Group""","""PLC""","""Amenity horticulturist""","""08/32""",30234030061112,436,"""TOP""",66.1589,15.32971,1.0,24059.825451,54298.345758,195.327929,6.556397,5.4985e6,20.880613,-0.68996,0.580587,0.841471,-0.730068,0.966013,0.99741,-0.839275,-0.407383,0.629865,4382,240,6,5.7888e8,2.9483e9,38158,47.986348,30234030061117,441,"""ohn""","""-15""","""nna""","""ypt""","""ugh""","""ale""","""J96""","""MM8""","""BPC""","""oup""","""PLC""","""ist""","""/32""","""TOP"""
1668,"""April""","""2022-02-20""","""DodgerBlue""",false,387,2,3.6455e7,1.1997e9,"""Namibia""","""Port Dillonland""","""Jimmy Lock""",876,46.57002,"""GBB 810""","""RN4FHXMX723BLLLLW""","""VLPDGBAD""","""Gordon, Saunders and Jimenez""","""Group""","""Production assistant, radio""","""10/32""",4239854219348534,311,"""MXN""",40.841156,19.672316,1.414214,6037.78174,34636.080016,29.597297,6.824223,6.5114e7,17.635192,0.184634,-0.551452,0.909297,-0.326152,-0.574312,0.483239,0.525982,0.999822,0.017672,1673,392,7,3.6455e7,1.1997e9,881,51.57002,4239854219348539,316,"""ril""","""-20""","""lue""","""bia""","""and""","""ock""","""810""","""LLW""","""BAD""","""nez""","""oup""","""dio""","""/32""","""MXN"""
1304,"""Nathan""","""2013-06-07""","""MintCream""",true,320,5,7.1056e8,9.1401e8,"""Switzerland""","""Lake Nicholeville""","""Jon Streets""",214,115.053304,"""OD 93833""","""JE195THF0L8WJZCTG""","""LYEUGBTX""","""Cherry Inc""","""Inc""","""Outdoor activities/education m…","""07/33""",180075318572801,5179,"""NIS""",36.11094,17.888544,2.236068,26656.260448,30232.673363,14.628739,10.72629,1.3419e7,71.965269,-0.236779,-0.428155,-0.958924,-0.446853,-0.634985,0.363199,0.926734,0.405029,0.996431,1309,325,10,7.1056e8,9.1401e8,219,120.053304,180075318572806,5184,"""han""","""-07""","""eam""","""and""","""lle""","""ets""","""833""","""CTG""","""BTX""","""Inc""","""Inc""","""ger""","""/33""","""NIS"""
8154,"""Cynthia""","""2023-11-14""","""DarkSalmon""",false,891,9,5.9632e8,4.1042e9,"""Greece""","""South Ronaldhaven""","""Brent Prairie""",50038,55.289272,"""7I RW552""","""VF2HLEMD1690DK17P""","""AFOLGBH4""","""Cruz Ltd""","""Group""","""Engineer, automotive""","""09/28""",4993489088320546606,353,"""BAM""",90.299502,29.849623,3.0,24419.582472,64064.207644,223.691752,7.435676,2.2346e9,18.788294,-0.999993,-0.93643,0.412118,0.869352,0.440658,-0.960219,-0.951905,-0.840214,0.90931,8159,896,14,5.9632e8,4.1042e9,50043,60.289272,4993489088320546611,358,"""hia""","""-14""","""mon""","""ece""","""ven""","""rie""","""552""","""17P""","""BH4""","""Ltd""","""oup""","""ive""","""/28""","""BAM"""
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
4922,"""Alexis""","""2024-03-24""","""Green""",true,167,5,3.9597e8,9.3008e8,"""Saint Martin""","""Lake Tara""","""Donna Harbors""",241,-119.632217,"""MVK 678""","""6R243FV412SPK04A8""","""OEBIGBIZ""","""Gutierrez-Davidson""","""PLC""","""Clinical scientist, histocompa…","""04/34""",4317515348040111,519,"""KZT""",70.156967,12.922848,2.236068,19898.898164,30497.290579,15.524175,,6.5708e7,22.781571,0.767984,-0.47555,-0.958924,-0.99032,-0.936647,0.784962,-0.249047,-0.312088,-0.594957,4927,172,10,3.9597e8,9.3008e8,246,-114.632217,4317515348040116,524,"""xis""","""-24""","""een""","""tin""","""ara""","""ors""","""678""","""4A8""","""BIZ""","""son""","""PLC""","""ics""","""/34""","""KZT"""
7522,"""Karen""","""2017-10-31""","""Olive""",false,227,10,3.4249e8,9.2168e9,"""Belarus""","""Brownfurt""","""Frank Wall""",0,-50.525241,"""39-KM46""","""DEDZ4TY44LZT2GL3L""","""RRMSGB64""","""Klein-Miller""","""Inc""","""Forensic psychologist""","""04/28""",2233322460028841,0,"""VND""",86.729464,15.066519,3.162278,18506.433395,96004.345463,0.0,,4.7258e7,0.0,0.855848,0.721059,-0.544021,0.573631,0.630359,0.0,-0.256847,-0.66105,0.0,7527,232,15,3.4249e8,9.2168e9,5,-45.525241,2233322460028846,5,"""ren""","""-31""","""ive""","""rus""","""urt""","""all""","""M46""","""L3L""","""B64""","""ler""","""Inc""","""ist""","""/28""","""VND"""
1896,"""Danielle""","""1998-07-14""","""HotPink""",true,675,8,7.7132e8,5.3113e9,"""Saint Barthelemy""","""East Robert""","""Kayla Key""",49032,86.667822,"""9S230""","""LCW80LB322VPX1A8U""","""ZXCDGBZR""","""Walton LLC""","""Ltd""","""Tax adviser""","""10/27""",2696106733714620,420,"""TTD""",43.543082,25.980762,2.828427,27772.653048,72878.477102,221.431705,9.309555,5.1924e7,20.493902,-0.998808,0.428128,0.989358,-0.83639,0.005103,-0.918178,-0.96269,0.837288,-0.826812,1901,680,13,7.7132e8,5.3113e9,49037,91.667822,2696106733714625,425,"""lle""","""-14""","""ink""","""emy""","""ert""","""Key""","""230""","""A8U""","""BZR""","""LLC""","""Ltd""","""ser""","""/27""","""TTD"""
6060,"""Mary""","""1994-10-10""","""Aqua""",true,483,2,7.6923e8,6.7472e9,"""Israel""","""Mullenmouth""","""Derrick Rapids""",9632,-152.370592,"""ZZR 9029""","""C8HJX0SW3AGH6TJC0""","""THUVGBWB""","""Rivera, Lewis and Miller""","""LLC""","""Multimedia specialist""","""10/27""",4361042620080,380,"""ALL""",77.846002,21.977261,1.414214,27735.059944,82141.20015,98.142753,,2.0883e6,19.493589,0.131844,-0.721017,0.909297,-0.646136,-0.547439,-0.122765,-0.999994,0.852743,0.132322,6065,488,7,7.6923e8,6.7472e9,9637,-147.370592,4361042620085,385,"""ary""","""-10""","""qua""","""ael""","""uth""","""ids""","""029""","""JC0""","""BWB""","""ler""","""LLC""","""ist""","""/27""","""ALL"""


In [30]:
df = df.lazy()

In [31]:
%%time

df.with_columns(
    [pl.col(col).sqrt().alias(col + "_sqrt") for col in num_cols]
    + [pl.col(col).sin().alias(col + "_sin") for col in num_cols]
    + [(pl.col(col) + 5).alias(col + "_add") for col in num_cols]
    + [pl.col(col).str.tail(3).alias(col + "_tail") for col in cat_cols]
).collect()

CPU times: user 2.7 s, sys: 591 ms, total: 3.29 s
Wall time: 660 ms


id,name,birth_date,hair_color,is_MU_fan,iq,accession_month,balance,debt,country,city,street_name,building_number,coordinate,license_plate,vin,swift,company_name,company_suffix,job_name,credit_card_expire,credit_card_number,credit_card_security_code,currency,id_sqrt,iq_sqrt,accession_month_sqrt,balance_sqrt,debt_sqrt,building_number_sqrt,coordinate_sqrt,credit_card_number_sqrt,credit_card_security_code_sqrt,id_sin,iq_sin,accession_month_sin,balance_sin,debt_sin,building_number_sin,coordinate_sin,credit_card_number_sin,credit_card_security_code_sin,id_add,iq_add,accession_month_add,balance_add,debt_add,building_number_add,coordinate_add,credit_card_number_add,credit_card_security_code_add,name_tail,birth_date_tail,hair_color_tail,country_tail,city_tail,street_name_tail,license_plate_tail,vin_tail,swift_tail,company_name_tail,company_suffix_tail,job_name_tail,credit_card_expire_tail,currency_tail
i64,str,str,str,bool,i64,i64,f64,f64,str,str,str,i64,f64,str,str,str,str,str,str,str,i64,i64,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,i64,i64,i64,f64,f64,i64,f64,i64,i64,str,str,str,str,str,str,str,str,str,str,str,str,str,str
1984,"""Jennifer""","""2006-05-24""","""PaleGoldenRod""",true,906,7,9.7458e8,9.6302e9,"""Bolivia""","""Lake Jeffrey""","""Alison Underpass""",602,-134.798172,"""3CD16""","""GZMAE9F91PXWCUH9R""","""DWBDGB73""","""Salazar PLC""","""PLC""","""Biomedical scientist""","""03/28""",213141891186046,5,"""NAD""",44.542115,30.099834,2.645751,31218.311267,98133.349158,24.535688,,1.4599e7,2.236068,-0.996454,0.939551,0.656987,0.989182,0.80225,-0.926796,-0.286251,0.583621,-0.958924,1989,911,12,9.7458e8,9.6302e9,607,-129.798172,213141891186051,10,"""fer""","""-24""","""Rod""","""via""","""rey""","""ass""","""D16""","""H9R""","""B73""","""PLC""","""PLC""","""ist""","""/28""","""NAD"""
4377,"""John""","""2022-03-15""","""Sienna""",false,235,1,5.7888e8,2.9483e9,"""Egypt""","""Kellyborough""","""Heather Dale""",38153,42.986348,"""90-ZJ96""","""GUGTCFXK934FD4MM8""","""IBBRGBPC""","""Payne Group""","""PLC""","""Amenity horticulturist""","""08/32""",30234030061112,436,"""TOP""",66.1589,15.32971,1.0,24059.825451,54298.345758,195.327929,6.556397,5.4985e6,20.880613,-0.68996,0.580587,0.841471,-0.730068,0.966013,0.99741,-0.839275,-0.407383,0.629865,4382,240,6,5.7888e8,2.9483e9,38158,47.986348,30234030061117,441,"""ohn""","""-15""","""nna""","""ypt""","""ugh""","""ale""","""J96""","""MM8""","""BPC""","""oup""","""PLC""","""ist""","""/32""","""TOP"""
1668,"""April""","""2022-02-20""","""DodgerBlue""",false,387,2,3.6455e7,1.1997e9,"""Namibia""","""Port Dillonland""","""Jimmy Lock""",876,46.57002,"""GBB 810""","""RN4FHXMX723BLLLLW""","""VLPDGBAD""","""Gordon, Saunders and Jimenez""","""Group""","""Production assistant, radio""","""10/32""",4239854219348534,311,"""MXN""",40.841156,19.672316,1.414214,6037.78174,34636.080016,29.597297,6.824223,6.5114e7,17.635192,0.184634,-0.551452,0.909297,-0.326152,-0.574312,0.483239,0.525982,0.999822,0.017672,1673,392,7,3.6455e7,1.1997e9,881,51.57002,4239854219348539,316,"""ril""","""-20""","""lue""","""bia""","""and""","""ock""","""810""","""LLW""","""BAD""","""nez""","""oup""","""dio""","""/32""","""MXN"""
1304,"""Nathan""","""2013-06-07""","""MintCream""",true,320,5,7.1056e8,9.1401e8,"""Switzerland""","""Lake Nicholeville""","""Jon Streets""",214,115.053304,"""OD 93833""","""JE195THF0L8WJZCTG""","""LYEUGBTX""","""Cherry Inc""","""Inc""","""Outdoor activities/education m…","""07/33""",180075318572801,5179,"""NIS""",36.11094,17.888544,2.236068,26656.260448,30232.673363,14.628739,10.72629,1.3419e7,71.965269,-0.236779,-0.428155,-0.958924,-0.446853,-0.634985,0.363199,0.926734,0.405029,0.996431,1309,325,10,7.1056e8,9.1401e8,219,120.053304,180075318572806,5184,"""han""","""-07""","""eam""","""and""","""lle""","""ets""","""833""","""CTG""","""BTX""","""Inc""","""Inc""","""ger""","""/33""","""NIS"""
8154,"""Cynthia""","""2023-11-14""","""DarkSalmon""",false,891,9,5.9632e8,4.1042e9,"""Greece""","""South Ronaldhaven""","""Brent Prairie""",50038,55.289272,"""7I RW552""","""VF2HLEMD1690DK17P""","""AFOLGBH4""","""Cruz Ltd""","""Group""","""Engineer, automotive""","""09/28""",4993489088320546606,353,"""BAM""",90.299502,29.849623,3.0,24419.582472,64064.207644,223.691752,7.435676,2.2346e9,18.788294,-0.999993,-0.93643,0.412118,0.869352,0.440658,-0.960219,-0.951905,-0.840214,0.90931,8159,896,14,5.9632e8,4.1042e9,50043,60.289272,4993489088320546611,358,"""hia""","""-14""","""mon""","""ece""","""ven""","""rie""","""552""","""17P""","""BH4""","""Ltd""","""oup""","""ive""","""/28""","""BAM"""
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
4922,"""Alexis""","""2024-03-24""","""Green""",true,167,5,3.9597e8,9.3008e8,"""Saint Martin""","""Lake Tara""","""Donna Harbors""",241,-119.632217,"""MVK 678""","""6R243FV412SPK04A8""","""OEBIGBIZ""","""Gutierrez-Davidson""","""PLC""","""Clinical scientist, histocompa…","""04/34""",4317515348040111,519,"""KZT""",70.156967,12.922848,2.236068,19898.898164,30497.290579,15.524175,,6.5708e7,22.781571,0.767984,-0.47555,-0.958924,-0.99032,-0.936647,0.784962,-0.249047,-0.312088,-0.594957,4927,172,10,3.9597e8,9.3008e8,246,-114.632217,4317515348040116,524,"""xis""","""-24""","""een""","""tin""","""ara""","""ors""","""678""","""4A8""","""BIZ""","""son""","""PLC""","""ics""","""/34""","""KZT"""
7522,"""Karen""","""2017-10-31""","""Olive""",false,227,10,3.4249e8,9.2168e9,"""Belarus""","""Brownfurt""","""Frank Wall""",0,-50.525241,"""39-KM46""","""DEDZ4TY44LZT2GL3L""","""RRMSGB64""","""Klein-Miller""","""Inc""","""Forensic psychologist""","""04/28""",2233322460028841,0,"""VND""",86.729464,15.066519,3.162278,18506.433395,96004.345463,0.0,,4.7258e7,0.0,0.855848,0.721059,-0.544021,0.573631,0.630359,0.0,-0.256847,-0.66105,0.0,7527,232,15,3.4249e8,9.2168e9,5,-45.525241,2233322460028846,5,"""ren""","""-31""","""ive""","""rus""","""urt""","""all""","""M46""","""L3L""","""B64""","""ler""","""Inc""","""ist""","""/28""","""VND"""
1896,"""Danielle""","""1998-07-14""","""HotPink""",true,675,8,7.7132e8,5.3113e9,"""Saint Barthelemy""","""East Robert""","""Kayla Key""",49032,86.667822,"""9S230""","""LCW80LB322VPX1A8U""","""ZXCDGBZR""","""Walton LLC""","""Ltd""","""Tax adviser""","""10/27""",2696106733714620,420,"""TTD""",43.543082,25.980762,2.828427,27772.653048,72878.477102,221.431705,9.309555,5.1924e7,20.493902,-0.998808,0.428128,0.989358,-0.83639,0.005103,-0.918178,-0.96269,0.837288,-0.826812,1901,680,13,7.7132e8,5.3113e9,49037,91.667822,2696106733714625,425,"""lle""","""-14""","""ink""","""emy""","""ert""","""Key""","""230""","""A8U""","""BZR""","""LLC""","""Ltd""","""ser""","""/27""","""TTD"""
6060,"""Mary""","""1994-10-10""","""Aqua""",true,483,2,7.6923e8,6.7472e9,"""Israel""","""Mullenmouth""","""Derrick Rapids""",9632,-152.370592,"""ZZR 9029""","""C8HJX0SW3AGH6TJC0""","""THUVGBWB""","""Rivera, Lewis and Miller""","""LLC""","""Multimedia specialist""","""10/27""",4361042620080,380,"""ALL""",77.846002,21.977261,1.414214,27735.059944,82141.20015,98.142753,,2.0883e6,19.493589,0.131844,-0.721017,0.909297,-0.646136,-0.547439,-0.122765,-0.999994,0.852743,0.132322,6065,488,7,7.6923e8,6.7472e9,9637,-147.370592,4361042620085,385,"""ary""","""-10""","""qua""","""ael""","""uth""","""ids""","""029""","""JC0""","""BWB""","""ler""","""LLC""","""ist""","""/27""","""ALL"""


In [32]:
del df

In [33]:
df_pd = pd.read_csv("data/dummy_dataset.csv")

In [34]:
%%time
df_pd[[col + "_sqrt" for col in num_cols]] = np.sqrt(df_pd[num_cols])
df_pd[[col + "_sin" for col in num_cols]] = np.sin(df_pd[num_cols])
df_pd[[col + "_add" for col in num_cols]] = df_pd[num_cols] + 5
df_pd.assign(**{col + "_tail": df_pd[col].str[-3:] for col in cat_cols})

  result = func(self.values, **kwargs)


CPU times: user 10.9 s, sys: 5.95 s, total: 16.9 s
Wall time: 19.2 s


Unnamed: 0,id,name,birth_date,hair_color,is_MU_fan,iq,accession_month,balance,debt,country,...,city_tail,street_name_tail,license_plate_tail,vin_tail,swift_tail,company_name_tail,company_suffix_tail,job_name_tail,credit_card_expire_tail,currency_tail
0,1984,Jennifer,2006-05-24,PaleGoldenRod,True,906,7,9.745830e+08,9.630154e+09,Bolivia,...,rey,ass,D16,H9R,B73,PLC,PLC,ist,/28,NAD
1,4377,John,2022-03-15,Sienna,False,235,1,5.788752e+08,2.948310e+09,Egypt,...,ugh,ale,J96,MM8,BPC,oup,PLC,ist,/32,TOP
2,1668,April,2022-02-20,DodgerBlue,False,387,2,3.645481e+07,1.199658e+09,Namibia,...,and,ock,810,LLW,BAD,nez,oup,dio,/32,MXN
3,1304,Nathan,2013-06-07,MintCream,True,320,5,7.105562e+08,9.140145e+08,Switzerland,...,lle,ets,833,CTG,BTX,Inc,Inc,ger,/33,NIS
4,8154,Cynthia,2023-11-14,DarkSalmon,False,891,9,5.963160e+08,4.104223e+09,Greece,...,ven,rie,552,17P,BH4,Ltd,oup,ive,/28,BAM
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5000000,4922,Alexis,2024-03-24,Green,True,167,5,3.959661e+08,9.300847e+08,Saint Martin,...,ara,ors,678,4A8,BIZ,son,PLC,ics,/34,KZT
5000001,7522,Karen,2017-10-31,Olive,False,227,10,3.424881e+08,9.216834e+09,Belarus,...,urt,all,M46,L3L,B64,ler,Inc,ist,/28,VND
5000002,1896,Danielle,1998-07-14,HotPink,True,675,8,7.713203e+08,5.311272e+09,Saint Barthelemy,...,ert,Key,230,A8U,BZR,LLC,Ltd,ser,/27,TTD
5000003,6060,Mary,1994-10-10,Aqua,True,483,2,7.692336e+08,6.747177e+09,Israel,...,uth,ids,029,JC0,BWB,ler,LLC,ist,/27,ALL


In [35]:
del df_pd
gc.collect()

258

**How poeple that use polars feel like:**

![](https://media1.tenor.com/m/fx8F06O7lQsAAAAd/monkey-monkey-funny.gif)


**How poeple that use pandas feel like:**

![](https://media1.tenor.com/m/78_RIrgHwnkAAAAd/monkey-sad-monkey.gif)