In [148]:
import polars as pl
import pandas as pd
import numpy as np
import time 
import json
import vaex
from datetime import datetime

In [48]:
df = pl.read_csv('data.csv')

In [229]:
csv = """
0,                            Would never order again.
1,   I'm not sure it gives me any type of glow and ...
2,   Goes on smoothly a bit sticky and color is clo...
3,       Preferisco altri prodotti della stessa marca.
4,        The moisturizing advertised is non-existent.
""".encode()

(pl.read_csv(csv, has_header=False, new_columns=["idx", "lines"])
    .select(pl.col("lines").str.split(" ").alias("words"))
)

words
list[str]
"["""", """", ... ""again.""]"
"["""", """", ... ""...""]"
"["""", """", ... ""clo...""]"
"["""", """", ... ""marca.""]"
"["""", """", ... ""non-existent.""]"


## Dataframe slicing

In [49]:
df.head(1)

carat,cut,color,clarity,depth,table,price,x,y,z
f64,str,str,str,f64,f64,i64,f64,f64,f64
0.23,"""Ideal""","""E""","""SI2""",61.5,55.0,326,3.95,3.98,2.43


In [50]:
df.tail(1)

carat,cut,color,clarity,depth,table,price,x,y,z
f64,str,str,str,f64,f64,i64,f64,f64,f64
0.75,"""Ideal""","""D""","""SI2""",62.2,55.0,2757,5.83,5.87,3.64


## Dataframe info

In [51]:
df.shape

(53940, 10)

In [52]:
df.columns

['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'price', 'x', 'y', 'z']

In [53]:
df.dtypes

[polars.datatypes.Float64,
 polars.datatypes.Utf8,
 polars.datatypes.Utf8,
 polars.datatypes.Utf8,
 polars.datatypes.Float64,
 polars.datatypes.Float64,
 polars.datatypes.Int64,
 polars.datatypes.Float64,
 polars.datatypes.Float64,
 polars.datatypes.Float64]

In [54]:
df.to_pandas().describe() # no inherent fucntion of describe of polars

Unnamed: 0,carat,depth,table,price,x,y,z
count,53940.0,53940.0,53940.0,53940.0,53940.0,53940.0,53940.0
mean,0.79794,61.749405,57.457184,3932.799722,5.731157,5.734526,3.538734
std,0.474011,1.432621,2.234491,3989.439738,1.121761,1.142135,0.705699
min,0.2,43.0,43.0,326.0,0.0,0.0,0.0
25%,0.4,61.0,56.0,950.0,4.71,4.72,2.91
50%,0.7,61.8,57.0,2401.0,5.7,5.71,3.53
75%,1.04,62.5,59.0,5324.25,6.54,6.54,4.04
max,5.01,79.0,95.0,18823.0,10.74,58.9,31.8


# Tackle null value

In [109]:
df = pl.DataFrame({"a": [1, None, None, 4],"b": [0.5, 4, None, 13]})

In [110]:
df.null_count()

a,b
u32,u32
2,1


In [111]:
df.with_columns([pl.col('a').fill_null('backward'), pl.col('b').fill_null('mean')])

a,b
i64,f64
1,0.5
4,4.0
4,5.833333
4,13.0


In [86]:
df.drop_nulls()

a,b
i64,f64
1,0.5
4,13.0


In [87]:
df.filter(~pl.fold(acc=True,f=lambda acc, s: acc & s.is_null(),exprs=pl.all(),)) # drop the rows only all vlaues are null

a,b
i64,f64
1.0,0.5
,4.0
4.0,13.0


# Filtering

In [89]:
df = pl.DataFrame(
    {
        "id": [1, 2, 3],
        "color": ["blue", "red", "green"],
        "size": ["small", "medium", "large"],
    }
)

In [90]:
df.filter(pl.col("id") <= 2)

id,color,size
i64,str,str
1,"""blue""","""small"""
2,"""red""","""medium"""


In [96]:
df.filter((pl.col("id") <= 2) & (pl.col("size") == "small"))

{'id': {0: 1}, 'color': {0: 'blue'}, 'size': {0: 'small'}}

# String related

In [132]:
df = pl.DataFrame({"word": "The man ate a whole cake and the end".split(" ")})

In [133]:
df.with_columns(pl.col("word").str.to_uppercase().alias("uppercase"))

word,uppercase
str,str
"""The""","""THE"""
"""man""","""MAN"""
"""ate""","""ATE"""
"""a""","""A"""
"""whole""","""WHOLE"""
"""cake""","""CAKE"""
"""and""","""AND"""
"""the""","""THE"""
"""end""","""END"""


In [134]:
df.with_columns(pl.col("word").str.lengths().alias("letter_count"))

word,letter_count
str,u32
"""The""",3
"""man""",3
"""ate""",3
"""a""",1
"""whole""",5
"""cake""",4
"""and""",3
"""the""",3
"""end""",3


In [138]:
df.filter(pl.col("word").str.contains(r"(?i)^the$|^a$|Cake").is_not())

word
str
"""man"""
"""ate"""
"""whole"""
"""and"""
"""end"""


In [201]:
df = pl.DataFrame({"a": ["a", "b", "c"],"b": [1, 2, 3]})
df.select([pl.concat_str(["a", "b"]).alias('a+b')])

a+b
str
"""a1"""
"""b2"""
"""c3"""


# Datetime related

In [150]:
df = pl.DataFrame({"date": ["2020-01-01", "2020-02-03", "2020-03-04"], "index": [1, 2, 3]})
df

date,index
str,i64
"""2020-01-01""",1
"""2020-02-03""",2
"""2020-03-04""",3


In [152]:
df_time_converted = df.with_columns(pl.col("date").str.strptime(pl.Date, "%Y-%m-%d"))

In [155]:
df_time_converted.with_columns(pl.col("date").is_between(datetime(2020, 1, 1), datetime(2020, 3, 1)).alias('bt_jan_mar'))

date,index,bt_jan_mar
date,i64,bool
2020-01-01,1,False
2020-02-03,2,True
2020-03-04,3,False


In [158]:
df_time_converted.with_columns(pl.col("date").dt.month().alias("month"))

date,index,month
date,i64,u32
2020-01-01,1,1
2020-02-03,2,2
2020-03-04,3,3


# Useful expression

In [206]:
df = pl.read_csv('data.csv')

### Unique value in a column

In [164]:
df['cut'].unique().to_list()

['Ideal', 'Fair', 'Very Good', 'Good', 'Premium']

### Sort by column

In [172]:
df.sort('price', reverse=True)

carat,cut,color,clarity,depth,table,price,x,y,z
f64,str,str,str,f64,f64,i64,f64,f64,f64
2.29,"""Premium""","""I""","""VS2""",60.8,60.0,18823,8.5,8.47,5.16
2.0,"""Very Good""","""G""","""SI1""",63.5,56.0,18818,7.9,7.97,5.04
1.51,"""Ideal""","""G""","""IF""",61.7,55.0,18806,7.37,7.41,4.56
2.07,"""Ideal""","""G""","""SI2""",62.5,55.0,18804,8.2,8.13,5.11
2.0,"""Very Good""","""H""","""SI1""",62.8,57.0,18803,7.95,8.0,5.01
2.29,"""Premium""","""I""","""SI1""",61.8,59.0,18797,8.52,8.45,5.24
2.04,"""Premium""","""H""","""SI1""",58.1,60.0,18795,8.37,8.28,4.84
2.0,"""Premium""","""I""","""VS1""",60.8,59.0,18795,8.13,8.02,4.91
1.71,"""Premium""","""F""","""VS2""",62.3,59.0,18791,7.57,7.53,4.7
2.15,"""Ideal""","""G""","""SI2""",62.6,54.0,18791,8.29,8.35,5.21


### Simple aggregation

In [173]:
df.select(
    [
        pl.sum("price").alias("sum"),
        pl.min("price").alias("min"),
        pl.max("price").alias("max"),
        pl.col("price").max().alias("other_max"),
        pl.std("price").alias("std dev"),
        pl.var("price").alias("variance"),
    ]
)

sum,min,max,other_max,std dev,variance
i64,i64,i64,i64,f64,f64
212135217,326,18823,18823,3989.439738,15916000.0


### Binary function on a column

In [178]:
df.with_columns([pl.when(pl.col("price") < 1000).then('cheap').otherwise('not cheap').alias('cheap?')])

carat,cut,color,clarity,depth,table,price,x,y,z,cheap?
f64,str,str,str,f64,f64,i64,f64,f64,f64,str
0.23,"""Ideal""","""E""","""SI2""",61.5,55.0,326,3.95,3.98,2.43,"""cheap"""
0.21,"""Premium""","""E""","""SI1""",59.8,61.0,326,3.89,3.84,2.31,"""cheap"""
0.23,"""Good""","""E""","""VS1""",56.9,65.0,327,4.05,4.07,2.31,"""cheap"""
0.29,"""Premium""","""I""","""VS2""",62.4,58.0,334,4.2,4.23,2.63,"""cheap"""
0.31,"""Good""","""J""","""SI2""",63.3,58.0,335,4.34,4.35,2.75,"""cheap"""
0.24,"""Very Good""","""J""","""VVS2""",62.8,57.0,336,3.94,3.96,2.48,"""cheap"""
0.24,"""Very Good""","""I""","""VVS1""",62.3,57.0,336,3.95,3.98,2.47,"""cheap"""
0.26,"""Very Good""","""H""","""SI1""",61.9,55.0,337,4.07,4.11,2.53,"""cheap"""
0.22,"""Fair""","""E""","""VS2""",65.1,61.0,337,3.87,3.78,2.49,"""cheap"""
0.23,"""Very Good""","""H""","""VS1""",59.4,61.0,338,4.0,4.05,2.39,"""cheap"""


### Window function

In [222]:
lazy_df = df.lazy()

In [223]:
lazy_df

In [226]:
df.with_columns([pl.col("price").sum().over("color").alias("sum[price]/color"),
                pl.col("clarity").list().over("color").alias("clarity_list/color")]).collect()

carat,cut,color,clarity,depth,table,price,x,y,z,sum[price]/color,clarity_list/color
f64,str,str,str,f64,f64,i64,f64,f64,f64,i64,list[str]
0.23,"""Ideal""","""E""","""SI2""",61.5,55.0,326,3.95,3.98,2.43,30142944,"[""SI2"", ""SI1"", ... ""VS2""]"
0.21,"""Premium""","""E""","""SI1""",59.8,61.0,326,3.89,3.84,2.31,30142944,"[""SI2"", ""SI1"", ... ""VS2""]"
0.23,"""Good""","""E""","""VS1""",56.9,65.0,327,4.05,4.07,2.31,30142944,"[""SI2"", ""SI1"", ... ""VS2""]"
0.29,"""Premium""","""I""","""VS2""",62.4,58.0,334,4.2,4.23,2.63,27608146,"[""VS2"", ""VVS1"", ... ""SI1""]"
0.31,"""Good""","""J""","""SI2""",63.3,58.0,335,4.34,4.35,2.75,14949281,"[""SI2"", ""VVS2"", ... ""SI1""]"
0.24,"""Very Good""","""J""","""VVS2""",62.8,57.0,336,3.94,3.96,2.48,14949281,"[""SI2"", ""VVS2"", ... ""SI1""]"
0.24,"""Very Good""","""I""","""VVS1""",62.3,57.0,336,3.95,3.98,2.47,27608146,"[""VS2"", ""VVS1"", ... ""SI1""]"
0.26,"""Very Good""","""H""","""SI1""",61.9,55.0,337,4.07,4.11,2.53,37257301,"[""SI1"", ""VS1"", ... ""SI2""]"
0.22,"""Fair""","""E""","""VS2""",65.1,61.0,337,3.87,3.78,2.49,30142944,"[""SI2"", ""SI1"", ... ""VS2""]"
0.23,"""Very Good""","""H""","""VS1""",59.4,61.0,338,4.0,4.05,2.39,37257301,"[""SI1"", ""VS1"", ... ""SI2""]"


### Groupby

In [189]:
df.groupby('color').agg([pl.col('price').max().alias('max_price'), pl.col('cut').count().alias('count')])

color,max_price,count
str,i64,u32
"""F""",18791,9542
"""J""",18710,2808
"""G""",18818,11292
"""D""",18693,6775
"""I""",18823,5422
"""H""",18803,8304
"""E""",18731,9797


In [36]:
df['cut'].value_counts()

cut,counts
str,u32
"""Ideal""",21551
"""Premium""",13791
"""Very Good""",12082
"""Good""",4906
"""Fair""",1610


### Row wise processing

In [202]:
grades = pl.DataFrame(
    {
        "student": ["bas", "laura", "tim", "jenny"],
        "arithmetic": [10, 5, 6, 8],
        "biology": [4, 6, 2, 7],
        "geography": [8, 4, 9, 7],
    }
)
grades

student,arithmetic,biology,geography
str,i64,i64,i64
"""bas""",10,4,8
"""laura""",5,6,4
"""tim""",6,2,9
"""jenny""",8,7,7


In [204]:
grades.select([pl.concat_list(pl.all().exclude("student")).alias("all_grades")])

all_grades
list[i64]
"[10, 4, 8]"
"[5, 6, 4]"
"[6, 2, 9]"
"[8, 7, 7]"


In [198]:
df1 = pl.DataFrame(
    {
        "a": [1, 2, 3],
        "b": [10, 20, 30],
    }
)

out = df1.select(
    pl.fold(acc=pl.lit(2), f=lambda acc, x: acc + x, exprs=pl.col("*")).alias("sum"),
)

out

sum
i64
13
24
35


### Apply custom function

In [221]:
import collections

def add_prefix(string):
    return 'very ' + string

df.with_columns(pl.col('cut').apply(add_prefix).alias('cut_with_pre'))

carat,cut,color,clarity,depth,table,price,x,y,z,cut_with_pre
f64,str,str,str,f64,f64,i64,f64,f64,f64,str
0.23,"""Ideal""","""E""","""SI2""",61.5,55.0,326,3.95,3.98,2.43,"""very Ideal"""
0.21,"""Premium""","""E""","""SI1""",59.8,61.0,326,3.89,3.84,2.31,"""very Premium"""
0.23,"""Good""","""E""","""VS1""",56.9,65.0,327,4.05,4.07,2.31,"""very Good"""
0.29,"""Premium""","""I""","""VS2""",62.4,58.0,334,4.2,4.23,2.63,"""very Premium"""
0.31,"""Good""","""J""","""SI2""",63.3,58.0,335,4.34,4.35,2.75,"""very Good"""
0.24,"""Very Good""","""J""","""VVS2""",62.8,57.0,336,3.94,3.96,2.48,"""very Very Good..."
0.24,"""Very Good""","""I""","""VVS1""",62.3,57.0,336,3.95,3.98,2.47,"""very Very Good..."
0.26,"""Very Good""","""H""","""SI1""",61.9,55.0,337,4.07,4.11,2.53,"""very Very Good..."
0.22,"""Fair""","""E""","""VS2""",65.1,61.0,337,3.87,3.78,2.49,"""very Fair"""
0.23,"""Very Good""","""H""","""VS1""",59.4,61.0,338,4.0,4.05,2.39,"""very Very Good..."


## Lazy related

In [101]:
lazy_select_df = pl.scan_csv("data.csv").select(['carat', 'cut', 'color'])

In [102]:
lazy_select_df

## polars expression (parallel process for each column)

each item in the list represent one process

alias create a column with the desire column name

In [42]:
df.select([
    pl.col('cut').n_unique().alias('numbers of unique in cut'),
    pl.col('clarity').n_unique().alias('numbers of unique in clarity')
])

numbers of unique in cut,numbers of unique in clarity
u32,u32
5,8


aggregation:

if statement

In [57]:
df.select([
    pl.col('price'),
    pl.when(pl.col('price') > 1000).then(True).otherwise(False).alias('test')
])

price,test
i64,bool
326,false
326,false
327,false
334,false
335,false
336,false
336,false
337,false
337,false
338,false


window function

In [55]:
df[
    [
        pl.col('*'), # select all
        pl.col('price').sum().over('cut').alias('sum of the correspond group'),
        pl.col('clarity').list().over('cut').alias('full list of the correspond group')
    ]
]

carat,cut,color,clarity,depth,table,price,x,y,z,sum of the correspond group,full list of the correspond group
f64,str,str,str,f64,f64,i64,f64,f64,f64,i64,list[str]
0.23,"""Ideal""","""E""","""SI2""",61.5,55.0,326,3.95,3.98,2.43,74513487,"[""SI2"", ""VS1"", ... ""SI2""]"
0.21,"""Premium""","""E""","""SI1""",59.8,61.0,326,3.89,3.84,2.31,63221498,"[""SI1"", ""VS2"", ... ""SI2""]"
0.23,"""Good""","""E""","""VS1""",56.9,65.0,327,4.05,4.07,2.31,19275009,"[""VS1"", ""SI2"", ... ""SI1""]"
0.29,"""Premium""","""I""","""VS2""",62.4,58.0,334,4.2,4.23,2.63,63221498,"[""SI1"", ""VS2"", ... ""SI2""]"
0.31,"""Good""","""J""","""SI2""",63.3,58.0,335,4.34,4.35,2.75,19275009,"[""VS1"", ""SI2"", ... ""SI1""]"
0.24,"""Very Good""","""J""","""VVS2""",62.8,57.0,336,3.94,3.96,2.48,48107623,"[""VVS2"", ""VVS1"", ... ""SI1""]"
0.24,"""Very Good""","""I""","""VVS1""",62.3,57.0,336,3.95,3.98,2.47,48107623,"[""VVS2"", ""VVS1"", ... ""SI1""]"
0.26,"""Very Good""","""H""","""SI1""",61.9,55.0,337,4.07,4.11,2.53,48107623,"[""VVS2"", ""VVS1"", ... ""SI1""]"
0.22,"""Fair""","""E""","""VS2""",65.1,61.0,337,3.87,3.78,2.49,7017600,"[""VS2"", ""SI2"", ... ""VS1""]"
0.23,"""Very Good""","""H""","""VS1""",59.4,61.0,338,4.0,4.05,2.39,48107623,"[""VVS2"", ""VVS1"", ... ""SI1""]"


In [60]:
df.head(6)

carat,cut,color,clarity,depth,table,price,x,y,z
f64,str,str,str,f64,f64,i64,f64,f64,f64
0.23,"""Ideal""","""E""","""SI2""",61.5,55.0,326,3.95,3.98,2.43
0.21,"""Premium""","""E""","""SI1""",59.8,61.0,326,3.89,3.84,2.31
0.23,"""Good""","""E""","""VS1""",56.9,65.0,327,4.05,4.07,2.31
0.29,"""Premium""","""I""","""VS2""",62.4,58.0,334,4.2,4.23,2.63
0.31,"""Good""","""J""","""SI2""",63.3,58.0,335,4.34,4.35,2.75
0.24,"""Very Good""","""J""","""VVS2""",62.8,57.0,336,3.94,3.96,2.48


groupby

In [61]:
df.groupby('cut').agg([
    pl.col('price').sum().suffix('_sum'),
    pl.col('price').count().alias('count of memeber'),
    pl.col('depth').min().suffix('_min')
])

cut,price_sum,count of memeber,depth_min
str,i64,u32,f64
"""Premium""",63221498,13791,58.0
"""Fair""",7017600,1610,43.0
"""Ideal""",74513487,21551,43.0
"""Good""",19275009,4906,54.3
"""Very Good""",48107623,12082,56.8


In [62]:
process = (df.lazy().groupby('cut').agg([
    pl.count(),
    pl.col('price').sum(),
    pl.col('depth').min()
]).sort('count', reverse=True).limit(3))

In [63]:
process.collect()

cut,count,price,depth
str,u32,i64,f64
"""Ideal""",21551,74513487,43.0
"""Premium""",13791,63221498,58.0
"""Very Good""",12082,48107623,56.8


horizontal accumulation

In [107]:
df.select([(pl.col('price') + pl.col('depth')).alias('p+d2')])

p+d2
f64
387.5
385.8
383.9
396.4
398.3
398.8
398.3
398.9
402.1
397.4


In [75]:
df.head(6)

carat,cut,color,clarity,depth,table,price,x,y,z
f64,str,str,str,f64,f64,i64,f64,f64,f64
0.23,"""Ideal""","""E""","""SI2""",61.5,55.0,326,3.95,3.98,2.43
0.21,"""Premium""","""E""","""SI1""",59.8,61.0,326,3.89,3.84,2.31
0.23,"""Good""","""E""","""VS1""",56.9,65.0,327,4.05,4.07,2.31
0.29,"""Premium""","""I""","""VS2""",62.4,58.0,334,4.2,4.23,2.63
0.31,"""Good""","""J""","""SI2""",63.3,58.0,335,4.34,4.35,2.75
0.24,"""Very Good""","""J""","""VVS2""",62.8,57.0,336,3.94,3.96,2.48


In [78]:
df.filter(pl.col('color')=="E").select(['carat', 'cut', 'color']).head(5)

carat,cut,color
f64,str,str
0.23,"""Ideal""","""E"""
0.21,"""Premium""","""E"""
0.23,"""Good""","""E"""
0.22,"""Fair""","""E"""
0.2,"""Premium""","""E"""


# Pokemon dataset

In [79]:
df = pl.read_csv(
    "https://gist.githubusercontent.com/ritchie46/cac6b337ea52281aa23c049250a4ff03/raw/89a957ff3919d90e6ef2d34235e6bf22304f3366/pokemon.csv"
)

In [84]:
df.head(10)

#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
i64,str,str,str,i64,i64,i64,i64,i64,i64,i64,i64,bool
1,"""Bulbasaur""","""Grass""","""Poison""",318,45,49,49,65,65,45,1,False
2,"""Ivysaur""","""Grass""","""Poison""",405,60,62,63,80,80,60,1,False
3,"""Venusaur""","""Grass""","""Poison""",525,80,82,83,100,100,80,1,False
3,"""VenusaurMega V...","""Grass""","""Poison""",625,80,100,123,122,120,80,1,False
4,"""Charmander""","""Fire""",,309,39,52,43,60,50,65,1,False
5,"""Charmeleon""","""Fire""",,405,58,64,58,80,65,80,1,False
6,"""Charizard""","""Fire""","""Flying""",534,78,84,78,109,85,100,1,False
6,"""CharizardMega ...","""Fire""","""Dragon""",634,78,130,111,130,85,100,1,False
6,"""CharizardMega ...","""Fire""","""Flying""",634,78,104,78,159,115,100,1,False
7,"""Squirtle""","""Water""",,314,44,48,65,50,64,43,1,False


In [87]:
df.sort('Type 1')

#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
i64,str,str,str,i64,i64,i64,i64,i64,i64,i64,i64,bool
10,"""Caterpie""","""Bug""",,195,45,30,35,20,20,45,1,false
11,"""Metapod""","""Bug""",,205,50,20,55,25,25,30,1,false
12,"""Butterfree""","""Bug""","""Flying""",395,60,45,50,90,80,70,1,false
13,"""Weedle""","""Bug""","""Poison""",195,40,35,30,20,20,50,1,false
14,"""Kakuna""","""Bug""","""Poison""",205,45,25,50,25,25,35,1,false
15,"""Beedrill""","""Bug""","""Poison""",395,65,90,40,45,80,75,1,false
15,"""BeedrillMega B...","""Bug""","""Poison""",495,65,150,40,15,80,145,1,false
46,"""Paras""","""Bug""","""Grass""",285,35,70,55,45,55,25,1,false
47,"""Parasect""","""Bug""","""Grass""",405,60,95,80,60,80,30,1,false
48,"""Venonat""","""Bug""","""Poison""",305,60,55,50,40,55,45,1,false


In [98]:
df.sort("Type 1").select(
    [pl.col("Type 1").head(3).list().over("Type 1").flatten(), ])

Type 1
str
"""Bug"""
"""Bug"""
"""Bug"""
"""Dragon"""
"""Dragon"""
"""Dragon"""
"""Electric"""
"""Electric"""
"""Electric"""
"""Fairy"""


In [99]:
grades = pl.DataFrame(
    {
        "student": ["bas", "laura", "tim", "jenny"],
        "arithmetic": [10, 5, 6, 8],
        "biology": [4, 6, 2, 7],
        "geography": [8, 4, 9, 7],
    }
)

In [102]:
grades.with_column(
    # create the list of homogeneous data
    pl.concat_list(pl.all().exclude("student")).alias("all_grades")
)

student,arithmetic,biology,geography,all_grades
str,i64,i64,i64,list[i64]
"""bas""",10,4,8,"[10, 4, 8]"
"""laura""",5,6,4,"[5, 6, 4]"
"""tim""",6,2,9,"[6, 2, 9]"
"""jenny""",8,7,7,"[8, 7, 7]"


## undefine

In [27]:
df['x'].apply(round)

shape: (53940,)
Series: 'x' [i64]
[
	4
	4
	4
	4
	4
	4
	4
	4
	4
	4
	4
	4
	...
	6
	6
	6
	6
	6
	6
	6
	6
	6
	6
	6
	6
]

In [28]:
df['x'].apply(lambda x: round(x))

shape: (53940,)
Series: 'x' [i64]
[
	4
	4
	4
	4
	4
	4
	4
	4
	4
	4
	4
	4
	...
	6
	6
	6
	6
	6
	6
	6
	6
	6
	6
	6
	6
]

In [29]:
mydict = {v:k for k,v in enumerate(df['cut'].unique())}

In [30]:
mydict

{'Ideal': 0, 'Premium': 1, 'Good': 2, 'Very Good': 3, 'Fair': 4}

In [31]:
import polars.lazy as plz

ModuleNotFoundError: No module named 'polars.lazy'

In [4]:
l_df = df.lazy()

In [5]:
type(l_df)

polars.internals.lazy_frame.LazyFrame

In [6]:
l_df

In [13]:
df.groupby('cut').

shape: (5,)
Series: 'price' [i64]
[
	63221498
	48107623
	74513487
	7017600
	19275009
]