# A Must-Read Cheatsheet to Switch From Pandas to Lightning-Fast Polars
## N most-used Pandas operations/techniques shown in Polars
![](images/midjourney.png)

### Introduction

### 0. Reading/writing data

In [13]:
import polars as pl

df = pl.read_csv(
    source="diamonds.csv",
    has_header=True,  # Default
    columns=["price", "cut", "color"],
    separator=",",
    n_rows=10000,
    dtypes={
        "cut": str
    },  # Override datatype mappings
    null_values="N/A",  # Consider as NaN
)

In [None]:
df.write_csv("data/new_file.csv")

In [None]:
df = pl.read_parquet(
    source="data.parquet",
    columns=["list", "of", "columns"],
)

In [None]:
df.write_parquet(
    file="new_data.parquet",
    compression="zstd",  # Default
)

### 1. Creating Series and DataFrames

In [6]:
series = pl.Series(
    values=[1, 2, 3, 4],
    name="new_series",
)

df1 = pl.DataFrame(
    {
        "col1": ["a", "b", "c"],
        "col2": [1, 2, 3],
    }
)

df2 = pl.DataFrame(
    [
        {"col1": "a", "col2": 1},
        {"col1": "b", "col2": 2},
        {"col1": "c", "col2": 3},
    ]
)

### 2. Viewing data

In [14]:
df.head()

cut,color,price
str,str,i64
"""Ideal""","""E""",326
"""Premium""","""E""",326
"""Good""","""E""",327
"""Premium""","""I""",334
"""Good""","""J""",335


In [15]:
df.tail()

cut,color,price
str,str,i64
"""Very Good""","""E""",4704
"""Premium""","""E""",4704
"""Premium""","""E""",4704
"""Premium""","""D""",4704
"""Fair""","""D""",4704


In [16]:
df.sample(5)

cut,color,price
str,str,i64
"""Ideal""","""H""",3985
"""Premium""","""J""",3119
"""Good""","""H""",3584
"""Very Good""","""I""",4641
"""Ideal""","""G""",3267


In [17]:
df.describe()

describe,cut,color,price
str,str,str,f64
"""count""","""10000""","""10000""",10000.0
"""null_count""","""0""","""0""",0.0
"""mean""",,,3406.6043
"""std""",,,1110.12915
"""min""","""Fair""","""D""",326.0
"""max""","""Very Good""","""J""",4704.0
"""median""",,,3626.5


### 3. What are expressions in Polars?

### 4. Selecting data

In [26]:
price = df.select("price")
price = df.select(pl.col("price"))

two_cols = df.select(
    pl.col("price", "cut")
)
two_cols = df.select(
    [
        pl.col("price"),
        pl.col("cut"),
    ]
)

In [27]:
df.select(pl.exclude("price")).head()

cut,color
str,str
"""Ideal""","""E"""
"""Premium""","""E"""
"""Good""","""E"""
"""Premium""","""I"""
"""Good""","""J"""


In [29]:
df.select(pl.col("^c.+$")).head()

cut,color
str,str
"""Ideal""","""E"""
"""Premium""","""E"""
"""Good""","""E"""
"""Premium""","""I"""
"""Good""","""J"""


In [50]:
df.select(
    pl.col(pl.Int64, pl.Float64)
).head()

carat,depth,table,price,x,y,z
f64,f64,f64,i64,f64,f64,f64
0.23,61.5,55.0,326,3.95,3.98,2.43
0.21,59.8,61.0,326,3.89,3.84,2.31
0.23,56.9,65.0,327,4.05,4.07,2.31
0.29,62.4,58.0,334,4.2,4.23,2.63
0.31,63.3,58.0,335,4.34,4.35,2.75


### 5. Filtering data

In [33]:
df.filter(
    pl.col("price").is_between(
        500, 1000
    )
)

cut,color,price
str,str,i64
"""Ideal""","""I""",552
"""Premium""","""D""",552
"""Ideal""","""D""",552
"""Ideal""","""D""",552
"""Premium""","""I""",552


In [34]:
df.filter(
    pl.col("cut") == "Ideal"
).head()

cut,color,price
str,str,i64
"""Ideal""","""E""",326
"""Ideal""","""J""",340
"""Ideal""","""J""",344
"""Ideal""","""I""",348
"""Ideal""","""I""",403


In [36]:
df.filter(
    pl.col("color").is_in(["E", "J"])
    & pl.col("price")
    < 500
).head()

cut,color,price
str,str,i64
"""Ideal""","""E""",326
"""Premium""","""E""",326
"""Good""","""E""",327
"""Premium""","""I""",334
"""Good""","""J""",335


### 6. Creating new columns

In [39]:
df = df.with_columns(
    (pl.col("price") ** 2).alias(
        "new_col"
    )
)
df = df.with_columns(
    (
        str(pl.col("price"))
        + pl.col("cut")
    ).alias("new_col2")
)

### 7. Groupby

In [41]:
df = pl.read_csv("data/diamonds.csv")

df.groupby(
    "cut", maintain_order=True
).count()

cut,count
str,u32
"""Ideal""",21551
"""Premium""",13791
"""Good""",4906
"""Very Good""",12082
"""Fair""",1610


In [44]:
df.groupby(
    "cut", maintain_order=True
).agg(pl.col("*").count())

cut,carat,color,clarity,depth,table,price,x,y,z
str,u32,u32,u32,u32,u32,u32,u32,u32,u32
"""Ideal""",21551,21551,21551,21551,21551,21551,21551,21551,21551
"""Premium""",13791,13791,13791,13791,13791,13791,13791,13791,13791
"""Good""",4906,4906,4906,4906,4906,4906,4906,4906,4906
"""Very Good""",12082,12082,12082,12082,12082,12082,12082,12082,12082
"""Fair""",1610,1610,1610,1610,1610,1610,1610,1610,1610


In [49]:
cols = ["price", "table", "depth"]

df.groupby(
    "cut", maintain_order=True
).agg(pl.col(cols).mean())

cut,price,table,depth
str,f64,f64,f64
"""Ideal""",3457.54197,55.951668,61.709401
"""Premium""",4584.257704,58.746095,61.264673
"""Good""",3928.864452,58.694639,62.365879
"""Very Good""",3981.759891,57.95615,61.818275
"""Fair""",4358.757764,59.053789,64.041677


### 8. Joining and concatenation

### 9. The lazy API in Polars

### Conclusion