# [Getting started](https://docs.pola.rs/user-guide/getting-started/)

In [1]:
import polars as pl
import datetime as dt

In [4]:

df = pl.DataFrame(
    {
        "name": ["Alice Archer", "Ben Brown", "Chloe Cooper", "Daniel Donovan"],
        "birthdate": [
            dt.date(1997, 1, 10),
            dt.date(1985, 2, 15),
            dt.date(1983, 3, 22),
            dt.date(1981, 4, 30),
        ],
        "weight": [57.9, 72.5, 53.6, 83.1],  # (kg)
        "height": [1.56, 1.77, 1.65, 1.75],  # (m)
    }
)
print(df)

shape: (4, 4)
┌────────────────┬────────────┬────────┬────────┐
│ name           ┆ birthdate  ┆ weight ┆ height │
│ ---            ┆ ---        ┆ ---    ┆ ---    │
│ str            ┆ date       ┆ f64    ┆ f64    │
╞════════════════╪════════════╪════════╪════════╡
│ Alice Archer   ┆ 1997-01-10 ┆ 57.9   ┆ 1.56   │
│ Ben Brown      ┆ 1985-02-15 ┆ 72.5   ┆ 1.77   │
│ Chloe Cooper   ┆ 1983-03-22 ┆ 53.6   ┆ 1.65   │
│ Daniel Donovan ┆ 1981-04-30 ┆ 83.1   ┆ 1.75   │
└────────────────┴────────────┴────────┴────────┘


In [5]:
df

name,birthdate,weight,height
str,date,f64,f64
"""Alice Archer""",1997-01-10,57.9,1.56
"""Ben Brown""",1985-02-15,72.5,1.77
"""Chloe Cooper""",1983-03-22,53.6,1.65
"""Daniel Donovan""",1981-04-30,83.1,1.75


In [9]:
df.write_csv("data/output.csv")
df_csv = pl.read_csv("data/output.csv", try_parse_dates=True)
df_csv

name,birthdate,weight,height
str,date,f64,f64
"""Alice Archer""",1997-01-10,57.9,1.56
"""Ben Brown""",1985-02-15,72.5,1.77
"""Chloe Cooper""",1983-03-22,53.6,1.65
"""Daniel Donovan""",1981-04-30,83.1,1.75


## expressions

select

In [13]:
results = df.select(
    pl.col("name"),
    pl.col("birthdate").dt.year().alias("birth_year"),
    (pl.col("weight") / (pl.col("height") ** 2)).alias("bmi"),
)
results

name,birth_year,bmi
str,i32,f64
"""Alice Archer""",1997,23.791913
"""Ben Brown""",1985,23.141498
"""Chloe Cooper""",1983,19.687787
"""Daniel Donovan""",1981,27.134694


expression expansion

In [16]:
result = df.select(
    pl.col("name"),
    (pl.col("weight", "height") * 0.95).round(2).name.suffix("-5%")
)
result

name,weight-5%,height-5%
str,f64,f64
"""Alice Archer""",55.01,1.48
"""Ben Brown""",68.88,1.68
"""Chloe Cooper""",50.92,1.57
"""Daniel Donovan""",78.94,1.66


`with_columns` adds new columns

In [18]:
df.with_columns(
    birth_year=pl.col("birthdate").dt.year(),
    bmi=pl.col("weight") / (pl.col("height") ** 2),
)

name,birthdate,weight,height,birth_year,bmi
str,date,f64,f64,i32,f64
"""Alice Archer""",1997-01-10,57.9,1.56,1997,23.791913
"""Ben Brown""",1985-02-15,72.5,1.77,1985,23.141498
"""Chloe Cooper""",1983-03-22,53.6,1.65,1983,19.687787
"""Daniel Donovan""",1981-04-30,83.1,1.75,1981,27.134694


`filter`: for creating a second dataframe out of the original one. so electing data and filtering data is seperated I guess?

In [21]:
result = df.filter(pl.col("birthdate").dt.year() < 1990)
result

name,birthdate,weight,height
str,date,f64,f64
"""Ben Brown""",1985-02-15,72.5,1.77
"""Chloe Cooper""",1983-03-22,53.6,1.65
"""Daniel Donovan""",1981-04-30,83.1,1.75


Just add multiple filters for more complex queries.

In [28]:
result = df.filter(
    pl.col("birthdate").is_between(dt.date(1982, 12, 31), dt.date(1996, 1, 1)),
    pl.col("height") > 1.7
)
result

name,birthdate,weight,height
str,date,f64,f64
"""Ben Brown""",1985-02-15,72.5,1.77


`group_by` I think this mostly works the same as filter.

In [None]:
df.group_by(
    (pl.col("birthdate").dt.year() // 10 * 10).alias("decade"),
    maintain_order=True # slows process down, but makes data reproducible.
).len()

decade,len
i32,u32
1990,1
1980,3


add an `.agg()` to compute aggregations.

In [32]:
df.group_by(
    (pl.col("birthdate").dt.year() // 10 * 10).alias("decade"),
    maintain_order=True
).agg(
    pl.len().alias("sample_size"),
    pl.col("weight").mean().round(2).alias("avg_weight"),
    pl.col("height").max().alias("tallest")
)

decade,sample_size,avg_weight,tallest
i32,u32,f64,f64
1990,1,57.9,1.56
1980,3,69.73,1.77


## More complex queries

In [34]:
df.with_columns(
    (pl.col("birthdate").dt.year() // 10 * 10).alias("decade"),     # get decade
    pl.col("name").str.split(by=" ").list.first(),                  # extract first name
).select(
    pl.all().exclude("birthrate"),                                  # remove column birthrate
).group_by(
    pl.col("decade"),                                               # group by decade
    maintain_order=True,
).agg(
    pl.col("name"),                                                 # get names per decade group
    pl.col("weight", "height").mean().round(2).name.prefix("avg_"), # avg weight and height on 2 decimals precision
)

decade,name,avg_weight,avg_height
i32,list[str],f64,f64
1990,"[""Alice""]",57.9,1.56
1980,"[""Ben"", ""Chloe"", ""Daniel""]",69.73,1.72


## combining dataframes

### joining

In [35]:
df2 = pl.DataFrame(
    {
        "name": ["Ben Brown", "Daniel Donovan", "Alice Archer", "Chloe Cooper"],
        "parent": [True, False, False, False],
        "siblings": [1, 2, 3, 4],
    }
)

df.join(df2, on="name", how="left")

name,birthdate,weight,height,parent,siblings
str,date,f64,f64,bool,i64
"""Alice Archer""",1997-01-10,57.9,1.56,False,3
"""Ben Brown""",1985-02-15,72.5,1.77,True,1
"""Chloe Cooper""",1983-03-22,53.6,1.65,False,4
"""Daniel Donovan""",1981-04-30,83.1,1.75,False,2


### concat

In [36]:
df3 = pl.DataFrame(
    {
        "name": ["Ethan Edwards", "Fiona Foster", "Grace Gibson", "Henry Harris"],
        "birthdate": [
            dt.date(1977, 5, 10),
            dt.date(1975, 6, 23),
            dt.date(1973, 7, 22),
            dt.date(1971, 8, 3),
        ],
        "weight": [67.9, 72.5, 57.6, 93.1],  # (kg)
        "height": [1.76, 1.6, 1.66, 1.8],  # (m)
    }
)
pl.concat([df, df3], how="vertical")

name,birthdate,weight,height
str,date,f64,f64
"""Alice Archer""",1997-01-10,57.9,1.56
"""Ben Brown""",1985-02-15,72.5,1.77
"""Chloe Cooper""",1983-03-22,53.6,1.65
"""Daniel Donovan""",1981-04-30,83.1,1.75
"""Ethan Edwards""",1977-05-10,67.9,1.76
"""Fiona Foster""",1975-06-23,72.5,1.6
"""Grace Gibson""",1973-07-22,57.6,1.66
"""Henry Harris""",1971-08-03,93.1,1.8
