# [Expressions: Lists and arrays](https://docs.pola.rs/user-guide/expressions/lists-and-arrays/)

## Lists vs arrays

list is 1D. every list stored can have different lengths. lists in polars are homogenous.

In [1]:
from datetime import datetime
import polars as pl

df = pl.DataFrame(
    {
        "names": [
            ["Anne", "Averill", "Adams"],
            ["Brandon", "Brooke", "Borden", "Branson"],
            ["Camila", "Campbell"],
            ["Dennis", "Doyle"],
        ],
        "children_ages": [
            [5, 7],
            [],
            [],
            [8, 11, 18],
        ],
        "medical_appointments": [
            [],
            [],
            [],
            [datetime(2022, 5, 22, 16, 30)],
        ],
    }
)

df

names,children_ages,medical_appointments
list[str],list[i64],list[datetime[μs]]
"[""Anne"", ""Averill"", ""Adams""]","[5, 7]",[]
"[""Brandon"", ""Brooke"", … ""Branson""]",[],[]
"[""Camila"", ""Campbell""]",[],[]
"[""Dennis"", ""Doyle""]","[8, 11, 18]",[2022-05-22 16:30:00]


arrays have a fixed shape with an arbitrary dimension.

In [2]:
df = pl.DataFrame(
    {
        "bit_flags": [
            [True, True, True, True, False],
            [False, True, True, True, True],
        ],
        "tic_tac_toe": [
            [
                [" ", "x", "o"],
                [" ", "x", " "],
                ["o", "x", " "],
            ],
            [
                ["o", "x", "x"],
                [" ", "o", "x"],
                [" ", " ", "o"],
            ],
        ],
    },
    schema={
        "bit_flags": pl.Array(pl.Boolean, 5),
        "tic_tac_toe": pl.Array(pl.String, (3, 3)),
    },
)
df

bit_flags,tic_tac_toe
"array[bool, 5]","array[str, (3, 3)]"
"[true, true, … false]","[["" "", ""x"", ""o""], ["" "", ""x"", "" ""], [""o"", ""x"", "" ""]]"
"[false, true, … true]","[[""o"", ""x"", ""x""], ["" "", ""o"", ""x""], ["" "", "" "", ""o""]]"


If your are using arrays, it's best to use numpy for it. Polars can work with that. When it's possible to choose, use arrays, because it has better performance.

In [3]:
import numpy as np

array = np.arange(0, 120).reshape((5, 2, 3, 4))  # 4D array

print(pl.Series(array).dtype)

Array(Int64, shape=(2, 3, 4))


## working with lists

In [4]:
weather = pl.DataFrame(
    {
        "station": [f"Station {idx}" for idx in range(1, 6)],
        "temperatures": [
            "20 5 5 E1 7 13 19 9 6 20",
            "18 8 16 11 23 E2 8 E2 E2 E2 90 70 40",
            "19 24 E9 16 6 12 10 22",
            "E2 E0 15 7 8 10 E1 24 17 13 6",
            "14 8 E0 16 22 24 E1",
        ],
    }
)

weather

station,temperatures
str,str
"""Station 1""","""20 5 5 E1 7 13 19 9 6 20"""
"""Station 2""","""18 8 16 11 23 E2 8 E2 E2 E2 90…"
"""Station 3""","""19 24 E9 16 6 12 10 22"""
"""Station 4""","""E2 E0 15 7 8 10 E1 24 17 13 6"""
"""Station 5""","""14 8 E0 16 22 24 E1"""


### Programmatically creating lists

In [5]:
weather = weather.with_columns(
    pl.col("temperatures").str.split(" ")
    )
result = weather.explode("temperatures")

In [7]:
weather

station,temperatures
str,list[str]
"""Station 1""","[""20"", ""5"", … ""20""]"
"""Station 2""","[""18"", ""8"", … ""40""]"
"""Station 3""","[""19"", ""24"", … ""22""]"
"""Station 4""","[""E2"", ""E0"", … ""6""]"
"""Station 5""","[""14"", ""8"", … ""E1""]"


In [8]:
result

station,temperatures
str,str
"""Station 1""","""20"""
"""Station 1""","""5"""
"""Station 1""","""5"""
"""Station 1""","""E1"""
"""Station 1""","""7"""
…,…
"""Station 5""","""E0"""
"""Station 5""","""16"""
"""Station 5""","""22"""
"""Station 5""","""24"""


The tutorial tells me I don't need to do this (?).

### Operating on lists

In [11]:
weather.with_columns(
    pl.col("temperatures").list.head(3).alias("head"),
    pl.col("temperatures").list.tail(3).alias("tail"),
    pl.col("temperatures").list.slice(-3,2).alias("two_next_to_last")
)

station,temperatures,head,tail,two_next_to_last
str,list[str],list[str],list[str],list[str]
"""Station 1""","[""20"", ""5"", … ""20""]","[""20"", ""5"", ""5""]","[""9"", ""6"", ""20""]","[""9"", ""6""]"
"""Station 2""","[""18"", ""8"", … ""40""]","[""18"", ""8"", ""16""]","[""90"", ""70"", ""40""]","[""90"", ""70""]"
"""Station 3""","[""19"", ""24"", … ""22""]","[""19"", ""24"", ""E9""]","[""12"", ""10"", ""22""]","[""12"", ""10""]"
"""Station 4""","[""E2"", ""E0"", … ""6""]","[""E2"", ""E0"", ""15""]","[""17"", ""13"", ""6""]","[""17"", ""13""]"
"""Station 5""","[""14"", ""8"", … ""E1""]","[""14"", ""8"", ""E0""]","[""22"", ""24"", ""E1""]","[""22"", ""24""]"


### Element-wise computation within lists

note: when using `with_columns` and don't give a column a alias, it will overwite the column.

Using eval is the way to do this

In [20]:
weather.with_columns(
    pl.col("temperatures")
    .list.eval(pl.element().cast(pl.Int64, strict=False).is_null()) # turns elements into booleans
    .list.sum() # True = 1, so it count's True occurrances
    .alias("errors")
)

station,temperatures,errors
str,list[str],u32
"""Station 1""","[""20"", ""5"", … ""20""]",1
"""Station 2""","[""18"", ""8"", … ""40""]",4
"""Station 3""","[""19"", ""24"", … ""22""]",1
"""Station 4""","[""E2"", ""E0"", … ""6""]",3
"""Station 5""","[""14"", ""8"", … ""E1""]",2


different approuch where you check if measurements start with a letter

In [22]:
weather.with_columns(
    pl.col("temperatures")
    .list.eval(pl.element().str.contains("(?i)[a-z]"))
    .list.sum()
    .alias("errors")
)

station,temperatures,errors
str,list[str],u32
"""Station 1""","[""20"", ""5"", … ""20""]",1
"""Station 2""","[""18"", ""8"", … ""40""]",4
"""Station 3""","[""19"", ""24"", … ""22""]",1
"""Station 4""","[""E2"", ""E0"", … ""6""]",3
"""Station 5""","[""14"", ""8"", … ""E1""]",2


### Row-wise computations

In [23]:
weather_by_day = pl.DataFrame(
    {
        "station": [f"Station {idx}" for idx in range(1, 11)],
        "day_1": [17, 11, 8, 22, 9, 21, 20, 8, 8, 17],
        "day_2": [15, 11, 10, 8, 7, 14, 18, 21, 15, 13],
        "day_3": [16, 15, 24, 24, 8, 23, 19, 23, 16, 10],
    }
)
print(weather_by_day)

shape: (10, 4)
┌────────────┬───────┬───────┬───────┐
│ station    ┆ day_1 ┆ day_2 ┆ day_3 │
│ ---        ┆ ---   ┆ ---   ┆ ---   │
│ str        ┆ i64   ┆ i64   ┆ i64   │
╞════════════╪═══════╪═══════╪═══════╡
│ Station 1  ┆ 17    ┆ 15    ┆ 16    │
│ Station 2  ┆ 11    ┆ 11    ┆ 15    │
│ Station 3  ┆ 8     ┆ 10    ┆ 24    │
│ Station 4  ┆ 22    ┆ 8     ┆ 24    │
│ Station 5  ┆ 9     ┆ 7     ┆ 8     │
│ Station 6  ┆ 21    ┆ 14    ┆ 23    │
│ Station 7  ┆ 20    ┆ 18    ┆ 19    │
│ Station 8  ┆ 8     ┆ 21    ┆ 23    │
│ Station 9  ┆ 8     ┆ 15    ┆ 16    │
│ Station 10 ┆ 17    ┆ 13    ┆ 10    │
└────────────┴───────┴───────┴───────┘


Not sure what the calculated in this example

In [28]:
rank_pct = (pl.element().rank(descending=True) / pl.all().count()).round(2)

result = weather_by_day.with_columns(
    pl.concat_list(pl.all().exclude("station")).alias("all_temps")
).select(
        pl.all().exclude("all_temps"),
        pl.col("all_temps").list.eval(rank_pct, parallel=True).alias("temps_rank")
    )
result

station,day_1,day_2,day_3,temps_rank
str,i64,i64,i64,list[f64]
"""Station 1""",17,15,16,"[0.33, 1.0, 0.67]"
"""Station 2""",11,11,15,"[0.83, 0.83, 0.33]"
"""Station 3""",8,10,24,"[1.0, 0.67, 0.33]"
"""Station 4""",22,8,24,"[0.67, 1.0, 0.33]"
"""Station 5""",9,7,8,"[0.33, 1.0, 0.67]"
"""Station 6""",21,14,23,"[0.67, 1.0, 0.33]"
"""Station 7""",20,18,19,"[0.33, 1.0, 0.67]"
"""Station 8""",8,21,23,"[1.0, 0.67, 0.33]"
"""Station 9""",8,15,16,"[1.0, 0.67, 0.33]"
"""Station 10""",17,13,10,"[0.33, 0.67, 1.0]"


## Working with arrays

In [30]:
df = pl.DataFrame(
    {
        "first_last": [
            ["Anne", "Adams"],
            ["Brandon", "Branson"],
            ["Camila", "Campbell"],
            ["Dennis", "Doyle"],
        ],
        "fav_numbers": [
            [42, 0, 1],
            [2, 3, 5],
            [13, 21, 34],
            [73, 3, 7],
        ],
    },
    schema={
        "first_last": pl.Array(pl.String, 2),
        "fav_numbers": pl.Array(pl.Int32, 3),
    },
)
df

first_last,fav_numbers
"array[str, 2]","array[i32, 3]"
"[""Anne"", ""Adams""]","[42, 0, 1]"
"[""Brandon"", ""Branson""]","[2, 3, 5]"
"[""Camila"", ""Campbell""]","[13, 21, 34]"
"[""Dennis"", ""Doyle""]","[73, 3, 7]"


In [34]:
df.select(
    pl.col("first_last").arr.join(" ").alias("name"),
    pl.col("fav_numbers").arr.sort(),
    pl.col("fav_numbers").arr.max().alias("largest_fav"),
    pl.col("fav_numbers").arr.sum().alias("summed"),
    pl.col("fav_numbers").arr.contains(3).alias("likes_3")
)

name,fav_numbers,largest_fav,summed,likes_3
str,"array[i32, 3]",i32,i32,bool
"""Anne Adams""","[0, 1, 42]",42,43,False
"""Brandon Branson""","[2, 3, 5]",5,10,True
"""Camila Campbell""","[13, 21, 34]",34,68,False
"""Dennis Doyle""","[3, 7, 73]",73,83,True
