# [Expressions: Structs](https://docs.pola.rs/user-guide/expressions/structs/)

In [1]:
import polars as pl

ratings = pl.DataFrame(
    {
        "Movie": ["Cars", "IT", "ET", "Cars", "Up", "IT", "Cars", "ET", "Up", "Cars"],
        "Theatre": ["NE", "ME", "IL", "ND", "NE", "SD", "NE", "IL", "IL", "NE"],
        "Avg_Rating": [4.5, 4.4, 4.6, 4.3, 4.8, 4.7, 4.5, 4.9, 4.7, 4.6],
        "Count": [30, 27, 26, 29, 31, 28, 28, 26, 33, 28],
    }
)
print(ratings)

shape: (10, 4)
┌───────┬─────────┬────────────┬───────┐
│ Movie ┆ Theatre ┆ Avg_Rating ┆ Count │
│ ---   ┆ ---     ┆ ---        ┆ ---   │
│ str   ┆ str     ┆ f64        ┆ i64   │
╞═══════╪═════════╪════════════╪═══════╡
│ Cars  ┆ NE      ┆ 4.5        ┆ 30    │
│ IT    ┆ ME      ┆ 4.4        ┆ 27    │
│ ET    ┆ IL      ┆ 4.6        ┆ 26    │
│ Cars  ┆ ND      ┆ 4.3        ┆ 29    │
│ Up    ┆ NE      ┆ 4.8        ┆ 31    │
│ IT    ┆ SD      ┆ 4.7        ┆ 28    │
│ Cars  ┆ NE      ┆ 4.5        ┆ 28    │
│ ET    ┆ IL      ┆ 4.9        ┆ 26    │
│ Up    ┆ IL      ┆ 4.7        ┆ 33    │
│ Cars  ┆ NE      ┆ 4.6        ┆ 28    │
└───────┴─────────┴────────────┴───────┘


## Encountering the data type Struct

In [3]:
ratings.select(pl.col("Theatre").value_counts(sort=True))

Theatre
struct[2]
"{""NE"",4}"
"{""IL"",3}"
"{""ME"",1}"
"{""ND"",1}"
"{""SD"",1}"


use unnest to convert a struct to a count table.

In [5]:
ratings.select(pl.col("Theatre").value_counts(sort=True)).unnest("Theatre")

Theatre,count
str,u32
"""NE""",4
"""IL""",3
"""ME""",1
"""ND""",1
"""SD""",1


## Inferring the data type Struct from dictionaries

In [6]:
rating_series = pl.Series(
    "ratings",
    [
        {"Movie": "Cars", "Theatre": "NE", "Avg_Rating": 4.5},
        {"Movie": "Toy Story", "Theatre": "ME", "Avg_Rating": 4.9},
    ],
)
print(rating_series)

shape: (2,)
Series: 'ratings' [struct[3]]
[
	{"Cars","NE",4.5}
	{"Toy Story","ME",4.9}
]


Using the wrong type can cause null values, or errors

In [7]:
null_rating_series = pl.Series(
    "ratings",
    [
        {"Movie": "Cars", "Theatre": "NE", "Avg_Rating": 4.5},
        {"Mov": "Toy Story", "Theatre": "ME", "Avg_Rating": 4.9},
        {"Movie": "Snow White", "Theatre": "IL", "Avg_Rating": "4.7"},
    ],
    strict=False,  # To show the final structs with `null` values.
)
print(null_rating_series)

shape: (3,)
Series: 'ratings' [struct[4]]
[
	{"Cars","NE","4.5",null}
	{null,"ME","4.9","Toy Story"}
	{"Snow White","IL","4.7",null}
]


## Extracting individual values of a Struct

In [8]:
rating_series.struct.field("Movie")

Movie
str
"""Cars"""
"""Toy Story"""


## Renaming individual fields of a Struct

In [10]:
result = rating_series.struct.rename_fields(["Film", "State", "Value"])
result.to_frame().unnest("ratings")

Film,State,Value
str,str,f64
"""Cars""","""NE""",4.5
"""Toy Story""","""ME""",4.9


## Practical use-cases of Struct columns

### Identifying duplicate rows

In [11]:
ratings.filter(pl.struct("Movie", "Theatre").is_duplicated())

Movie,Theatre,Avg_Rating,Count
str,str,f64,i64
"""Cars""","""NE""",4.5,30
"""ET""","""IL""",4.6,26
"""Cars""","""NE""",4.5,28
"""ET""","""IL""",4.9,26
"""Cars""","""NE""",4.6,28


### Multi-column ranking

In [None]:
ratings.with_columns(
    pl.struct("Count","Avg_Rating")
    .rank("dense", descending=True)
    .over("Movie", "Theatre")   # This applies the previous
    .alias("Ranking")
)

Movie,Theatre,Avg_Rating,Count,Ranking
str,str,f64,i64,u32
"""Cars""","""NE""",4.5,30,1
"""IT""","""ME""",4.4,27,1
"""ET""","""IL""",4.6,26,2
"""Cars""","""ND""",4.3,29,1
"""Up""","""NE""",4.8,31,1
"""IT""","""SD""",4.7,28,1
"""Cars""","""NE""",4.5,28,3
"""ET""","""IL""",4.9,26,1
"""Up""","""IL""",4.7,33,1
"""Cars""","""NE""",4.6,28,2


## Using multiple columns in a single expression

In [None]:
def ack(m, n):
    if not m:
        return n + 1
    if not n:
        return ack(m - 1, 1)
    return ack(m - 1, ack(m, n - 1))

values = pl.DataFrame(
    {
        "m": [0, 0, 0, 1, 1, 1, 2],
        "n": [2, 3, 4, 1, 2, 3, 1],
    }
)

values.with_columns(
    pl.struct(["m", "n"])
    .map_elements(lambda s: ack(s["m"], s["n"]), return_dtype=pl.Int64)
    .alias("ack")
)


m,n,ack
i64,i64,i64
0,2,3
0,3,4
0,4,5
1,1,3
1,2,4
1,3,5
2,1,5
