# [Data types and structures](https://docs.pola.rs/user-guide/concepts/data-types-and-structures/)

NaN for missing floating point values, null for other missing data.

In [1]:
import polars as pl
from datetime import date
import random

## series

In [2]:
s = pl.Series("ints", [1,2,3,4])
# possible to be more specific by using 
# s = pl.Series("ints", [1,2,3,4], dtype=pl.UInt64)
print(s)

shape: (4,)
Series: 'ints' [i64]
[
	1
	2
	3
	4
]


In [3]:
s1 = pl.Series("ints", [1,2,3,4])
s2 = pl.Series("ints", [1,2,3,4], dtype=pl.UInt64)
print(s1.dtype, s2.dtype)

Int64 UInt64


## dataframe

In [4]:

df = pl.DataFrame(
    {
        "name": ["Alice Archer", "Ben Brown", "Chloe Cooper", "Daniel Donovan"],
        "birthdate": [
            date(1997, 1, 10),
            date(1985, 2, 15),
            date(1983, 3, 22),
            date(1981, 4, 30),
        ],
        "weight": [57.9, 72.5, 53.6, 83.1],  # (kg)
        "height": [1.56, 1.77, 1.65, 1.75],  # (m)
    }
)

print(df)

shape: (4, 4)
┌────────────────┬────────────┬────────┬────────┐
│ name           ┆ birthdate  ┆ weight ┆ height │
│ ---            ┆ ---        ┆ ---    ┆ ---    │
│ str            ┆ date       ┆ f64    ┆ f64    │
╞════════════════╪════════════╪════════╪════════╡
│ Alice Archer   ┆ 1997-01-10 ┆ 57.9   ┆ 1.56   │
│ Ben Brown      ┆ 1985-02-15 ┆ 72.5   ┆ 1.77   │
│ Chloe Cooper   ┆ 1983-03-22 ┆ 53.6   ┆ 1.65   │
│ Daniel Donovan ┆ 1981-04-30 ┆ 83.1   ┆ 1.75   │
└────────────────┴────────────┴────────┴────────┘


### inspecting a dataframe

#### head

In [5]:
df.head(3)

name,birthdate,weight,height
str,date,f64,f64
"""Alice Archer""",1997-01-10,57.9,1.56
"""Ben Brown""",1985-02-15,72.5,1.77
"""Chloe Cooper""",1983-03-22,53.6,1.65


### glimpse

a different way to do this

In [6]:
df.glimpse()

Rows: 4
Columns: 4
$ name       <str> 'Alice Archer', 'Ben Brown', 'Chloe Cooper', 'Daniel Donovan'
$ birthdate <date> 1997-01-10, 1985-02-15, 1983-03-22, 1981-04-30
$ weight     <f64> 57.9, 72.5, 53.6, 83.1
$ height     <f64> 1.56, 1.77, 1.65, 1.75



### tail

In [7]:
df.tail(2)

name,birthdate,weight,height
str,date,f64,f64
"""Chloe Cooper""",1983-03-22,53.6,1.65
"""Daniel Donovan""",1981-04-30,83.1,1.75


### sample

get a sample or random rows

In [8]:
random.seed(42)
df.sample(2)

name,birthdate,weight,height
str,date,f64,f64
"""Daniel Donovan""",1981-04-30,83.1,1.75
"""Chloe Cooper""",1983-03-22,53.6,1.65


### describe

In [16]:
df.describe()

statistic,name,birthdate,weight,height
str,str,str,f64,f64
"""count""","""4""","""4""",4.0,4.0
"""null_count""","""0""","""0""",0.0,0.0
"""mean""",,"""1986-09-04 00:00:00""",66.775,1.6825
"""std""",,,13.560082,0.097082
"""min""","""Alice Archer""","""1981-04-30""",53.6,1.56
"""25%""",,"""1983-03-22""",57.9,1.65
"""50%""",,"""1985-02-15""",72.5,1.75
"""75%""",,"""1985-02-15""",72.5,1.75
"""max""","""Daniel Donovan""","""1997-01-10""",83.1,1.77


### schema

Get the types of every column

In [17]:
df.schema

Schema([('name', String),
        ('birthdate', Date),
        ('weight', Float64),
        ('height', Float64)])

you can override it by using `schema`. If you only want to change some attribute types use: `schema_overrides`

In [18]:
df = pl.DataFrame(
    {
        "name": ["Alice", "Ben", "Chloe", "Daniel"],
        "age": [27, 39, 41, 43],
    },
    schema={"name": None, "age": pl.UInt8}
)

print(df)

shape: (4, 2)
┌────────┬─────┐
│ name   ┆ age │
│ ---    ┆ --- │
│ str    ┆ u8  │
╞════════╪═════╡
│ Alice  ┆ 27  │
│ Ben    ┆ 39  │
│ Chloe  ┆ 41  │
│ Daniel ┆ 43  │
└────────┴─────┘


### more data types

In [47]:
# Array, List, Object, Categorical, Enum, Struct
class Apple:
    def __init__(self, juicy: bool, color: str):
        self.juicy = juicy
        self.color = color
    def __str__(self):
        return f"{self.juicy=}; {self.color=}"

apple1 = Apple(True, "green")
apple2 = Apple(False, "red")
sizes = pl.Enum(["big", "small"])


df = pl.DataFrame(
    {
        
        "data": [{"jumps": 13, "wins":3}, {"jumps": 8, "wins":45}], # this is a Struct (basically a dictionary)
        "cups" : pl.Series(["big", "small"], dtype=sizes), # Enum type (category with predefined values)
        "food": [apple1, apple2],
        "chickens": [1,4],
        "categories": ["small", "big"], # Category type, will figure out the categories on the fly.
        "bit_flags": [
            [True, True, True, True, False],
            [False, True, True, True, True],
        ],
        "tic_tac_toe": [
            [
                [" ", "x", "o"],
                [" ", "x", " "],
                ["o", "x", " "],
            ],
            [
                ["o", "x", "x"],
                [" ", "o", "x"],
                [" ", " ", "o"],
            ],
        ],
    },
    schema_overrides={
        "bit_flags": pl.Array(pl.Boolean, 5),
        "tic_tac_toe": pl.Array(pl.String, (3, 3)),
        "categories": pl.Categorical()
    },
)
df

data,cups,food,chickens,categories,bit_flags,tic_tac_toe
struct[2],enum,object,i64,cat,"array[bool, 5]","array[str, (3, 3)]"
"{13,3}","""big""",self.juicy=True; self.color='green',1,"""small""","[true, true, … false]","[["" "", ""x"", ""o""], ["" "", ""x"", "" ""], [""o"", ""x"", "" ""]]"
"{8,45}","""small""",self.juicy=False; self.color='red',4,"""big""","[false, true, … true]","[[""o"", ""x"", ""x""], ["" "", ""o"", ""x""], ["" "", "" "", ""o""]]"
