# [Expressions: Missing data](https://docs.pola.rs/user-guide/expressions/missing-data/)

null = no value at all

NaN =  a value

In [3]:
import polars as pl

df = pl.DataFrame(
    {
        "value": [1, None, 3],
        "beers": [None, 9, None]
    },
)
print(df)

shape: (3, 2)
┌───────┬───────┐
│ value ┆ beers │
│ ---   ┆ ---   │
│ i64   ┆ i64   │
╞═══════╪═══════╡
│ 1     ┆ null  │
│ null  ┆ 9     │
│ 3     ┆ null  │
└───────┴───────┘


## Missing data metadata

In [4]:
df.null_count()

value,beers
u32,u32
1,2


## Filling missing data

In [5]:
df = pl.DataFrame(
    {
        "col1": [0.5, 1, 1.5, 2, 2.5],
        "col2": [1, None, 3, None, 5],
    },
)

### Fill with a specified literal value

In [6]:
df.with_columns(
    pl.col("col2").fill_null(3)
)

col1,col2
f64,i64
0.5,1
1.0,3
1.5,3
2.0,3
2.5,5


### Fill with an expression

In [7]:
df.with_columns(
    pl.col("col2").fill_null((2 * pl.col("col1")).cast(pl.Int64))
)

col1,col2
f64,i64
0.5,1
1.0,2
1.5,3
2.0,4
2.5,5


### Fill with a strategy based on neighbouring values

The `forward` looks forward for a value to copy in the empty space.

In [10]:
df.with_columns(
    pl.col("col2").fill_null(strategy="forward").alias("forward"),
    pl.col("col2").fill_null(strategy="backward").alias("backward"),
)

col1,col2,forward,backward
f64,i64,i64,i64
0.5,1.0,1,1
1.0,,1,3
1.5,3.0,3,3
2.0,,3,5
2.5,5.0,5,5


### Fill with interpolation

In [14]:
df.with_columns(
    pl.col("col2"),
    pl.col("col2").interpolate().alias("interpolated")
)

col1,col2,interpolated
f64,i64,f64
0.5,1.0,1.0
1.0,,2.0
1.5,3.0,3.0
2.0,,4.0
2.5,5.0,5.0


## Not a Number, or NaN values

NaN values aren't counted as missing values. They are only filled when using `fill_nan`. When calculating a mean, null isn't counted, but NaN is.

In [15]:
import numpy as np

nan_df = pl.DataFrame(
    {
        "value": [1.0, np.nan, float("nan"), 3.0],
    },
)
print(nan_df)

shape: (4, 1)
┌───────┐
│ value │
│ ---   │
│ f64   │
╞═══════╡
│ 1.0   │
│ NaN   │
│ NaN   │
│ 3.0   │
└───────┘


In [16]:
df = pl.DataFrame(
    {
        "dividend": [1, 0, -1],
        "divisor": [1, 0, -1],
    }
)
result = df.select(pl.col("dividend") / pl.col("divisor"))
print(result)

shape: (3, 1)
┌──────────┐
│ dividend │
│ ---      │
│ f64      │
╞══════════╡
│ 1.0      │
│ NaN      │
│ 1.0      │
└──────────┘


In [17]:
nan_df.with_columns(
    pl.col("value").fill_nan(None).alias("replaced")
).select(
    pl.all().mean().name.suffix("_mean"),
    pl.all().sum().name.suffix("_sum"),
)

value_mean,replaced_mean,value_sum,replaced_sum
f64,f64,f64,f64
,2.0,,4.0
