# Pandas CategoricalDtype with invalid value

This is a notebook for the medium article [A practical introduction to Keras Callbacks](https://medium.com/@bindiatwork/a-practical-introduction-to-keras-callbacks-in-tensorflow-2-705d0c584966)

Please check out article for instructions

**License**: [BSD 2-Clause](https://opensource.org/licenses/BSD-2-Clause)

In [1]:
import pandas as pd
import numpy as np

from pandas.api.types import CategoricalDtype

In [10]:
df = pd.DataFrame({
    'cloth_id': [1001, 1002, 1003, 1004, 1005, 1006, 1007],
    'size': ['S', 'XL', 'M', 'XS', 'L', 'S', 'other'],
})

In [11]:
df

Unnamed: 0,cloth_id,size
0,1001,S
1,1002,XL
2,1003,M
3,1004,XS
4,1005,L
5,1006,S
6,1007,other


## The problem

In [12]:
size_order = CategoricalDtype(
    ['XS', 'S', 'M', 'L', 'XL'], 
    ordered=True
)
df['size'] = df['size'].astype(size_order)

In [13]:
# size other now is NaN
df

Unnamed: 0,cloth_id,size
0,1001,S
1,1002,XL
2,1003,M
3,1004,XS
4,1005,L
5,1006,S
6,1007,


### Fix

In [14]:
# handle invalid value
df = pd.DataFrame({
    'cloth_id': [1001, 1002, 1003, 1004, 1005, 1006, 1007],
    'size': ['S', 'XL', 'M', 'XS', 'L', 'S', 'other'],
})

def as_categorical(source: pd.Series, categories: pd.Series) -> pd.Series:
    cd = CategoricalDtype(categories, ordered=True)
    # 原来不是 nan
    org_not_na = source.notna() 
    res = source.astype(size_order)
    # 原来不是 nan ，现在变成 nan，那就是 那些新的类别
    has_new_na = org_not_na & res.isna()
    if has_new_na.any():
        items = source[has_new_na].str.cat(sep='|')
        raise Exception(f'invalid size value: "{items}"')

    return res

In [15]:
df['size'] = as_categorical(
    df['size'], 
    ['XS', 'S', 'M', 'L', 'XL'],
)

Exception: invalid size value: "other"