# 小编环境

In [1]:
import sys

print('python 版本：',sys.version.split('|')[0])
#python 版本： 3.11.9 

import polars as pl

print("polars 版本：",pl.__version__)
#polars 版本： 0.20.22

python 版本： 3.11.9 
polars 版本： 0.20.22


# 分类数据 Categorical data

分类数据就是平时在数据库中能进行编码的数据，比如：性别、年龄、国家、城市、职业 等等，可以对这些数据进行编码，可以节省存储空间

Polars 支持两种不同的数据类型来处理分类数据：`Enum` 和 `Categorical`
- 当类别预先已知时使用 `Enum`，需要提前提供所有类别
- 当不知道类别或类别不固定时，可以使用 `Categorical`

In [2]:
enum_dtype = pl.Enum(["Polar", "Panda", "Brown"])
enum_series = pl.Series(
    ["Polar", "Panda", "Brown", "Brown", "Polar"], 
    dtype=enum_dtype)

cat_series = pl.Series(
    ["Polar", "Panda", "Brown", "Brown", "Polar"], 
    dtype=pl.Categorical
)

# Categorical

`Categorical`相对比较灵活，不用提前获取所有的类别，当有新类别时，会自动进行编码

当对来自2个不同的 Categorical 类别列直接进行拼接时，以下这种方式会比较慢，polars 是根据字符串出现的先后顺序进行编码，不同的字符串在不同的序列里面编码可能不一样，直接合并的话全局会再进行一次编码，速度会比较慢

In [3]:
cat_series = pl.Series(
    ["Polar", "Panda", "Brown", "Brown", "Polar"], dtype=pl.Categorical
)
cat2_series = pl.Series(
    ["Panda", "Brown", "Brown", "Polar", "Polar"], dtype=pl.Categorical
)

#CategoricalRemappingWarning: Local categoricals have different encodings, 
#expensive re-encoding is done to perform this merge operation. 
#Consider using a StringCache or an Enum type if the categories are known in advance
print(cat_series.append(cat2_series))

shape: (10,)
Series: '' [cat]
[
	"Polar"
	"Panda"
	"Brown"
	"Brown"
	"Polar"
	"Panda"
	"Brown"
	"Brown"
	"Polar"
	"Polar"
]


  print(cat_series.append(cat2_series))


可以使用polars提供的全局字符缓存`StringCache`，来提升数据处理效率

In [4]:
with pl.StringCache():
    cat_series = pl.Series(
        ["Polar", "Panda", "Brown", "Brown", "Polar"], dtype=pl.Categorical
    )
    cat2_series = pl.Series(
        ["Panda", "Brown", "Brown", "Polar", "Polar"], dtype=pl.Categorical
    )
    print(cat_series.append(cat2_series))

shape: (10,)
Series: '' [cat]
[
	"Polar"
	"Panda"
	"Brown"
	"Brown"
	"Polar"
	"Panda"
	"Brown"
	"Brown"
	"Polar"
	"Polar"
]


# Enum

上面来自2个不同类型列进行拼接的耗时的情况，在`Enum`中不会存在，因为已经提前获取到了全部的类别

In [5]:
dtype = pl.Enum(["Polar", "Panda", "Brown"])
cat_series = pl.Series(["Polar", "Panda", "Brown", "Brown", "Polar"], dtype=dtype)
cat2_series = pl.Series(["Panda", "Brown", "Brown", "Polar", "Polar"], dtype=dtype)
print(cat_series.append(cat2_series))

shape: (10,)
Series: '' [enum]
[
	"Polar"
	"Panda"
	"Brown"
	"Brown"
	"Polar"
	"Panda"
	"Brown"
	"Brown"
	"Polar"
	"Polar"
]


如果有编码的字符串类别，当不在提前获取的`Enum`中时，则会报错：`OutOfBounds`

In [6]:
dtype = pl.Enum(["Polar", "Panda", "Brown"])
try:
    cat_series = pl.Series(["Polar", "Panda", "Brown", "Black"], dtype=dtype)
except Exception as e:
    print(e)

conversion from `str` to `enum` failed in column '' for 1 out of 4 values: ["Black"]

Ensure that all values in the input column are present in the categories of the enum datatype.


# 比较

- Categorical vs Categorical
- Categorical vs String
- Enum vs Enum
- Enum vs String(该字符串必须要在提前获取的Enum中)

## Categorical vs Categorical

In [7]:
with pl.StringCache():
    cat_series = pl.Series(["Brown", "Panda", "Polar"], dtype=pl.Categorical)
    cat_series2 = pl.Series(["Polar", "Panda", "Black"], dtype=pl.Categorical)
    print(cat_series == cat_series2)

shape: (3,)
Series: '' [bool]
[
	false
	true
	false
]


## Categorical vs String

In [8]:
cat_series = pl.Series(["Brown", "Panda", "Polar"], dtype=pl.Categorical)
print(cat_series <= "Cat")

shape: (3,)
Series: '' [bool]
[
	true
	false
	false
]


In [9]:
cat_series = pl.Series(["Brown", "Panda", "Polar"], dtype=pl.Categorical)
cat_series_utf = pl.Series(["Panda", "Panda", "A Polar"])
print(cat_series <= cat_series_utf)

shape: (3,)
Series: '' [bool]
[
	true
	true
	false
]


## Enum vs Enum

In [10]:
dtype = pl.Enum(["Polar", "Panda", "Brown"])
cat_series = pl.Series(["Brown", "Panda", "Polar"], dtype=dtype)
cat_series2 = pl.Series(["Polar", "Panda", "Brown"], dtype=dtype)
print(cat_series == cat_series2)

shape: (3,)
Series: '' [bool]
[
	false
	true
	false
]


## Enum vs String(该字符串必须要在提前获取的Enum中)

In [11]:
try:
    cat_series = pl.Series(
        ["Low", "Medium", "High"], dtype=pl.Enum(["Low", "Medium", "High"])
    )
    cat_series <= "Excellent"
except Exception as e:
    print(e)

conversion from `str` to `enum` failed in column '' for 1 out of 1 values: ["Excellent"]

Ensure that all values in the input column are present in the categories of the enum datatype.


In [12]:
dtype = pl.Enum(["Low", "Medium", "High"])
cat_series = pl.Series(["Low", "Medium", "High"], dtype=dtype)
print(cat_series <= "Medium")

shape: (3,)
Series: '' [bool]
[
	true
	true
	false
]


In [13]:
dtype = pl.Enum(["Low", "Medium", "High"])
cat_series = pl.Series(["Low", "Medium", "High"], dtype=dtype)
cat_series2 = pl.Series(["High", "High", "Low"])
print(cat_series <= cat_series2)

shape: (3,)
Series: '' [bool]
[
	true
	true
	false
]
