In [17]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [18]:
# 读入数据
car = pd.read_csv("./input/car.csv")
# 打印数据集的前五行
car.head(5)

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,acceptance
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc


In [19]:
# 查看数据集大小
car.shape

(1728, 7)

离散型变量，使用 `value_counts()` 统计各种取值的分布

In [20]:
# 查看各个取值的数量分布
car["acceptance"].value_counts()

acceptance
unacc    1210
acc       384
good       69
vgood      65
Name: count, dtype: int64

car 的第一维长度 `car.shape[0]`；获取各个取值的

In [21]:
(car["acceptance"].value_counts() + 0.0) / car.shape[0]

acceptance
unacc    0.700231
acc      0.222222
good     0.039931
vgood    0.037616
Name: count, dtype: float64

## 数字编码

In [22]:
# 创建要进行数字编码的数据集
car_num_encoded = car.copy()
# 根据编码规则创建编码字典
buying_dict = {"low": 0, "med": 1, "high": 2, "vhigh": 3}
# 进行编码
car_num_encoded["buying"] = car_num_encoded["buying"].map(buying_dict)
# 查看编码列的前五行，查看编码结果
car_num_encoded["buying"].head(5)

0    3
1    3
2    3
3    3
4    3
Name: buying, dtype: int64

In [23]:
# maint特征数字编码
maint_dict = {"low": 0, "med": 1, "high": 2, "vhigh": 3}
car_num_encoded["maint"] = car_num_encoded["maint"].map(maint_dict)

# lug_boot特征数字编码
lug_boot_dict = {"small": 0, "med": 1, "big": 2}
car_num_encoded["lug_boot"] = car_num_encoded["lug_boot"].map(lug_boot_dict)

# safety特征数字编码
safety_dict = {"low": 0, "med": 1, "high": 2}
car_num_encoded["safety"] = car_num_encoded["safety"].map(safety_dict)

In [24]:
# 查看编码后的前五行
car_num_encoded.head(5)

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,acceptance
0,3,3,2,2,0,0,unacc
1,3,3,2,2,0,1,unacc
2,3,3,2,2,0,2,unacc
3,3,3,2,2,1,0,unacc
4,3,3,2,2,1,1,unacc


In [25]:
# 将车门数量一列中，5more替换为6
car_num_encoded.doors.replace("5more", "6", inplace=True)
# 将核载人数一列中，more替换为6
car_num_encoded.persons.replace("more", "6", inplace=True)
# 查看两个属性取值的分布情况。
print(car_num_encoded["doors"].value_counts())
print(car_num_encoded["persons"].value_counts())

doors
2    432
3    432
4    432
6    432
Name: count, dtype: int64
persons
2    576
4    576
6    576
Name: count, dtype: int64


In [26]:
# 查看数据类型
print(car_num_encoded.doors.dtype, car_num_encoded.persons.dtype)
# 将车门数量属性类型变更为整型
car_num_encoded["doors"] = car_num_encoded["doors"].astype("int")
# 将核载人数属性类型变更为整型
car_num_encoded["persons"] = car_num_encoded["persons"].astype("int")
# 再次查看两个特征的数据类型：
print(car_num_encoded.doors.dtype, car_num_encoded.persons.dtype)

object object
int32 int32


In [27]:
# 查看初步编码后的数据
car_num_encoded.head(5)

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,acceptance
0,3,3,2,2,0,0,unacc
1,3,3,2,2,0,1,unacc
2,3,3,2,2,0,2,unacc
3,3,3,2,2,1,0,unacc
4,3,3,2,2,1,1,unacc


In [28]:
# 对库中的编码函数
# 创建编码器
buying_encoder = LabelEncoder()
# 适配数据集并完成编码
encoded_buying = buying_encoder.fit_transform(car["buying"])
# 查看原数据和编码的对应关系：
buying_encoder.inverse_transform([0, 1, 2, 3])

array(['high', 'low', 'med', 'vhigh'], dtype=object)

# One-hot 编码

In [29]:
pd.get_dummies(car["buying"], prefix="buying").head(5)

Unnamed: 0,buying_high,buying_low,buying_med,buying_vhigh
0,False,False,False,True
1,False,False,False,True
2,False,False,False,True
3,False,False,False,True
4,False,False,False,True


In [30]:
# 新建dataframe存储编码后的数据
car_onehot_encoded = pd.DataFrame()
# 对各个特征依次编码
for col in ["buying", "maint", "doors", "persons", "lug_boot", "safety"]:
    col_encoded = pd.get_dummies(car[col], prefix=col + "_")
    car_onehot_encoded = pd.concat([car_onehot_encoded, col_encoded], axis=1)
# 在数据后，加上目标特征列
car_onehot_encoded = pd.concat([car_onehot_encoded, car["acceptance"]], axis=1)
# 查看编码后的数据集
car_onehot_encoded.head(5)

Unnamed: 0,buying__high,buying__low,buying__med,buying__vhigh,maint__high,maint__low,maint__med,maint__vhigh,doors__2,doors__3,...,persons__2,persons__4,persons__more,lug_boot__big,lug_boot__med,lug_boot__small,safety__high,safety__low,safety__med,acceptance
0,False,False,False,True,False,False,False,True,True,False,...,True,False,False,False,False,True,False,True,False,unacc
1,False,False,False,True,False,False,False,True,True,False,...,True,False,False,False,False,True,False,False,True,unacc
2,False,False,False,True,False,False,False,True,True,False,...,True,False,False,False,False,True,True,False,False,unacc
3,False,False,False,True,False,False,False,True,True,False,...,True,False,False,False,True,False,False,True,False,unacc
4,False,False,False,True,False,False,False,True,True,False,...,True,False,False,False,True,False,False,False,True,unacc


In [31]:
car_onehot_encoded.to_csv("./output/one-hot.csv")