In [1]:
import pandas as pd

In [2]:
from sklearn.preprocessing import LabelEncoder

In [3]:
from sklearn.model_selection import train_test_split

In [4]:
from sklearn.linear_model import LogisticRegression

In [5]:
from sklearn.metrics import classification_report

In [6]:
# 读入数据
car = pd.read_csv("D:/input/car.csv")
# 打印数据集的前五行
car.head(5)

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,acceptance
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc


In [7]:
# 查看数据集大小
car.shape

(1728, 7)

In [8]:
# 查看各个取值的数量分布
print(car["acceptance"].value_counts())

unacc    1210
acc       384
good       69
vgood      65
Name: acceptance, dtype: int64


In [9]:
# 查看各个取值的比例分布
print((car["acceptance"].value_counts() + 0.0) / car.shape[0])

unacc    0.700231
acc      0.222222
good     0.039931
vgood    0.037616
Name: acceptance, dtype: float64


In [10]:
# 创建要进行数字编码的数据集
car_num_encoded = car.copy()
# 根据编码规则创建编码字典
buying_dict = {"low": 0, "med": 1, "high": 2, "vhigh": 3}
# 进行编码
car_num_encoded["buying"] = car_num_encoded["buying"].map(buying_dict)

In [11]:
# 查看编码列的前五行
car_num_encoded["buying"].head(5)

0    3
1    3
2    3
3    3
4    3
Name: buying, dtype: int64

In [12]:
# maint特征数字编码
maint_dict = {"low": 0, "med": 1, "high": 2, "vhigh": 3}
car_num_encoded["maint"] = car_num_encoded["maint"].map(maint_dict)

In [13]:
# lug_boot特征数字编码
lug_boot_dict = {"small": 0, "med": 1, "big": 2}
car_num_encoded["lug_boot"] = car_num_encoded["lug_boot"].map(lug_boot_dict)

In [14]:
# safety特征数字编码
safety_dict = {"low": 0, "med": 1, "high": 2}
car_num_encoded["safety"] = car_num_encoded["safety"].map(safety_dict)

In [15]:
# 查看编码后的数据前五行
car_num_encoded.head(5)

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,acceptance
0,3,3,2,2,0,0,unacc
1,3,3,2,2,0,1,unacc
2,3,3,2,2,0,2,unacc
3,3,3,2,2,1,0,unacc
4,3,3,2,2,1,1,unacc


In [16]:
# 将车门数量一列中，5more替换为6
car_num_encoded.doors.replace("5more", "6", inplace=True)
# 将核载人数一列中，more替换为6
car_num_encoded.persons.replace("more", "6", inplace=True)

In [17]:
print(car_num_encoded["doors"].value_counts())

3    432
6    432
2    432
4    432
Name: doors, dtype: int64


In [18]:
print(car_num_encoded["persons"].value_counts())

6    576
2    576
4    576
Name: persons, dtype: int64


In [19]:
print(car_num_encoded.doors.dtype, car_num_encoded.persons.dtype)

object object


In [20]:
# 将车门数量属性类型变更为整型
car_num_encoded["doors"] = car_num_encoded["doors"].astype("int")
# 将核载人数属性类型变更为整型
car_num_encoded["persons"] = car_num_encoded["persons"].astype("int")

In [21]:
print(car_num_encoded.doors.dtype, car_num_encoded.persons.dtype)

int32 int32


In [22]:
car_num_encoded.head(5)

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,acceptance
0,3,3,2,2,0,0,unacc
1,3,3,2,2,0,1,unacc
2,3,3,2,2,0,2,unacc
3,3,3,2,2,1,0,unacc
4,3,3,2,2,1,1,unacc


In [23]:
# 创建编码器
buying_encoder = LabelEncoder()
# 适配数据集并完成编码
encoded_buying = buying_encoder.fit_transform(car["buying"])

In [24]:
buying_encoder.inverse_transform([0, 1, 2, 3])

array(['high', 'low', 'med', 'vhigh'], dtype=object)

In [25]:
pd.get_dummies(car["buying"], prefix="buying_").head(5)

Unnamed: 0,buying__high,buying__low,buying__med,buying__vhigh
0,0,0,0,1
1,0,0,0,1
2,0,0,0,1
3,0,0,0,1
4,0,0,0,1


In [26]:
car_onehot_encoded = pd.DataFrame()

In [27]:
for col in ["buying", "maint", "doors", "persons", "lug_boot", "safety"]:
    col_encoded = pd.get_dummies(car[col], prefix=col + "_")
    car_onehot_encoded = pd.concat([car_onehot_encoded, col_encoded], axis=1)

In [28]:
car_onehot_encoded = pd.concat([car_onehot_encoded, car["acceptance"]], axis=1)

In [29]:
car_onehot_encoded.head(5)

Unnamed: 0,buying__high,buying__low,buying__med,buying__vhigh,maint__high,maint__low,maint__med,maint__vhigh,doors__2,doors__3,...,persons__2,persons__4,persons__more,lug_boot__big,lug_boot__med,lug_boot__small,safety__high,safety__low,safety__med,acceptance
0,0,0,0,1,0,0,0,1,1,0,...,1,0,0,0,0,1,0,1,0,unacc
1,0,0,0,1,0,0,0,1,1,0,...,1,0,0,0,0,1,0,0,1,unacc
2,0,0,0,1,0,0,0,1,1,0,...,1,0,0,0,0,1,1,0,0,unacc
3,0,0,0,1,0,0,0,1,1,0,...,1,0,0,0,1,0,0,1,0,unacc
4,0,0,0,1,0,0,0,1,1,0,...,1,0,0,0,1,0,0,0,1,unacc


In [30]:
# 数字编码数据集进行划分
x_num = car_num_encoded.iloc[:,:6]
y_num = car_num_encoded["acceptance"]
# 将百分之七十数据作为训练集，剩余作为测试集。按照acceptance进行分成抽样，确保测试集和训练集满意程度分布情况一致
x_train_num, x_test_num, y_train_num, y_test_num = train_test_split(x_num, y_num, test_size=0.3, stratify=y_num, random_state=42)

In [31]:
# 输出训练集各个取值样本所占比例
print (y_train_num.value_counts()/len(y_train_num))
# 输出测试集各个取值样本所占比例
print (y_test_num.value_counts()/len(y_test_num))

unacc    0.700579
acc      0.222498
good     0.039702
vgood    0.037221
Name: acceptance, dtype: float64
unacc    0.699422
acc      0.221580
good     0.040462
vgood    0.038536
Name: acceptance, dtype: float64


In [32]:
# one-hot编码数据集进行划分
x_onehot = car_onehot_encoded.iloc[:,:21]
y_onehot = car_onehot_encoded["acceptance"]
# 将百分之七十数据作为训练集，剩余作为测试集。按照acceptance进行分成抽样，确保测试集和训练集满意程度分布情况一致
x_train_onehot, x_test_onehot, y_train_onehot, y_test_onehot = train_test_split(x_onehot, y_onehot, test_size=0.3, stratify=y_onehot, random_state=42)

In [38]:
# 输出训练集各个取值样本所占比例
print (y_train_onehot.value_counts()/len(y_train_onehot))
# 输出测试集各个取值样本所占比例
print (y_test_onehot.value_counts()/len(y_test_onehot))

unacc    0.700579
acc      0.222498
good     0.039702
vgood    0.037221
Name: acceptance, dtype: float64
unacc    0.699422
acc      0.221580
good     0.040462
vgood    0.038536
Name: acceptance, dtype: float64


In [33]:
# 创建逻辑回归模型模型
num_model = LogisticRegression(max_iter=5000)
onehot_model = LogisticRegression(max_iter=5000)

In [34]:
# 分别用数字编码的数据集和one-hot编码的数据集训练模型
num_model.fit(x_train_num, y_train_num)
onehot_model.fit(x_train_onehot, y_train_onehot)

LogisticRegression(max_iter=5000)

In [35]:
# 获取数据编码对应模型的预测结果
y_pred_num = num_model.predict(x_test_num)
# 获取one-hot编码对应模型的预测结果
y_pred_onehot = onehot_model.predict(x_test_onehot)

In [36]:
# 查看数字编码训练模型的性能
print(classification_report(y_test_num, y_pred_num))

              precision    recall  f1-score   support

         acc       0.65      0.50      0.57       115
        good       0.67      0.57      0.62        21
       unacc       0.87      0.94      0.90       363
       vgood       0.76      0.65      0.70        20

    accuracy                           0.82       519
   macro avg       0.74      0.67      0.70       519
weighted avg       0.81      0.82      0.81       519



In [37]:
# 查看onehot编码训练模型的性能
print(classification_report(y_test_onehot, y_pred_onehot))

              precision    recall  f1-score   support

         acc       0.79      0.84      0.82       115
        good       0.69      0.43      0.53        21
       unacc       0.96      0.97      0.97       363
       vgood       0.95      0.90      0.92        20

    accuracy                           0.92       519
   macro avg       0.85      0.78      0.81       519
weighted avg       0.91      0.92      0.91       519



In [39]:
car_onehot_encoded.to_csv("D:/output/one-hot.csv")