In [3]:
# 导入相关库包
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# 载入数据
credit = pd.read_csv('credit.csv')

In [4]:
# 数据预处理与训练数据集构建

# 根据表格当中的列名及数据，创建特征映射字典，便于决策树构建
col_dicts = {
    'checking_balance': {'1 - 200 DM': 2, '< 0 DM': 1, '> 200 DM': 3, 'unknown': 0},
    'credit_history': {'critical': 0, 'delayed': 2, 'fully repaid': 3, 'fully repaid this bank': 4, 'repaid': 1},
    'employment_length': {'0 - 1 yrs': 1, '1 - 4 yrs': 2, '4 - 7 yrs': 3, '> 7 yrs': 4, 'unemployed': 0},
    'foreign_worker': {'no': 1, 'yes': 0},
    'housing': {'for free': 1, 'own': 0, 'rent': 2},
    'installment_plan': {'bank': 1, 'none': 0, 'stores': 2},
    'job': {'management self-employed': 3, 'skilled employee': 2, 'unemployed non-resident': 0, 'unskilled resident': 1},
    'other_debtors': {'co-applicant': 2, 'guarantor': 1, 'none': 0},
    'personal_status': {'divorced male': 2, 'female': 1, 'married male': 3, 'single male': 0},
    'property': {'building society savings': 1, 'other': 3, 'real estate': 0, 'unknown/none': 2},
    'purpose': {'business': 5, 'car (new)': 3, 'car (used)': 4, 'domestic appliances': 6, 'education': 1, 'furniture': 2,
                'others': 8, 'radio/tv': 0, 'repairs': 7, 'retraining': 9},
    'savings_balance': {'101 - 500 DM': 2, '501 - 1000 DM': 3, '< 100 DM': 1, '> 1000 DM': 4, 'unknown': 0},
    'telephone': {'none': 1, 'yes': 0}
}

# 由于数据集当中有一部分数据是类名，不是数字值，而sklearn的包只能处理数字值。此处使用特征映射字典将非数字值替换为数字值，便于model fitting
for col in col_dicts:
    credit[col] = credit[col].map(col_dicts[col])

# 准备特征和目标变量
X = credit.drop('default', axis=1)  # 特征矩阵
y = credit['default']  # 目标变量

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

# 创建决策树分类器
credit_model = DecisionTreeClassifier(criterion='entropy')  # 使用信息熵

In [5]:
# 在训练集上训练模型
credit_model.fit(X_train, y_train)

# 在测试集上进行预测
y_pred = credit_model.predict(X_test)

In [6]:
#模型结果评估

# 计算准确率
accuracy = accuracy_score(y_test, y_pred)
print("模型在测试集上的准确率: {:.2f}".format(accuracy))

# 打印分类报告和混淆矩阵
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))


模型在测试集上的准确率: 0.71
              precision    recall  f1-score   support

           1       0.77      0.86      0.81       214
           2       0.49      0.35      0.41        86

    accuracy                           0.71       300
   macro avg       0.63      0.60      0.61       300
weighted avg       0.69      0.71      0.69       300

[[183  31]
 [ 56  30]]
