In [1]:
from sklearn.preprocessing import LabelEncoder

import numpy as np
import pandas as pd
import random

## 一、数据预处理

In [2]:
columns = ['age', 'workclass', 'fnlwgt', 'education', 'educationNum', 
           'maritalStatus', 'occupation', 'relationship', 'race', 'sex',
           'capitalGain', 'capitalLoss', 'hoursPerWeek', 'nativeCountry', 'income']
continues_columns = ['age', 'fnlwgt', 'educationNum', 'capitalGain', 'capitalLoss', 'hoursPerWeek']
discrete_columns = [_ for _ in columns if _ not in continues_columns]


train_data = pd.read_csv('data/adult.data', delimiter=', ', header=None, engine='python')
test_data = pd.read_csv('data/adult.test', delimiter=', ', header=None, skiprows=1, engine='python')
train_data.columns = columns
test_data.columns = columns


# 离散数据：按列编码为float类型
LE = LabelEncoder()
for i in range(len(columns)):
    if columns[i] in continues_columns: continue
    train_data[columns[i]] = LE.fit_transform(train_data[columns[i]]).astype(float)
    test_data[columns[i]] = LE.fit_transform(test_data[columns[i]]).astype(float)


X_train = train_data.iloc[:, :-1]
y_train = train_data.iloc[:, -1]
X_test = test_data.iloc[:, :-1]
y_test = test_data.iloc[:, -1]


# 清洗数据：删除有缺失值的样本
origin_train_shape = train_data.shape
origin_test_shape = test_data.shape
for column in columns:
    train_data = train_data[~train_data[column].str.contains('\?', regex=True)]
    test_data = test_data[~test_data[column].str.contains('\?', regex=True)]
after_train_shape = train_data.shape
after_test_shape = test_data.shape
print(f"Train data shape: {origin_train_shape} -> {after_train_shape} \
      delta: {(origin_train_shape[0] - after_train_shape[0]) / origin_train_shape[0] * 100:.2f}%")
print(f"Test data shape : {origin_test_shape} -> {after_test_shape} \
      delta: {(origin_test_shape[0] - after_test_shape[0]) / origin_test_shape[0] * 100:.2f}%")


# 连续型数据离散化
continues_columns = ['age', 'fnlwgt', 'educationNum', 'capitalGain', 'capitalLoss', 'hoursPerWeek']
for column in continues_columns:
    train_data[column] = pd.qcut(train_data[column].astype(float), 10, duplicates='drop')
    test_data[column] = pd.qcut(test_data[column].astype(float), 10, duplicates='drop')

Train data shape: (32561, 15) -> (30162, 15)       delta: 7.37%
Test data shape : (16281, 15) -> (15060, 15)       delta: 7.50%


## 二、模型训练

In [3]:
class TreeNode:
    def __init__(self, dataset, left, right, feature_name, feature_value, label):
        self.dataset = dataset
        self.left = left
        self.right = right
        self.feature_name = feature_name
        self.feature_value = feature_value
        self.label = label

In [4]:
def calc_gini(df):
    """
    计算数据集的基尼指数
    :param df: 数据集
    :return: 基尼指数
    """
    
    val_list = pd.unique(df)
    gini = 1
    df = np.array(df)
    for val in val_list:
        p = df[df == val].size / df.size
        gini -= p**2
    return gini


def split_dataset(df, index, value):
    """
    按照给定的列划分数据集
    :param df: 原始数据集
    :param index: 指定特征的列索引，即feature
    :param value: 指定特征的值
    :return: 切分后的数据集
    """

    # 划分数据集
    left = df[df[index] == value]
    right = df[df[index] != value]

    # 删除划分后的数据集中的特征列
    left = left.drop(index, axis=1)
    right = right.drop(index, axis=1)

    return left, right


def choose_best_feature_to_split(df, flags):
    """
    选择最好的特征进行分裂
    :param df: 数据集
    :return: best_value:(分裂特征的index, 特征的值), best_df:(分裂后的左右子树数据集), best_gain:(选择该属性分裂的最大信息增益), best_feature_value:(分裂特征的值)
    """

    best_feature = None
    best_split_value = None
    best_gain = 0
    best_feature_value = None

    base_gini_index = calc_gini(df["income"])

    for feature in df.columns:
        if flags[columns.index(feature)] == 1:
            continue

        for value in df[feature].unique():
            left_df, right_df = split_dataset(df, feature, value)
            gini_index = calc_gini(
                left_df[:]["income"]) + calc_gini(right_df[:]["income"])
            gain = gini_index - base_gini_index
            if gain > best_gain:
                best_feature = feature
                best_split_value = (left_df, right_df)
                best_gain = gain
                best_feature_value = value

    return best_feature, best_split_value, best_gain, best_feature_value


def build_decision_tree(df, columns, flags):
    """
    构建CART树
    :param df: 数据集
    :param columns: 特征列表
    :param flags: 区分特征是否被完全区分开,初始为全0, 若某个特征被区分开那么flags对应的下标为0
    :return: CART树
    """

    # 递归结束情况1: 若当前集合的所有样本标签相等,即样本已被分"纯",则可以返回该标签值作为一个叶子节点
    # 递归结束情况2: 若当前训练集的所有特征都被使用完毕,当前无可用特征但样本仍未分"纯"，则返回样本最多的标签作为结果
    flags[columns.index('income')] = 1
    if df.shape[0] == 0:
        return TreeNode(df, None, None, 'income', None, -1)
    if (len(df['income'].unique()) == 1) or (sum(flags) == len(columns)):
        return TreeNode(df, None, None, 'income', None, df['income'].value_counts().keys()[0])

    best_feature, best_split_value, best_gain, best_feature_value = choose_best_feature_to_split(
        df, flags)
    if best_gain > 0:
        flags[columns.index(best_feature)] = 1
        left = build_decision_tree(best_split_value[0], columns, flags)
        right = build_decision_tree(best_split_value[1], columns, flags)
        return TreeNode(df, left, right, best_feature, best_feature_value, None)
    else:
        return TreeNode(df, None, None, 'income', None, df['income'].value_counts().keys()[0])


def save_decision_tree(cart):
    """
    决策树的存储
    :param cart: 训练好的决策树
    :return: void
    """

    np.save('cart.npy', cart)


def load_decision_tree():
    """
    决策树的加载
    :return: 保存的决策树
    """

    cart = np.load('cart.npy', allow_pickle=True)
    return cart.item()

In [5]:
train_data_copy = train_data.copy()
flags = [0 for _ in range(len(columns))]
cart = build_decision_tree(train_data_copy, columns, flags)
save_decision_tree(cart)

### 2.x 使用 sk-learn 包

In [3]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn import tree
import pandas as pd


columns = ['age', 'workclass', 'fnlwgt', 'education', 'educationNum', 
           'maritalStatus', 'occupation', 'relationship', 'race', 'sex',
           'capitalGain', 'capitalLoss', 'hoursPerWeek', 'nativeCountry', 'income']
continues_columns = ['age', 'fnlwgt', 'educationNum', 'capitalGain', 'capitalLoss', 'hoursPerWeek']
discrete_columns = [_ for _ in columns if _ not in continues_columns]


train_data = pd.read_csv('data/adult.data', delimiter=', ', header=None, engine='python')
test_data = pd.read_csv('data/adult.test', delimiter=', ', header=None, skiprows=1, engine='python')
train_data.columns = columns
test_data.columns = columns


# 离散数据：按列编码为float类型
LE = LabelEncoder()
for i in range(len(columns)):
    if columns[i] in continues_columns: continue
    train_data[columns[i]] = LE.fit_transform(train_data[columns[i]]).astype(float)
    test_data[columns[i]] = LE.fit_transform(test_data[columns[i]]).astype(float)


X_train = train_data.iloc[:, :-1]
y_train = train_data.iloc[:, -1]
X_test = test_data.iloc[:, :-1]
y_test = test_data.iloc[:, -1]


clf = DecisionTreeClassifier(criterion="entropy")
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'准确率：{accuracy:.2f}')

# tree.plot_tree(clf, filled=True)

准确率：0.81


## 三、模型测试

In [7]:
def classify(cart, df_row, columns):
    """
    用训练好的决策树进行分类
    :param cart:决策树模型
    :param df_row: 一条测试样本
    :param columns: 特征列表
    :return: 预测结果
    """

    if cart.label != None:
        return cart.label

    else:
        if df_row[columns.index(cart.feature_name)] == cart.feature_value:
            return classify(cart.left, df_row, columns)
        else:
            return classify(cart.right, df_row, columns)


def predict(cart, df, columns):
    """
    用训练好的决策树进行分类
    :param cart:决策树模型
    :param df: 所有测试集
    :param columns: 特征列表
    :return: 预测结果
    """

    pred_list = []
    for i in range(len(df)):
        pred_label = classify(cart, df.iloc[i, :], columns)
        if pred_label == -1:
            pred_label = random.randint(0, 1)  # 防止classify执行到返回-1,但一般不会执行到返回-1
        pred_list.append(pred_label)
    return pred_list


def calc_acc(pred_list, test_list):
    """
    返回预测准确率
    :param pred_list: 预测列表
    :param test_list: 测试列表
    :return: 准确率
    """

    pred = np.array(pred_list)
    test = np.array(test_list)
    acc = np.sum(pred == test) / len(test_list)
    return acc

In [8]:
cart = load_decision_tree()

pred_train_list = predict(cart, train_data, columns)
train_list = train_data['income'].to_numpy()
acc_train = calc_acc(pred_train_list, train_list)
print("Train accuracy: ", acc_train)

pred_test_list = predict(cart, test_data, columns)
test_list = [0 if i == '<=50K' else 1 for i in test_data['income'].to_numpy()]
acc_test = calc_acc(pred_test_list, test_list)
print("Test accuracy : ", acc_test)

Train accuracy:  0.7510775147536636
Test accuracy :  0.50199203187251
