# 导入数据并划分数据集

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc

In [2]:
# 鸢尾花数据集
def iris_dataloader():
    dataset = pd.read_csv("iris.data", names=["萼片长度", "萼片宽度", "花瓣长度", "花瓣宽度","物种"])
    # 按照四比一划分训练集和测试集
    train_set, test_set = train_test_split(dataset, test_size=0.2)
    return train_set, test_set

In [3]:
# 成人数据集
def adult_dataloader():
    names = ["age", "workclass", "fnlwgt", "education", "education-num", "marital-status", "occupation", "relationship", "race", "sex", "capital-gain", "capital-loss", "hours-per-week", "native-country", "income"]
    dataset = pd.read_csv("adult.data", names=names, nrows=2000)
    # 按照四比一划分训练集和测试集
    train_set, test_set = train_test_split(dataset, test_size=0.2)
    return train_set, test_set

# 计算基尼系数

In [4]:
# 计算基尼系数
def gini(counts):
    total = sum(counts)
    if total == 0:
        return 0
    return 1 - sum((count / total) ** 2 for count in counts)

In [5]:
# 计算加权基尼系数
def weighted_gini(counts_list):
    total = sum(sum(counts) for counts in counts_list)
    return sum((sum(counts) / total) * gini(counts) for counts in counts_list)

# 选取属性进行数据集划分

In [6]:
# 以某一个离散特征作为划分依据的话，计算这个离散特征下最优的划分方式
def lisan_feature(dataset, feature):
    choices = dataset[feature].unique()
    best_choice = None
    best_pass_subset = None
    best_refuse_subset = None
    min_gini = 1.0
    
    for choice in choices:
        # 根据“特征是否是这个值”划分为两个子集
        pass_subset = dataset[dataset[feature] == choice]
        refuse_subset = dataset[dataset[feature] != choice]

        # 计算加权平均基尼系数
        pass_counts = pass_subset.iloc[:, -1].value_counts()
        refuse_counts = refuse_subset.iloc[:, -1].value_counts()
        gini = weighted_gini([pass_counts, refuse_counts])

        # 如果基尼系数小于记录的数据则进行更新
        if gini < min_gini:
            best_choice = choice
            best_pass_subset = pass_subset
            best_refuse_subset = refuse_subset
            min_gini = gini

    return best_choice, best_pass_subset, best_refuse_subset, min_gini

In [7]:
# 以某一个连续值的特征作为划分依据的话，计算最优划分的方式
def lianxu_feature(dataset, feature):
    unique_values = dataset[feature].unique()
    thresholds = (unique_values[:-1] + unique_values[1:]) / 2
    best_threshold = None
    best_less_subset = None
    best_more_subset = None
    min_gini = 1.0
    
    for threshold in thresholds:
        # 根据“取值是否小于该阈值”划分为两个子集
        less_subset = dataset[dataset[feature] < threshold]
        more_subset = dataset[dataset[feature] >= threshold]

        # 计算加权平均基尼系数
        less_counts = less_subset.iloc[:, -1].value_counts()
        more_counts = more_subset.iloc[:, -1].value_counts()
        gini = weighted_gini([less_counts, more_counts])

        # 如果基尼系数小于记录的数据则进行更新
        if gini < min_gini:
            best_threshold = threshold
            best_less_subset = less_subset
            best_more_subset = more_subset
            min_gini = gini

    return best_threshold, best_less_subset, best_more_subset, min_gini

In [8]:
# 选取特征进行划分
def choose_feature(dataset):
    features = dataset.columns[:-1]
    best_feature = None
    best_choice = None
    best_threshold = None
    best_left_subset = None
    best_right_subset = None
    min_gini = 1.0
    
    for feature in features:
        # 如果一个特征只有一个值，你们就不能选择来作为划分依据，跳过
        if len(dataset[feature].unique()) == 1:
            continue

        # 离散
        if dataset[feature].dtype == object:
            choice, left_subset, right_subset, gini = lisan_feature(dataset, feature)
            threshold = None

        # 连续
        else:
            threshold, left_subset, right_subset, gini= lianxu_feature(dataset, feature)
            choice = None

        # 如果基尼系数小于记录的数据则进行更新
        if gini < min_gini:
            best_feature = feature
            best_choice = choice
            best_threshold = threshold
            best_left_subset = left_subset
            best_right_subset = right_subset
            min_gini = gini

    return best_feature, best_choice, best_threshold, best_left_subset, best_right_subset

# 建立决策树

In [9]:
def create_tree(dataset, depth):
    if len(dataset.iloc[:, -1].unique()) == 1:
        return dataset.iloc[:, -1].unique()[0]
    
    feature, choice, threshold, left_subset, right_subset = choose_feature(dataset)
    # 如果没有最佳特征，意味着所有特征都只有一类,投票决定该节点的类别。
    if feature is None:
        return dataset.iloc[:, -1].value_counts().index[0]
    
    # 递归地构建左右子树
    left_tree = create_tree(left_subset, depth + 1)
    right_tree = create_tree(right_subset, depth + 1)

    return {"feature": feature, "choice": choice, "threshold": threshold, "left": left_tree, "right": right_tree}

# 利用决策树进行测试

In [10]:
def predict_tree(data, decision_tree):
    if isinstance(decision_tree, str):
        return decision_tree

    if decision_tree["choice"] is not None:
        if data[decision_tree["feature"]] == decision_tree["choice"]:
            return predict_tree(data, decision_tree["left"])
        else:
            return predict_tree(data, decision_tree["right"])
    elif decision_tree["threshold"] is not None:
        if data[decision_tree["feature"]] < decision_tree["threshold"]:
            return predict_tree(data, decision_tree["left"])
        else:
            return predict_tree(data, decision_tree["right"])
    else:
        raise Exception("存在既不是离散又不是连续的情况，出错！！！")

In [11]:
def test_tree(test_dataset, decision_tree):
    predictions = test_dataset.apply(predict_tree, axis=1, args=(decision_tree,))
    
    # Assuming labels are categorical
    labels = test_dataset.iloc[:, -1].unique()
    
    # Initialize counters for each class
    true_positive = {label: 0 for label in labels}
    true_negative = {label: 0 for label in labels}
    false_positive = {label: 0 for label in labels}
    false_negative = {label: 0 for label in labels}
    
    # Calculate counts for each class
    for label in labels:
        true_positive[label] = sum((predictions == label) & (test_dataset.iloc[:, -1] == label))
        true_negative[label] = sum((predictions != label) & (test_dataset.iloc[:, -1] != label))
        false_positive[label] = sum((predictions == label) & (test_dataset.iloc[:, -1] != label))
        false_negative[label] = sum((predictions != label) & (test_dataset.iloc[:, -1] == label))
    
    # Calculate metrics for each class
    accuracy = sum(true_positive.values()) / test_dataset.shape[0]
    error_rate = (sum(false_negative.values()) + sum(false_positive.values())) / test_dataset.shape[0]
    recall = {label: true_positive[label] / (true_positive[label] + false_negative[label]) for label in labels}
    fpr = {label: false_positive[label] / (false_positive[label] + true_negative[label]) for label in labels}
    tpr = {label: true_positive[label] / (true_positive[label] + false_negative[label]) for label in labels}
    
    return accuracy, error_rate, recall, fpr, tpr

# 构建随机森林

In [12]:
def sample_dataset(dataset, num_sample):
    samples = []
    
    # 随机选取数据和部分属性，选取的特征数量等于总特征数的平方根
    num_feature = int(np.sqrt(dataset.shape[1]))

    for i in range(num_sample):
        # 对数据采样是有放回的，对于特征的采样是无放回的
        sample = dataset.sample(dataset.shape[0], replace=True)
        features = np.random.choice(sample.columns[:-1], num_feature, replace=False)
        # 把数据和特征按行的方向拼接起来
        sample = pd.concat([sample[features], sample.iloc[:, -1]], axis=1)

        samples.append(sample)

    return samples

In [13]:
def create_forest(dataset, num_tree=20):
    datas = sample_dataset(dataset, num_tree)
    decision_trees = [create_tree(data, 1) for data in datas]
    return decision_trees

In [14]:
def predict_forest(data, forest):
    y_lables = [predict_tree(data, tree) for tree in forest]
    return max(set(y_lables), key=y_lables.count)

In [15]:
def test_forest(test_dataset, forest):
    predictions = test_dataset.apply(predict_forest, axis=1, args=(random_forest,))
    
    # Assuming labels are categorical
    labels = test_dataset.iloc[:, -1].unique()
    
    # Initialize counters for each class
    true_positive = {label: 0 for label in labels}
    true_negative = {label: 0 for label in labels}
    false_positive = {label: 0 for label in labels}
    false_negative = {label: 0 for label in labels}
    
    # Calculate counts for each class
    for label in labels:
        true_positive[label] = sum((predictions == label) & (test_dataset.iloc[:, -1] == label))
        true_negative[label] = sum((predictions != label) & (test_dataset.iloc[:, -1] != label))
        false_positive[label] = sum((predictions == label) & (test_dataset.iloc[:, -1] != label))
        false_negative[label] = sum((predictions != label) & (test_dataset.iloc[:, -1] == label))
    
    # Calculate metrics for each class
    accuracy = sum(true_positive.values()) / test_dataset.shape[0]
    error_rate = (sum(false_negative.values()) + sum(false_positive.values())) / test_dataset.shape[0]
    recall = {label: true_positive[label] / (true_positive[label] + false_negative[label]) for label in labels}
    fpr = {label: false_positive[label] / (false_positive[label] + true_negative[label]) for label in labels}
    tpr = {label: true_positive[label] / (true_positive[label] + false_negative[label]) for label in labels}
    
    return accuracy, error_rate, recall, fpr, tpr

# 带入数据进行验证

In [16]:
print("鸢尾花数据集上：")
train_dataset, test_dataset = iris_dataloader()
decision_tree = create_tree(train_dataset, 1)

print('-'*40)
decision_tree_accuracy, decision_tree_error_rate, decision_tree_recall, decision_tree_fpr, decision_tree_tpr = test_tree(test_dataset, decision_tree)
print("决策树准确率：", decision_tree_accuracy)
print("决策树错误率：", decision_tree_error_rate)
print("决策树召回率：", decision_tree_recall)
print("决策树假正类率：", decision_tree_fpr)
print("决策树真正类率：", decision_tree_tpr)

print('-'*40)
random_forest = create_forest(train_dataset)
random_forest_accuracy, random_forest_error_rate, random_forest_recall, random_forest_fpr, random_forest_tpr = test_forest(test_dataset, random_forest)
print("随机森林准确率：", random_forest_accuracy)
print("随机森林错误率：", random_forest_error_rate)
print("随机森林召回率：", random_forest_recall)
print("随机森林假正类率：", random_forest_fpr)
print("随机森林真正类率：", random_forest_tpr)

鸢尾花数据集上：
----------------------------------------
决策树准确率： 0.9666666666666667
决策树错误率： 0.06666666666666667
决策树召回率： {'Iris-versicolor': 1.0, 'Iris-setosa': 1.0, 'Iris-virginica': 0.875}
决策树假正类率： {'Iris-versicolor': 0.0625, 'Iris-setosa': 0.0, 'Iris-virginica': 0.0}
决策树真正类率： {'Iris-versicolor': 1.0, 'Iris-setosa': 1.0, 'Iris-virginica': 0.875}
----------------------------------------
随机森林准确率： 1.0
随机森林错误率： 0.0
随机森林召回率： {'Iris-versicolor': 1.0, 'Iris-setosa': 1.0, 'Iris-virginica': 1.0}
随机森林假正类率： {'Iris-versicolor': 0.0, 'Iris-setosa': 0.0, 'Iris-virginica': 0.0}
随机森林真正类率： {'Iris-versicolor': 1.0, 'Iris-setosa': 1.0, 'Iris-virginica': 1.0}


In [17]:
print("成人数据集上：")
train_dataset, test_dataset = adult_dataloader()
decision_tree = create_tree(train_dataset, 1)
decision_tree_accuracy, decision_tree_error_rate, decision_tree_recall, decision_tree_fpr, decision_tree_tpr = test_tree(test_dataset, decision_tree)
print("决策树准确率：", decision_tree_accuracy)
print("决策树错误率：", decision_tree_error_rate)
print("决策树召回率：", decision_tree_recall)
print("决策树假正类率：", decision_tree_fpr)
print("决策树真正类率：", decision_tree_tpr)

print('-'*40)

random_forest = create_forest(train_dataset)
random_forest_accuracy, random_forest_error_rate, random_forest_recall, random_forest_fpr, random_forest_tpr = test_forest(test_dataset, random_forest)
print("随机森林准确率：", random_forest_accuracy)
print("随机森林错误率：", random_forest_error_rate)
print("随机森林召回率：", random_forest_recall)
print("随机森林假正类率：", random_forest_fpr)
print("随机森林真正类率：", random_forest_tpr)

成人数据集上：
决策树准确率： 0.7925
决策树错误率： 0.415
决策树召回率： {' <=50K': 0.853035143769968, ' >50K': 0.5747126436781609}
决策树假正类率： {' <=50K': 0.42528735632183906, ' >50K': 0.14696485623003194}
决策树真正类率： {' <=50K': 0.853035143769968, ' >50K': 0.5747126436781609}
----------------------------------------
随机森林准确率： 0.81
随机森林错误率： 0.38
随机森林召回率： {' <=50K': 0.9616613418530351, ' >50K': 0.26436781609195403}
随机森林假正类率： {' <=50K': 0.735632183908046, ' >50K': 0.038338658146964855}
随机森林真正类率： {' <=50K': 0.9616613418530351, ' >50K': 0.26436781609195403}
