In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

class Node:
    def __init__ (self, value, split=None):
        self.value = value
        self.split = split
        self.left = None
        self.right = None

class CART:
    def __init__ (self, epsilon=0.01):
        self.epsilon = epsilon
        self.tree = None
        self.features = None
    
    # 基尼不纯度
    def calc_gini (self, y):
        gini = 0
        for y_unique, y_cnt in zip(*np.unique(y, return_counts=True)):
            prob = y_cnt / len(y)
            gini += prob * (1 - prob)
        return gini

    # 特征条件下的基尼不纯度
    def calc_cond_gini (self, X, y):
        sorted_X = np.unique(np.sort(X, axis=0))
        split_pos = (sorted_X[: -1] + sorted_X[1:]) / 2
        best_gini = float("inf")
        best_split = None
        for pos in split_pos:
            lmask, rmask = X <= pos, X > pos
            cond_gini = (sum(lmask) * self.calc_gini(y[lmask]) + sum(rmask) * self.calc_gini(y[rmask])) / len(y)
            if (cond_gini < best_gini):
                best_gini = cond_gini
                best_split = pos
        return best_gini, best_split

    def build (self, X, y):
        y_unique = np.unique(y)
        if (len(y_unique) == 1):
            return Node(value=y_unique[0])
        if (X.shape[0] == 0):
            return Node(value=0.5)
        best_gini = float("inf")
        best_split = None
        best_feature = None
        for i in range(X.shape[1]):
            gini, split = self.calc_cond_gini(X.iloc[:, i], y)
            if (gini < best_gini):
                best_gini = gini
                best_split = split
                best_feature = i
        if (best_gini < self.epsilon):
            return Node(value=np.mean(y))
        tree = Node(value=self.features[best_feature], split=best_split)
        lmask, rmask = X.iloc[:, best_feature] <= best_split, X.iloc[:, best_feature] > best_split
        tree.left = self.build(X[lmask], y[lmask])
        tree.right = self.build(X[rmask], y[rmask])
        return tree

    def fit (self, X, y):
        self.features = list(X.columns)
        self.tree = self.build(X, y)

    def search (self, x):
        root = self.tree
        split = root.split
        while split is not None:
            index = self.features.index(root.value)
            if (x[index] <= root.split):
                root = root.left
            else:
                root = root.right
            split = root.split
        return root.value

    def predict (self, X):
        y = []
        for x in X:
            y.append(self.search(x))
        return y
    
    def getMean (self, tree):
        mean = 0
        if (tree.left is not None):
            mean += self.getMean(tree.left)
        if (tree.right is not None):
            mean += self.getMean(tree.right)
        return mean / 2

    # 剪枝
    def prune (self, tree, X, y):
        if (X.shape[0] == 0):
            tree.value = self.getMean(tree)
            tree.split = None
            tree.left = None
            tree.right = None
            return tree
        if (tree.split is None):
            return tree
        index = self.features.index(tree.value)
        lmask, rmask = X.iloc[:, index] <= tree.split, X.iloc[:, index] > tree.split
        tree.left = self.prune(tree.left, X[lmask], y[lmask])
        tree.right = self.prune(tree.right, X[rmask], y[rmask])
        if (tree.left.split is None and tree.right.split is None):
            error_no_merge = np.sum(np.power(y[lmask] - tree.left.value, 2)) + np.sum(np.power(y[rmask] - tree.right.value, 2))
            error_merge = np.sum(np.power(y - self.getMean(tree), 2))
            if (error_merge < error_no_merge):
                tree.value = self.getMean(tree)
                tree.split = None
                tree.left = None
                tree.right = None
        return tree

In [2]:
train_data = pd.read_csv("./TrainData.csv")

X = train_data.drop(columns=["h1n1_vaccine", "seasonal_vaccine"])
y = train_data[["h1n1_vaccine", "seasonal_vaccine"]]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)
y1_train, y2_train = y_train.iloc[:, 0], y_train.iloc[:, 1]
y1_test, y2_test = y_test.iloc[:, 0], y_test.iloc[:, 1]

In [24]:
from math import log

# 经验熵
def calc_ent (datasets):
    data_length = len(datasets)
    label_count = {}
    for i in range(data_length):
        label = datasets[i][-1]
        if label not in label_count:
            label_count[label] = 0
        label_count[label] += 1
    ent = -sum([(p / data_length) * log(p / data_length, 2) for p in label_count.values()])
    return ent

# 经验条件熵
def cond_ent (datasets, axis = 0):
    data_length = len(datasets)
    feature_sets = {}
    for i in range(data_length):
        feature = datasets[i][axis]
        if feature not in feature_sets:
            feature_sets[feature] = []
        feature_sets[feature].append(datasets[i])
    cond_ent = sum([(len(p) / data_length) * calc_ent(p) for p in feature_sets.values()])
    return cond_ent

# 信息增益
def info_gain (ent, cond_ent):
    return ent - cond_ent
    
def info_gain_train (datasets):
    count = len(datasets[0]) - 1
    ent = calc_ent(datasets)
    best_feature = []
    for c in range(count):
        c_info_gain = info_gain(ent, cond_ent(datasets, axis = c))
        best_feature.append((c, c_info_gain))
    best_ = max(best_feature, key = lambda x: x[-1])
    return best_

data_set_1 = pd.concat([X_train, y1_train], axis=1)
data_set_2 = pd.concat([X_train, y2_train], axis=1)

columns_name = [column for column in X]
print("对于 h1n1_vaccine 标签，其信息增益最大的特征为：", columns_name[info_gain_train(np.array(data_set_1))[0]])
print("对于 seasonal_vaccine 标签，其信息增益最大的特征为：", columns_name[info_gain_train(np.array(data_set_2))[0]])

对于 h1n1_vaccine 标签，其信息增益最大的特征为： doctor_recc_h1n1
对于 seasonal_vaccine 标签，其信息增益最大的特征为： opinion_seas_vacc_effective_5.0


In [21]:
model_1 = CART()
model_1.fit(X_train, y1_train)
y1_pre = model_1.predict(np.array(X_test))
print("剪枝前 h1n1_vaccine:", roc_auc_score(y1_test, y1_pre))
model_1.prune(model_1.tree, X_test, y1_test)
y1_pre = model_1.predict(np.array(X_test))
print("剪枝后 h1n1_vaccine:", roc_auc_score(y1_test, y1_pre))

model_2 = CART()
model_2.fit(X_train, y2_train)
y2_pre = model_2.predict(np.array(X_test))
print("剪枝前 seasonal_vaccine:", roc_auc_score(y2_test, y2_pre))
model_2.prune(model_2.tree, X_test, y2_test)
y2_pre = model_2.predict(np.array(X_test))
print("剪枝后 h1n1_vaccine:", roc_auc_score(y2_test, y2_pre))

剪枝前 h1n1_vaccine: 0.6549834052439429
剪枝后 h1n1_vaccine: 0.7217561400597411
剪枝前 seasonal_vaccine: 0.7187384847347728
剪枝后 h1n1_vaccine: 0.7881303885806883


In [22]:
test_features = pd.read_csv("./TestFeatures.csv")
id = np.array(test_features["respondent_id"])
X_features = np.array(test_features.drop(columns=["respondent_id"]))

y1_label = model_1.predict(X_features)
y2_label = model_2.predict(X_features)

output = pd.DataFrame({"respondent_id": id, "h1n1_vaccine": y1_label, "seasonal_vaccine": y2_label})
output.to_csv("./submission.csv", index=False)