# 使用Python自编程（以ID3算法为例），实现课本样例数据的分类

In [1]:
import numpy as np
import pandas as pd
from math import log

In [2]:
def create_data():
    df = pd.read_excel(r"课本样例数据.xls")
    # 获取数据集和每个维度的名称
    df = df.drop(['ID'], axis=1)
    datasets = df.values
    labels = df.columns.values
    print(datasets)
    print(labels)
    return datasets, labels

In [3]:
datasets, labels = create_data()
train_data = pd.DataFrame(datasets, columns=labels)
print(train_data)

[['青年' '否' '否' '一般' '否']
 ['青年' '否' '否' '好' '否']
 ['青年' '是' '否' '好' '是']
 ['青年' '是' '是' '一般' '是']
 ['青年' '否' '否' '一般' '否']
 ['中年' '否' '否' '一般' '否']
 ['中年' '否' '否' '好' '否']
 ['中年' '是' '是' '好' '是']
 ['中年' '否' '是' '非常好' '是']
 ['中年' '否' '是' '非常好' '是']
 ['老年' '否' '是' '非常好' '是']
 ['老年' '否' '是' '好' '是']
 ['老年' '是' '否' '好' '是']
 ['老年' '是' '否' '非常好' '是']
 ['老年' '否' '否' '一般' '否']]
['年龄' '有工作' '有自己的房子' '信贷情况' '类别']
    年龄 有工作 有自己的房子 信贷情况 类别
0   青年   否      否   一般  否
1   青年   否      否    好  否
2   青年   是      否    好  是
3   青年   是      是   一般  是
4   青年   否      否   一般  否
5   中年   否      否   一般  否
6   中年   否      否    好  否
7   中年   是      是    好  是
8   中年   否      是  非常好  是
9   中年   否      是  非常好  是
10  老年   否      是  非常好  是
11  老年   否      是    好  是
12  老年   是      否    好  是
13  老年   是      否  非常好  是
14  老年   否      否   一般  否


In [4]:
# 定义节点类 二叉树
class Node:
    def __init__(self, root=True, label=None, feature_name=None, feature=None):
        self.root = root
        self.label = label
        self.feature_name = feature_name
        self.feature = feature
        self.tree = {}
        self.result = {'label:': self.label, 'feature': self.feature, 'tree': self.tree}

    def __repr__(self):
        return '{}'.format(self.result)

    def add_node(self, val, node):
        self.tree[val] = node

    def predict(self, features):
        if self.root is True:
            return self.label
        if features[self.feature] not in self.tree:
            key_feature = list(self.tree.keys())  # 储存特征出现过的取值
            x = np.random.randint(0, len(key_feature))  # 随机生成一个数字
            random_key = key_feature[x]
            return self.tree[random_key].predict(features)
        return self.tree[features[self.feature]].predict(features)

In [5]:
class DTree:
    def __init__(self, epsilon=0.1):
        self.epsilon = epsilon
        self._tree = {}

    # 熵
    def calc_ent(self, datasets):
        data_length = len(datasets)
        label_count = {}
        for i in range(data_length):
            label = datasets[i][-1]
            if label not in label_count:
                label_count[label] = 0
            label_count[label] += 1
        ent = -sum([(p/data_length)*log(p/data_length, 2) for p in label_count.values()])
        return ent

    # 经验条件熵
    def cond_ent(self, datasets, axis=0):
        data_length = len(datasets)
        feature_sets = {}
        for i in range(data_length):
            feature = datasets[i][axis]
            if feature not in feature_sets:
                feature_sets[feature] = []
            feature_sets[feature].append(datasets[i])
        cond_ent = sum([(len(p)/data_length)*self.calc_ent(p) for p in feature_sets.values()])
        return cond_ent

    # 信息增益
    def info_gain(self, ent, cond_ent):
        return ent - cond_ent

    #返回信息增益最大的特征
    def info_gain_train(self, datasets):
        count = len(datasets[0]) - 1
        ent = self.calc_ent(datasets)
        best_feature = []
        for c in range(count):
            c_info_gain = self.info_gain(ent, self.cond_ent(datasets, axis=c))
            best_feature.append((c, c_info_gain))
        # 比较大小
        best_ = max(best_feature, key=lambda x: x[-1])
        return best_

    def train(self, train_data):
        """
        input:数据集D(DataFrame格式)，特征集A，阈值eta
        output:决策树T
        """
        _, y_train, features = train_data.iloc[:, :-1], train_data.iloc[:, -1], train_data.columns[:-1]

        if len(y_train.value_counts()) == 1:
            return Node(root=True,
                        label=y_train.iloc[0])

        if len(features) == 0:
            return Node(root=True, label=y_train.value_counts().sort_values(ascending=False).index[0])

        max_feature, max_info_gain = self.info_gain_train(np.array(train_data))
        max_feature_name = features[max_feature]

        if max_info_gain < self.epsilon:
            return Node(root=True, label=y_train.value_counts().sort_values(ascending=False).index[0])

        node_tree = Node(root=False, feature_name=max_feature_name, feature=max_feature)

        feature_list = train_data[max_feature_name].value_counts().index
        for f in feature_list:
            sub_train_df = train_data.loc[train_data[max_feature_name] == f].drop([max_feature_name], axis=1)

            sub_tree = self.train(sub_train_df)
            node_tree.add_node(f, sub_tree)
        
        return node_tree

    
    def fit(self, train_data):
        self._tree = self.train(train_data)
        return self._tree

    def predict(self, X_test):
        return self._tree.predict(X_test)

In [6]:
dt = DTree()
tree = dt.fit(train_data)
print(tree)

{'label:': None, 'feature': 2, 'tree': {'否': {'label:': None, 'feature': 1, 'tree': {'否': {'label:': '否', 'feature': None, 'tree': {}}, '是': {'label:': '是', 'feature': None, 'tree': {}}}}, '是': {'label:': '是', 'feature': None, 'tree': {}}}}


In [7]:
print(dt.predict(['老年', '否', '否', '一般']))

否


# 调用Sklearn库分类器，实现课本样例数据的分类

环境准备：
请参考文件 决策树可视化——anaconda下安装pydotplus以及graphviz
提前安装好pydotplus以及graphviz！！！

In [9]:
import pandas as pd
import pydotplus
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz



#读取xlsx文件并查看所有的表头
df = pd.read_excel(r"课本样例数据.xls")
cols=df.columns.values
print(cols)


age = {'青年': 1, '中年': 2, '老年': 3}
job = {'是': 1, '否': 0}
housing = {'是': 1, '否': 0}
credit = {'一般': 1, '好': 2, '非常好': 3}
loan = {'是': 1, '否': 0}
df['年龄'] = df['年龄'].map(age)
df['有工作'] = df['有工作'].map(job)
df['有自己的房子'] = df['有自己的房子'].map(housing)
df['信贷情况'] = df['信贷情况'].map(credit)
df['类别'] = df['类别'].map(loan)
df= df.drop(['ID'], axis=1)
datas=df.values
print(datas)


feature =['age','no job','no housing','credit']
classname =['no loan','loan']

#划分数据集
X = [x[0:4] for x in datas]
#print(X)
Y = [y[-1] for y in datas]
#print(Y)


tree_clf = DecisionTreeClassifier(max_depth=4)
tree_clf.fit(X, Y)
print(tree_clf.predict([[3, 0, 0, 1]]))

['ID' '年龄' '有工作' '有自己的房子' '信贷情况' '类别']
[[1 0 0 1 0]
 [1 0 0 2 0]
 [1 1 0 2 1]
 [1 1 1 1 1]
 [1 0 0 1 0]
 [2 0 0 1 0]
 [2 0 0 2 0]
 [2 1 1 2 1]
 [2 0 1 3 1]
 [2 0 1 3 1]
 [3 0 1 3 1]
 [3 0 1 2 1]
 [3 1 0 2 1]
 [3 1 0 3 1]
 [3 0 0 1 0]]
[0]


In [10]:
dot_data=export_graphviz(
            tree_clf,
            out_file=None,
            feature_names=feature,
            class_names=classname,
            rounded=True,
            filled=True,
    special_characters=True)
dot_data=dot_data.replace('\n','')
graph=pydotplus.graph_from_dot_data(dot_data)
graph.write_png(r"loan.png")

True