In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter, defaultdict

In [2]:
datasets = [
    ["青年", "否", "否", "一般", "否"],
    ["青年", "否", "否", "好", "否"],
    ["青年", "是", "否", "好", "是"],
    ["青年", "是", "是", "一般", "是"],
    ["青年", "否", "否", "一般", "否"],
    ["中年", "否", "否", "一般", "否"],
    ["中年", "否", "否", "好", "否"],
    ["中年", "是", "是", "好", "是"],
    ["中年", "否", "是", "非常好", "是"],
    ["中年", "否", "是", "非常好", "是"],
    ["老年", "否", "是", "非常好", "是"],
    ["老年", "否", "是", "好", "是"],
    ["老年", "是", "否", "好", "是"],
    ["老年", "是", "否", "非常好", "是"],
    ["老年", "否", "否", "一般", "否"],
]
labels = [u"年龄", u"有工作", u"有自己的房子", u"信贷情况", u"类别"]
train_data = pd.DataFrame(datasets, columns=labels)
train_data

Unnamed: 0,年龄,有工作,有自己的房子,信贷情况,类别
0,青年,否,否,一般,否
1,青年,否,否,好,否
2,青年,是,否,好,是
3,青年,是,是,一般,是
4,青年,否,否,一般,否
5,中年,否,否,一般,否
6,中年,否,否,好,否
7,中年,是,是,好,是
8,中年,否,是,非常好,是
9,中年,否,是,非常好,是


In [3]:
# Entropy
def calcEntropy(datasets):
    length = len(datasets)
    num_labels = Counter(datasets[i][-1] for i in range(length))
    entropy = -sum([(count / length) * np.log2(count / length) for count in num_labels.values()])
    # print(num_labels)

    return entropy

def conditionEntropy(datasets, axis=0):
    length = len(datasets)
    feature_sets = defaultdict(list)

    for dataset in datasets:
        feature_sets[dataset[axis]].append(dataset)

    condition_entropy = sum(
        [(len(p) / length) * calcEntropy(p) for p in feature_sets.values()]
    )

    return condition_entropy

def infoGain(entropy, condition_entropy):
    return entropy - condition_entropy

def infoGainTrain(datasets):
    cnt_features = len(datasets[0]) - 1
    entropy = calcEntropy(datasets)
    best_features = []
    for i in range(cnt_features):
        i_info_gain = infoGain(entropy, conditionEntropy(datasets, axis=i))
        best_features.append((i, i_info_gain))
        print('Feature({}) info_gain = {:.3f}'.format(labels[i], i_info_gain))
    best_feature = max(best_features, key=lambda x: x[-1])

    return '特征({})的信息增益最大，选择为根节点特征'.format(labels[best_feature[0]])

In [4]:
infoGainTrain(np.array(datasets))

Feature(年龄) info_gain = 0.083
Feature(有工作) info_gain = 0.324
Feature(有自己的房子) info_gain = 0.420
Feature(信贷情况) info_gain = 0.363


'特征(有自己的房子)的信息增益最大，选择为根节点特征'

In [5]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split


iris = load_iris()
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df["label"] = iris.target
df.columns = [
    "sepal length", "sepal width", "petal length", "petal width", "label"
]

data = np.array(df.iloc[:100, [0, 1, -1]])
X = data[:, :-1]
y = data[:, -1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [6]:
import graphviz
from sklearn.tree import export_graphviz
from sklearn.tree import DecisionTreeClassifier

In [7]:
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

1.0

In [8]:
dot_data = export_graphviz(
    clf,
    out_file=None,
    filled=True,
    rounded=True,
    special_characters=True
)

graph = graphviz.Source(dot_data)
graph.render("decision_tree", format="pdf", cleanup=True)

'decision_tree.pdf'