In [141]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris
import matplotlib.pyplot as plt
%matplotlib inline

In [142]:
def create_data():
    iris = load_iris()
    df = pd.DataFrame(iris.data, columns=iris.feature_names)
    df['label'] = iris.target
    data = np.array(df.iloc[:100, [0, 1, -1]])
    for i in range(len(data)):
        if data[i, -1] == 0:
            data[i, -1] = 1
    return data[:, :2], data[:, -1]

In [144]:
X, y = create_data()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

## sklearn

In [145]:
from sklearn.tree import DecisionTreeClassifier

In [146]:
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [148]:
"""对树结构进行展示"""
import graphviz
from sklearn.tree import export_graphviz

In [151]:
# tree_pic = export_graphviz(clf, out_file="mytree.pdf")
# with open('mytree.pdf') as f:
#     dot_graph = f.read()
# graphviz.Source(dot_graph)

## 手动例子

In [158]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import export_graphviz
import numpy as np
import pandas as pd
import graphviz

In [None]:
features = ["年龄", "有工作", "有自己的房子", "信贷情况"]

In [154]:
X_train = pd.DataFrame([
    ["青年", "否", "否", "一般"],
    ["青年", "否", "否", "好"],
    ["青年", "是", "否", "好"],
    ["青年", "是", "是", "一般"],
    ["青年", "否", "否", "一般"],
    ["中年", "否", "否", "一般"],
    ["中年", "否", "否", "好"],
    ["中年", "是", "是", "好"],
    ["中年", "否", "是", "非常好"],
    ["中年", "否", "是", "非常好"],
    ["老年", "否", "是", "非常好"],
    ["老年", "否", "是", "好"],
    ["老年", "是", "否", "好"],
    ["老年", "是", "否", "非常好"],
    ["老年", "否", "否", "一般"]
])

In [155]:
y_train = pd.DataFrame([
    "否", "否", "是", "是", "否", 
    "否", "否", "是", "是", "是", 
    "是", "是", "是", "是", "否"
])

In [156]:
"""数据预处理"""
label_x = LabelEncoder()
label_x.fit(np.unique(X_train))
X_train = X_train.apply(label_x.transform)
label_y = LabelEncoder()
label_y.fit(np.unique(y_train))
y_train = y_train.apply(label_y.transform)

In [157]:
tree = DecisionTreeClassifier()
tree.fit(X_train, y_train)
"""
参数信息：
max_features：最佳划分时考虑的特征数量，每次只考虑max_features个特征
min_impurity_decrease：分裂后信息不确定度减少大于min_impurity_decrease，分裂
min_impurity_split：当前节点信息不确定度高于min_impurity_split，分裂
min_samples_leaf：叶节点最小样本数
presort：样本进行预排序切分，适用于小样本
min_weight_fraction_leaf：叶子节点中样本的最小权重
splitter：选择切分点的策略
"""

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [162]:
"""数据可视化"""
dot_data = export_graphviz(
    tree, out_file=None,
    feature_names=features,
    class_names=[str(k) for k in np.unique(y_train)],
    filled=True, rounded=True,
    special_characters=True
)

In [163]:
graph = graphviz.Source(dot_data)

In [174]:
"""参数调优"""
from sklearn.model_selection import GridSearchCV

In [178]:
param_grid = {
    'criterion':['gini', 'entropy'],
    'max_depth':range(2, 15)
}
grid_search = GridSearchCV(tree, param_grid)
grid_search.fit(X_train, y_train)
best_parameters = grid_search.best_estimator_.get_params()
print(grid_search.best_params_)
print(grid_search.best_score_)

{'criterion': 'gini', 'max_depth': 4}
0.9333333333333333


In [179]:
tree = DecisionTreeClassifier(
    criterion=best_parameters['criterion'],
    max_depth=best_parameters['max_depth']
)
tree.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=4,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')