In [2]:
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from math import log
from collections import Counter

In [3]:
datasets = [['青年', '否', '否', '一般', '否'],
            ['青年', '否', '否', '好', '否'],
            ['青年', '是', '否', '好', '是'],
            ['青年', '是', '是', '一般', '是'],
            ['青年', '否', '否', '一般', '否'],
            ['中年', '否', '否', '一般', '否'],
            ['中年', '否', '否', '好', '否'],
            ['中年', '是', '是', '好', '是'],
            ['中年', '否', '是', '非常好', '是'],
            ['中年', '否', '是', '非常好', '是'],
            ['老年', '否', '是', '非常好', '是'],
            ['老年', '否', '是', '好', '是'],
            ['老年', '是', '否', '好', '是'],
            ['老年', '是', '否', '非常好', '是'],
            ['老年', '否', '否', '一般', '否'],
            ]
# 给出数据集的列名称
labels = [u'年龄', u'有工作', u'有自己的房子', u'信贷情况', u'类别']

In [4]:
train_data = pd.DataFrame(datasets,columns=labels)
train_data.head()

Unnamed: 0,年龄,有工作,有自己的房子,信贷情况,类别
0,青年,否,否,一般,否
1,青年,否,否,好,否
2,青年,是,否,好,是
3,青年,是,是,一般,是
4,青年,否,否,一般,否


In [6]:
class Node:
    def __init__(self,root=True,label=None,feature_name=None,feature=None):
        self.root = root
        self.label = label
        self.feature_name = feature_name
        self.feature = feature
        self.tree = {}
        self.result = {
            'label': self.label,
            'feature': self.feature,
            'tree': self.tree
        }
        
    def __repr__(self):
        return '{result}'.format(result = self.result)
    
    def add_node(self,val,node):
        self.tree[val] = node
    
    def predict(self,features):
        if self.root is True:
            return self.label
        return self.tree[features[self.feature]].predict(features)

In [15]:
class DTree:
    def __init__(self,epsilon=0.1):
        self.epsilon = epsilon
        self._tree = {}
        
    @staticmethod
    def calc_jini(dataframe,c=-1):
        target = dataframe.iloc[:,c]
        data_length = len(target)
        labels = Counter(target)
        ent = 1-sum([np.power((p/data_length),2) for p in list(labels.values())])
        return ent
    
    def cond_jini(self,dataframe,index=0):
        data_length = len(dataframe)
        feature_set = {}
        data_group = dataframe.groupby(dataframe.iloc[:,index])
        for group in data_group:
            key = group[0]
            val = group[1]
            feature_set[key] = val
            
        cond_jini = sum([(len(p)/data_length)*self.calc_jini(p,-1) for p in feature_set.values()])
        return cond_jini
    
    @staticmethod
    def info_jini(jini,cond_jini):
        return jini - cond_jini
    
    def info_gain_jini(self,dataframe):
        count = dataframe.shape[1] - 1
        ent = self.calc_jini(dataframe)
        best_feature = []
        for c in range(count):
            c_info_jini = self.info_jini(self.calc_jini(dataframe,-1),self.cond_jini(dataframe,index=c))
            best_feature.append((c,c_info_jini))
        best_ = max(best_feature,key = lambda x: x[-1])
        return best_
    
    def train(self,train_data):
        y_train,features = train_data.iloc[:,-1],list(train_data.columns)[:-1]
        
        if len(y_train.value_counts()) == 1:
            return Node(root=True,label=y_train.iloc[0])
        
        if len(features) == 0:
            return Node(root=True,label=y_train.value_counts.sort_values(ascending=False).index[0])
        
        max_feature,max_info_gain = self.info_gain_jini(train_data)
        max_feature_name = features[max_feature]
        if max_info_gain < self.epsilon:
            return Node(root=True,label=y_train.value_counts.sort_values(ascending=False).index[0])
        
        node_tree = Node(root=False,feature_name=max_feature_name,feature=max_feature)
        feature_list = train_data[max_feature_name].value_counts().index
        for f in feature_list:
            sub_train_df = train_data.loc[train_data[max_feature_name]==f].drop(max_feature_name,axis=1)
            sub_tree = self.train(sub_train_df)
            node_tree.add_node(f,sub_tree)
        return node_tree
    
    def fit(self,train_data):
        self._tree = self.train(train_data)
        return self._tree
    
    def predict(self,X_test):
        return self._tree.predict(X_test)
        

In [16]:
dt = DTree()
tree = dt.fit(train_data)

print(tree)

{'label': None, 'feature': 2, 'tree': {'否': {'label': None, 'feature': 1, 'tree': {'否': {'label': '否', 'feature': None, 'tree': {}}, '是': {'label': '是', 'feature': None, 'tree': {}}}}, '是': {'label': '是', 'feature': None, 'tree': {}}}}


In [17]:
dt.predict(['老年','否','否','一般'])

'否'