## 决策树
决策树的算法实现。根据不同的划分方法，生成不同的树。

1. 信息增益划分 ID3.
2. 信息增益率划分 C4.5
3. Gini 系数划分 CART


In [124]:
import copy
from math import log
from math import inf

def findBestSplitFeature(data, labels):
    '''
        Find the best feature to split data. This function is a stub function.
        Always return the first feature as the best split feature.
    '''
    return 0


def splitDataSet(data, labels, feature, value):
    '''
        Get sub dataset and labels by feature and value
    '''
    subData = []
    subLabels = []
    for i, item in enumerate(data):
        if (item[feature] == value):
            citem = item.copy()
            del citem[feature]
            subData.append(citem)
            subLabels.append(labels[i])
    return subData, subLabels

def calcEntropy(labels):
    '''
        Calc Shannon Entropy
    '''
    m = len(labels)
    keys = set(labels)
    ent = 0.0
    for k in keys:
        count = labels.count(k)
        ent += - (count/m) * (log(count/m) / log(2))
    return ent
    

def splitByInfoGain(data, labels):
    '''
        Find the best feature to split data by Information gain.
    '''
    m = len(data)
    features = len(data[0])
    Edata = calcEntropy(labels)

    maxGain = 0
    splitFeature = -1
    
    for feature in range(features):
        values = set([item[feature] for item in data])
        subEnt = 0
        for value in values:
            subData, subLabels = splitDataSet(data, labels, feature, value)
            subEnt += (len(subData) / m) * calcEntropy(subLabels)
        gain = Edata - subEnt
        
        if (gain >= maxGain):
            maxGain = gain
            splitFeature = feature
    return splitFeature

def splitByInfoGainRatio(data, labels):
    '''
        Find the best feature to split data by Information gain Ratio.
    '''
    m = len(data)
    features = len(data[0])
    Edata = calcEntropy(labels)

    maxRatio = 0
    splitFeature = -1
    
    for feature in range(features):
        values = set([item[feature] for item in data])
        subEnt = 0
        intrinsicVal = 0.0
        for value in values:
            subData, subLabels = splitDataSet(data, labels, feature, value)
            freq = len(subData) / m
            subEnt += freq * calcEntropy(subLabels)
            intrinsicVal += -(freq * log(freq) / log(2))
        ratio = subEnt / intrinsicVal
        
        if (ratio >= maxRatio):
            maxRatio = ratio
            splitFeature = feature
    return splitFeature

def calcGiniIndex(labels):
    '''
        calculte gini index
    '''
    m = len(labels)
    keys = set(labels)
    gini = 0.0
    for k in keys:
        frac = labels.count(k) / m
        gini += 1 - frac * frac
    return gini       

def splitByGiniIndex(data, labels):
    '''
        Find the best feature to split data by Min Gini Index.
    '''
    m = len(data)
    features = len(data[0])
    
    minGini = inf
    splitFeature = -1
    
    for feature in range(features):
        values = set([item[feature] for item in data])
        gini = 0
        for value in values:
            subData, subLabels = splitDataSet(data, labels, feature, value)
            freq = len(subData) / m
            gini += freq * calcGiniIndex(subLabels)
        
        if (gini <= minGini):
            minGini = gini
            splitFeature = feature
    return splitFeature  

class DecisionTree(object):
    '''
        Create A Decision Tree.
    '''
    def __init__(self, splitFunc = findBestSplitFeature):
        '''
            Init a decision tree with a split function.
        '''
        self.splitFunc = splitFunc
        return
    
    def createNode(self):
        return {'isLeaf': False, 'feature': '', 'label': ''}
    
    def classifyByVotes(self, labels):
        '''
            Return class of the labels by voting.
        '''
        keys = set(labels)
        maxCount = 0
        label = ''
        for k in keys:
            count = labels.count(k)
            if (count >= maxCount):
                maxCount = count
                label = k    
        return label
            
    def generateTree(self, data, labels, featureNames, indent = 0, output = True):
        '''
            To recursively construct a decision tree.
        '''
        node = self.createNode()
        if (output):
            print(' ' * (4 * indent), data, labels)
        # case 0, all examples has the same label
        if (len(labels) == labels.count(labels[0])):
            node['isLeaf'] = True
            node['label'] = labels[0]
            return node
        
        # case 1, all examples has the same feature values.
        featureStrs = [','.join(map(str, item)) for item in data]
        if (len(set(featureStrs)) == 1):
            node['isLeaf'] = True
            node['label'] = self.classifyByVotes(labels)
            return node
        
        feature = self.splitFunc(data, labels)
        
        node['feature'] = featureNames[feature]
        cfeatureNames = featureNames.copy()
        del cfeatureNames[feature]
        
        values = set([item[feature] for item in data])        
        for value in values:
            subData, subLabels,  = splitDataSet(data, labels, feature, value)
            # case 3
            if (len(subData) == len(data)):
                node['isLeaf'] = True
                node['label'] = self.classifyByVotes(subLabels)
                return node
            
            node[value] = self.generateTree(subData, subLabels, cfeatureNames, indent + 1, output)
            
        return node
    
    def model(self, data, labels, featuresNames = None, output = True):
        '''
            Model the decisiton tree
        '''
        return self.generateTree(data, labels, featuresNames, output=output)
        

In [125]:
# 创建数据集
def loadDataSet():
    data = [
        [1, 1],
        [1, 1],
        [1, 0],
        [0, 1],
        [0, 1]
    ]
    labels = ['yes', 'yes', 'no', 'no', 'no']
    features = ['no surfacing', 'flippers']
    return data, labels, features

#### 1. 始终使用第一个特征进行划分

In [126]:
data, labels, features = loadDataSet()
tree = DecisionTree()
tree.model(data, labels, features)

 [[1, 1], [1, 1], [1, 0], [0, 1], [0, 1]] ['yes', 'yes', 'no', 'no', 'no']
     [[1], [1]] ['no', 'no']
     [[1], [1], [0]] ['yes', 'yes', 'no']
         [[]] ['no']
         [[], []] ['yes', 'yes']


{'isLeaf': False,
 'feature': 'no surfacing',
 'label': '',
 0: {'isLeaf': True, 'feature': '', 'label': 'no'},
 1: {'isLeaf': False,
  'feature': 'flippers',
  'label': '',
  0: {'isLeaf': True, 'feature': '', 'label': 'no'},
  1: {'isLeaf': True, 'feature': '', 'label': 'yes'}}}

#### 2. 使用信息增益进行划分

In [127]:
data, labels, features = loadDataSet()
tree = DecisionTree(splitByInfoGain)
tree.model(data, labels, features)

 [[1, 1], [1, 1], [1, 0], [0, 1], [0, 1]] ['yes', 'yes', 'no', 'no', 'no']
     [[1], [1]] ['no', 'no']
     [[1], [1], [0]] ['yes', 'yes', 'no']
         [[]] ['no']
         [[], []] ['yes', 'yes']


{'isLeaf': False,
 'feature': 'no surfacing',
 'label': '',
 0: {'isLeaf': True, 'feature': '', 'label': 'no'},
 1: {'isLeaf': False,
  'feature': 'flippers',
  'label': '',
  0: {'isLeaf': True, 'feature': '', 'label': 'no'},
  1: {'isLeaf': True, 'feature': '', 'label': 'yes'}}}

#### 3. 使用增益率进行划分


In [128]:
data, labels, features = loadDataSet()
tree = DecisionTree(splitByInfoGainRatio)
tree.model(data, labels, features)

 [[1, 1], [1, 1], [1, 0], [0, 1], [0, 1]] ['yes', 'yes', 'no', 'no', 'no']
     [[1]] ['no']
     [[1], [1], [0], [0]] ['yes', 'yes', 'no', 'no']
         [[], []] ['no', 'no']
         [[], []] ['yes', 'yes']


{'isLeaf': False,
 'feature': 'flippers',
 'label': '',
 0: {'isLeaf': True, 'feature': '', 'label': 'no'},
 1: {'isLeaf': False,
  'feature': 'no surfacing',
  'label': '',
  0: {'isLeaf': True, 'feature': '', 'label': 'no'},
  1: {'isLeaf': True, 'feature': '', 'label': 'yes'}}}

#### 4. 使用 Gini 系数来划分

In [129]:
data, labels, features = loadDataSet()
tree = DecisionTree(splitByGiniIndex)
tree.model(data, labels, features)

 [[1, 1], [1, 1], [1, 0], [0, 1], [0, 1]] ['yes', 'yes', 'no', 'no', 'no']
     [[1], [1]] ['no', 'no']
     [[1], [1], [0]] ['yes', 'yes', 'no']
         [[]] ['no']
         [[], []] ['yes', 'yes']


{'isLeaf': False,
 'feature': 'no surfacing',
 'label': '',
 0: {'isLeaf': True, 'feature': '', 'label': 'no'},
 1: {'isLeaf': False,
  'feature': 'flippers',
  'label': '',
  0: {'isLeaf': True, 'feature': '', 'label': 'no'},
  1: {'isLeaf': True, 'feature': '', 'label': 'yes'}}}

### 使用隐形眼镜数据集来测试

In [130]:
def loadLensData():
    fr = open('lenses.txt')
    data = []
    labels = []
    for line in fr.readlines():
        lineData = line.strip().split('\t')
        labels.append(lineData[-1])
        del lineData[-1]
        data.append(lineData)
    features = ['Age', 'Prescript', 'Astigmatic', 'tearRate']
    fr.close()
    return data, labels, features

In [132]:
data, labels, features = loadLensData() 
tree = DecisionTree(splitByInfoGain)
tree.model(data, labels, features, output = False)

{'isLeaf': False,
 'feature': 'tearRate',
 'label': '',
 'reduced': {'isLeaf': True, 'feature': '', 'label': 'no lenses'},
 'normal': {'isLeaf': False,
  'feature': 'Astigmatic',
  'label': '',
  'yes': {'isLeaf': False,
   'feature': 'Prescript',
   'label': '',
   'hyper': {'isLeaf': False,
    'feature': 'Age',
    'label': '',
    'pre': {'isLeaf': True, 'feature': '', 'label': 'no lenses'},
    'young': {'isLeaf': True, 'feature': '', 'label': 'hard'},
    'presbyopic': {'isLeaf': True, 'feature': '', 'label': 'no lenses'}},
   'myope': {'isLeaf': True, 'feature': '', 'label': 'hard'}},
  'no': {'isLeaf': False,
   'feature': 'Age',
   'label': '',
   'pre': {'isLeaf': True, 'feature': '', 'label': 'soft'},
   'young': {'isLeaf': True, 'feature': '', 'label': 'soft'},
   'presbyopic': {'isLeaf': False,
    'feature': 'Prescript',
    'label': '',
    'hyper': {'isLeaf': True, 'feature': '', 'label': 'soft'},
    'myope': {'isLeaf': True, 'feature': '', 'label': 'no lenses'}}}}}

上面使用信息增益生成一个隐形眼镜的 ID3 树，有时间可以把树画出来。