In [1]:
import math 
def createDataSet():
    dataSet = [[0, 0, 0, 0, 'no'],         #数据集
            [0, 0, 0, 1, 'no'],
            [0, 1, 0, 1, 'yes'],
            [0, 1, 1, 0, 'yes'],
            [0, 0, 0, 0, 'no'],
            [1, 0, 0, 0, 'no'],
            [1, 0, 0, 1, 'no'],
            [1, 1, 1, 1, 'yes'],
            [1, 0, 1, 2, 'yes'],
            [1, 0, 1, 2, 'yes'],
            [2, 0, 1, 2, 'yes'],
            [2, 0, 1, 1, 'yes'],
            [2, 1, 0, 1, 'yes'],
            [2, 1, 0, 2, 'yes'],
            [2, 0, 0, 0, 'no']]
    labels = ['年龄', '有工作', '有自己的房子', '信贷情况']   #分类属性
    return dataSet, labels              #返回数据集和分类属性

In [2]:
def h(lista):
    sums=0
    for i in lista:
        x=i/sum(lista)
        sums+=x*(math.log(x,2))
    return -(sums)
    

In [3]:
def calcShannonEnt(dataSet):
    numEntires = len(dataSet)
    labelCounts = {}                        #保存每个标签(Label)出现次数的字典
    for featVec in dataSet:                      
        currentLabel = featVec[-1]                #提取标签(Label)信息
        if currentLabel not in labelCounts.keys():    
            labelCounts[currentLabel] = 0
        labelCounts[currentLabel] += 1
    shannonEnt = h(labelCounts.values())
#     print(list(labelCounts.values()),'--',sum(list(labelCounts.values())))
    return shannonEnt

if __name__ == '__main__':
    dataSet, features = createDataSet()
    print(dataSet)
    print(calcShannonEnt(dataSet))

[[0, 0, 0, 0, 'no'], [0, 0, 0, 1, 'no'], [0, 1, 0, 1, 'yes'], [0, 1, 1, 0, 'yes'], [0, 0, 0, 0, 'no'], [1, 0, 0, 0, 'no'], [1, 0, 0, 1, 'no'], [1, 1, 1, 1, 'yes'], [1, 0, 1, 2, 'yes'], [1, 0, 1, 2, 'yes'], [2, 0, 1, 2, 'yes'], [2, 0, 1, 1, 'yes'], [2, 1, 0, 1, 'yes'], [2, 1, 0, 2, 'yes'], [2, 0, 0, 0, 'no']]
0.9709505944546686


In [4]:
def splitDataSet(dataSet, axis, value):       
    retDataSet = []             #创建返回的数据集列表
    for featVec in dataSet:
        if featVec[axis] == value:
            reducedFeatVec = featVec[:axis]        #去掉axis特征
            reducedFeatVec.extend(featVec[axis+1:]) #将符合条件的添加到返回数据集
            retDataSet.append(reducedFeatVec)
    return retDataSet     #返回划分后的数据集

In [5]:
splitDataSet(dataSet,0,0)

[[0, 0, 0, 'no'],
 [0, 0, 1, 'no'],
 [1, 0, 1, 'yes'],
 [1, 1, 0, 'yes'],
 [0, 0, 0, 'no']]

In [9]:
def chooseBestFeatureToSplit(dataSet):
    numFeatures = len(dataSet[0]) - 1                    	#特征数量
    baseEntropy = calcShannonEnt(dataSet)               	#计算数据集的香农熵
    bestInfoGain = 0.0                                	#信息增益
    bestFeature = -1                                   #最优特征的索引值
    vb=[]
    for i in range(numFeatures):
        featList = [example[i] for example in dataSet]
        uniqueVals = set(featList)                            #经验条件熵
        # 计算信息增益
        aum=0
        for j in uniqueVals:
            x=splitDataSet(dataSet,i,j)
            l=len(x)
            aum+=calcShannonEnt(x)*(l/len(dataSet))
        print(f"第{i}个特征的增益为{round(baseEntropy-aum,3)}")
        vb.append(baseEntropy-aum)
    bestInfoGain=vb.index(max(vb))
    return bestInfoGain
 
if __name__ == '__main__':
    dataSet, features = createDataSet()
    print("最优特征索引值:" + str(chooseBestFeatureToSplit(dataSet)))

第0个特征的增益为0.083
第1个特征的增益为0.324
第2个特征的增益为0.42
第3个特征的增益为0.363
最优特征索引值:2


In [7]:
def createTree(dataSet, labels, featLabels):
    classList = set([example[-1] for example in dataSet])
    
    if len(labels) == 0 or len(classList) == 1:
        return majorityClass(dataSet)

    bestFeat = chooseBestFeatureToSplit(dataSet)
    bestFeatLabel = labels[bestFeat]
    del(labels[bestFeat])
    featLabels.append(bestFeatLabel)
    featValues = set([example[bestFeat] for example in dataSet])
    disct = {}
    for value in featValues:
        subLabels = labels.copy()
        disct[value] = createTree(splitDataSet(dataSet, bestFeat, value), subLabels, featLabels)
        
    return {bestFeatLabel: disct}

def majorityClass(dataSet):
    classCount = {}
    for example in dataSet:
        classCount[example[-1]] = classCount.get(example[-1], 0) + 1
    return max(classCount, key=classCount.get)

if __name__ == '__main__':
    data = {}
    dataSet, labels = createDataSet()
    myTree = createTree(dataSet, labels, [])
    
    print(myTree)

{'有自己的房子': {0: {'有工作': {0: 'no', 1: 'yes'}}, 1: 'yes'}}


In [8]:
def classify(inputTree, featLabels, testVec):
    firstStr = next(iter(inputTree))            #获取决策树节点
    secondDict = inputTree[firstStr]           #下一个字典
    featIndex = featLabels.index(firstStr)                                               
    for key in secondDict.keys():
        if testVec[featIndex] == key:
            if type(secondDict[key]).__name__ == 'dict':
                classLabel = classify(secondDict[key], featLabels, testVec)
            else: classLabel = secondDict[key]
    return classLabel

if __name__ == '__main__':
    dataSet, labels = createDataSet()
    featLabels = []
    myTree = createTree(dataSet, labels, featLabels)
    testVec =[0,1] 
    result = classify(myTree, featLabels, testVec)
    
    if result == 'yes':
        print('放贷')
    if result == 'no':
        print('不放贷')


放贷
