# CART假设决策树是二叉树，既可以用于分类也可以用于回归

# CART生成算法（分类树）
输入：训练数据集D，停止计算的条件；

输出：CART决策树

根据训练数据集，从根结点开始，递归地对每个结点进行以下操作，构建二叉树：

    （1）设节点的训练数据集为D，计算现有特征对该数据集的基尼指数。此时，对每一个特征A，对其可能取得每个值a，根据样本点对A=a的测试为“是”或“否”将D分割成D1和D2两部分，利用式（1）计算A=a时的基尼指数。
    
    （2）在所有可能的特征A以及它们所有可能的切分点a中，选择基尼指数最小的特征及其对应的切分点作为最优特征与最优切分点。依最优特征与最优切分点，从现结点生成两个子结点，将训练数据集依特征分配到两个子结点中去。
    
    （3）对两个子结点递归地调用（1），（2），直到满足停止条件。
    
    （4）生成CART决策树。
    
算法停止条件是结点中的样本个数小于预定阈值或样本集的基尼指数小于预定阈值，或者没有更多特征。

In [1]:
import operator
import math

class Node:
    def __init__(self, label=None, featureIndex=None,feature_name=None, parentNode=None, leaf=False):
        self.label = label
        self.leaf = leaf  # 是否为叶结点
        self.featureIndex = featureIndex
        self.feature_name = feature_name
        self.parentNode = parentNode
        self.childNodeList = {}  # 存的是子结点，如feature_name是身高，key可能存的是高，矮,value为不同key时对应的node

    def predict(self,node,Xtest):
        if node.leaf:
            return node.label
        else:
            currentFeatIndex=node.featureIndex
            currentFeatVal=Xtest[currentFeatIndex]
            if currentFeatVal in node.childNodeList.keys():
                return self.predict(node.childNodeList[currentFeatVal],Xtest)
            else:
                return self.predict(node.childNodeList[u'其他'],Xtest)




class dTree:
    def __init__(self, epsilon=0.1):
        self.epsilon = epsilon
        self.myTree = None
        self.completeDataSets=None
        self.completeAttrs = None


    def treeGenerate(self,dataSets,labels,pattern='classification'):
        self.myTree=Node()
        self.completeDataSets=dataSets.copy()
        self.completeAttrs = labels.copy()
        self.recursive(dataSets,labels,pattern,self.myTree)
        print("generate tree success!")


    def recursive(self, dataSets, labels,pattern, node=Node()):
        if self.isSameClass(dataSets):
            node.label = dataSets[0][-1]
            node.leaf = True
            return
        if len(labels[:-1]) == 0:
            node.label = self.majorityClass(dataSets)
            node.leaf = True
            return

        bestFeatIndex,bestFeatVal = self.chooseBestFeatureToSplit(dataSets, labels)
        node.featureIndex = bestFeatIndex
        node.feature_name = self.completeAttrs[bestFeatIndex]
        equalSubDataSets, unequalSubDataSets = self.splitData(dataSets,bestFeatIndex, bestFeatVal)
        subDataSets=[equalSubDataSets,unequalSubDataSets]
        for i in range(2):
            subNode=Node()
            if i==0:
                node.childNodeList[bestFeatVal]=subNode
            else:
                node.childNodeList[u'其他'] = subNode
            subNode.parentNode=node
            if len(subDataSets[i])==0:
                subNode.leaf=True
                subNode.label=self.majorityClass(dataSets)
            else:
                subLabels = labels.copy()
                subLabels.remove(self.completeAttrs[bestFeatIndex])
                self.recursive(subDataSets[i],subLabels,pattern,subNode)



    def chooseBestFeatureToSplit(self, dataSets, labels):
        m = len(dataSets)
        bestGiniIndex = math.inf
        bestFeatIndex = -1
        bestFeatVal=None
        for i in range(len(labels) - 1):  # 遍历每个属性
            j=self.completeAttrs.index(labels[i])
            featList = [data[j] for data in dataSets]
            uniqueFeats = set(featList) #当前属性的取值
            currentGiniIndex=0
            for feat in uniqueFeats: #遍历当前属性的每个取值
                equalSubDataSets, unequalSubDataSets = self.splitData(dataSets, j, feat) #根据当前属性j的当前取值feat来分割dataSets，equalSubDataSets里存放的是dataSets上属性j取值为feat的样本，unequalSubDataSets里存放的是ataSets上属性j取值不为feat的样本
                equal_prob=len(equalSubDataSets)/m
                unequal_prob=len(unequalSubDataSets)/m
                currentGiniIndex=equal_prob*self.calcGini(equalSubDataSets)+unequal_prob*self.calcGini(unequalSubDataSets)
                if currentGiniIndex <= bestGiniIndex:
                    bestGiniIndex = currentGiniIndex
                    bestFeatIndex = j
                    bestFeatVal=feat
        return bestFeatIndex,bestFeatVal


    def splitData(self, dataSets, featIndex, featVals):
        equalSubDataSets=[]
        unequalSubDataSets=[]
        for data in dataSets:
            if data[featIndex]==featVals:
                equalSubDataSets.append(data)
            else:
                unequalSubDataSets.append(data)
        return equalSubDataSets,unequalSubDataSets


    def calcGini(self, dataSets):
        giniIndex=1
        m=len(dataSets)
        classCountDict=self.classCount(dataSets)
        for key,value in classCountDict.items():
            probability=value/m
            giniIndex-=(probability**2)
        return giniIndex



    def isSameClass(self, dataSets):
        C = dataSets[0][-1]  # 第一个样本的类
        for data in dataSets:
            if C != data[-1]:
                return False
        return True



    def majorityClass(self, dataSets):
        classCountDict = self.classCount(dataSets)
        sortedClassCount = sorted(classCountDict.items(), key=operator.itemgetter(1),reverse=True)
        return sortedClassCount[0][0]


    def classCount(self,dataSets):
        classCountDict = {}
        for data in dataSets:
            if data[-1] not in classCountDict.keys():
                classCountDict[data[-1]] = 1
            else:
                classCountDict[data[-1]] += 1
        return classCountDict


    def predict(self,Xtest):
        return self.myTree.predict(self.myTree,Xtest)


    def score(self,testDatas):
        #testDatas需要是至少有两个样本的list
        y=[data[-1] for data in testDatas]
        predictY=[]
        for Xtest in testDatas:
            predictY.append(self.predict(Xtest[:-1]))
        m=len(y)
        count=0
        for i in range(m):
            if y[i]==predictY[i]:
                count+=1
        accuracy=float(count/m)*100
        print(y)
        print(predictY)
        print("accuracy is "+str(accuracy)+"%")


In [2]:
firstDataSets = [['青年', '否', '否', '一般', '否'],
               ['青年', '否', '否', '好', '否'],
               ['青年', '是', '否', '好', '是'],
               ['青年', '是', '是', '一般', '是'],
               ['青年', '否', '否', '一般', '否'],
               ['中年', '否', '否', '一般', '否'],
               ['中年', '否', '否', '好', '否'],
               ['中年', '是', '是', '好', '是'],
               ['中年', '否', '是', '非常好', '是'],
               ['中年', '否', '是', '非常好', '是'],
               ['老年', '否', '是', '非常好', '是'],
               ['老年', '否', '是', '好', '是'],
               ['老年', '是', '否', '好', '是'],
               ['老年', '是', '否', '非常好', '是'],
               ['老年', '否', '否', '一般', '否'],]

firstLabels = [u'年龄', u'有工作', u'有自己的房子', u'信贷情况', u'类别']

firstMyTree=dTree()
firstMyTree.treeGenerate(firstDataSets,firstLabels)
firstMyTree.score(firstDataSets)

generate tree success!
['否', '否', '是', '是', '否', '否', '否', '是', '是', '是', '是', '是', '是', '是', '否']
['否', '否', '是', '是', '否', '否', '否', '是', '是', '是', '是', '是', '是', '是', '否']
accuracy is 100.0%


# 第一组数据生成的决策树

![image.png](./CART_tree_pics/Figure_1.png)

In [3]:
secondDataSets=[
['青绿','蜷缩','浊响','清晰','凹陷','硬滑',1],
['乌黑','蜷缩','沉闷','清晰','凹陷','硬滑',1],
['乌黑','蜷缩','浊响','清晰','凹陷','硬滑',1],
['青绿','蜷缩','沉闷','清晰','凹陷','硬滑',1],
['浅白','蜷缩','浊响','清晰','凹陷','硬滑',1],
['青绿','稍蜷','浊响','清晰','稍凹','软粘',1],
['乌黑','稍蜷','浊响','稍糊','稍凹','软粘',1],
['乌黑','稍蜷','浊响','清晰','稍凹','硬滑',1],
['乌黑','稍蜷','沉闷','稍糊','稍凹','硬滑',0],
['青绿','硬挺','清脆','清晰','平坦','软粘',0],
['浅白','硬挺','清脆','模糊','平坦','硬滑',0],
['浅白','蜷缩','浊响','模糊','平坦','软粘',0],
['青绿','稍蜷','浊响','稍糊','凹陷','硬滑',0],
['浅白','稍蜷','沉闷','稍糊','凹陷','硬滑',0],
['乌黑','稍蜷','浊响','清晰','稍凹','软粘',0],
['浅白','蜷缩','浊响','模糊','平坦','硬滑',0],
['青绿','蜷缩','沉闷','稍糊','稍凹','硬滑',0]]

secondLabels=[u'色泽', u'根蒂', u'敲声', u'纹理', u'脐部', u'触感',u'类别']


secondMyTree=dTree()
secondMyTree.treeGenerate(secondDataSets,secondLabels)
secondMyTree.score(secondDataSets)

generate tree success!
[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]
accuracy is 100.0%


# 第二组数据生成的决策树

![image.png](./CART_tree_pics/Figure_2.png)

In [4]:
thirdDataSets=[
['青绿', '蜷缩', '浊响', '清晰', '凹陷', '硬滑', 1],
['乌黑', '蜷缩', '沉闷', '清晰', '凹陷', '硬滑', 1],
['乌黑', '蜷缩', '浊响', '清晰', '凹陷', '硬滑', 1],
['青绿', '稍蜷', '浊响', '清晰', '稍凹', '软粘', 1],
['乌黑', '稍蜷', '浊响', '稍糊', '稍凹', '软粘', 1],
['青绿', '硬挺', '清脆', '清晰', '平坦', '软粘', 0],
['浅白', '稍蜷', '沉闷', '稍糊', '凹陷', '硬滑', 0],
['乌黑', '稍蜷', '浊响', '清晰', '稍凹', '软粘', 0],
['浅白', '蜷缩', '浊响', '模糊', '平坦', '硬滑', 0],
['青绿', '蜷缩', '沉闷', '稍糊', '稍凹', '硬滑', 0]]

thirdLabels=[u'色泽', u'根蒂', u'敲声', u'纹理', u'脐部', u'触感',u'类别']

thirdTestDataSets=[
['青绿','蜷缩','沉闷','清晰','凹陷','硬滑',1],
['浅白','蜷缩','浊响','清晰','凹陷','硬滑',1],
['乌黑','稍蜷','浊响','清晰','稍凹','硬滑',1],
['乌黑','稍蜷','沉闷','稍糊','稍凹','硬滑',0],
['浅白','硬挺','清脆','模糊','平坦','硬滑',0],
['浅白','蜷缩','浊响','模糊','平坦','软粘',0],
['青绿','稍蜷','浊响','稍糊','凹陷','硬滑',0]]

thirdMyTree=dTree()
thirdMyTree.treeGenerate(thirdDataSets,thirdLabels)
thirdMyTree.score(thirdDataSets)

generate tree success!
[1, 1, 1, 1, 1, 0, 0, 0, 0, 0]
[1, 1, 1, 1, 1, 0, 0, 1, 0, 0]
accuracy is 90.0%


# 第三组数据生成的决策树

![image.png](./CART_tree_pics/Figure_3.png)

In [5]:
thirdMyTree.score(thirdTestDataSets)

[1, 1, 1, 0, 0, 0, 0]
[1, 0, 1, 0, 0, 0, 0]
accuracy is 85.71428571428571%


# CART回归树与剪枝比较复杂，因时间原因暂不实现，以后补上

In [3]:
from sklearn.datasets import load_boston
import numpy as np
import pandas as pd

In [8]:
def create_data():
    boston = load_boston()
    return boston.data,boston.target,boston.feature_names
X,y,attrs=create_data()