In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score

import math
import gevent

import pickle
import pydotplus

def openCsv(filename):
    df = pd.read_csv(filename)
    data = np.array(df)
    return data[:, 1:-1], data[:, -1]

all_xs, all_ys = openCsv('./creditcard.csv')
train_xs, test_xs, train_ys, test_ys = train_test_split(all_xs, all_ys, test_size=0.8, shuffle=True)
train_ys = train_ys.astype(np.int)
test_ys = test_ys.astype(np.int)
col_name = ["V1","V2","V3","V4","V5","V6","V7","V8","V9","V10","V11","V12","V13","V14","V15","V16","V17","V18","V19","V20","V21","V22","V23","V24","V25","V26","V27","V28","Amount"]

print(f'x: {train_xs.shape}, y: {train_ys.shape}')

x: (56961, 29), y: (56961,)


In [2]:
class Node:
    def __init__(self, attr='', split=0, pos=0, neg=0):
        self.attr = attr
        self.split = split
        self.pos = pos
        self.neg = neg
        self.classLabel = '+' if pos >= neg else '-'
        
    def __str__(self):
        if self.attr == '':
            return '[{}, {}]: {}'.format(self.pos, self.neg, self.classLabel)
        return '{}<={}\n[{}, {}]: {}'.format(self.attr, self.split, self.pos, self.neg, self.classLabel)

class Tree:
    def __init__(self, curr='', left=None, right=None):
        self.curr = curr
        self.left = left
        self.right = right
        
    def __str__(self):
        if self.curr == None:
            return ''
        return '{} | {} | {}'.format(str(self.curr).replace('\n', ' '), self.left, self.right)

In [3]:
def calcTwoClassEntropy(labels: []) -> float:
    '''
    对 0, 1 列表计算熵
    '''
    pr = (np.sum(labels) / len(labels)) if len(labels) != 0 else 0 # 1 计数
    if pr == 0 or pr == 1:
        return 0
    return - pr * math.log(pr) - (1 - pr) * math.log(1 - pr)

def getMaxGainRatioCol(xs: [[]], ys: []) -> (int, float):
    '''
    计算矩阵内的最优特征与分割值
    '''
    def calcAttrSplit(attrs: [float], ys: [int]) -> (float, float, float):
        '''
        在 attrs 连续性属性列表内找出分割的属性中位数值
        @return split, gain, leftPr
        '''
        entropy = calcTwoClassEntropy(ys) # 整个数据集的Ent
        sortedAttrs = np.unique(attrs) # 去重并排序
        medians = ((np.r_[0, sortedAttrs] + np.r_[sortedAttrs, 0]) / 2) [1:-1]
        
        maxGain = -1 # 信息增益率 >= 0
        goalMedian, goalPr = 0, 0 # 目标最大增益的 中位数拆分 比率
        for median in medians:
            idxLeft = attrs <= median
            ysLeft = ys[idxLeft]
            ysRight = ys[~idxLeft]
            
            entropyLeft = calcTwoClassEntropy(ysLeft) # 第一个区间的Ent
            entropyRight = calcTwoClassEntropy(ysRight) # 第二个区间的Ent
            
            pr = len(ysLeft) / len(ys) # 第一个区间占整个数据的比例
            gain = entropy - (pr * entropyLeft + (1 - pr) * entropyRight) # 用 median 中位数拆分的信息增益

            if maxGain < gain:
                maxGain = gain
                goalMedian, goalPr = median, pr
                
        return goalMedian, maxGain, goalPr
        
    def calcGainRatio(col: [float], ys: [int]) -> (float, float):
        '''
        计算一列属性的信息增益率
        @return gainRatio, splitData
        '''
        entropy = calcTwoClassEntropy(ys) # 整个数据集的交叉熵
        split, gain, pr = calcAttrSplit(col, ys) # 每一列的最佳连续拆分 增益 比率
        ha = - pr * np.log(pr) - (1 - pr) * np.log(1 - pr) # 固有值，可能为 -1log1 = -0
        return np.divide(gain, ha), split # 当只有一个子数据集时：gain / -0 = -inf 最小
        
    coroutines = []
    for idx in range(xs.shape[1]): # 所有属性放协程
        coroutines.append(gevent.spawn(calcGainRatio, xs[:, idx], ys))
        
    maxGainRatio = -1 # 最大增益率
    goalColIdx, goalSplit = 0, 0 # 最大增益率的 列编号 分割中位数
    for idx, coroutine in enumerate(coroutines):
        coroutine.join()
        gainRatio, split = coroutine.value
        if maxGainRatio < gainRatio:
            maxGainRatio = gainRatio
            goalColIdx, goalSplit = idx, split
    
    return goalColIdx, goalSplit

In [6]:
from collections import Counter
maxDepth = math.inf

def splitTree(data: [[]], ys: [], decisionTree, depth=0):
    colIdx, split = getMaxGainRatioCol(data, ys) # 目标列与分割点
    print('Greatest col: \"{}\", split: {}'.format(col_name[colIdx], split))
    
    leftIdxs = data[:, colIdx] <= split
    rightIdxs = ~leftIdxs
    
    left, leftYs = data[leftIdxs], ys[leftIdxs] # 第一棵树的数据与标签
    right, rightYs = data[rightIdxs], ys[rightIdxs] # 第二棵树的数据与标签
    
    allCounter = Counter(ys)
    print('all: ', allCounter)
    decisionTree.curr = Node(col_name[colIdx], round(split, 3), pos=allCounter[1], neg=allCounter[0]) # 当前节点：属性名，分裂值，正负类数
    
    leftCounter = Counter(leftYs)
    rightCounter = Counter(rightYs)
    if len(leftCounter) == 0 or len(rightCounter) == 0: # 没有分类到
        return
    
    if depth < maxDepth: # 在层数内 
        print('left: ', leftCounter)
        if len(leftCounter) == 1: # 只有一类
            decisionTree.left = Tree(Node(pos=leftCounter[1], neg=leftCounter[0])) # 左子树：输出正负
        else:
            decisionTree.left = Tree()
            splitTree(left, leftYs, decisionTree.left, depth + 1) # 分割左子树

        print('right: ', rightCounter)
        if len(rightCounter) == 1: # 只有一类
            decisionTree.right = Tree(Node(pos=rightCounter[1], neg=rightCounter[0])) # 右子树：输出正负
        elif depth < maxDepth:
            decisionTree.right = Tree()
            splitTree(right, rightYs, decisionTree.right, depth + 1) # 分割右子树
        
import datetime
print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))

decisionTree = Tree('data')
splitTree(train_xs, train_ys, decisionTree)

print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))

2019-11-14 17:35:36
Greatest col: "V17", split: -2.656930606537035
all:  Counter({0: 56864, 1: 97})
left:  Counter({1: 74, 0: 24})
Greatest col: "V9", split: 1.7731751396248712
all:  Counter({1: 74, 0: 24})
left:  Counter({1: 74, 0: 15})
Greatest col: "V3", split: -0.7314551273840515
all:  Counter({1: 74, 0: 15})
left:  Counter({1: 74, 0: 13})
Greatest col: "V25", split: 2.27577503511848
all:  Counter({1: 74, 0: 13})
left:  Counter({1: 74, 0: 12})
Greatest col: "V14", split: -1.1270435692362302
all:  Counter({1: 74, 0: 12})
left:  Counter({1: 74, 0: 10})
Greatest col: "V15", split: 0.8382216338599235
all:  Counter({1: 74, 0: 10})
left:  Counter({1: 64, 0: 4})
Greatest col: "V5", split: -0.0023874318095675993
all:  Counter({1: 64, 0: 4})
left:  Counter({1: 63, 0: 3})
Greatest col: "V4", split: -0.16839759613814048
all:  Counter({1: 63, 0: 3})
left:  Counter({1: 1, 0: 1})
Greatest col: "V1", split: -1.974145166043705
all:  Counter({1: 1, 0: 1})
left:  Counter({1: 1})
right:  Counter({0: 

In [8]:
def saveTree(root, filename):
    with open(filename, 'wb') as f:
        f.write(pickle.dumps(root))
        
def readTree(filename):
    obj = Tree()
    with open(filename, 'rb') as f:
        obj = pickle.loads(f.read())
    return obj

decisionTree = readTree('./decisionTree.pkl')

In [7]:
print(str(decisionTree))

def toDot(root):
    def genNode(idx, label):
        return f'{idx} [label="{label}" fontsize=8] ;\n'
    def genLink(idx1, idx2, label, isLeft=True):
        return f'{idx1} -> {idx2} [labeldistance=2.5, labelangle={45 if isLeft else -45}, headlabel="{label}"] ;\n'
    def genHead(shape='box'): # ellipse
        return f'digraph Tree {{ \nnode [shape={shape}] ;\n'
    def genTail():
        return '}\n'
    
    def genBody(dot, idx, node, isRoot=False):
        dot += genNode(idx, node.curr)
        newIdx = idx
        if node.left != None:
            dot += genLink(idx, idx + 1, 'True' if isRoot else '', isLeft=True)
            dot, newIdx = genBody(dot, idx + 1, node.left)
        if node.right != None:
            dot += genLink(idx, newIdx + 1, 'False' if isRoot else '', isLeft=False)
            dot, newIdx = genBody(dot, newIdx + 1, node.right)
        return dot, newIdx
    
    dot = genHead()
    dot += genBody('', 0, root, isRoot=True)[0]
    dot += genTail()
    return dot

saveTree(decisionTree, 'decisionTree.pkl')
dot = toDot(decisionTree)
graph = pydotplus.graph_from_dot_data(dot)
graph.write_pdf("decisionTree.pdf")
# graph.write_png("decisionTree.png")

V17<=-2.657 [97, 56864]: - | V9<=1.773 [74, 24]: + | V3<=-0.731 [74, 15]: + | V25<=2.276 [74, 13]: + | V14<=-1.127 [74, 12]: + | V15<=0.838 [74, 10]: + | V5<=-0.002 [64, 4]: + | V4<=-0.168 [63, 3]: + | V1<=-1.974 [1, 1]: + | [1, 0]: + | None | None | [0, 1]: - | None | None | V11<=7.799 [62, 2]: + | [57, 0]: + | None | None | V4<=6.547 [5, 2]: + | [0, 2]: - | None | None | [5, 0]: + | None | None | V1<=-4.688 [1, 1]: + | [1, 0]: + | None | None | [0, 1]: - | None | None | V13<=-0.704 [10, 6]: + | V27<=-0.247 [2, 6]: - | [2, 0]: + | None | None | [0, 6]: - | None | None | [8, 0]: + | None | None | [0, 2]: - | None | None | [0, 1]: - | None | None | [0, 2]: - | None | None | [0, 9]: - | None | None | V12<=-4.747 [23, 56840]: - | V3<=-0.845 [4, 1]: + | [4, 0]: + | None | None | [0, 1]: - | None | None | V14<=-4.449 [19, 56839]: - | V19<=-3.446 [7, 50]: - | [2, 0]: + | None | None | V7<=0.409 [5, 50]: - | V1<=0.01 [5, 3]: + | [0, 2]: - | None | None | V1<=1.525 [5, 1]: + | [5, 0]: + | None

True

In [5]:
def searchInTree(tree, x) -> int:
    if tree.curr != None and (tree.left is None or tree.right is None): # 叶子节点
        return tree.curr.classLabel == '+' # + -> 1, - -> 0
    idx = col_name.index(tree.curr.attr)
    if x[idx] <= tree.curr.split:
        return searchInTree(tree.left, x)
    else:
        return searchInTree(tree.right, x)

train_pos = len(list(filter(lambda y: y == 1, train_ys)))
train_neg = len(train_ys) - train_pos
test_pos = len(list(filter(lambda y: y == 1, test_ys)))
test_neg = len(test_ys) - test_pos

pred = np.zeros(len(train_ys))
print('train pos: {}, neg: {}, acc(all neg): {:.6f}, f1(all neg): {:.6f}'.format(train_pos, train_neg, accuracy_score(train_ys, pred), f1_score(train_ys, pred)))
pred = np.zeros(len(test_ys))
print('test  pos: {}, neg: {}, acc(all neg): {:.6f}, f1(all neg): {:.6f}'.format(test_pos, test_neg, accuracy_score(test_ys, pred), f1_score(test_ys, pred)))

pred = [searchInTree(decisionTree, x) for x in train_xs]
print('train acc: {:.6f}, f1: {:.6f}'.format(accuracy_score(train_ys, pred), f1_score(train_ys, pred)))
pred = [searchInTree(decisionTree, x) for x in test_xs]
print('test  acc: {:.6f}, f1: {:.6f}'.format(accuracy_score(test_ys, pred), f1_score(test_ys, pred)))

  'precision', 'predicted', average, warn_for)


train pos: 85, neg: 56876, acc(all neg): 0.998508, f1(all neg): 0.000000
test  pos: 407, neg: 227439, acc(all neg): 0.998214, f1(all neg): 0.000000
train acc: 0.999280, f1: 0.760234
test  acc: 0.999232, f1: 0.768212


In [15]:
_decisionTree = decisionTree

In [9]:
def postPruning(root, node, xs, ys):
    """
    对决策树进行后剪枝，从叶子节点往上遍历
    """
    if node.curr == None:
        return
    if not (node.left.left is None or node.left.right is None or node.right.left is None or node.right.right is None): # 子节点非叶子
        postPruning(root, node.left, xs, ys) # 子节点遍历，后序
        postPruning(root, node.right, xs, ys)
    else: # 子节点为叶子 
        # 未剪枝
        pred = [searchInTree(root, x) for x in xs]
        acc_not_pruning = accuracy_score(ys, pred)
    
        # 剪枝
        saveLeft, saveRight = node.left, node.right
        node.left, node.right = None, None
        pred = [searchInTree(root, x) for x in xs]
        acc_pruning = accuracy_score(ys, pred)
        
        if acc_pruning < acc_not_pruning: # 精确率降低
            node.left, node.right = saveLeft, saveRight
            print(f'Not Pruning: {node.curr.attr} <= {node.curr.split}, not: {acc_not_pruning} -> prun: {acc_pruning}')
        else:
            print(f'Pruning: {node.curr.attr} <= {node.curr.split}, not: {acc_not_pruning} -> prun: {acc_pruning}')
    
postPruning(decisionTree, decisionTree, train_xs, train_ys)

Not Pruning: V9 <= 1.773, not: 0.999280209265989 -> prun: 0.9991397622934991
Pruning: V3 <= -0.845, not: 0.999280209265989 -> prun: 0.999280209265989
Not Pruning: V19 <= -3.446, not: 0.999280209265989 -> prun: 0.9992099857797441
Pruning: V15 <= -1.458, not: 0.999280209265989 -> prun: 0.999280209265989
Pruning: V13 <= -2.432, not: 0.999280209265989 -> prun: 0.9992977651375503
Pruning: V17 <= -1.275, not: 0.9992977651375503 -> prun: 0.9992977651375503
Pruning: V10 <= -1.481, not: 0.9992977651375503 -> prun: 0.9992977651375503
Pruning: V19 <= 1.029, not: 0.9992977651375503 -> prun: 0.9992977651375503
Not Pruning: V4 <= 2.882, not: 0.9992977651375503 -> prun: 0.999280209265989
Pruning: V21 <= 1.251, not: 0.9992977651375503 -> prun: 0.9992977651375503
Pruning: V16 <= 1.843, not: 0.9992977651375503 -> prun: 0.999420656238479
Pruning: V15 <= 2.207, not: 0.999420656238479 -> prun: 0.9994557679816014
Pruning: V27 <= 1.775, not: 0.9994557679816014 -> prun: 0.999490879724724


In [12]:
saveTree(decisionTree, 'decisionTree_postPruning.pkl')
dot = toDot(decisionTree)
graph = pydotplus.graph_from_dot_data(dot)
graph.write_pdf("decisionTree_postPruning.pdf")

pred = [searchInTree(decisionTree, x) for x in train_xs]
print('train acc: {:.6f}, f1: {:.6f}'.format(accuracy_score(train_ys, pred), f1_score(train_ys, pred)))
pred = [searchInTree(decisionTree, x) for x in test_xs]
print('test  acc: {:.6f}, f1: {:.6f}'.format(accuracy_score(test_ys, pred), f1_score(test_ys, pred)))

train acc: 0.999491, f1: 0.815287
test  acc: 0.999368, f1: 0.798319
