In [1]:
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import train_test_split

In [64]:
def loadData(filePath):
    data = pd.read_csv(filePath)
    threshold = 64
    if 'label' in data.columns:
        X = np.array(data.iloc[:,1:])
        # 8位 256 -> 1位 2 减小复杂度
        X[X<=threshold], X[X>threshold] = 0, 1
        y = np.array(data.iloc[:,0])
        return X, y
    else:
        X = np.array(data.iloc[:,:])
        # 8位 256 -> 1位 2 减小复杂度
        X[X<=threshold], X[X>threshold] = 0, 1
        return X

In [118]:
class TreeNode:
    def __init__(self):
        self.feature = None
        self.ch = None
        self.label = None
    
class Decision_Tree:
    def __init__(self):
        # 如果最大信息增益小于threshold则停止分枝
        self.threshold = 0.3
    def entPartFunc(self, x):
        return -x*np.log2(x)
    
    def maxCountLabel(self, arr):
        # 返回出现最多次的元素
        dic = dict()
        for _ in arr:
            if _ in dic.keys():
                dic[_] += 1
            else:
                dic[_] = 1
        maxKey = list(dic.keys())[0]
        for key in dic.keys():
            if dic[key] > dic[maxKey]:
                maxKey = key
        return maxKey
    
    def calculateHD(self, arr):
        # 计算经验熵
        dic = dict()
        for _ in arr:
            if _ in dic.keys():
                dic[_] += 1
            else:
                dic[_] = 1
        HD, nums = 0, arr.shape[0]
        for val in dic.values():
            HD += self.entPartFunc(val / nums)
        return HD
   
    def calculateHDA(self, x, y):
        # 计算经验条件熵
        HDA, nums = 0, x.shape[0]
        xs = set(x)
        for xi in xs:
            HDA += (x==xi).sum() / nums * self.calculateHD(y[x==xi])
        return HDA
    
    def buildTree(self, X, y, featureID):
        curNode = TreeNode()
        # label全部相同直接返回
        if len(set(y)) == 1:
            curNode.label = y[-1]
            return curNode
        # 所有特征都分类过了，选出现最多的label为叶子节点label并返回
        if X.shape[0] == 0:
            curNode.label = self.maxCountLabel(y)
            return curNode
        # 统计按各个特征分类得到的的信息增益
        infoGain = np.zeros(featureID.shape[0])
        HD = self.calculateHD(y)
        for i in range(featureID.shape[0]):
            HDA = self.calculateHDA(X[:,i], y)
            infoGain[i] = HD - HDA
        # 最大信息增益没有到阈值，选出现最多的label为叶子节点label并返回
        if infoGain.max() < self.threshold:
            curNode.label = self.maxCountLabel(y)
            return curNode
        maxGainIndex = infoGain.argmax()
        # 分类特征
        curNode.feature = featureID[maxGainIndex]
        subFeatures = set(X[:,maxGainIndex])
        curNode.ch = {}
        for term in subFeatures:
            index = (X[:,maxGainIndex]==term)
#             print(X[index,:maxGainIndex].shape, X[index, maxGainIndex+1:].shape)
            nextX = np.concatenate((X[index,:maxGainIndex], X[index, maxGainIndex+1:]), axis = 1)
            nexty = y[index]
            nextfeatureID = np.concatenate((featureID[:maxGainIndex], featureID[maxGainIndex+1:]))
            curNode.ch[term] = self.buildTree(nextX, nexty, nextfeatureID)
        return curNode
        
    def fit(self, X, y, threshold = 0.3):
        self.threshold = threshold
        featureID = np.array([i for i in range(X.shape[1])])
        self.tree = self.buildTree(X,y,featureID)
    
    def pred(self, xi, curNode):
        if curNode.label != None:
            return curNode.label
        return self.pred(xi, curNode.ch[xi[curNode.feature]])
    
    def predict(self, X):
        y_pred = np.zeros(X.shape[0])
        for i, xi in enumerate(X):
            y_pred[i] = self.pred(xi, self.tree)
        return y_pred

In [138]:
trainFilePath = '../mnist/train.csv'
X, y = loadData(trainFilePath)
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size = 0.2, random_state = 42, stratify = y)
start = time.time()
model = Decision_Tree()
print('training start.')
model.fit(X_train, y_train, threshold = 0.05)
end = time.time()
print('training finish.')
print('time : %fs' % (end - start))

training start.
training finish.
time : 206.075328s


In [139]:
y_pred = model.predict(X_valid)
acc = (y_pred == y_valid).sum() / y_valid.shape[0]
print('acc = %f.' % (acc))

acc = 0.860357.


In [140]:
testFilePath = '../mnist/test.csv'
X_test = loadData(testFilePath)
y_pred = model.predict(X_test)
ans = pd.DataFrame({'ImageId': np.arange(1,y_pred.shape[0]+1), 'Label' : y_pred.astype(np.int32)})
ans.to_csv('./result.csv', index = False)

# threshold=0.3 训练时长54.27s kaggle准确度 70.846%
# threshold=0.1 训练时长108.83s kaggle准确度 85.685%