In [1]:
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import random

In [2]:
class Node():
    def __init__(self):
        self.prob = 0
        self.depth = 1
        self.left = None
        self.right = None
        self.feature = None
        self.threshlod = None
        self.is_terminal = None

In [3]:
class DecisionTreeClassifier():
    def __init__(self,max_depth = 200,min_sample_leaf = 1,min_sample_split = 2):
        self.max_depth = max_depth
        self.min_sample_leaf = min_sample_leaf
        self.min_sample_split = min_sample_split
        self.classes = None
        self.Tree = None
        
    def prob(self,y):
        probs = []
        for class_i in self.classes:
            prob = y[ y==class_i ].shape[0]/y.shape[0]
            probs.append( prob )
        return np.asarray( probs )
    
    def Gini( self,y ):
        return 1 - np.sum((self.prob(y))**2)
    
    def BestFeatureSplit( self,X,y,feat_index_list):
        
        bestSplitFea = None
        bestThre = None
        minGini = 1
        
        for i in feat_index_list:
            fea_i = X[ :,i ]
            for term_j in fea_i:
                threshold = term_j
                y_left = y[ fea_i <= threshold ]
                y_right = y[ fea_i > threshold ]
                if y_left.shape[ 0 ]==0 or y_right.shape[ 0 ]==0:
                    continue
                    
                GiniLeft = self.Gini( y_left )
                GiniRight = self.Gini( y_right )
                ConGini = (y_left.shape[ 0 ]*GiniLeft + y_right.shape[0]*GiniRight)/y.shape[ 0 ]
                
                if minGini > ConGini:
                    minGini = ConGini
                    bestSplitFea = i
                    bestThre = threshold
        if bestSplitFea == None:
            bestSplitFea = np.random.choice(feat_index_list)
        X_col = X[ :,bestSplitFea ]
        if bestThre == None:
            bestThre = np.random.choice(X_col)
        X_left ,X_right = X[ X_col <= bestThre,: ],X[ X_col > bestThre,: ]
        y_left, y_right = y[ X_col <= bestThre ],y[ X_col > bestThre]
        
        return bestSplitFea,bestThre,X_left,y_left,X_right,y_right
    
    def CreateTree(self,X,y,node):
        if (node.depth > self.max_depth or  
            X.shape[ 0 ] <= self.min_sample_split or 
            np.unique(y).shape[ 0 ] == 1):
            node.is_terminal = True
            return 
        
        feat_index_list = np.random.choice(X.shape[ 1 ],int(np.sqrt(X.shape[ 1 ]+1)),replace = False)
        
        splitCol,thresh,X_left,y_left,X_right,y_right = self.BestFeatureSplit( X,y,feat_index_list )
        if X_left.shape[ 0 ] < self.min_sample_leaf or X_right.shape[ 0 ] < self.min_sample_leaf:
            node.is_terminal = True
            return 
        
        node.feature = splitCol
        node.threshlod = thresh
        
        node.left = Node()
        node.left.depth = node.depth +1
        node.left.rownum = y_left.shape[ 0 ]
        node.left.prob = self.prob( y_left )
        
        node.right = Node()
        node.right.depth = node.depth +1
        node.right.rownum = y_right.shape[ 0 ]
        node.right.prob = self.prob( y_right )
        
        self.CreateTree( X_left,y_left,node.left )
        self.CreateTree( X_right,y_right,node.right )
        
    def fit(self,X,y):
        self.classes = np.unique( y )
        self.Tree = Node()
        self.Tree.prob = self.prob( y )
        self.CreateTree( X,y,self.Tree )
        
    def predictSample(self,x,node):
        if node.is_terminal:
            return node.prob
        
        if x[node.feature] > node.threshlod:
            probs = self.predictSample( x,node.right )
        else:
            probs = self.predictSample( x,node.left )
            
        return probs
    
    def predict( self,X ):
        predict = [ ]
        for x in X:
            class_i = np.argmax(self.predictSample(x,self.Tree))
            predict.append( class_i )
        return np.asarray(predict)

In [4]:
class RandomForestClassifier():
    
    def __init__(self,n_trees=15,max_depth = 200,min_sample_leaf = 1,min_sample_split = 2):
        self.n_trees = n_trees
        self.Trees = [ ]
        self.max_depth = max_depth
        self.min_sample_leaf = min_sample_leaf
        self.min_sample_split = min_sample_split
        
    def BootStrap(self,X,y):
        sample_num = X.shape[ 0 ]
        index = np.random.choice(sample_num,sample_num,replace=True)
        return X[index,:],y[index]
    
    def max_common_label(self,y):
        counter = Counter(y)
        return counter.most_common(1)[0][0]
    
    def fit(self,X,y):
        self.Trees = [ ]
        for i in range(self.n_trees):
            tree = DecisionTreeClassifier(max_depth =self.max_depth,
                                           min_sample_leaf = self.min_sample_leaf,
                                          min_sample_split = self.min_sample_split)
            X_bootstrap,y_booststrap = self.BootStrap(X,y)
            tree.fit(X_bootstrap,y_booststrap)
            self.Trees.append(tree)
            
    def predict(self,X):
        tree_preds = np.array([tree.predict(X) for tree in self.Trees])
        tree_preds = np.swapaxes(tree_preds,0,1)
        y_pred = [self.max_common_label(tree_pred) for tree_pred in tree_preds]
        return np.asarray(y_pred)

if __name__ == "__main__":
    data=pd.read_csv("/Users/zzk/Desktop/M_Learning/DATA/线性分类数据/Binary-classification-dataset-master/data0/data.csv", names = ['label','x1', 'x2'])
    X=data.values[:,1:]
    y=data.values[:,0]
    y_val=np.where(y==1,1,0)
    X_train, X_test, y_train, y_test = train_test_split(X, y_val, test_size=0.2, random_state=0)
    model = RandomForestClassifier(n_trees=15,max_depth = 30, min_sample_leaf = 1,min_sample_split = 2)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print('模型的准确率是:', accuracy_score(y_test, y_pred))

模型的准确率是: 0.95
