# AdaBoost算法的实现

In [2]:
import numpy as np
from sklearn.tree import DecisionTreeClassifier
class AdaBoost:
    def fit(self,train_x,train_y,clf_num):
        self.weak_clfs = []
        self.clf_alphas = []
        n_train = len(train_x) # train size
        w = np.ones(n_train) / n_train # initial sample weights
        for i in range(clf_num):
            #train the i-th weak classifier
            clf = DecisionTreeClassifier(max_depth=3)
            clf.fit(train_x, train_y,sample_weight=w)
            self.weak_clfs.append(clf)
            #calculate the error of i-th weak classifer
            pred_train_i = clf.predict(train_x)
            error = [int(x) for x in (pred_train_i != train_y)]
            #print("the %d th weak classifier accuracy:%.3f"%(i+1,1-sum(error)/n_train))
            err_wighted = np.dot(w,error)
            #calculate alpha_i
            alpha_i = 0.5*np.log((1-err_wighted)/(err_wighted))
            self.clf_alphas.append(alpha_i)
            #update sample weights
            miss = [x if x==1 else -1 for x in error]
            w = np.multiply(w,np.exp([float(x)*alpha_i for x in miss]))
            w = w / sum(w)
    def predict(self,test_x):
        n_test = len(test_x)
        pred_test = np.zeros(n_test)
        for i in range(len(self.weak_clfs)):
            pred_test_i = self.weak_clfs[i].predict(test_x)
            pred_test_i = [1 if x == 1 else -1 for x in pred_test_i]
            pred_test = pred_test + np.multiply(self.clf_alphas[i], pred_test_i)
        pred_test = (pred_test > 0) * 1
        return pred_test


# 实践案例一 ：基于AdaBoost的信用卡精准营模型

In [4]:
#读取数据
import pandas as pd
data = pd.read_csv("Credit_Card_Sale.csv",encoding='utf-8')
data = data.values
data_x = data[:,0:-1] # sample feature
data_y = data[:,-1] # sample label
#划分数据集
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data_x,data_y,test_size=0.3)
ada_boost = AdaBoost()
ada_boost.fit(X_train, y_train, clf_num=20)
test_y_ada_boost = ada_boost.predict(X_test)
acc = 0.0
for i in range(len(y_test)):
    if y_test[i] == test_y_ada_boost[i]:
        acc += 1.0
print("AdaBoost model accuracy:%.3f"%(acc/len(y_test)))

#训练模型，sklearn库中的AdaBoost算法
from sklearn.ensemble import AdaBoostClassifier
ada_boost_sklearn = AdaBoostClassifier(DecisionTreeClassifier(max_depth=3),n_estimators=20)
ada_boost_sklearn.fit(X_train, y_train)
y_pred = ada_boost_sklearn.predict(X_test)
acc = 0.0
for i in range(len(y_test)):
    if y_test[i] == y_pred[i]:
        acc += 1.0
print("AdaBoost Sklearn model accuracy:%.3f"%(acc/len(y_test)))
#单个决策树模型预测模型
tree_model = DecisionTreeClassifier(max_depth=3)
tree_model.fit(X_train, y_train)
tree_model_test_y = tree_model.predict(X_test)
acc = 0.0
for i in range(len(y_test)):
    if y_test[i] == tree_model_test_y[i]:
        acc += 1.0
print("single tree model accuracy:%.3f"%(acc/len(y_test)))

AdaBoost model accuracy:0.837
AdaBoost Sklearn model accuracy:0.810
single tree model accuracy:0.833


# 随机森林算法的实现

In [5]:
from random import randrange
from random import randint
from sklearn.tree import DecisionTreeClassifier
import numpy as np
class RandomForest:
    #boost-trap
    def boosttrap_sampling(self,data_length):
        sample_data_index = []
        while len(sample_data_index) < data_length:
            index = randrange(data_length-1)
            sample_data_index.append(index)
        return sample_data_index
    #randomly select k features
    def random_select_k_features(self, feature_length,k):
        feature_index = []
        while len(feature_index) < k:
            index = randint(0, feature_length-1)
            if index in feature_index:
                index =randint(0, feature_length-1)
            else:
                feature_index.append(index)
        return feature_index
    #sampling training data
    def get_sampled_data(self,data_x,data_y,k):
        data_len = data_x.shape[0]
        feat_len = data_x.shape[1]
        sample_data_index = self.boosttrap_sampling(data_len)
        feature_index = self.random_select_k_features(feat_len,k)
        sample_data_x = data_x[sample_data_index]
        sample_data_x = sample_data_x[:,feature_index]
        sample_data_y = data_y[sample_data_index]
        return sample_data_x,sample_data_y,feature_index
    #train random forests
    def fit(self, train_x, train_y,tree_num, k,tree_depth):
        self.feature_list = []
        self.trees = []
        for i in range(tree_num):
            sample_data_x,sample_data_y,feature_index = self.get_sampled_data(
                train_x, train_y, k)
            self.feature_list.append(feature_index)
            clf = DecisionTreeClassifier(criterion='gini',max_depth=tree_depth)
            clf.fit(sample_data_x,sample_data_y)
            self.trees.append(clf)
    #model prediction
    def predict(self,test_x):
        pred_result = np.zeros((len(test_x),len(self.trees)),dtype=int)
        labels = []
        for i in range(len(self.trees)):
            test_x_sub = test_x[:,self.feature_list[i]]
            pred_y = self.trees[i].predict(test_x_sub)
            pred_result[:,i] = pred_y
        for i in range(len(test_x)):
            label = self.majorityCount(pred_result[i,:])
            labels.append(label)
        return pred_result,labels
    #get major class
    def majorityCount(self,votes):
        class_list = []
        for c in votes:
            if c not in class_list:
                class_list.append(c)
        count = []
        for c in class_list:
            num = 0
            for x in votes:
                if x == c:
                    num += 1
            count.append(num)
        max_count = 0
        max_index = 0
        for i in range(len(count)):
            if count[i] > max_count:
                max_count =count[i]
                max_index = i
        return class_list[max_index]

# 实践案例二：基于随机森林的信用卡精准营模型

In [7]:
#读取数据集
import pandas as pd
data = pd.read_csv("Credit_Card_Sale.csv",encoding='utf-8')
data = data.values
data_x = data[:,0:-1] # sample feature
data_y = data[:,-1] # sample label
#划分训练集和测试集
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data_x,data_y,test_size=0.3)
rf_model = RandomForest()
rf_model.fit(X_train, y_train, 50,4,3)
_,test_y_rf = rf_model.predict(X_test)
acc = 0.0
for i in range(len(y_test)):
    if y_test[i] == test_y_rf[i]:
        acc += 1.0
print("RF model accuracy:%.3f"%(acc/len(y_test)))

#用sklearn库中AdaBoost算法和训练数据，训练模型
from sklearn.ensemble import RandomForestClassifier
rf_sklearn = RandomForestClassifier(max_depth=3,n_estimators=50)
rf_sklearn.fit(X_train, y_train)
y_pred = rf_sklearn.predict(X_test)
acc = 0.0
for i in range(len(y_test)):
    if y_test[i] == y_pred[i]:
        acc += 1.0
print("RF Sklearn model accuracy:%.3f"%(acc/len(y_test)))

#单个决策树模型-预测模型
tree_model = DecisionTreeClassifier(max_depth=3)
tree_model.fit(X_train, y_train)
tree_model_test_y = tree_model.predict(X_test)
acc = 0.0
for i in range(len(y_test)):
    if y_test[i] == tree_model_test_y[i]:
        acc += 1.0
print("single tree model accuracy:%.3f"%(acc/len(y_test)))


RF model accuracy:0.810
RF Sklearn model accuracy:0.810
single tree model accuracy:0.810
