In [1]:
import numpy as np
import copy
import random


class Tree(object):
    def __init__(self):
        self.split_feature = None
        self.split_value = None
        self.split_gain = None
        self.internal_value = None
        self.node_index = None
        self.leaf_value = None
        self.tree_left = None
        self.tree_right = None

    def calc_predict_value(self, dataset):
        if self.leaf_value is not None:
            return self.leaf_value
        elif dataset[self.split_feature] <= self.split_value:
            return self.tree_left.calc_predict_value(dataset)
        else:
            return self.tree_right.calc_predict_value(dataset)

    # print tree structure by JSON format
    def describe_tree(self):
        if not self.tree_left and not self.tree_right:
            leaf_info = "{leaf_value:" + str(self.leaf_value) + "}"
            return leaf_info
        left_info = self.tree_left.describe_tree()
        right_info = self.tree_right.describe_tree()
        tree_structure = "{split_feature:" + str(self.split_feature) + \
                         ",split_value:" + str(self.split_value) + \
                         ",split_gain:" + str(self.split_gain) + \
                         ",internal_value:" + str(self.internal_value) + \
                         ",node_index:" + str(self.node_index) + \
                         ",left_tree:" + left_info + \
                         ",right_tree:" + right_info + "}"
        return tree_structure

    # count all leaf nodes & parent nodes which have two leaf nodes
    def state_tree(self, leaves_state, node_state):
        if not self.tree_left and not self.tree_right:
            leaves_state.append(1)
            return
        if not self.tree_left.split_gain and not self.tree_right.split_gain:
            node_state.append([self.node_index, self.split_gain])
        self.tree_left.state_tree(leaves_state, node_state)
        self.tree_right.state_tree(leaves_state, node_state)
        return leaves_state, node_state

    # prune tree with given node_index
    def prune_tree(self, prune_node_index):
        if not self.tree_left and not self.tree_right:
            return
        if self.tree_left.node_index == prune_node_index:
            leaf_value = self.tree_left.internal_value
            self.tree_left = Tree()
            self.tree_left.node_index = prune_node_index
            self.tree_left.leaf_value = leaf_value
            return
        elif self.tree_right.node_index == prune_node_index:
            leaf_value = self.tree_right.internal_value
            self.tree_right = Tree()
            self.tree_right.node_index = prune_node_index
            self.tree_right.leaf_value = leaf_value
            return
        self.tree_left.prune_tree(prune_node_index)
        self.tree_right.prune_tree(prune_node_index)
        return


class BaseDecisionTree(object):
    def __init__(self, max_depth, num_leaves, min_samples_split, min_samples_leaf, subsample,
                 colsample_bytree, max_bin, min_child_weight, reg_gamma, reg_lambda, random_state):
        self.max_depth = max_depth
        self.num_leaves = num_leaves
        self.min_samples_split = min_samples_split
        self.min_samples_leaf = min_samples_leaf
        self.subsample = subsample
        self.colsample_bytree = colsample_bytree
        self.max_bin = max_bin
        self.min_child_weight = min_child_weight
        self.reg_gamma = reg_gamma
        self.reg_lambda = reg_lambda
        self.random_state = random_state
        self.tree = Tree()
        self.pred = None
        self.node_index = 0
        self.feature_importances_ = dict()

    def fit(self, dataset, targets):
        dataset_copy = copy.deepcopy(dataset).reset_index(drop=True)
        targets_copy = copy.deepcopy(targets).reset_index(drop=True)

        if self.random_state:
            random.seed(self.random_state)
        if self.subsample < 1.0:
            subset_index = random.sample(range(len(targets)), int(self.subsample*len(targets)))
            dataset_copy = dataset_copy.iloc[subset_index, :].reset_index(drop=True)
            targets_copy = targets_copy.iloc[subset_index, :].reset_index(drop=True)
        if self.colsample_bytree < 1.0:
            subcol_index = random.sample(list(dataset_copy.columns), int(self.colsample_bytree*len(dataset_copy.columns)))
            dataset_copy = dataset_copy[subcol_index]

        self.tree = self._fit(dataset_copy, targets_copy, depth=0)
        self.pred = dataset.apply(lambda x: self.predict(x), axis=1)

        leaves_state, node_state = self.tree.state_tree(leaves_state=[], node_state=[])
        while sum(leaves_state) > self.num_leaves:
            node_state = sorted(node_state, key=lambda x: x[1])
            self.tree.prune_tree(node_state[0][0])
            leaves_state, node_state = self.tree.state_tree(leaves_state=[], node_state=[])
        return self

    def _fit(self, dataset, targets, depth):
        if dataset.__len__() <= self.min_samples_split or targets['hess'].sum() <= self.min_child_weight:
            tree = Tree()
            tree.leaf_value = self.calc_leaf_value(targets)
            return tree

        if depth < self.max_depth:
            best_split_feature, best_split_value, best_split_gain, best_internal_value = \
                self.choose_best_feature(dataset, targets)
            left_dataset, right_dataset, left_targets, right_targets = \
                self.split_dataset(dataset, targets, best_split_feature, best_split_value)

            tree = Tree()
            if left_dataset.__len__() <= self.min_samples_leaf or \
                    right_dataset.__len__() <= self.min_samples_leaf:
                tree.leaf_value = self.calc_leaf_value(targets)
                return tree
            else:
                self.feature_importances_[best_split_feature] = \
                    self.feature_importances_.get(best_split_feature, 0) + 1

                tree.split_feature = best_split_feature
                tree.split_value = best_split_value
                tree.split_gain = best_split_gain
                tree.internal_value = best_internal_value
                tree.node_index = self.node_index
                self.node_index += 1
                tree.tree_left = self._fit(left_dataset, left_targets, depth+1)
                tree.tree_right = self._fit(right_dataset, right_targets, depth+1)
                return tree
        else:
            tree = Tree()
            tree.leaf_value = self.calc_leaf_value(targets)
            return tree

    def choose_best_feature(self, dataset, targets):
        best_split_gain = float('-inf')
        best_split_feature = None
        best_split_value = None

        for feature in dataset.columns:
            if dataset[feature].unique().__len__() <= 100:
                unique_values = dataset[feature].unique()
            else:
                unique_values = np.unique([np.percentile(dataset[feature], x)
                                           for x in np.linspace(0, 100, self.max_bin)])

            for split_value in unique_values:
                left_targets = targets[dataset[feature] <= split_value]
                right_targets = targets[dataset[feature] > split_value]
                split_gain = self.calc_split_gain(left_targets, right_targets)

                if split_gain > best_split_gain:
                    best_split_feature = feature
                    best_split_value = split_value
                    best_split_gain = split_gain
        best_internal_value = self.calc_leaf_value(targets)
        return best_split_feature, best_split_value, best_split_gain, best_internal_value

    def calc_leaf_value(self, targets):
        leaf_value = - targets['grad'].sum() / (targets['hess'].sum() + self.reg_lambda)
        return leaf_value

    def calc_split_gain(self, left_targets, right_targets):
        left_grad = left_targets['grad'].sum()
        left_hess = left_targets['hess'].sum()
        right_grad = right_targets['grad'].sum()
        right_hess = right_targets['hess'].sum()
        split_gain = 0.5 * (left_grad ** 2 / (left_hess + self.reg_lambda) +
                            right_grad ** 2 / (right_hess + self.reg_lambda) -
                            (left_grad + right_grad) ** 2 / (left_hess + right_hess + self.reg_lambda)) - self.reg_gamma
        return split_gain

    @staticmethod
    def split_dataset(dataset, targets, split_feature, split_value):
        left_dataset = dataset[dataset[split_feature] <= split_value]
        left_targets = targets[dataset[split_feature] <= split_value]
        right_dataset = dataset[dataset[split_feature] > split_value]
        right_targets = targets[dataset[split_feature] > split_value]
        return left_dataset, right_dataset, left_targets, right_targets

    def predict(self, dataset):
        return self.tree.calc_predict_value(dataset)

    def print_tree(self):
        return self.tree.describe_tree()

In [2]:
from __future__ import division
import pandas as pd
import numpy as np
from math import exp, log
import random
import warnings
warnings.filterwarnings('ignore')
pd.set_option('precision', 4)
pd.set_option('display.max_rows', 50)
pd.set_option('display.width', 1000)
pd.set_option('display.max_columns', 1000)
pd.set_option('expand_frame_repr', False)


class BaseLoss(object):
    def __init__(self):
        pass

    def grad(self, targets):
        pass

    def hess(self, targets):
        pass


class SquareLoss(BaseLoss):
    """
    L = 0.5*(pred - label)**2
    """
    def grad(self, targets):
        grad = targets['pred'] - targets['label']
        return grad

    def hess(self, targets):
        hess = 1
        return hess


class LogisticLoss(BaseLoss):
    """
    L = log(1 + exp(-label*pred))
    """
    def grad(self, targets):
        pred = 1.0 / (1.0 + exp(- targets['pred']))
        grad = - targets['label'] / (1 + exp(targets['label'] * pred))
        return grad

    def hess(self, targets):
        pred = 1.0 / (1.0 + exp(- targets['pred']))
        hess = exp(targets['label'] * pred) / (1 + exp(targets['label'] * pred))**2
        return hess


class XGBClassifier(object):
    def __init__(self, n_estimators=100, max_depth=-1, num_leaves=-1, learning_rate=0.1, min_samples_split=2,
                 min_samples_leaf=1, subsample=1., colsample_bytree=1., max_bin=225, min_child_weight=1.,
                 reg_gamma=0., reg_lambda=0., loss="squareloss", random_state=None):
        """Construct a xgboost model
        Parameters
        ----------
        n_estimators : int, optional (default=100)
            Number of boosted trees to fit.
        max_depth : int, optional (default=-1)
            Maximum tree depth for base learners, -1 means no limit.
        num_leaves : int, optional (default=-1)
            Maximum tree leaves for base learners, -1 means no limit.
        learning_rate : float, optional (default=0.1)
            Boosting learning rate.
        min_samples_split : int, optional (default=2)
            The minimum number of samples required to split an internal node.
        min_samples_leaf : int, optional (default=1)
            The minimum number of samples required to be at a leaf node.
        subsample : float, optional (default=1.)
            Subsample ratio of the training instance.
        colsample_bytree : float, optional (default=1.)
            Subsample ratio of columns when constructing each tree.
        max_bin: int or None, optional (default=225))
            Max number of discrete bins for features.
        min_child_weight : float, optional (default=1.)
            Minimum sum of instance weight(hessian) needed in a child(leaf).
        reg_gamma : float, optional (default=0.)
            L1 regularization term on weights.
        reg_lambda : float, optional (default=0.)
            L2 regularization term on weights.
        loss: loss object, (default="logistic")
            logisticloss, squareloss
        random_state : int or None, optional (default=None)
            Random number seed.
        """
        self.n_estimators = n_estimators
        self.max_depth = max_depth if max_depth != -1 else float('inf')
        self.num_leaves = num_leaves if num_leaves != -1 else float('inf')
        self.learning_rate = learning_rate
        self.min_samples_split = min_samples_split
        self.min_samples_leaf = min_samples_leaf
        self.subsample = subsample
        self.colsample_bytree = colsample_bytree
        self.max_bin = max_bin
        self.min_child_weight = min_child_weight
        self.reg_gamma = reg_gamma
        self.reg_lambda = reg_lambda
        self.loss = loss
        self.random_state = random_state
        self.pred_0 = None
        self.trees = dict()
        self.feature_importances_ = dict()

    def fit(self, dataset, targets):
        if self.loss == "logistic":
            self.loss = LogisticLoss()
        elif self.loss == "squareloss":
            self.loss = SquareLoss()
        else:
            raise ValueError("The loss function must be 'logistic' or 'squareloss'!")

        targets = targets.to_frame(name='label')
#         if targets['label'].unique().__len__() != 2:
#             raise ValueError("There must be two class for targets!")
#         if len([x for x in dataset.columns if dataset[x].dtype in ['int32', 'float32', 'int64', 'float64']]) \
#                 != len(dataset.columns):
#             raise ValueError("The features dtype must be int or float!")

        if self.random_state:
            random.seed(self.random_state)
        random_state_stages = random.sample(range(max(self.n_estimators, len(targets))), self.n_estimators)

        # the first base function
        mean = 1.0 * sum(targets['label']) / len(targets['label'])
#         self.pred_0 = 0.5 * log((1 + mean) / (1 - mean))
        self.pred_0 = 0.5
        targets['pred'] = self.pred_0
        targets['grad'] = targets.apply(self.loss.grad, axis=1)
        targets['hess'] = targets.apply(self.loss.hess, axis=1)

        for stage in range(self.n_estimators):
            print(("iter: "+str(stage+1)).center(80, '='))
            tree = BaseDecisionTree(self.max_depth, self.num_leaves, self.min_samples_split, self.min_samples_leaf,
                                    self.subsample, self.colsample_bytree, self.max_bin, self.min_child_weight,
                                    self.reg_gamma, self.reg_lambda, random_state_stages[stage])
            tree.fit(dataset, targets)
            self.trees[stage] = tree
            targets['pred'] = targets['pred'] + self.learning_rate * tree.pred
            targets['grad'] = targets.apply(self.loss.grad, axis=1)
            targets['hess'] = targets.apply(self.loss.hess, axis=1)

            for key, value in tree.feature_importances_.items():
                self.feature_importances_[key] = self.feature_importances_.get(key, 0) + 1

    def predict_proba(self, dataset):
        res = []
        for index, row in dataset.iterrows():
            f_value = self.pred_0
            for stage, tree in self.trees.items():
                f_value += self.learning_rate * tree.predict(row)
            p_0 = 1.0 / (1 + exp(2 * f_value))
            res.append([p_0, 1 - p_0])
        return np.array(res)

    def predict(self, dataset):
        res = []
        for p in self.predict_proba(dataset):
            label = 0 if p[0] >= p[1] else 1
            res.append(label)
        return np.array(res)


if __name__ == '__main__':
#     df = pd.read_csv("test.csv")
    df = pd.read_csv('facies_vectors.csv')
    df = df.fillna(df['PE'].mean())
    feature_names = ['GR', 'ILD_log10', 'DeltaPHI', 'PHIND', 'PE', 'NM_M', 'RELPOS']
    well = 'KIMZEY A'
    test = df[df['Well Name'] == well]
    train = df[df['Well Name'] != well]
    X_train = train[feature_names].values 
    y_train = train['Facies'].values 
    X_test = test[feature_names].values 
    y_test = test['Facies'].values 

    xgb = XGBClassifier(n_estimators=5,
                        max_depth=6,
                        num_leaves=30,
                        learning_rate=0.1,
                        min_samples_split=40,
                        min_samples_leaf=10,
                        subsample=0.6,
                        colsample_bytree=0.8,
                        max_bin=150,
                        min_child_weight=1,
                        reg_gamma=0.1,
                        reg_lambda=0.3,
                        loss='logistic',
                        random_state=66)
    train_count = int(0.7 * len(df))
    xgb.fit(train[feature_names], train['Facies'])
#     xgb.fit(df.ix[:train_count, :-1], df.ix[:train_count, 'Class'])

    y_res = xgb.predict_proba(test[feature_names])
    print(y_res)
    print(y_test)
    acc = 0
    for index in range (len(y_res)):
        if (y_res[index] == y_test[index]):
            acc += 1
    print("Accuracy is ", acc/len(y_res))       

#     from sklearn import metrics
#     print (metrics.roc_auc_score(df.ix[:train_count, 'Class'], xgb.predict_proba(df.ix[:train_count, :-1])[:, 1]))
#     print (metrics.roc_auc_score(df.ix[train_count:, 'Class'], xgb.predict_proba(df.ix[train_count:, :-1])[:, 1]))

[[0.02528146 0.97471854]
 [0.02427736 0.97572264]
 [0.02522805 0.97477195]
 [0.02657948 0.97342052]
 [0.02842016 0.97157984]
 [0.02958636 0.97041364]
 [0.02958636 0.97041364]
 [0.02958636 0.97041364]
 [0.02958636 0.97041364]
 [0.02958636 0.97041364]
 [0.02952414 0.97047586]
 [0.02952414 0.97047586]
 [0.03522683 0.96477317]
 [0.03423381 0.96576619]
 [0.03423381 0.96576619]
 [0.03186597 0.96813403]
 [0.02514387 0.97485613]
 [0.02657948 0.97342052]
 [0.02848012 0.97151988]
 [0.02958636 0.97041364]
 [0.02958636 0.97041364]
 [0.02958636 0.97041364]
 [0.02958636 0.97041364]
 [0.02958636 0.97041364]
 [0.02663567 0.97336433]
 [0.02663567 0.97336433]
 [0.02663567 0.97336433]
 [0.02663567 0.97336433]
 [0.02958636 0.97041364]
 [0.02958636 0.97041364]
 [0.02958636 0.97041364]
 [0.02958636 0.97041364]
 [0.02958636 0.97041364]
 [0.02952414 0.97047586]
 [0.02952414 0.97047586]
 [0.02663567 0.97336433]
 [0.02663567 0.97336433]
 [0.02528146 0.97471854]
 [0.02528146 0.97471854]
 [0.02528146 0.97471854]


ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()