In [1]:
import numpy as np
from collections import Counter

In [2]:
class Node:
    def __init__(self, feature=None, threshold=None, left=None, right=None, info_gain=None, value=None):
        #Constructor
        self.feature = feature
        print("--- "+str(feature))
        self.threshold = threshold
        self.left = left
        self.right = right
        self.info_gain=info_gain
        self.value = value
        #verifier si le noad est un feuille
    def is_leaf_node(self):
        return self.value is not None

In [38]:
class DecisionTree:
    
    #Constructor
    def __init__(self, min_samples_split=1, max_depth=100, n_features=None):
        self.min_samples_split=min_samples_split
        self.max_depth=max_depth
        self.n_features=n_features
        self.root=None
        self.features_names=None
        
    def fit(self, X, y,features_names):
        self.features_names=features_names
        self.n_features = X.shape[1] if not self.n_features else min(X.shape[1],self.n_features)
        self.root = self._build_tree(X, y,features_names)
        return self

        #construire l'arbre
    def _build_tree(self, X, y,features_names,depth=0):
        n_samples=X.shape[0]
        n_feats = X.shape[1]
        n_labels = len(np.unique(y))

        # check the stopping criteria
        if (depth>=self.max_depth or n_labels==1 or n_samples<self.min_samples_split):
            leaf_value = self._most_common_label(y)
            return Node(feature=-1,value=leaf_value)

        feat_idxs = np.random.choice(n_feats, n_feats, replace=False)

        # find the best 
        best_feature, best_thresh , best_gain= self._best_split(X, y, feat_idxs)
        # create child nodes
        left_idxs, right_idxs = self._split(X[:, best_feature], best_thresh)
        X=np.delete(X, best_feature, 1)
        best_features_name=features_names[best_feature]
        features_names=np.delete(features_names, best_feature)

        left = self._build_tree(X[left_idxs, :].copy(), y[left_idxs], features_names.copy(),depth+1)
        right = self._build_tree(X[right_idxs, :].copy(), y[right_idxs],features_names.copy(), depth+1)
        
        return Node(best_features_name, best_thresh, left, right, best_gain)


    #choisir le bon attribut et le seuil pour diviser la population
    def _best_split(self, X, y, feat_idxs):
        best_gain = -1
        split_idx, split_threshold = None, None

        #boucler sur chaque attributs
        for feat_idx in feat_idxs:
            X_column = X[:, feat_idx]
            thresholds = np.unique(X_column)

            #boucler sur chaque valeur possible d'un attribut
            for thr in thresholds :
                # calculate the information gain
                gain = self._information_gain(y, X_column, thr)

                if gain > best_gain:
                    best_gain = gain
                    split_idx = feat_idx
                    split_threshold = thr

        return split_idx, split_threshold, best_gain


    #calcule le gain d'information de chaque split selon le feature et la valeur de seuil
    def _information_gain(self, y, X_column, threshold):
        # parent entropy
        parent_entropy = self._entropy(y)

        # create children
        left_idxs, right_idxs = self._split(X_column, threshold)

        if len(left_idxs) == 0 or len(right_idxs) == 0:
            return 0
        
        # calculate the weighted avg. entropy of children
        n = len(y)
        n_l, n_r = len(left_idxs), len(right_idxs)
        e_l, e_r = self._entropy(y[left_idxs]), self._entropy(y[right_idxs])
        child_entropy = (n_l/n) * e_l + (n_r/n) * e_r

        # calculate the IG
        information_gain = parent_entropy - child_entropy
        return information_gain

    #diviser l'arbre en deux parties
    def _split(self, X_column, split_thresh):
        left_idxs = np.argwhere(X_column <= split_thresh).flatten()
        right_idxs = np.argwhere(X_column > split_thresh).flatten()
        return left_idxs, right_idxs

    #calcule l'enropy
    def _entropy(self, y):
        hist = np.bincount(y.reshape(-1))
        ps = hist / len(y)
        return -np.sum([p * np.log(p) for p in ps if p>0])


    def _most_common_label(self, y):
        counter = Counter(y.reshape(-1))
        value = counter.most_common(1)[0][0]
        return value

    #prediction
    def predict(self, X):
        return np.array([self._traverse_tree(x, self.root) for x in X])

    def _traverse_tree(self, x, node):
        if node.is_leaf_node():
            return node.value
        
        index=np.where(self.features_names == node.feature)[0]
        if x[index] <= node.threshold:
            return self._traverse_tree(x, node.left)
        return self._traverse_tree(x, node.right)
    
    
    def print_tree(self, tree=None, indent="--"):
        
        if not tree:
            tree = self.root
            
        if tree.value is not None:
            print(tree.value)

        else:
            print("feature_"+str(tree.feature), "| threshoed <= ", tree.threshold, " | gain info :", tree.info_gain)
            print("%sleft:" % (indent), end="")
            self.print_tree(tree.left, indent + indent)
            print("%sright:" % (indent), end="")
            self.print_tree(tree.right, indent + indent)

In [39]:
import pandas as pd

col_names = ['months_as_customer', 'age', 'capital-gains', 'total_claim_amount', 'fraud_reported']
data = pd.read_csv("InsuranceFraudDetection.csv")[col_names].loc[0:25]
data.head(15)

Unnamed: 0,months_as_customer,age,capital-gains,total_claim_amount,fraud_reported
0,328,48,53300,71610,0
1,228,42,0,5070,1
2,134,29,35100,34650,0
3,256,41,48900,63400,0
4,228,44,66000,6500,1
5,256,39,0,64100,0
6,137,34,0,78650,0
7,165,37,0,51590,1
8,27,33,0,27700,0
9,212,42,0,42300,1


In [40]:
X = data.iloc[:, :-1].values
Y = data.iloc[:, -1].values.reshape(-1,1)

In [41]:
X

array([[   328,     48,  53300,  71610],
       [   228,     42,      0,   5070],
       [   134,     29,  35100,  34650],
       [   256,     41,  48900,  63400],
       [   228,     44,  66000,   6500],
       [   256,     39,      0,  64100],
       [   137,     34,      0,  78650],
       [   165,     37,      0,  51590],
       [    27,     33,      0,  27700],
       [   212,     42,      0,  42300],
       [   235,     42,  38400,  87010],
       [   447,     61,      0, 114920],
       [    60,     23,      0,  56520],
       [   121,     34,  52800,   7280],
       [   180,     38,  41300,  46200],
       [   473,     58,  55700,  63120],
       [    70,     26,  63600,  52110],
       [   140,     31,  53500,  77880],
       [   160,     37,  45500,  72930],
       [   196,     39,  57000,  60400],
       [   460,     62,      0,  47160],
       [   217,     41,  46700,  37840],
       [   370,     55,  72700,  71520],
       [   413,     55,      0,  98160],
       [   237, 

In [42]:
Y

array([[0],
       [1],
       [0],
       [0],
       [1],
       [0],
       [0],
       [1],
       [0],
       [1],
       [0],
       [0],
       [1],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0]], dtype=int64)

In [47]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=.5, random_state=1)

In [48]:
clf = DecisionTree(max_depth=1000)
features_names= ['months_as_customer', 'age', 'capital-gains', 'total_claim_amount']
clf.fit(X_train, Y_train,features_names)
predictions = clf.predict(X_test)
print(predictions)
Y_test
# 0 :No fraud , 1:fraud

--- -1
--- -1
--- -1
--- capital-gains
--- months_as_customer
--- -1
--- age
--- -1
--- total_claim_amount
[0 0 0 0 0 0 0 0 0 0 0 0 0]


  if x[index] <= node.threshold:


array([[0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [1],
       [0],
       [0],
       [0],
       [0]], dtype=int64)

In [49]:
from sklearn.metrics import accuracy_score
accuracy_score(Y_test, predictions)

0.9230769230769231

In [50]:
clf.print_tree()

feature_total_claim_amount | threshoed <=  56520  | gain info : 0.3234659997481203
--left:feature_age | threshoed <=  33  | gain info : 0.3182570841474064
----left:feature_months_as_customer | threshoed <=  27  | gain info : 0.17441604792151594
--------left:0
--------right:feature_capital-gains | threshoed <=  0  | gain info : 0.6931471805599453
----------------left:1
----------------right:0
----right:1
--right:0


In [12]:
T=[[ 1.28000e+02,  4.0000e+01,  3.30691e+03,  1.55000e+04]]
T

[[128.0, 40.0, 3306.91, 15500.0]]

In [13]:
clf.predict(T)

array([0], dtype=int64)