In [1]:
import pandas as pd
import numpy as np
from scipy import stats
from itertools import chain,combinations

In [2]:
def powerset(iterable):
    s = list(iterable)
    return chain.from_iterable(combinations(s, r) for r in range(1,len(s)))

In [3]:
class Node:
    def __init__(self):
        self.is_leaf = None
        self.splitting_feature = None
        self.threshold = None
        self.left_set = None
        self.right_set = None
        self.left = None
        self.right = None
        self.gini = None
        self.prediction = None

In [106]:
class CART:
    def __init__(self,data,features,target):
        self.classes_ = np.unique(data[target])
        self.total_data_points = data.shape[0]
        self.StrAttrValues_ = {}
        for f in features:
            if data[f].dtype == np.object:
                self.StrAttrValues_[f] = np.unique(data[f])
        self.overallgini = 0
    def Gini(self,target_values):
        if len(target_values) == 0:
            return 0
        g_a = 0
        for c in self.classes_:
            prop = np.sum(target_values==c)/len(target_values)
            g_a+=prop*prop
        return 1-g_a
    def create_leaf(self,target_values):
        leaf = Node() 
        leaf.is_leaf = True
        leaf.gini = self.Gini(target_values)
        self.overallgini += (len(target_values)*leaf.gini)/self.total_data_points
        leaf.prediction = stats.mode(target_values)[0][0]               
        return leaf 
    def find_best_split_subset(self,data,feature,StrAttrValues,target_values):
        best_gini = 100
        left_set,right_set = None,None
        subsets = list(powerset(StrAttrValues[feature]))
        max_len = len(StrAttrValues[feature])//2
        for s in subsets:
            set1 = list(s)
            if len(s) <= max_len:
                set2 = []
                for a in StrAttrValues[feature]:
                    if a not in set1:
                        set2.append(a)
                target_values1,target_values2 = target_values[data[feature].isin(set1)],target_values[data[feature].isin(set2)]
                if not len(target_values1):
                    gini = self.Gini(target_values2)
                elif not len(target_values2):
                    gini = self.Gini(target_values1)
                else:
                    gini = (len(target_values1)*self.Gini(target_values1)+len(target_values2)*self.Gini(target_values2))/len(target_values)
                if gini < best_gini:
                    best_gini = gini
                    left_set,right_set = set1,set2
            else:
                break
        return (best_gini,left_set,right_set)
    def best_threshold(self, data, feature, target_values):
        best_gini = 100
        threshold = None
        num_parent = {}
        for c in self.classes_:
            num_parent[c] = np.sum(target_values==c)
        points, classes = zip(*sorted(zip(data[feature],target_values)))
        num_left = {}
        for c in self.classes_:
            num_left[c] = 0
        num_right = num_parent.copy()
        m = len(target_values)
        for i in range(1,m):
            c = classes[i - 1]
            num_left[c] += 1
            num_right[c] -= 1
            gini_left = 1.0 - sum((num_left[x] / i) ** 2 for x in self.classes_)
            gini_right = 1.0 - sum((num_right[x] / (m - i)) ** 2 for x in self.classes_)
            gini = (i * gini_left + (m - i) * gini_right) / m
            if points[i] == points[i - 1]:
                continue
            if gini < best_gini:
                best_gini = gini
                threshold = (points[i] + points[i - 1]) / 2
        return (best_gini,threshold)
    def best_splitting_feature(self,data, features, target, StrAttrValues):
        best_feature = None 
        best_gini = 100    
        num_data_points = len(data)
        threshold = None
        left_set,right_set = None,None
        for feature in features:
            if data[feature].dtype == np.object:
                gini,l_set,r_set = self.find_best_split_subset(data,feature,StrAttrValues,data[target])
            else:
                gini,t = self.best_threshold(data,feature,data[target])
            if gini < best_gini:
                best_gini = gini
                best_feature = feature
                if data[best_feature].dtype == np.object:
                    left_set,right_set = l_set,r_set
                else:
                    threshold = t
        if data[best_feature].dtype == np.object:
            return {'gini':best_gini,'feature':best_feature,'left_set':left_set,'right_set':right_set}
        return {'gini':best_gini,'feature':best_feature,'threshold':threshold}
    def decision_tree_create(self, data, features, target, current_depth = 0, max_depth = 10,min_data_in_node=0,min_impurity_reduction=-1):
        remaining_features = features[:] # Make a copy of the features.
        target_values = data[target]
        StrAttrValues = self.StrAttrValues_
        print("--------------------------------------------------------------------")
        print("Subtree, depth = %s (%s data points)." % (current_depth, len(target_values)))
        if self.Gini(target_values) == 0: 
            print("Stopping condition 1 reached(Pure Node).")
            return self.create_leaf(target_values)
        if remaining_features == [] :  
            print("Stopping condition 2 reached(No feature to split on).")
            return self.create_leaf(target_values)    
        if max_depth!= None and current_depth >= max_depth:  
            print("Early Stopping condition 1(Reached max depth).")
            return self.create_leaf(target_values)
        if len(target_values) <= min_data_in_node:
            print("Early Stopping condition 2(Min Data in Node)).")
            return self.create_leaf(target_values)
        split = self.best_splitting_feature(data,remaining_features,target,self.StrAttrValues_)
        if split['feature'] == None:
            print("Creating a leaf(Cannot split further)")
            return self.create_leaf(target_values)
        gini_before_split = self.Gini(target_values)
        gini_after_split = split['gini']
        if gini_before_split-gini_after_split <= min_impurity_reduction:
            print("Early Stopping condition 3(Min Reduction in impurity)).")
            return self.create_leaf(target_values)
        try:
            left_split = data[data[split['feature']].isin(split['left_set'])]
            right_split = data[data[split['feature']].isin(split['right_set'])]
            remaining_features.remove(split['feature'])
        except:
            left_split = data[data[split['feature']] < split['threshold']]
            right_split = data[data[split['feature']] >= split['threshold']]
            if len(np.unique(data[split['feature']])) == 2:
                remaining_features.remove(split['feature'])
        print(f'Split on feature {split["feature"]}. ({len(left_split)},{len(right_split)})')
        if len(left_split) == len(data):
            print("Creating leaf node.")
            return self.create_leaf(left_split[target])
        if len(right_split) == len(data):
            print("Creating leaf node.")
            return self.create_leaf(right_split[target])
        left_tree = self.decision_tree_create(left_split, remaining_features, target, current_depth + 1, max_depth,min_data_in_node,min_impurity_reduction)    
        right_tree = self.decision_tree_create(right_split, remaining_features, target, current_depth+1, max_depth,min_data_in_node,min_impurity_reduction)
        temp = Node()
        temp.is_leaf = False
        temp.splitting_feature = split['feature']
        temp.gini = split['gini']
        temp.left,temp.right = left_tree,right_tree
        if data[split['feature']].dtype == np.object:
            temp.left_set,temp.right_set = split['left_set'],split['right_set']
        else:
            temp.threshold = split['threshold']
        return temp
    def classify(self,root, x, annotate = False):   
        if root.is_leaf:
            if annotate: 
                print("At leaf, predicting %s" % root.prediction)
            return root.prediction
        else:
            split_feature_value = x[root.splitting_feature]
            if annotate: 
                print("Split on %s = %s" % (root.splitting_feature, split_feature_value))
            if type(split_feature_value) == str:
                if split_feature_value in root.left_set:
                    return self.classify(root.left,x,annotate)
                else:
                    return self.classify(root.right,x,annotate)
            else:
                if split_feature_value < root.threshold:
                    return self.classify(root.left,x,annotate)
                else:
                    return self.classify(root.right,x,annotate)
    def viewTree(self,root):
        print(self.overallgini)
        queue = []
        queue.append((root,0))
        while len(queue):
            temp,depth = queue.pop(0)
            i = 1
            print(f'Depth: {depth} Attribute: {temp.splitting_feature} Gini: {temp.gini}')
            if not temp.left.is_leaf:
                print(f'[{i}: {temp.left_set if temp.left_set!=None else temp.threshold}: Attribute: {temp.left.splitting_feature}]',end="  ")
                queue.append((temp.left,depth+1))
            else:
                print(f'[{i}: {temp.left_set if temp.left_set!=None else temp.threshold}: Class label: {temp.left.prediction}]',end="  ")
            if not temp.right.is_leaf:
                print(f'[{i}: {temp.right_set if temp.right_set!=None else temp.threshold}: Attribute: {temp.right.splitting_feature}]',end="  ")
                queue.append((temp.right,depth+1))
            else:
                print(f'[{i}: {temp.right_set if temp.right_set!=None else temp.threshold}: Class label: {temp.right.prediction}]',end="  ")
            i+=1
            print('\n')
    

In [90]:
bank = pd.read_csv('bank-additional/bank-additional-full.csv',sep=";")

In [91]:
bank

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41183,73,retired,married,professional.course,no,yes,no,cellular,nov,fri,...,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,yes
41184,46,blue-collar,married,professional.course,no,no,no,cellular,nov,fri,...,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,no
41185,56,retired,married,university.degree,no,yes,no,cellular,nov,fri,...,2,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,no
41186,44,technician,married,professional.course,no,no,no,cellular,nov,fri,...,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,yes


In [92]:
from sklearn.model_selection import train_test_split

In [93]:
train,test = train_test_split(bank,test_size=0.2,random_state=0)

In [94]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
for c in train.columns[:-1]:
    if train[c].dtype == np.object:
        le.fit(train[c])
        train[c] = le.transform(train[c])
        test[c] = le.transform(test[c])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train[c] = le.transform(train[c])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test[c] = le.transform(test[c])


In [95]:
features = bank.columns.to_list()[:-1]
target = 'y'

In [107]:
tree = CART(train,features,target)

In [108]:
root = tree.decision_tree_create(train,features,target,0,7,40,0.01)

--------------------------------------------------------------------
Subtree, depth = 0 (32950 data points).
Split on feature nr.employed. (3997,28953)
--------------------------------------------------------------------
Subtree, depth = 1 (3997 data points).
Split on feature duration. (1392,2605)
--------------------------------------------------------------------
Subtree, depth = 2 (1392 data points).
Split on feature pdays. (216,1176)
--------------------------------------------------------------------
Subtree, depth = 3 (216 data points).
Split on feature day_of_week. (93,123)
--------------------------------------------------------------------
Subtree, depth = 4 (93 data points).
Split on feature cons.price.idx. (63,30)
--------------------------------------------------------------------
Subtree, depth = 5 (63 data points).
Split on feature campaign. (52,11)
--------------------------------------------------------------------
Subtree, depth = 6 (52 data points).
Split on feature d

In [109]:
tree.viewTree(root)

0.12472025700740622
Depth: 0 Attribute: nr.employed Gini: 0.17030160025393262
[1: 5087.65: Attribute: duration]  [1: 5087.65: Attribute: duration]  

Depth: 1 Attribute: duration Gini: 0.4028072959489401
[1: 158.5: Attribute: pdays]  [1: 158.5: Attribute: pdays]  

Depth: 1 Attribute: duration Gini: 0.09813637262062751
[1: 524.5: Class label: no]  [1: 524.5: Attribute: duration]  

Depth: 2 Attribute: pdays Gini: 0.24247386534434967
[1: 14.5: Attribute: day_of_week]  [1: 14.5: Class label: no]  

Depth: 2 Attribute: pdays Gini: 0.44077016108694106
[1: 16.5: Class label: yes]  [1: 16.5: Attribute: duration]  

Depth: 2 Attribute: duration Gini: 0.45307370223461285
[1: 835.5: Attribute: euribor3m]  [1: 835.5: Class label: yes]  

Depth: 3 Attribute: day_of_week Gini: 0.41288574176064335
[1: 1.5: Attribute: cons.price.idx]  [1: 1.5: Attribute: euribor3m]  

Depth: 3 Attribute: duration Gini: 0.4807174456127758
[1: 250.5: Class label: no]  [1: 250.5: Class label: yes]  

Depth: 3 Attribute

In [103]:
def accuracy(true_y,pred_y):
    return sum(true_y == pred_y)/len(pred_y)*100

In [104]:
pred_y = []
test_x = test.iloc[:,:-1]
for i in test_x.index:
    pred_y.append(tree.classify(root,test_x.loc[i]))

In [105]:
accuracy(test.iloc[:,-1].values,pred_y)

92.0004855547463