In [3]:
import pandas as pd
import numpy as np
from scipy import stats
from itertools import chain,combinations

In [4]:
def powerset(iterable):
    s = list(iterable)
    return chain.from_iterable(combinations(s, r) for r in range(1,len(s)))
# p = list(powerset(np.array(['a','b','c'])))
# list(p[0])

In [5]:
class Node:
    def __init__(self):
        self.attr = None
        self.label = None
        self.next = None

In [12]:
class cart:
    def __init__(self,target):
        self.classes_ = np.unique(target)
    def gini(self,y):
        g_a = 0
        for c in self.classes_:
            prop = np.sum(y==c)/len(y)
            g_a+=prop*prop
        return 1-g_a
    def findBestMidPoint(self,X,y):
        num_parent = {}
        for c in self.classes_:
            num_parent[c] = np.sum(y==c)
        g_a = np.inf
        best_split_point = None
        points, classes = zip(*sorted(zip(X,y)))
        num_left = {}
        for c in self.classes_:
            num_left[c] = 0
        num_right = num_parent.copy()
        m = len(y)
        for i in range(1,m):
             c = classes[i - 1]
             num_left[c] += 1
             num_right[c] -= 1
             gini_left = 1.0 - sum((num_left[x] / i) ** 2 for x in self.classes_)
             gini_right = 1.0 - sum((num_right[x] / (m - i)) ** 2 for x in self.classes_)
             gini = (i * gini_left + (m - i) * gini_right) / m
             if points[i] == points[i - 1]:
                    continue
             if gini < g_a:
                    g_a = gini
                    best_split_point = (points[i] + points[i - 1]) / 2
        return (g_a,best_split_point)
    def selectBestAttr(self,X,y,features,attr_values):
        # print(len(y))
        # print(G)
        G = self.gini(y)
        # print(G)
        G_a = []
        split_points = []
        for i in range(len(features)):
            if attr_values[features[i]].dtype == 'O':
                g_a = np.inf
                subsets = list(powerset(attr_values[features[i]]))
                # print(subsets)
                max_len = len(attr_values[features[i]])//2
                for s in subsets:
                    set1 = list(s)
                    if len(s) <= max_len:
                        set2 = []
                        for a in attr_values[features[i]]:
                            if a not in set1:
                                set2.append(a)
                        # print(set1,set2)
                        y_v1,y_v2 = y[X[features[i]].isin(set1)],y[X[features[i]].isin(set2)]
                        if not len(y_v1):
                            temp = self.gini(y_v2)
                        elif not len(y_v2):
                            temp = self.gini(y_v1)
                        else:
                            temp = (len(y_v1)*self.gini(y_v1)+len(y_v2)*self.gini(y_v2))/len(y)
                        if temp < g_a:
                            g_a = temp
                            sp = (set1,set2)
                    else:
                        break
            else:
                (g_a,sp) = self.findBestMidPoint(X[features[i]],y)
            G_a.append(g_a)
            split_points.append(sp)
        # print(G_a,split_points)
        print(G_a)
        G_a = np.array(G_a)
        # print(G_a)
        best_index = (G-G_a).argmax()
        # print(features[best_index],split_points[best_index])
        return (best_index,split_points[best_index])
    def cart(self,X,y,features,attr_values):
        root = Node()
        labels = np.unique(y)
        if len(labels)==1:
            # print(X,y)
            root.label = labels[0]
            return root
        if not len(features):
            root.label = stats.mode(y)[0][0]
            return root
        attr_index,sp = self.selectBestAttr(X,y,features,attr_values)
        if sp == None:
            root.label = stats.mode(y)[0][0]
            return root
        root.attr = features[attr_index]
        root.next = []
        if X[root.attr].dtype == 'O':
            set1,set2 = sp
            # print(sp)
            X_v1,X_v2 = X[X[root.attr].isin(set1)],X[X[root.attr].isin(set2)]
            y_v1,y_v2 = y[X[root.attr].isin(set1)],y[X[root.attr].isin(set2)]
            if not len(X_v1):
                temp = Node()
                temp.label = stats.mode(y)[0][0]
                root.next.append((set1,temp))
            else:
                temp = self.cart(X_v1,y_v1,features[0:attr_index]+features[attr_index+1:],attr_values)
                root.next.append((set1,temp))
            if not len(X_v2):
                temp = Node()
                temp.label = stats.mode(y)[0][0]
                root.next.append((set2,temp))
            else:
                temp = self.cart(X_v2,y_v2,features[0:attr_index]+features[attr_index+1:],attr_values)
                root.next.append((set2,temp))
        else:
            X_v1,X_v2 = X[X[root.attr] <= sp],X[X[root.attr] > sp]
            y_v1,y_v2 = y[X[root.attr] <= sp],y[X[root.attr] > sp]
            if not len(X_v1):
                temp = Node()
                temp.label = stats.mode(y)[0][0]
                root.next.append(((sp,0),temp))
            else:
                temp = self.cart(X_v1,y_v1,features[0:attr_index]+features[attr_index+1:],attr_values)
                root.next.append(((sp,0),temp))
            if not len(X_v2):
                temp = Node()
                temp.label = stats.mode(y)[0][0]
                root.next.append(((sp,1),temp))
            else:
                temp = self.cart(X_v2,y_v2,features[0:attr_index]+features[attr_index+1:],attr_values)
                root.next.append(((sp,1),temp))
        return root
    def fit(self,data,features,target):
        X = data[features]
        y = data[target]
        attr_values = {}
        for a in features:
            attr_values[a] = np.unique(X[a])
        # print(type(attr_values['Outlook']))
        root = self.cart(X,y,features,attr_values)
        return root
    def findPath(self,x,root):
        # print(x)
        # print(root.attr)
        # print(root.label)
        # print(root.next)
        if root.label != None:
            return root.label
        for child in root.next:
            if type(x[root.attr]) == str:
                if x[root.attr] in child[0]:
                    ans = self.findPath(x,child[1])
                    break
            else:
                if not child[0][1] and x[root.attr] <= child[0][0]:
                    ans = self.findPath(x,child[1])
                    break
                if child[0][1] and x[root.attr] > child[0][0]:
                    ans = self.findPath(x,child[1])
                    break
        return ans
    def predict(self,root,test_X):
        y = []
        for x in test_X.index:
            for child in root.next:
                if test_X[root.attr].dtype == 'O':
                    if test_X.loc[x][root.attr] in child[0]:
                        y.append(self.findPath(test_X.loc[x],child[1]))
                        break
                else:
                    if not child[0][1] and test_X.loc[x][root.attr] <= child[0][0]:
                        y.append(self.findPath(test_X.loc[x],child[1]))
                        break
                    if child[0][1] and test_X.loc[x][root.attr] > child[0][0]:
                        y.append(self.findPath(test_X.loc[x],child[1]))
                        break
        y = np.array(y)
        return y
    def viewTree(self,root):
        queue = []
        queue.append((root,0))
        while len(queue):
            temp,depth = queue.pop(0)
            i = 1
            print(f'Depth: {depth} Attribute: {temp.attr}')
            for t in temp.next:
                if t[1].label == None:
                    print(f'[{i}: {t[0]}: Attribute: {t[1].attr}]',end="  ")
                    queue.append((t[1],depth+1))
                else:
                    print(f'[{i}: {t[0]}: Class label: {t[1].label}]',end="  ")
                i+=1
            print('\n')
    def accuracy(self,true_y,pred_y):
        acc = np.sum(true_y==pred_y)/len(true_y)
        return acc*100

In [23]:
data = pd.read_csv('playtennis.csv')

In [24]:
data['PlayTennis'].dtype == np.int64

False

In [25]:
tree = cart(data.PlayTennis)

In [26]:
features = ['Outlook','Temperature','Humidity','Wind']
target = 'PlayTennis'
root = tree.fit(data,features,target)

In [27]:
tree.viewTree(root)

Depth: 0 Attribute: Outlook
[1: ['Overcast']: Class label: Yes]  [2: ['Rain', 'Sunny']: Attribute: Humidity]  

Depth: 1 Attribute: Humidity
[1: ['High']: Attribute: Temperature]  [2: ['Normal']: Attribute: Wind]  

Depth: 2 Attribute: Temperature
[1: ['Hot']: Class label: No]  [2: ['Cool', 'Mild']: Attribute: Wind]  

Depth: 2 Attribute: Wind
[1: ['Strong']: Attribute: Temperature]  [2: ['Weak']: Class label: Yes]  

Depth: 3 Attribute: Wind
[1: ['Strong']: Class label: No]  [2: ['Weak']: Class label: No]  

Depth: 3 Attribute: Temperature
[1: ['Cool']: Class label: No]  [2: ['Hot', 'Mild']: Class label: Yes]  



In [30]:
cars = pd.read_csv('car2.csv')

In [31]:
from sklearn.model_selection import train_test_split
train,test = train_test_split(cars,test_size=0.25,random_state=0)

In [32]:
tree2 = cart(train.target)
features = train.columns[:-1].to_list()

In [33]:
root = tree2.fit(train,features,'target')

In [34]:
tree2.viewTree(root)

Depth: 0 Attribute: safety
[1: ['low']: Class label: unacc]  [2: ['high', 'med']: Attribute: persons]  

Depth: 1 Attribute: persons
[1: ['2']: Class label: unacc]  [2: ['4', 'more']: Attribute: buying]  

Depth: 2 Attribute: buying
[1: ['high', 'vhigh']: Attribute: maint]  [2: ['low', 'med']: Attribute: maint]  

Depth: 3 Attribute: maint
[1: ['high', 'vhigh']: Attribute: lug_boot]  [2: ['low', 'med']: Attribute: lug_boot]  

Depth: 3 Attribute: maint
[1: ['high', 'vhigh']: Attribute: lug_boot]  [2: ['low', 'med']: Attribute: lug_boot]  

Depth: 4 Attribute: lug_boot
[1: ['small']: Attribute: doors]  [2: ['big', 'med']: Attribute: doors]  

Depth: 4 Attribute: lug_boot
[1: ['small']: Attribute: doors]  [2: ['big', 'med']: Attribute: doors]  

Depth: 4 Attribute: lug_boot
[1: ['small']: Attribute: doors]  [2: ['big', 'med']: Attribute: doors]  

Depth: 4 Attribute: lug_boot
[1: ['small']: Attribute: doors]  [2: ['big', 'med']: Attribute: doors]  

Depth: 5 Attribute: doors
[1: ['4']: C

In [35]:
test_X = test.iloc[:,:-1]

In [36]:
pred_y = tree2.predict(test_X=test_X,root=root)

In [37]:
tree2.accuracy(test.iloc[:,-1].values,pred_y)

83.79629629629629

In [38]:
t_pred_y = tree2.predict(root,train.iloc[:,:-1])

In [39]:
tree2.accuracy(train.iloc[:,-1].values,t_pred_y)

88.19444444444444

In [13]:
data = pd.read_csv('dtr3_2.csv')

In [14]:
tree4 = cart(data.Target)

In [15]:
features = ['Outlook','Temperature','Humidity','Wind']
target = 'Target'

In [16]:
root = tree4.fit(data,features,target)

[0.35714285714285715, 0.39560439560439564, 0.3673469387755103, 0.42857142857142855]
[0.375, 0.31999999999999984, 0.41666666666666663]
[0.0, 0.2]
[0.0, 0.26666666666666666]


In [11]:
tree4.viewTree(root)

Depth: 0 Attribute: Outlook
[1: ['Overcast']: Class label: Yes]  [2: ['Rain', 'Sunny']: Attribute: Humidity]  

Depth: 1 Attribute: Humidity
[1: (82.5, 0): Attribute: Temperature]  [2: (82.5, 1): Attribute: Temperature]  

Depth: 2 Attribute: Temperature
[1: (66.5, 0): Class label: No]  [2: (66.5, 1): Class label: Yes]  

Depth: 2 Attribute: Temperature
[1: (70.5, 0): Class label: Yes]  [2: (70.5, 1): Class label: No]  



In [45]:
iris_names = ['sepal length','sepal width','petal length','petal width','class']
iris = pd.read_csv('iris.csv',names=iris_names)

In [46]:
iris

Unnamed: 0,sepal length,sepal width,petal length,petal width,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Iris-virginica
146,6.3,2.5,5.0,1.9,Iris-virginica
147,6.5,3.0,5.2,2.0,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica


In [61]:
train_iris,test_iris = train_test_split(iris,test_size=0.4,random_state=1)

In [62]:
train_iris

Unnamed: 0,sepal length,sepal width,petal length,petal width,class
11,4.8,3.4,1.6,0.2,Iris-setosa
113,5.7,2.5,5.0,2.0,Iris-virginica
123,6.3,2.7,4.9,1.8,Iris-virginica
12,4.8,3.0,1.4,0.1,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
...,...,...,...,...,...
133,6.3,2.8,5.1,1.5,Iris-virginica
137,6.4,3.1,5.5,1.8,Iris-virginica
72,6.3,2.5,4.9,1.5,Iris-versicolor
140,6.7,3.1,5.6,2.4,Iris-virginica


In [63]:
i_tree = cart(train_iris['class'])

In [64]:
root = i_tree.fit(train_iris,features=['sepal length','sepal width','petal length','petal width'],target='class')

In [66]:
preds = i_tree.predict(root,test_iris.iloc[:,:-1])

In [67]:
i_tree.accuracy(test_iris.iloc[:,-1].values,preds)

96.66666666666667

In [68]:
i_tree.viewTree(root)

Depth: 0 Attribute: petal length
[1: (2.5999999999999996, 0): Class label: Iris-setosa]  [2: (2.5999999999999996, 1): Attribute: petal width]  

Depth: 1 Attribute: petal width
[1: (1.65, 0): Attribute: sepal length]  [2: (1.65, 1): Attribute: sepal length]  

Depth: 2 Attribute: sepal length
[1: (7.1, 0): Attribute: sepal width]  [2: (7.1, 1): Class label: Iris-virginica]  

Depth: 2 Attribute: sepal length
[1: (5.95, 0): Attribute: sepal width]  [2: (5.95, 1): Class label: Iris-virginica]  

Depth: 3 Attribute: sepal width
[1: (2.8499999999999996, 0): Class label: Iris-versicolor]  [2: (2.8499999999999996, 1): Class label: Iris-versicolor]  

Depth: 3 Attribute: sepal width
[1: (3.1, 0): Class label: Iris-virginica]  [2: (3.1, 1): Class label: Iris-versicolor]  



In [69]:
bank = pd.read_csv('bank-additional/bank-additional-full.csv',sep=";")

In [70]:
bank

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41183,73,retired,married,professional.course,no,yes,no,cellular,nov,fri,...,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,yes
41184,46,blue-collar,married,professional.course,no,no,no,cellular,nov,fri,...,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,no
41185,56,retired,married,university.degree,no,yes,no,cellular,nov,fri,...,2,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,no
41186,44,technician,married,professional.course,no,no,no,cellular,nov,fri,...,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,yes


In [71]:
for c in bank.columns[bank.dtypes == 'object']:
    print(c)
    print(bank[c].value_counts())

job
admin.           10422
blue-collar       9254
technician        6743
services          3969
management        2924
retired           1720
entrepreneur      1456
self-employed     1421
housemaid         1060
unemployed        1014
student            875
unknown            330
Name: job, dtype: int64
marital
married     24928
single      11568
divorced     4612
unknown        80
Name: marital, dtype: int64
education
university.degree      12168
high.school             9515
basic.9y                6045
professional.course     5243
basic.4y                4176
basic.6y                2292
unknown                 1731
illiterate                18
Name: education, dtype: int64
default
no         32588
unknown     8597
yes            3
Name: default, dtype: int64
housing
yes        21576
no         18622
unknown      990
Name: housing, dtype: int64
loan
no         33950
yes         6248
unknown      990
Name: loan, dtype: int64
contact
cellular     26144
telephone    15044
Name: contact, 

In [72]:
from sklearn.model_selection import train_test_split

In [73]:
train,test = train_test_split(bank,test_size=0.2,random_state=0)

In [74]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32950 entries, 29321 to 2732
Data columns (total 21 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             32950 non-null  int64  
 1   job             32950 non-null  object 
 2   marital         32950 non-null  object 
 3   education       32950 non-null  object 
 4   default         32950 non-null  object 
 5   housing         32950 non-null  object 
 6   loan            32950 non-null  object 
 7   contact         32950 non-null  object 
 8   month           32950 non-null  object 
 9   day_of_week     32950 non-null  object 
 10  duration        32950 non-null  int64  
 11  campaign        32950 non-null  int64  
 12  pdays           32950 non-null  int64  
 13  previous        32950 non-null  int64  
 14  poutcome        32950 non-null  object 
 15  emp.var.rate    32950 non-null  float64
 16  cons.price.idx  32950 non-null  float64
 17  cons.conf.idx   32950 non-nu

In [75]:
train.poutcome.value_counts().index != test.poutcome.value_counts().index

array([False, False, False])

In [76]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
for c in train.columns[:-1]:
    if train[c].dtype == np.object:
        le.fit(train[c])
        train[c] = le.transform(train[c])
        test[c] = le.transform(test[c])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train[c] = le.transform(train[c])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test[c] = le.transform(test[c])


In [77]:
features = bank.columns.to_list()[:-1]

In [78]:
tree = cart(train.y)
root = tree.fit(train,features,'y')

In [79]:
preds = tree.predict(root,test.iloc[:,:-1])

In [80]:
tree.accuracy(test.iloc[:,-1].values,preds)

89.71837824714737

In [83]:
chess = pd.read_csv('krkopt.csv')

In [84]:
chess

Unnamed: 0,a,1,b,3,c,2,draw
0,a,1,c,1,c,2,draw
1,a,1,c,1,d,1,draw
2,a,1,c,1,d,2,draw
3,a,1,c,2,c,1,draw
4,a,1,c,2,c,3,draw
...,...,...,...,...,...,...,...
28050,b,1,g,7,e,5,sixteen
28051,b,1,g,7,e,6,sixteen
28052,b,1,g,7,e,7,sixteen
28053,b,1,g,7,f,5,sixteen


In [136]:
chess['1'] = chess['1'].astype(np.object)
chess['2'] = chess['2'].astype(np.object)
chess['3'] = chess['3'].astype(np.object)

In [138]:
train_c,test_c = train_test_split(chess,test_size=0.2,random_state=0)

In [139]:
tree_chess = cart(train_c.draw)

In [140]:
features = train_c.columns[:-1].to_list()

In [141]:
root_chess = tree_chess.fit(train_c,features,'draw')

KeyboardInterrupt: 

In [108]:
tree_chess.viewTree(root_chess)

Depth: 0 Attribute: 1
[1: (1.5, 0): Attribute: 2]  [2: (1.5, 1): Attribute: 2]  

Depth: 1 Attribute: 2
[1: (2.5, 0): Attribute: 3]  [2: (2.5, 1): Attribute: c]  

Depth: 1 Attribute: 2
[1: (2.5, 0): Attribute: a]  [2: (2.5, 1): Attribute: a]  

Depth: 2 Attribute: 3
[1: (3.5, 0): Attribute: b]  [2: (3.5, 1): Attribute: a]  

Depth: 2 Attribute: c
[1: (1.5, 0): Attribute: b]  [2: (1.5, 1): Attribute: a]  

Depth: 2 Attribute: a
[1: (2.5, 0): Attribute: c]  [2: (2.5, 1): Attribute: c]  

Depth: 2 Attribute: a
[1: (2.5, 0): Attribute: c]  [2: (2.5, 1): Attribute: c]  

Depth: 3 Attribute: b
[1: (2.5, 0): Attribute: c]  [2: (2.5, 1): Attribute: c]  

Depth: 3 Attribute: a
[1: (1.5, 0): Attribute: c]  [2: (1.5, 1): Attribute: c]  

Depth: 3 Attribute: b
[1: (2.5, 0): Attribute: 3]  [2: (2.5, 1): Attribute: 3]  

Depth: 3 Attribute: a
[1: (0.5, 0): Attribute: b]  [2: (0.5, 1): Attribute: b]  

Depth: 3 Attribute: c
[1: (2.5, 0): Attribute: b]  [2: (2.5, 1): Attribute: 3]  

Depth: 3 Attribu

In [117]:
pred_y = tree_chess.predict(root_chess,train_c.iloc[:,:-1])

In [118]:
tree_chess.accuracy(train_c.iloc[:,-1].values,pred_y)

33.08679379789699

In [119]:
pred_y

array(['thirteen', 'twelve', 'draw', ..., 'twelve', 'nine', 'twelve'],
      dtype='<U8')

In [120]:
train_c.iloc[:,-1]

573          draw
10162         ten
13302      twelve
12446      eleven
11158      eleven
           ...   
13123      eleven
19648    thirteen
9845          ten
10799      eleven
2732         draw
Name: draw, Length: 22444, dtype: object