In [1]:
import pandas as pd
import numpy as np
from scipy import stats

In [2]:
class Node:
    def __init__(self):
        self.attr = None
        self.label = None
        self.next = None

In [94]:
class ID3:
    def entropy(self,y):
        E = 0
        for v in np.unique(y):
            prop = np.sum(y==v)/len(y)
            E += (-prop*np.log2(prop))
        return E
    def findBestCutPoint(self,X,y):
        d = []
        attr_val = np.unique(X)
        if len(attr_val)<=1:
            return (np.inf,None)
        # print(attr_val)
        for i in attr_val:
            d.append((i,np.unique(y[X==i].values)))
        # print(d)
        cut_points = []
        for i in range(len(d)-1):
            if len(d[i][1]) == 1 and len(d[i+1][1]) == 1:
                if d[i][1] != d[i+1][1]:
                    cut_points.append(d[i][0])
            else:
                cut_points.append(d[i][0])
        S = {}
        # print(cut_points)
        for c in cut_points:
            prop1 = len(y[X<=c])/len(y)
            prop2 = len(y[X>c])/len(y)
            # print(prop1,prop2)
            S[c] = prop1*self.entropy(y[X<=c])+prop2*self.entropy(y[X>c])
        # print(S)
        # print(f'Best Cut point: {min(S.values())} and {min(S,key=S.get)}')
        return (min(S.values()),min(S,key=S.get))
    def selectBestAttr(self,X,y,features,attr_values):
        S = self.entropy(y)
        # print(len(y))
        # print(S)
        S_a = []
        cp_index = {}
        for i in range(len(features)):
            s_a = 0
            if attr_values[features[i]].dtype == 'O':
                for v in attr_values[features[i]]:
                    y_v = y[X[features[i]] == v]
                    s_a += (len(y_v)/len(y))*self.entropy(y_v)
            else:
                (s_a,cp) = self.findBestCutPoint(X[features[i]],y)
                cp_index[i] = cp
            S_a.append(s_a)
        S_a = np.array(S_a)
        # print(S-S_a)
        best_index = (S-S_a).argmax()
        if best_index in cp_index.keys():
            return (best_index,cp_index[best_index])
        return (best_index,None)
    def id3(self,X,y,features,attr_values):
        root = Node()
        labels = np.unique(y)
        if len(labels)==1:
            # print(X,y)
            root.label = labels[0]
            return root
        if not len(features):
            root.label = stats.mode(y)[0][0]
            return root
        attr_index,cp = self.selectBestAttr(X,y,features,attr_values)
        if X[features[attr_index]].dtype!='O' and cp == None:
            print('yes')
            root.label = stats.mode(y)[0][0]
            return root
        root.attr = features[attr_index]
        root.next = []
        if cp == None:
            for v in attr_values[root.attr]:
                X_v = X[X[root.attr] == v]
                y_v = y[X[root.attr] == v]
                # print(root.attr,v)
                # print(len(y_v))
                if not len(X_v):
                    temp = Node()
                    temp.label = stats.mode(y)[0][0]
                    root.next.append((v,temp))
                else:
                    temp = self.id3(X_v,y_v,features[0:attr_index]+features[attr_index+1:],attr_values)
                    root.next.append((v,temp))
        else:
            X_v1,X_v2 = X[X[root.attr] <= cp],X[X[root.attr] > cp]
            y_v1,y_v2 = y[X[root.attr] <= cp],y[X[root.attr] > cp]
            if not len(X_v1):
                temp = Node()
                temp.label = stats.mode(y)[0][0]
                root.next.append(((cp,0),temp))
            else:
                temp = self.id3(X_v1,y_v1,features[0:attr_index]+features[attr_index+1:],attr_values)
                root.next.append(((cp,0),temp))
            if not len(X_v2):
                temp = Node()
                temp.label = stats.mode(y)[0][0]
                root.next.append(((cp,1),temp))
            else:
                temp = self.id3(X_v2,y_v2,features[0:attr_index]+features[attr_index+1:],attr_values)
                root.next.append(((cp,1),temp))
        return root
    def fit(self,data,features,target):
        X = data[features]
        y = data[target]
        attr_values = {}
        for a in features:
            attr_values[a] = np.unique(X[a])
        # print(attr_values)
        root = self.id3(X,y,features,attr_values)
        return root
    def findPath(self,x,root):
        if root.label != None:
            return root.label
        for child in root.next:
            if type(x[root.attr]) == str:
                if child[0] == x[root.attr]:
                    ans = self.findPath(x,child[1])
                    break
            else:
                if not child[0][1] and x[root.attr] <= child[0][0]:
                    ans = self.findPath(x,child[1])
                    break
                if child[0][1] and x[root.attr] > child[0][0]:
                    ans = self.findPath(x,child[1])
                    break
        return ans
    def predict(self,root,test_X):
        y = []
        for x in test_X.index:
            for child in root.next:
                # print(child)
                if test_X[root.attr].dtype == 'O':
                    if child[0] == test_X.loc[x][root.attr]:
                        y.append(self.findPath(test_X.loc[x],child[1]))
                        break
                else:
                    if not child[0][1] and test_X.loc[x][root.attr] <= child[0][0]:
                        y.append(self.findPath(test_X.loc[x],child[1]))
                        break
                    if child[0][1] and test_X.loc[x][root.attr] > child[0][0]:
                        y.append(self.findPath(test_X.loc[x],child[1]))
                        break
        y = np.array(y)
        return y
    def viewTree(self,root):
        queue = []
        queue.append((root,0))
        while len(queue):
            temp,depth = queue.pop(0)
            i = 1
            print(f'Depth: {depth} Attribute: {temp.attr}')
            for t in temp.next:
                if t[1].label == None:
                    print(f'[{i}: {t[0]}: Attribute: {t[1].attr}]',end="  ")
                    queue.append((t[1],depth+1))
                else:
                    print(f'[{i}: {t[0]}: Class label: {t[1].label}]',end="  ")
                i+=1
            print('\n')
    def accuracy(self,true_y,pred_y):
        acc = np.sum(true_y==pred_y)/len(true_y)
        return acc*100

In [95]:
data = pd.read_csv('playtennis.csv')

In [96]:
tree = ID3()

In [97]:
data

Unnamed: 0,Outlook,Temperature,Humidity,Wind,PlayTennis
0,Sunny,Hot,High,Weak,No
1,Sunny,Hot,High,Strong,No
2,Overcast,Hot,High,Weak,Yes
3,Rain,Mild,High,Weak,Yes
4,Rain,Cool,Normal,Weak,Yes
5,Rain,Cool,Normal,Strong,No
6,Overcast,Cool,Normal,Strong,Yes
7,Sunny,Mild,High,Weak,No
8,Sunny,Cool,Normal,Weak,Yes
9,Rain,Mild,Normal,Weak,Yes


In [98]:
features = ['Outlook','Temperature','Humidity','Wind']
target = 'PlayTennis'
root = tree.fit(data,features,target)

In [99]:
tree.viewTree(root)

Depth: 0 Attribute: Outlook
[1: Overcast: Class label: Yes]  [2: Rain: Attribute: Wind]  [3: Sunny: Attribute: Humidity]  

Depth: 1 Attribute: Wind
[1: Strong: Class label: No]  [2: Weak: Class label: Yes]  

Depth: 1 Attribute: Humidity
[1: High: Class label: No]  [2: Normal: Class label: Yes]  



In [100]:
cars = pd.read_csv('car2.csv')

In [101]:
cars

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,target
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc
...,...,...,...,...,...,...,...
1723,low,low,5more,more,med,med,good
1724,low,low,5more,more,med,high,vgood
1725,low,low,5more,more,big,low,unacc
1726,low,low,5more,more,big,med,good


In [102]:
from sklearn.model_selection import train_test_split
train,test = train_test_split(cars,test_size=0.25,random_state=0)

In [103]:
tree2 = ID3()

In [104]:
feat = ['buying','maint','doors','persons','lug_boot','safety']
tar = 'target'
root = tree2.fit(train,feat,tar)

In [105]:
test_X = test.iloc[:,:-1]

In [106]:
pred_y = tree2.predict(test_X=test_X,root=root)

In [107]:
tree2.accuracy(test.iloc[:,-1].values,pred_y)

93.98148148148148

In [108]:
t_pred_y = tree2.predict(root,train.iloc[:,:-1])

In [109]:
tree2.accuracy(train.iloc[:,-1].values,t_pred_y)

100.0

In [110]:
df = pd.read_csv('dtr3.csv')

In [111]:
df

Unnamed: 0,a1,a2,a3,a4,Class
0,x,u,n,e,1
1,x,u,p,f,1
2,x,u,n,g,1
3,y,u,n,e,1
4,y,v,n,f,0
5,x,v,n,e,1
6,x,u,p,e,0
7,y,v,m,f,1
8,x,u,n,f,1
9,x,w,p,f,1


In [112]:
tree3 = ID3()

In [113]:
X = df.iloc[:,:-1].values
y = df.iloc[:,-1].values

In [114]:
features = ['a1','a2','a3','a4']
target = 'Class'
root = tree3.fit(df,features,target)

In [115]:
tree3.viewTree(root) 

Depth: 0 Attribute: a1
[1: x: Attribute: a3]  [2: y: Attribute: a2]  

Depth: 1 Attribute: a3
[1: m: Class label: 1]  [2: n: Class label: 1]  [3: p: Attribute: a4]  

Depth: 1 Attribute: a2
[1: u: Class label: 1]  [2: v: Attribute: a3]  [3: w: Class label: 0]  

Depth: 2 Attribute: a4
[1: e: Class label: 0]  [2: f: Class label: 1]  [3: g: Class label: 1]  

Depth: 2 Attribute: a3
[1: m: Class label: 1]  [2: n: Class label: 0]  [3: p: Class label: 0]  



In [116]:
data = pd.read_csv('dtr3_2.csv')

In [117]:
tree4 = ID3()

In [118]:
data

Unnamed: 0,Outlook,Temperature,Humidity,Wind,Target
0,Sunny,85,85,Weak,No
1,Sunny,80,90,Strong,No
2,Overcast,83,86,Weak,Yes
3,Rain,70,96,Weak,Yes
4,Rain,68,80,Weak,Yes
5,Rain,65,70,Strong,No
6,Overcast,64,65,Strong,Yes
7,Sunny,72,95,Weak,No
8,Sunny,69,70,Weak,Yes
9,Rain,75,80,Weak,Yes


In [119]:
features = ['Outlook','Temperature','Humidity','Wind']
target = 'Target'

In [120]:
root = tree4.fit(data,features,target)

In [121]:
tree4.viewTree(root)

Depth: 0 Attribute: Outlook
[1: Overcast: Class label: Yes]  [2: Rain: Attribute: Wind]  [3: Sunny: Attribute: Humidity]  

Depth: 1 Attribute: Wind
[1: Strong: Class label: No]  [2: Weak: Class label: Yes]  

Depth: 1 Attribute: Humidity
[1: (70, 0): Class label: Yes]  [2: (70, 1): Class label: No]  



In [122]:
pred_y = tree4.predict(root=root,test_X=data.iloc[:,:-1])

In [123]:
tree4.accuracy(true_y=data.iloc[:,-1].values,pred_y=pred_y)

100.0

In [124]:
chess = pd.read_csv('krkopt.csv')

In [125]:
chess['1'] = chess['1'].astype(np.object)
chess['2'] = chess['2'].astype(np.object)
chess['3'] = chess['3'].astype(np.object)

In [126]:
chess['1'] = chess['1'].apply(lambda x:str(x))
chess['2'] = chess['2'].apply(lambda x:str(x))
chess['3'] = chess['3'].apply(lambda x:str(x))

In [127]:
train_c,test_c = train_test_split(chess,test_size=0.2,random_state=0)

In [128]:
tree_chess = ID3()

In [129]:
features = train_c.columns[:-1].to_list()

In [130]:
root_chess = tree_chess.fit(train_c,features,'draw')

In [131]:
tree_chess.viewTree(root_chess)

Depth: 0 Attribute: 2
[1: 1: Attribute: a]  [2: 2: Attribute: a]  [3: 3: Attribute: c]  [4: 4: Attribute: 1]  [5: 5: Attribute: 1]  [6: 6: Attribute: 1]  [7: 7: Attribute: 1]  [8: 8: Attribute: 3]  

Depth: 1 Attribute: a
[1: a: Attribute: 3]  [2: b: Attribute: 3]  [3: c: Attribute: c]  [4: d: Attribute: c]  

Depth: 1 Attribute: a
[1: a: Attribute: 3]  [2: b: Attribute: 3]  [3: c: Attribute: 3]  [4: d: Attribute: c]  

Depth: 1 Attribute: c
[1: a: Attribute: b]  [2: b: Attribute: b]  [3: c: Attribute: b]  [4: d: Attribute: 3]  [5: e: Attribute: 3]  [6: f: Attribute: 1]  [7: g: Attribute: 1]  [8: h: Attribute: a]  

Depth: 1 Attribute: 1
[1: 1: Attribute: c]  [2: 2: Attribute: c]  [3: 3: Attribute: c]  [4: 4: Attribute: 3]  

Depth: 1 Attribute: 1
[1: 1: Attribute: c]  [2: 2: Attribute: c]  [3: 3: Attribute: c]  [4: 4: Attribute: c]  

Depth: 1 Attribute: 1
[1: 1: Attribute: c]  [2: 2: Attribute: 3]  [3: 3: Attribute: c]  [4: 4: Attribute: c]  

Depth: 1 Attribute: 1
[1: 1: Attribute: 

In [132]:
preds = tree_chess.predict(root_chess,test_c.iloc[:,:-1])

In [133]:
tree_chess.accuracy(test_c.iloc[:,-1].values,preds)

57.36945286045268