In [2]:
import pandas as pd
import numpy as np
import math
from copy import deepcopy
from scipy.stats import mode
from sklearn.model_selection import StratifiedKFold

def entropy_calc(data):
    classes = np.unique(data[:, 0])
    entropy = 0
    total_size = data.shape[0]
    for c in classes:
        size = data[data[:,0] == c].shape[0]
        entropy += (size/total_size)*math.log2((size/total_size)) # log 2? or more
    return -entropy

def information_gain(data, feature):
    entropy = entropy_calc(data)
    unique_values = np.unique(data[:, feature])
    ig = 0
    total_size = data.shape[0]
    for v in unique_values:
        d_v = data[data[:, feature] == v]
        size = d_v.shape[0]
        entropy_v = entropy_calc(d_v)
        ig = ig + ((size / total_size) * entropy_v)
    ig = entropy - ig   
    return ig
        
    
    
class Node:
    def __init__(self, data, parent=None, f_index = None, f_value = None):
        self.data = data
        self.parent = parent
        self.children = []
        self.f_index = f_index
        self.f_value = f_value
        self.category = None
        self.leaf = self.isleaf()
        
    def isleaf(self):
        if np.unique(self.data[:,0]).size == 1: # All Belong to same class
            self.category = np.unique(self.data[:,0])[0]
            return True
        elif data.shape[0] == data.size: # Out of feature
            mode,_ = mode(data, axis=0)
            self.category = mode[0]
            return True
        return False
        
    def add_children(self, node):
        self.children.append(node)
    
    def get_best_feature_to_split(self):
        best_f_index = 0
        best_ig = 0
        for f in range(1, self.data.shape[1]):
            ig = information_gain(self.data, f)
            if ig > best_ig:
                best_ig = ig
                best_f_index = f
        return best_f_index
    
    def visited(self):
        if self.children: # Has children. It's not empty
            return True
        return False
    
class ID3Classifier:
    
    def __init__(self, data):
        self.root = Node(data)
        self.unvisited_nodes = [self.root]
        self.visited_nodes = []
        
    
    def fit(self):
        while self.unvisited_nodes: # While list is not Empty
            
            n = self.unvisited_nodes.pop(0)
            self.visited_nodes.append(n)
            best_feature = n.get_best_feature_to_split()
            n.f_index = best_feature
            self.split_and_make_children(n, best_feature)
            
            
    def split_and_make_children(self, node, f_index):
        data = node.data
        unique_values = np.unique(data[:, f_index])
        
        for v in unique_values:
            
            new_data = data[data[:,f_index] == v]
            new_data = np.delete(new_data, f_index, axis=1)     
            n = Node(new_data, parent=node, f_index=None, f_value=v)
            node.add_children(n)
            
            if n.leaf :
                self.visited_nodes.append(n)
            else:
                self.unvisited_nodes.append(n)
    
    def predict(self, X):
        result = []
        for i, x in enumerate(X):
            result.append(self._predict(x))
        return result    
    
    def _predict(self, x):
        current_node = self.root
        while True:
            if current_node.leaf:
                return current_node.category
            for ch in current_node.children:
                if ch.f_value == x[current_node.f_index]:
                    x = np.delete(x, current_node.f_index)  
                    current_node = ch   
                    break
        return current_node.category
        

In [5]:


def replace_nan(data):
    modes, _= mode(data, axis=0)
    for i in range(data.shape[1]):
        data[:,i][data[:,i] == '?'] = modes[0][i]
    return data
data = pd.read_csv('1.txt',names=list(range(23)))
data = data.sample(frac=1).reset_index(drop=True)
data = data.to_numpy()
data = replace_nan(data)


In [6]:
from sklearn.metrics import accuracy_score, classification_report

X, y = data[:, 1:], data[:, 0]
skf = StratifiedKFold(n_splits=10)
print('Number of splits = ', skf.get_n_splits(X, y))

results = []
for train_index, test_index in skf.split(X, y):
    id3 = ID3Classifier(data[train_index])
    id3.fit()
    preds = id3.predict(data[test_index])
    r = accuracy_score(data[test_index, 0], preds)
    results.append(r)

print('Accuracy mean is {0}, and STD is {1}'.format(np.mean(results),np.std(results)))

Number of splits =  10
Accuracy mean is 1.0, and STD is 0.0
