In [1]:
# Importing necessary packages and reading data
import pandas as pd
import tensorflow as tf
import copy
print(tf.config.list_physical_devices('GPU'))
dataset = pd.read_csv('wine-dataset.csv')
dataset.describe()

2021-09-23 22:20:02.422680: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0


[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


2021-09-23 22:20:03.733944: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcuda.so.1
2021-09-23 22:20:03.791840: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-09-23 22:20:03.792569: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: 
pciBusID: 0000:01:00.0 name: GeForce GTX 1650 computeCapability: 7.5
coreClock: 1.515GHz coreCount: 14 deviceMemorySize: 3.82GiB deviceMemoryBandwidth: 178.84GiB/s
2021-09-23 22:20:03.792617: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0
2021-09-23 22:20:03.804060: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublas.so.11
2021-09-23 22:20:03.804142: I tensorflow/stream_executor/platform/d

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0
mean,6.854788,0.278241,0.334192,6.391415,0.045772,35.308085,138.360657,0.994027,3.188267,0.489847,10.514267,0.216415
std,0.843868,0.100795,0.12102,5.072058,0.021848,17.007137,42.498065,0.002991,0.151001,0.114126,1.230621,0.411842
min,3.8,0.08,0.0,0.6,0.009,2.0,9.0,0.98711,2.72,0.22,8.0,0.0
25%,6.3,0.21,0.27,1.7,0.036,23.0,108.0,0.991723,3.09,0.41,9.5,0.0
50%,6.8,0.26,0.32,5.2,0.043,34.0,134.0,0.99374,3.18,0.47,10.4,0.0
75%,7.3,0.32,0.39,9.9,0.05,46.0,167.0,0.9961,3.28,0.55,11.4,0.0
max,14.2,1.1,1.66,65.8,0.346,289.0,440.0,1.03898,3.82,1.08,14.2,1.0


In [2]:
# Max height of tree is 12 as there are 11 attributes. 
# The last level of tree will consist of yes or no nodes. 
# So at max, 4095 nodes can be present. 
# We shall not use 0th index of list. So first node of tree is tree[1]. 
# Every node shall be represented as a tuple with two elements.
# The first element shall give the index of the attribute. It will range from 0-10.
# The second element shall give the threshold being used for that attribute at that node.

class DecisionTree():
    
    def __init__(self):
        self.tree = []
        for i in range(4096):
            self.tree.append((-1,-1)) #Default node

    def learn(self, data, index_of_node = 1, attributes_used = [0,0,0,0,0,0,0,0,0,0,0]):

        p = tf.math.count_nonzero(data[:,11])/len(data)

        if min(attributes_used) == 1:
            if p > 0.5:
                self.tree[index_of_node] = (-2,0)
            else:
                self.tree[index_of_node] = (0,-2)
        elif p == 1:
            self.tree[index_of_node] = (-2,0)
        elif p == 0:
            self.tree[index_of_node] = (0,-2)
        else:
            data_size = len(data)
            positive = tf.math.count_nonzero(data[:,11],dtype='float64')
            Impurity_after_split = 1 # Initialising impurity after split to the max possible(1).
            for i in range(11):
                
                # Ensuring we don't use any attribute again
                if attributes_used[i] == 1:
                    continue
                    
                # Finding threshold
                no_of_thresholds = 100
                minimum = tf.math.reduce_min(data[:,i])
                maximum = tf.math.reduce_max(data[:,i])
                difference = (maximum - minimum)/(no_of_thresholds+1)
                thresholds = minimum + tf.range(1,no_of_thresholds+1,dtype='float64')*difference
                
                # Finding distance from thresholds
                distance = tf.tensordot(data[:,i],tf.ones(no_of_thresholds,dtype='float64'),0)-thresholds
                
                # Represent class of each data corresponding to each threshold as a matrix of Os and 1s
                classification = tf.cast(tf.math.equal(tf.math.sign(distance),1),tf.float64)
                size_pass = tf.math.count_nonzero(classification,0,dtype='float64')
                size_fail = data_size-size_pass
                
                # Calculating Impurity(entropy) after split for each threshold
                number_of_positive_that_passed = tf.tensordot(tf.transpose(classification),data[:,11],1)
                number_of_positive_that_failed = positive - number_of_positive_that_passed
                p1 = tf.divide(number_of_positive_that_passed,size_pass)
                p2 = tf.divide(number_of_positive_that_failed,size_fail)
                
                Impurity_left = -(p1*tf.math.log(p1)+(1-p1)*tf.math.log(1-p1))*(size_pass/data_size)
                Impurity_right = -(p2*tf.math.log(p2)+(1-p2)*tf.math.log(1-p2))*(size_fail/data_size)
                
                Impurity_left = tf.where(tf.math.is_nan(Impurity_left),tf.zeros_like(Impurity_left),Impurity_left)
                Impurity_right = tf.where(tf.math.is_nan(Impurity_right),tf.zeros_like(Impurity_right),Impurity_right)
                
                Impurity = Impurity_left + Impurity_right
                
                # Selecting threshold
                best = tf.math.argmin(Impurity)
                if Impurity[best] <= Impurity_after_split:
                    self.tree[index_of_node] = (i,thresholds[best])
                    Impurity_after_split = Impurity[best]
            
            # Classifying the data
            distance = data[:,self.tree[index_of_node][0]] - self.tree[index_of_node][1]
            data_passed = tf.gather(data,tf.where(tf.math.equal(tf.math.sign(distance),1))[:,0])
            data_not_passed = tf.gather(data,tf.where(tf.math.not_equal(tf.math.sign(distance),1))[:,0])
            
            # Maintaining record of attributes used
            attributes = copy.deepcopy(attributes_used)
            attributes[self.tree[index_of_node][0]] = 1

            # Recursively learning the tree
            self.learn(data_passed,2*index_of_node,attributes)
            self.learn(data_not_passed,2*index_of_node+1,attributes)
        
    def classify(self, test_point, index_of_node=1):
        if self.tree[index_of_node] == (-2,0):
            return 1
        elif self.tree[index_of_node] == (0,-2):
            return 0
        else:
            distance = test_point[self.tree[index_of_node][0]] - self.tree[index_of_node][1]
            if distance > 0:
                return self.classify(test_point,2*index_of_node)
            else:
                return self.classify(test_point,2*index_of_node+1)

def run_decision_tree():
    
    myname = "Dishank-Jain"
    
    """ Data preparation for stratified sampling """
    # Arranging data
    data = dataset.sort_values(by=['quality']).reset_index(drop=True)
    
    # Finding the index to split data
    no_of_bad_wines = 0
    for i in range(data.shape[0]):
        if data['quality'].iloc[no_of_bad_wines] == 0:
            no_of_bad_wines += 1
    
    # Splitting data into stratas and Randomizing
    bad_wines = data.iloc[:no_of_bad_wines,:].sample(frac=1)
    good_wines = data.iloc[no_of_bad_wines:,:].sample(frac=1)
    
    # Converting pandas dataframes to tensors
    bad_wines = tf.convert_to_tensor(bad_wines)
    good_wines = tf.convert_to_tensor(good_wines)
    
    """ Stratified sampling for 10-fold cross validation """
    # Performing 10 way split on the data
    K = 10
    split = []
    for i in range(K):
        temp_bad = bad_wines[i*(bad_wines.shape[0]//K):(i+1)*(bad_wines.shape[0]//K),:]
        temp_good = good_wines[i*(good_wines.shape[0]//K):(i+1)*(good_wines.shape[0]//K),:]
        temp = tf.concat([temp_bad,temp_good],0)
        split.append(temp)
    split = tf.convert_to_tensor(split)
    
    """Cross Validation and printing accuracy"""
    # Initialising average accuracy
    average_accuracy = 0
    
    # Opening file to write results to
    f = open(myname+"-result.txt",'w')
    
    # Looping over each K
    for i in range(10):
        
        # Generating training data for the fold
        training_set = []
        for j in range(K):
            if j != i:
                training_set.append(split[j,:,:])
        training_set = tf.concat(training_set,0)
        
        # Creating the tree and learning the tree
        tree = DecisionTree()
        tree.learn(training_set)
        
        # Generating testing data for each fold
        test_set = split[i,:,:]
        
        # Classifying data using the learned tree
        results = []
        for instance in test_set:
            result = tree.classify( instance[:-1] )
            results.append( result == instance[-1])
        
        # Calculating and printing accuracy for the fold
        accuracy = float(results.count(True))/float(len(results))
        f.write("accuracy: %.4f \n" % accuracy)
        print("accuracy for fold",i+1,"=", accuracy)
        
        # Calculating average accuracy
        average_accuracy += accuracy
        
    # Printing average accuracy
    print("average accuracy = ", average_accuracy/10)
    f.write("average accuracy: %.4f" % (average_accuracy/10))
    f.close()
    
if __name__ == "__main__":
    run_decision_tree()

2021-09-23 22:20:03.905368: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2021-09-23 22:20:03.906836: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-09-23 22:20:03.907275: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: 
pciBusID: 0000:01:00.0 name: GeForce GTX 1650 computeCapability: 7.5
coreClock: 1.515GHz coreCount: 14 deviceMemorySize: 3.82GiB deviceMemoryBandwidth: 178.84GiB/s
2021-09-23 22:20:03.907522: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but th

accuracy for fold 1 = 0.8077709611451943
accuracy for fold 2 = 0.8261758691206544
accuracy for fold 3 = 0.803680981595092
accuracy for fold 4 = 0.787321063394683
accuracy for fold 5 = 0.8139059304703476
accuracy for fold 6 = 0.8241308793456033
accuracy for fold 7 = 0.8077709611451943
accuracy for fold 8 = 0.7934560327198364
accuracy for fold 9 = 0.8159509202453987
accuracy for fold 10 = 0.8016359918200409
average accuracy =  0.8081799591002046


In [4]:
# Using Gini index and pruning

# Max height of tree is 12 as there are 11 attributes. 
# The last level of tree will consist of yes or no nodes. 
# So at max, 4095 nodes can be present. 
# We shall not use 0th index of list. So first node of tree is tree[1]. 
# Every node shall be represented as a tuple with two elements.
# The first element shall give the index of the attribute. It will range from 0-10.
# The second element shall give the threshold being used for that attribute at that node.

class DecisionTree():
    
    def __init__(self):
        self.tree = []
        for i in range(4096):
            self.tree.append((-1,-1)) #Default node

    def learn(self, data, index_of_node = 1, attributes_used = [0,0,0,0,0,0,0,0,0,0,0]):
        
        p = tf.math.count_nonzero(data[:,11])/len(data)
        
        # if less than three data points reach a node, we declare the node as leaf node to deal with noise
        if sum(attributes_used) == 11:
            if p > 0.5:
                self.tree[index_of_node] = (-2,0)
            else:
                self.tree[index_of_node] = (0,-2)
        """pruning"""        
        elif p > 0.90:
            self.tree[index_of_node] = (-2,0)
        elif p < 0.10:
            self.tree[index_of_node] = (0,-2)
        else:
            data_size = len(data)
            positive = tf.math.count_nonzero(data[:,11],dtype='float64')
            Gini_after_split = 1 # Initialising impurity after split to the max possible(1).
            for i in range(11):
                
                # Ensuring we don't use any attribute again
                if attributes_used[i] == 1:
                    continue
                    
                # Finding threshold
                no_of_thresholds = 10
                minimum = tf.math.reduce_min(data[:,i])
                maximum = tf.math.reduce_max(data[:,i])
                difference = (maximum - minimum)/(no_of_thresholds+1)
                thresholds = minimum + tf.range(1,no_of_thresholds+1,dtype='float64')*difference
                
                # Finding distance from thresholds
                distance = tf.tensordot(data[:,i],tf.ones(no_of_thresholds,dtype='float64'),0)-thresholds
                
                # Represent class of each data corresponding to each threshold as a matrix of Os and 1s
                classification = tf.cast(tf.math.equal(tf.math.sign(distance),1),tf.float64)
                size_pass = tf.math.count_nonzero(classification,0,dtype='float64')
                size_fail = data_size-size_pass
                
                # Calculating Impurity(entropy) after split for each threshold
                number_of_positive_that_passed = tf.tensordot(tf.transpose(classification),data[:,11],1)
                number_of_positive_that_failed = positive - number_of_positive_that_passed
                p1 = tf.divide(number_of_positive_that_passed,size_pass)
                p2 = tf.divide(number_of_positive_that_failed,size_fail)
                
                Gini_left = (1 - p1**2 - (1 - p1)**2)*(size_pass/data_size)
                Gini_right = (1 - p2**2 - (1 - p2)**2)*(size_fail/data_size)
                
                Gini_left = tf.where(tf.math.is_nan(Gini_left),tf.zeros_like(Gini_left),Gini_left)
                Gini_right = tf.where(tf.math.is_nan(Gini_right),tf.zeros_like(Gini_right),Gini_right)
                
                Gini_index = Gini_left + Gini_right
                
                # Selecting threshold
                best = tf.math.argmin(tf.math.abs(Gini_index))
                if Gini_index[best] <= Gini_after_split:
                    self.tree[index_of_node] = (i,thresholds[best])
                    Gini_after_split = Gini_index[best]
            
            # Classifying the data
            try:
                distance = data[:,self.tree[index_of_node][0]] - self.tree[index_of_node][1]
            except:
                print(data)
            data_passed = tf.gather(data,tf.where(tf.math.equal(tf.math.sign(distance),1))[:,0])
            data_not_passed = tf.gather(data,tf.where(tf.math.not_equal(tf.math.sign(distance),1))[:,0])
            
            # Maintaining record of attributes used
            attributes = copy.deepcopy(attributes_used)
            attributes[self.tree[index_of_node][0]] = 1
            
            # Recursively learning the tree
            self.learn(data_passed,2*index_of_node,attributes)
            self.learn(data_not_passed,2*index_of_node+1,attributes)
        
    def classify(self, test_point, index_of_node=1):
        if self.tree[index_of_node] == (-2,0):
            return 1
        elif self.tree[index_of_node] == (0,-2):
            return 0
        else:
            distance = test_point[self.tree[index_of_node][0]] - self.tree[index_of_node][1]
            if distance > 0:
                return self.classify(test_point,2*index_of_node)
            else:
                return self.classify(test_point,2*index_of_node+1)

def run_decision_tree():
    
    myname = "Dishank-Jain"
    
    """ Data preparation for stratified sampling """
    # Arranging data
    data = dataset.sort_values(by=['quality']).reset_index(drop=True)
    
    # Finding the index to split data
    no_of_bad_wines = 0
    for i in range(data.shape[0]):
        if data['quality'].iloc[no_of_bad_wines] == 0:
            no_of_bad_wines += 1
    
    # Splitting data into stratas and Randomizing
    bad_wines = data.iloc[:no_of_bad_wines,:].sample(frac=1)
    good_wines = data.iloc[no_of_bad_wines:,:].sample(frac=1)
    
    # Converting pandas dataframes to tensors
    bad_wines = tf.convert_to_tensor(bad_wines)
    good_wines = tf.convert_to_tensor(good_wines)
    
    """ Stratified sampling for 10-fold cross validation """
    # Performing 10 way split on the data
    K = 10
    split = []
    for i in range(K):
        temp_bad = bad_wines[i*(bad_wines.shape[0]//K):(i+1)*(bad_wines.shape[0]//K),:]
        temp_good = good_wines[i*(good_wines.shape[0]//K):(i+1)*(good_wines.shape[0]//K),:]
        temp = tf.concat([temp_bad,temp_good],0)
        split.append(temp)
    split = tf.convert_to_tensor(split)
    
    """Cross Validation and printing accuracy"""
    # Initialising average accuracy
    average_accuracy = 0
    
    # Opening file to write results to
    f = open(myname+"-improved-result.txt",'w')
    
    # Looping over each K
    for i in range(10):
        
        # Generating training data for the fold
        training_set = []
        for j in range(K):
            if j != i:
                training_set.append(split[j,:,:])
        training_set = tf.concat(training_set,0)
        
        # Creating the tree and learning the tree
        tree = DecisionTree()
        tree.learn(training_set)
        
        # Generating testing data for each fold
        test_set = split[i,:,:]
        
        # Classifying data using the learned tree
        results = []
        for instance in test_set:
            result = tree.classify( instance[:-1] )
            results.append( result == instance[-1])
        
        # Calculating and printing accuracy for the fold
        accuracy = float(results.count(True))/float(len(results))
        f.write("accuracy: %.4f \n" % accuracy)
        print("accuracy for fold",i+1,"=", accuracy)
        
        # Calculating average accuracy
        average_accuracy += accuracy
        
    # Printing average accuracy
    print("average accuracy = ", average_accuracy/10)
    f.write("average accuracy: %.4f" % (average_accuracy/10))
    f.close()
    
if __name__ == "__main__":
    run_decision_tree()

accuracy for fold 1 = 0.8486707566462167
accuracy for fold 2 = 0.8220858895705522
accuracy for fold 3 = 0.8384458077709611
accuracy for fold 4 = 0.8057259713701431
accuracy for fold 5 = 0.820040899795501
accuracy for fold 6 = 0.8282208588957055
accuracy for fold 7 = 0.8016359918200409
accuracy for fold 8 = 0.7934560327198364
accuracy for fold 9 = 0.8241308793456033
accuracy for fold 10 = 0.8077709611451943
average accuracy =  0.8190184049079756
