In [179]:
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
import pandas as pd
import numpy as np
import math
import sys

In [180]:
import sklearn
print(sklearn.__version__)

1.3.1


In [181]:
label_encoder = None 

In [182]:
def get_np_array(file_name):
    global label_encoder
    data = pd.read_csv(file_name)
    
    need_label_encoding = ['team','host','opp','month', 'day_match']
    if(label_encoder is None):
        label_encoder = OrdinalEncoder()
        label_encoder.fit(data[need_label_encoding])
    data_1 = pd.DataFrame(label_encoder.transform(data[need_label_encoding]), columns = label_encoder.get_feature_names_out())
    
    #merge the two dataframes
    dont_need_label_encoding =  ["year","toss","bat_first","format" ,"fow","score" ,"rpo" ,"result"]
    data_2 = data[dont_need_label_encoding]
    final_data = pd.concat([data_1, data_2], axis=1)
    
    X = final_data.iloc[:,:-1]
    y = final_data.iloc[:,-1:]
    return X.to_numpy(), y.to_numpy()


In [183]:
X_train,y_train = get_np_array('./Dataset/train.csv')
X_test, y_test = get_np_array("./Dataset/test.csv")

types = ['cat','cat','cat',"cat","cat","cont","cat","cat","cat" ,"cont","cont" ,"cont" ]


In [184]:
class DTNode:

    def __init__(self, depth, is_leaf = False, value = 0, column = None, category = None, types = None):

        self.depth = depth
        self.children = None
        self.is_leaf = is_leaf
        self.value = value
        self.column = column
        
        self.category = category
        self.types = types
    
    def __call__(self,X):
        
        if self.is_leaf:
            return self.category

        if self.types[self.column] == 'cat':
            for child in self.children:
                if child.value == X[self.column]:
                    return child(X)
        else:
            if X[self.column] <= self.value:
                return self.children[0](X)
            else:
                return self.children[1](X)    
        
        


    def get_children(self, X, y, types, max_depth):
        '''
        Args:
            X: A single example np array [num_features]
        Returns:
            None
        '''
        
        # print("depth", self.depth, "len", len(X))
        
        if self.is_leaf:
            majority = np.sum(y)/len(y)
            if majority > 0.5:
                self.category = 1
            else:
                self.category = 0
                
            return
      
        
            
            
        
        max_gain = -9999
        max_child_id = None
        
        ids = np.arange(len(types))
        ids = np.random.permutation(ids)
        
        for id in ids:
            type = types[id]
            
            if(type == 'cat'):
                #print("cat")
                categories = np.unique(X[:,id])
                current_gain = 0
                
                for category in categories:
                    category_indices = np.where(X[:,id] == category)
                    category_y = y[category_indices]
                    
                    positive_count = np.sum(category_y)
                    negative_count = len(category_y) - positive_count
                    total_count = len(category_y)
                    
                    if positive_count == 0 or negative_count == 0:
                        this_category_gain = 0
                    else:
                        this_category_gain = (positive_count/total_count)*np.log(positive_count/total_count) + (negative_count/total_count)*np.log(negative_count/total_count)
                    
                        
                    current_gain += (total_count/len(y))*this_category_gain
                    
                    if math.isnan(current_gain):
                        print("nan")
                        print(np.ravel(y))
                        print(np.ravel(category_y))
                        print(positive_count, negative_count, total_count)
                        sys.exit()
                    
                    
                
                if(current_gain > max_gain):
                    max_gain = current_gain
                    max_child_id = id
                
                
                
                print(f"For id {id} gain is {current_gain}, it is of type {type}")
                
            else:
                current_gain = 0

                split_accross = np.median(X[:,id])
                split_accross_indices_more = np.where(X[:,id] <= split_accross)
                
                split_accross_indices_more_y = y[split_accross_indices_more]
                
                positive_count = np.sum(split_accross_indices_more_y)
                negative_count = len(split_accross_indices_more_y) - positive_count
                total_count = len(split_accross_indices_more_y)
                
                if positive_count == 0 or negative_count == 0:
                    this_category_gain = 0
                else:
                    this_category_gain = (positive_count/total_count)*np.log(positive_count/total_count) + (negative_count/total_count)*np.log(negative_count/total_count)
                
                
                current_gain += (total_count/len(y))*this_category_gain
                
                
                split_accross_indices_less = np.where(X[:,id] > split_accross)
                split_accross_indices_less_y = y[split_accross_indices_less]
                
                positive_count = np.sum(split_accross_indices_less_y)
                negative_count = len(split_accross_indices_less_y) - positive_count
                total_count = len(split_accross_indices_less_y)
                
                if positive_count == 0 or negative_count == 0:
                    this_category_gain = 0
                else:
                    this_category_gain = (positive_count/total_count)*np.log(positive_count/total_count) + (negative_count/total_count)*np.log(negative_count/total_count)
                
                current_gain += (total_count/len(y))*this_category_gain
                
                if(current_gain > max_gain):
                    max_gain = current_gain
                    max_child_id = id
                
                print(f"For id {id} gain is {current_gain}, it is of type {type}")
                                
                if math.isnan(current_gain):
                    print("nan")
                    sys.exit()
                    
                
        
        
        print(len(X), max_child_id, max_gain)
        
        self.column = max_child_id
        self.types = types
        
        if types[max_child_id] == 'cat':
            categories = np.unique(X[:,max_child_id])
            children = []
            for category in categories:
                category_indices = np.where(X[:,max_child_id] == category)                
                child = DTNode(self.depth+1,
                               is_leaf = (self.depth >= max_depth),
                               value = category,
                               column=max_child_id,
                               types=types
                               )
                
                child.get_children(X[category_indices], y[category_indices], types, max_depth)

                    
                children.append(child)
            self.children = children
            

            
        else:
            split_accross = np.median(X[:,max_child_id])
            children = [] 
            child = DTNode(self.depth+1,
                           is_leaf = (self.depth >= max_depth),
                           value = split_accross,
                           column=max_child_id,
                           types=types
                           )
            indices = np.where(X[:,max_child_id] <= split_accross)
            child.get_children(X[indices], y[indices], types, max_depth)

            child2 = DTNode(self.depth+1,
                           is_leaf = (self.depth >= max_depth),
                           value = split_accross,
                           column=max_child_id,
                            types=types
                           )
            indices = np.where(X[:,max_child_id] >= split_accross)
            child2.get_children(X[indices], y[indices], types, max_depth)

            
            children.append(child)
            children.append(child2)
            self.children = children
        


In [185]:
class DTTree:

    def __init__(self):
        #Tree root should be DTNode
        self.root = DTNode(0)       

    def fit(self, X, y, types, max_depth = 10):
        '''
        Makes decision tree
        Args:
            X: numpy array of data [num_samples, num_features]
            y: numpy array of classes [num_samples, 1]
            types: list of [num_features] with types as: cat, cont
                eg: if num_features = 4, and last 2 features are continious then
                    types = ['cat','cat','cont','cont']
            max_depth: maximum                 this_category_gain = 0
                if positive_count != 0:
                    this_category_gain += (positive_count/total_count)*np.log(positive_count/total_count) 
                elif negative_count != 0:
                    this_category_gain += (negative_count/total_count)*np.log(negative_count/total_count)
                depth of tree
        Returns:
            None
        '''
        self.root.get_children(X, y, types, max_depth)

    def __call__(self, X):
        '''
        Predicted classes for X
        Args:
            X: numpy array of data [num_samples, num_features]
        Returns:
            y:  predicted classes
        '''
        return self.root(X)
    
    def post_prune(self, X_val, y_val):
        pass
        #TODO

In [186]:
max_depth = 5
tree = DTTree()
tree.fit(X_train,y_train,types, max_depth = max_depth)

For id 2 gain is -0.6742394055680165, it is of type cat
For id 11 gain is -0.6931240245499048, it is of type cont
For id 1 gain is -0.6929342839558993, it is of type cat
For id 3 gain is -0.6930590302744363, it is of type cat
For id 8 gain is -0.6931241379415193, it is of type cat
For id 5 gain is -0.6931140589636653, it is of type cont
For id 10 gain is -0.6931235393008489, it is of type cont
For id 0 gain is -0.6724631276081752, it is of type cat
For id 9 gain is -0.6931223068818197, it is of type cont
For id 6 gain is -0.6931190612533622, it is of type cat
For id 7 gain is -0.6929756870761786, it is of type cat
For id 4 gain is -0.6930776246992146, it is of type cat
7827 0 -0.6724631276081752
For id 1 gain is -0.5523327351939029, it is of type cat
For id 3 gain is -0.6028903300335143, it is of type cat
For id 4 gain is -0.6260307097332392, it is of type cat
For id 7 gain is -0.6407806254632169, it is of type cat
For id 5 gain is -0.6410314443027405, it is of type cont
For id 9 gain 

In [187]:
val_X, val_Y = get_np_array('./Dataset/val.csv')

In [188]:
predictions = []
for i,row in enumerate(val_X):
    print(tree(row), val_Y[i])
    predictions.append(tree(row) == val_Y[i])

    print("Accuracy: ", np.sum(predictions)/len(predictions))

1 [1]
Accuracy:  1.0
0 [1]
Accuracy:  0.5
0 [0]
Accuracy:  0.6666666666666666
1 [1]
Accuracy:  0.75
1 [1]
Accuracy:  0.8
0 [0]
Accuracy:  0.8333333333333334
1 [0]
Accuracy:  0.7142857142857143
1 [1]
Accuracy:  0.75
0 [1]
Accuracy:  0.6666666666666666
1 [0]
Accuracy:  0.6
1 [0]
Accuracy:  0.5454545454545454
0 [0]
Accuracy:  0.5833333333333334
1 [0]
Accuracy:  0.5384615384615384
0 [0]
Accuracy:  0.5714285714285714
None [0]
Accuracy:  0.5333333333333333
1 [1]
Accuracy:  0.5625
0 [0]
Accuracy:  0.5882352941176471
0 [0]
Accuracy:  0.6111111111111112
0 [0]
Accuracy:  0.631578947368421
0 [1]
Accuracy:  0.6
0 [0]
Accuracy:  0.6190476190476191
1 [1]
Accuracy:  0.6363636363636364
1 [1]
Accuracy:  0.6521739130434783
1 [1]
Accuracy:  0.6666666666666666
1 [1]
Accuracy:  0.68
0 [0]
Accuracy:  0.6923076923076923
1 [1]
Accuracy:  0.7037037037037037
1 [1]
Accuracy:  0.7142857142857143
1 [1]
Accuracy:  0.7241379310344828
0 [1]
Accuracy:  0.7
1 [1]
Accuracy:  0.7096774193548387
1 [1]
Accuracy:  0.71875
1