In [9]:
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
import pandas as pd
import numpy as np
import math 

In [7]:
label_encoder = None 

def get_np_array(file_name):
    global label_encoder
    data = pd.read_csv(file_name)
    
    need_label_encoding = ['team','host','opp','month', 'day_match']
    if(label_encoder is None):
        label_encoder = OrdinalEncoder()
        label_encoder.fit(data[need_label_encoding])
    data_1 = pd.DataFrame(label_encoder.transform(data[need_label_encoding]), columns = label_encoder.get_feature_names_out())
    
    #merge the two dataframes
    dont_need_label_encoding =  ["year","toss","bat_first","format" ,"fow","score" ,"rpo" ,"result"]
    data_2 = data[dont_need_label_encoding]
    final_data = pd.concat([data_1, data_2], axis=1)
    
    X = final_data.iloc[:,:-1]
    y = final_data.iloc[:,-1:]
    v1 = X.to_numpy()
    v2 = y.to_numpy()
    return (v1,v2)


In [8]:
X_train, Y_train = get_np_array("../data/train.csv") 
X_test, Y_test = get_np_array("../data/test.csv") 

In [None]:
class DTNode:

    def __init__(self, depth, features, labels, is_leaf = False, value = 0, column = None):

        #to split on column
        self.depth = depth
        self.features = features
        self.labels = labels
        #add children afterwards
        self.children = None

        #if leaf then also need value
        self.is_leaf = is_leaf
        if(self.is_leaf):
            self.value = value
        
        if(not self.is_leaf):
            self.column = column

In [None]:
class DTTree:

    def __init__(self):
        #Tree root should be DTNode
        self.root = None       

    def fit(self, X, y, types, max_depth = 10):
        self.x = X 
        self.y = y 
        self.type = types
        self.max_depth = max_depth


    def find_entropy(self, features, labels): # finds H[labels] 
        counts = np.unique(labels, return_counts=True)[1] 
        print(counts) 
        counts = counts / (labels.shape[0]) 
        entropy = -1 * np.sum(counts * math.log2(counts))  
        return entropy 

    def find_conditional_entropy(self, features, labels, attribute): # finds H[labels | attribute] 
        entropy = 0 
        total_size = features.shape[0]
        if (self.type[attribute] == "cat"):
            
            
            no_of_attribute_vals = np.unique(features[:, attribute]).shape[0] 
            for i in range(no_of_attribute_vals):
                new_features = features[features[:, attribute] == i] 
                new_labels = labels[features[:, attribute] == i]
                size = new_features.shape[0] 
                entropy += (size/total_size) * self.find_entropy(new_features, new_labels) 
        
        else:
            attribute_vals = features[:, attribute] 
            sorted_feature = np.sort(attribute_vals) 
            median_val = sorted_feature[(sorted_feature.shape[0] - 1) //2] 
            feature_1 = features[features[:, attribute] <= median_val] 
            feature_2 = features[features[:, attribute] > median_val]
            labels_1 = labels[features[:, attribute] <= median_val]
            labels_2 = labels[features[:, attribute] > median_val]

            entropy += (feature_1.shape[0]/total_size) * self.find_entropy(feature_1, labels_1)
            entropy += (feature_2.shape[0]/total_size) * self.find_entropy(feature_2, labels_2)

        
        return entropy 

    def find_split_attribute(self):
        max_gain = -1 
        split_attribute = None 
        init_entropy = self.find_entropy(self.features, self.labels)
        for attribute in range(self.features.shape[1]):
            gain = init_entropy - self.find_conditional_entropy(self.features, self.labels, attribute) 
            if gain > max_gain:
                max_gain = gain 
                split_attribute = attribute 
        
        return split_attribute
    
    def grow_tree(self, features, labels, depth): # returns a DTNode 
        node = DTNode(depth, features, labels)  
        # set is_leaf, value, column 
        if (depth == self.max_depth):
            # terminate 
            node.is_leaf = True 
        
            return 
        y_count = np.unique(labels).shape[0] 
        if (y_count == 1):
            # terminate 
            return 

        
        

    def __call__(self, X):
        '''
        Predicted classes for X
        Args:
            X: numpy array of data [num_samples, num_features]
        Returns:
            y: [num_samples, 1] predicted classes
        '''
        #TODO
    
    def post_prune(self, X_val, y_val):
        return
        #TODO