In [737]:
# importing packages
import math
import numpy as np
import pandas as pd

In [738]:
# helper function for getting probability from frequency table
# this function is used in get_entropy()
def get_probability(event_info):
  SUM = sum(event_info)
  for i in range(len(event_info)):
        event_info[i] /= SUM
  return event_info


# this function gets entropy from frequency table
def get_entropy(event_info):
  probabilities = get_probability(event_info)
  # print('probabilities', probabilities)
  entropy = 0.0
  for p in probabilities:
    if p != 0:
      entropy += p * math.log(1 / p) / math.log(2)
  return entropy

# this function gets gini impurity from frequency table
def get_gini_impurity(event_info):
  probabilities = get_probability(event_info)
  # print('probabilities', probabilities)
  imp = 0.0
  for p in probabilities:
    if p != 0:
      imp += p * (1.0 - p)
  return imp


In [739]:
#this funtion sort table by column
def sort_table_by_column(table, col):
    return table.sort_values(by = [col]).reset_index(drop=True)

In [740]:
# # for importing the dataset as a numpy array
# dataset1 = pd.read_csv(r"covid_dataset.csv")
# dataset1 = dataset1[dataset1['location'] == 'India']
# # dataset1 = dataset1.to_numpy()
# dataset2 = pd.read_csv(r"changes-visitors-covid.csv")
# dataset2 = dataset2[dataset2['Entity'] == 'India']
# # dataset2 = dataset2.to_numpy()
# dataset3 = pd.read_csv(r'continuous_attribute_table.csv')
# dataset4 = pd.read_csv(r'pure_category.csv')

In [741]:
# dataset = pd.merge(dataset1, dataset2, on = 'date', how = 'inner')

In [742]:
# example_dataset = dataset1 = pd.read_csv(r"example_data.csv")

In [743]:
# print(example_dataset)

In [744]:
def split_dataset_wrt_column(dataset, column_name):
  unique_items = dataset[column_name].unique()
  tables = []
  for item in unique_items:
    tables.append(dataset[dataset[column_name] == item])
  return tables

In [745]:
def get_count(table, target_column, class_name):
  # print(class_name)
  # ans =  (table[target_column] == class_name).shape[0]
  ans = (table[target_column] == class_name).sum()
  # print (ans)
  return ans

In [746]:
def get_entropy_from_table(table, target_column):
  unique_classes = table[target_column].unique()
  # print(unique_classes)
  counts = []
  for class_name in unique_classes:
    counts.append(get_count(table,target_column, class_name))
  # print("Count is ", counts)
  return get_entropy(counts)    # write get_gini_impurity if you want to change the parameter to gini imp from entropy
  

In [747]:
# this function returns the information gain of the column "column" when the target column is "target_column" of the table dataset
# only for categorical column or attribute
def get_information_gain(dataset, column, target_column):
  tables = []
  size_table = []
  overall_size = dataset.shape[0]
  for table in split_dataset_wrt_column(dataset, column):
    tables.append(table)
    size_table.append(table.shape[0])
  entropies = []
  for table in tables:
    # print(table)
    entropies.append(get_entropy_from_table(table, target_column))
  # print("entropies=", entropies)
  # entropies = [get_entropy_from_table(table, target_column) for table in tables]
  # print(entropies)
  entropy_initial = get_entropy_from_table(dataset, target_column)    # entropy without splitting
  # print("entropy_intial=",entropy_initial)
  entropy = sum([(size / overall_size) * entropyi for size, entropyi in zip(size_table, entropies)])  # entropy after splitting
  return (entropy_initial - entropy)


In [748]:
# get_information_gain(example_dataset, 'Sailboat', 'target')

In [749]:
def max_index(arr):
    index = 0
    mx = arr[0]
    for i in range(len(arr)):
        if mx < arr[i]:
            index = i
            mx = arr[i]
    return index


In [750]:
# get_information_gain(example_dataset, 'Sailboat', 'target')

In [751]:
def get_value_with_min_entropy_wrt_continuous_column(table, column, target_column):
    # step 1: sort the table
    new_table = sort_table_by_column(table, column)
    # print(new_table)
    # step 2: get various averages
    avg_array = []
    length_new_table = len(new_table)
    for i in range(length_new_table - 1):
        avg_array.append((new_table.at[i,column] + new_table.at[i + 1, column]) / 2)
    
    # print(avg_array)
    # step 3: count before and after averages
    IGs = []
    parentIG = get_entropy_from_table(new_table, target_column)
    for i in range(length_new_table - 1):
        table1 = new_table.iloc[:i + 1,:]
        table2 = new_table.iloc[i + 1:, :]
        # print('Table 1')
        # print(table1)
        # print('Table 2')
        # print(table2)
        E1 = get_entropy_from_table(table1, target_column)
        E2 = get_entropy_from_table(table2, target_column)
        # print('E1=',E1, 'E2=', E2)
        E = (len(table1) / len(table)) * E1 + (len(table2) / len(table)) * E2
        IG = parentIG - E
        IGs.append(IG)
    # print(IGs)
    # if(len(avg_array) == 0):
    #     IGs = [0]
    #     avg_array = [1]
    index = max_index(IGs)
    
    return avg_array[index], IGs[index]  # split wrt value, IG according to that value
    # step 4: calculate the entropy wrt each average
    # step 5: determine the best split with most information gain
    

In [752]:
# get_value_with_min_entropy_wrt_continuous_column(dataset3, 'Weight', 'Heart_disease')

In [753]:
# dataset3

In [754]:
#This function returns the best column for the split
def get_best_column (table, target_column, is_categorical):
    values = []
    # is_categorical is an array which is true if the data is categorical and false if continuous
    IGs = []
    for index,column in enumerate(table):
        if(column == target_column):
            break
        # print(column,index)
        if(is_categorical[index] == 1):
            values.append(None)
            IGs.append(get_information_gain(table, column, target_column))
        else:
            #value is the value at which the splitting occurs in the column and IG is the corresponding Info gain
            value, IG = get_value_with_min_entropy_wrt_continuous_column(table,column,target_column)
            values.append(value)
            # print(value)
            IGs.append(IG)
    selected_index = max_index(IGs)
    if(is_categorical[selected_index]):
        return selected_index, table.columns[selected_index], None, IGs[selected_index]
    else:
        return selected_index, table.columns[selected_index],values[selected_index], IGs[selected_index]



In [755]:
# get_best_column(example_dataset, 'target', [1,0,1,1])

In [756]:
def split_table_wrt_value(table,value,column):
    table1 = table[table[column] <= value]
    table2 = table[table[column] > value]
    return table1,table2

In [757]:
# table1, table2 = split_table_wrt_value(example_dataset,56.0,'Value')
# print(table1)
# print()
# print(table2)

In [758]:
class Node:
    def __init__(self, condition, children, decisions):
        self.condition = condition
        self.children = children
        self.decisions = decisions

In [759]:
class Leaf:
    def __init__(self, leaf_value):
        self.leaf_value = leaf_value
        self.condition = "THIS IS A LEAF NODE!!"
        

In [760]:
# this cell contains all the constants please beware
THRESHOLD_VALUE = 0.0
THRESHOLD_SIZE = 10
IS_CATEGORICAL = [0, 0, 0, 0, 0, 0]
TARGET_COLUMN = "new_cases_classes"

In [761]:
# returns the value and probability for the leaf which have maximum probability
def get_value_for_leaf(table, target_column):
    freq = dict()
    unique_classes = table[target_column].unique()
    for class_name in unique_classes:
        freq[class_name] = 0
    for index, row in table.iterrows():
        freq[row[target_column]] += 1
    mx = 0
    value = None
    sum_of_freq = 0.0
    for class_name, freq_of_class in freq.items():
        sum_of_freq += freq_of_class
        if mx < freq_of_class:
            mx = freq_of_class
            value = class_name
    return value, mx / sum_of_freq

In [762]:
def build_tree(table):
    if len(table) <= THRESHOLD_SIZE:
      return Leaf(get_value_for_leaf(table, TARGET_COLUMN))
    # step1 find the best split
    selected_index, column_name, value, best_IG = get_best_column(table, TARGET_COLUMN, IS_CATEGORICAL)
    tables = []
    decisions = []
    condition = None
    # categorical value
    if value == None:
        tables = split_dataset_wrt_column(table, column_name)
        start_index = 0
        for splited_table in tables:
            decisions.append(splited_table.iloc[0][column_name])
            start_index += len(splited_table)
        condition = column_name
    # Continuous Value
    else:
        table1, table2 = split_table_wrt_value(table, value, column_name)
        tables.append(table1)
        tables.append(table2)
        decisions = [None,None]
        condition = [value, column_name]
    # put some base condition
    if best_IG <= THRESHOLD_VALUE or len(table) <= THRESHOLD_SIZE:
        return Leaf(get_value_for_leaf(table, TARGET_COLUMN))
    # make tree for each child
    children = []
    for table in tables:
        child = build_tree(table)
        children.append(child)
    # return the current node which is already linked to its children so that current node's parent can link current node
    return Node(condition, children,decisions)

In [763]:
# root = build_tree(example_dataset) # before running this cell change the IS_CATEGORICAL array

In [764]:
# print(root.condition)
# for child in root.children:
#     print(child.condition)
#     if (child.condition == "THIS IS A LEAF NODE!!"):
#         print(child.leaf_value)

In [765]:
# root = build_tree(dataset4)

In [766]:
# print(root.condition)
# for child in root.children:
#     print(child.condition)
#     if (child.condition == "THIS IS A LEAF NODE!!"):
#         print(child.leaf_value)

In [767]:

def print_tree(root, spacing=""):
    if isinstance(root, Leaf):
        print(spacing, root.leaf_value)
        return
    print(spacing, root.condition,root.decisions)
    for child in root.children:
        print_tree(child,spacing+"--> ")

    
    

In [768]:
# print_tree(root)

In [769]:
def find_index(arr,x):
    for i in range (len(arr)):
        if(arr[i] == x):
            return i

In [770]:
def predict(row,root):
    #base case
    if(isinstance(root,Leaf)):
        return root.leaf_value
    value_to_check = row.at[0, root.condition]
    child_index = find_index(root.decisions, value_to_check)
    return predict(row, root.children[child_index])

        


In [771]:
# test_data = pd.DataFrame({
#     'Outlook': ['rainy'],
#     'Company': ['big'],
#     'Sailboat': ['small']
# })

In [772]:
# print(predict(test_data, root))

In [773]:
dataset5 = pd.read_csv(r'/content/sample_data/processed_covid_data.csv')
print(dataset5.dtypes)

retail_and_recreation    float64
grocery_and_pharmacy     float64
residential              float64
transit_stations         float64
parks                    float64
workplaces               float64
new_cases_classes          int64
dtype: object


In [774]:
dataset5

Unnamed: 0,retail_and_recreation,grocery_and_pharmacy,residential,transit_stations,parks,workplaces,new_cases_classes
0,0.667,1.667,0.000,2.000,3.000,3.000,0
1,0.500,1.750,0.000,2.000,3.250,3.000,0
2,0.400,1.800,0.200,1.800,2.800,3.200,0
3,0.500,2.000,0.000,2.333,3.167,3.333,0
4,-0.143,1.714,0.714,1.429,3.571,0.143,0
...,...,...,...,...,...,...,...
466,-61.714,-25.000,24.143,-49.143,-41.000,-45.429,200
467,-61.286,-24.429,23.714,-48.714,-40.000,-44.571,200
468,-61.143,-24.714,23.714,-49.000,-39.143,-44.286,201
469,-60.143,-23.429,23.286,-48.286,-38.000,-43.429,206


In [775]:
root = build_tree(dataset5)

In [776]:
print_tree(root)

 [4.9285, 'grocery_and_pharmacy'] [None, None]
-->  [-47.6425, 'parks'] [None, None]
--> -->  [-69.5715, 'retail_and_recreation'] [None, None]
--> --> -->  [-45.0715, 'grocery_and_pharmacy'] [None, None]
--> --> --> -->  (0, 1.0)
--> --> --> -->  [-77.643, 'retail_and_recreation'] [None, None]
--> --> --> --> -->  (1, 1.0)
--> --> --> --> -->  [-73.786, 'retail_and_recreation'] [None, None]
--> --> --> --> --> -->  (2, 1.0)
--> --> --> --> --> -->  (3, 1.0)
--> --> -->  [-52.929, 'retail_and_recreation'] [None, None]
--> --> --> -->  [-39.2145, 'transit_stations'] [None, None]
--> --> --> --> -->  [-62.4285, 'retail_and_recreation'] [None, None]
--> --> --> --> --> -->  (4, 1.0)
--> --> --> --> --> -->  [-57.357, 'retail_and_recreation'] [None, None]
--> --> --> --> --> --> -->  [-9.785499999999999, 'grocery_and_pharmacy'] [None, None]
--> --> --> --> --> --> --> -->  [-60.714, 'retail_and_recreation'] [None, None]
--> --> --> --> --> --> --> --> -->  (6, 0.75)
--> --> --> --> --> --> 