In [1]:
# importing packages
import math
import numpy as np
import pandas as pd

In [2]:
# helper function for getting probability from frequency table
# this function is used in get_entropy()
def get_probability(event_info):
  SUM = sum(event_info)
  for i in range(len(event_info)):
        event_info[i] /= SUM
  return event_info


# this function gets entropy from frequency table
def get_entropy(event_info):
  probabilities = get_probability(event_info)
  # print('probabilities', probabilities)
  entropy = 0.0
  for p in probabilities:
    if p != 0:
      entropy += p * math.log(1 / p) / math.log(2)
  return entropy

# this function gets gini impurity from frequency table
def get_gini_impurity(event_info):
  probabilities = get_probability(event_info)
  # print('probabilities', probabilities)
  entropy = 0.0
  for p in probabilities:
    if p != 0:
      entropy += p * (1.0 - p)
  return entropy


In [3]:
#this funtion sort table by column
def sort_table_by_column(table, col):
    return table.sort_values(by = [col]).reset_index(drop=True)

In [4]:
# for importing the dataset as a numpy array
dataset1 = pd.read_csv(r"covid_dataset.csv")
dataset1 = dataset1[dataset1['location'] == 'India']
# dataset1 = dataset1.to_numpy()
dataset2 = pd.read_csv(r"changes-visitors-covid.csv")
dataset2 = dataset2[dataset2['Entity'] == 'India']
# dataset2 = dataset2.to_numpy()
dataset3 = pd.read_csv(r'continuous_attribute_table.csv')

In [5]:
dataset = pd.merge(dataset1, dataset2, on = 'date', how = 'inner')

In [6]:
example_dataset = dataset1 = pd.read_csv(r"example_data.csv")

In [7]:
print(example_dataset)

  Outlook Company Sailboat target
0   sunny     big    small    yes
1   sunny     med    small    yes
2   sunny     med      big    yes
3   sunny      no    small    yes
4   sunny     big      big    yes
5   rainy      no    small     no
6   rainy     med    small    yes
7   rainy     big      big    yes
8   rainy      no      big     no
9   rainy     med      big     no


In [8]:
def split_dataset_wrt_column(dataset, column_name):
  unique_items = dataset[column_name].unique()
  for item in unique_items:
    yield dataset[dataset[column_name] == item]

In [9]:
class node:
  def __init__(self):
    self.table = None
    self.spliting_feature = None
    self.childs = []
  def add_child(self, child):
    self.childs.append(child)

In [10]:
def get_count(table, target_column, class_name):
  # print(class_name)
  # ans =  (table[target_column] == class_name).shape[0]
  ans = (table[target_column] == class_name).sum()
  # print (ans)
  return ans

In [11]:
def get_entropy_from_table(table, target_column):
  unique_classes = table[target_column].unique()
  # print(unique_classes)
  counts = []
  for class_name in unique_classes:
    counts.append(get_count(table,target_column, class_name))
  # print("Count is ", counts)
  return get_entropy(counts)
  

In [12]:
# this function returns the information gain of the column "column" when the target column is "target_column" of the table dataset
# only for categorical column or attribute
def get_information_gain(dataset, column, target_column):
  tables = []
  size_table = []
  overall_size = dataset.shape[0]
  for table in split_dataset_wrt_column(dataset, column):
    tables.append(table)
    size_table.append(table.shape[0])
  entropies = []
  for table in tables:
    print(table)
    entropies.append(get_entropy_from_table(table, target_column))
  # print("entropies=", entropies)
  # entropies = [get_entropy_from_table(table, target_column) for table in tables]
  # print(entropies)
  entropy_initial = get_entropy_from_table(dataset, target_column)    # entropy without splitting
  # print("entropy_intial=",entropy_initial)
  entropy = sum([(size / overall_size) * entropyi for size, entropyi in zip(size_table, entropies)])  # entropy after splitting
  return (entropy_initial - entropy)


In [13]:
get_information_gain(example_dataset, 'Sailboat', 'target')

  Outlook Company Sailboat target
0   sunny     big    small    yes
1   sunny     med    small    yes
3   sunny      no    small    yes
5   rainy      no    small     no
6   rainy     med    small    yes
  Outlook Company Sailboat target
2   sunny     med      big    yes
4   sunny     big      big    yes
7   rainy     big      big    yes
8   rainy      no      big     no
9   rainy     med      big     no


0.034851554559677034

In [14]:
get_information_gain(example_dataset, 'Sailboat', 'target')

  Outlook Company Sailboat target
0   sunny     big    small    yes
1   sunny     med    small    yes
3   sunny      no    small    yes
5   rainy      no    small     no
6   rainy     med    small    yes
  Outlook Company Sailboat target
2   sunny     med      big    yes
4   sunny     big      big    yes
7   rainy     big      big    yes
8   rainy      no      big     no
9   rainy     med      big     no


0.034851554559677034

In [15]:
def get_value_with_min_entropy_wrt_continuous_column(table, column, target_column):
    # step 1: sort the table
    new_table = sort_table_by_column(table, column)
    # print(new_table)
    # step 2: get various averages
    avg_array = []
    length_new_table = len(new_table)
    for i in range(length_new_table - 1):
        avg_array.append((new_table.at[i,column] + new_table.at[i + 1, column]) / 2)
    
    # print(avg_array)
    # step 3: count before and after averages
    IGs = []
    for i in range(length_new_table - 1):
        table1 = new_table.iloc[:i + 1,:]
        table2 = new_table.iloc[i + 1:, :]
        # print('Table 1')
        # print(table1)
        # print('Table 2')
        # print(table2)
        E1 = get_entropy_from_table(table1, target_column)
        E2 = get_entropy_from_table(table2, target_column)
        # print('E1=',E1, 'E2=', E2)
        E = (len(table1) / len(table)) * E1 + (len(table2) / len(table)) * E2
        IG = get_entropy_from_table(new_table, target_column) - E
        IGs.append(IG)
    print(IGs)
    # step 4: calculate the entropy wrt each average
    # step 5: determine the best split with most information gain
    

In [16]:
get_value_with_min_entropy_wrt_continuous_column(dataset3, 'Weight', 'Heart_disease')

[0.3219280948873625, 0.019973094021975113, 0.41997309402197514, 0.17095059445466876]


In [17]:
dataset3

Unnamed: 0,Weight,Heart_disease
0,155,No
1,190,No
2,220,Yes
3,225,Yes
4,180,Yes
