<a href="https://colab.research.google.com/github/Benjamindavid03/MachineLearning/blob/main/rootnode.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Root Node Attribute Selection using Information Gain

In [None]:
import pandas as pd
import numpy as np
df = pd.read_csv("https://raw.githubusercontent.com/akmand/datasets/master/FMLPDA_Table4_3.csv")
df

Unnamed: 0,stream,slope,elevation,vegetation
0,False,steep,high,chapparal
1,True,moderate,low,riparian
2,True,steep,medium,riparian
3,False,steep,medium,chapparal
4,False,flat,high,conifer
5,True,steep,highest,conifer
6,True,steep,high,chapparal


In [None]:
def compute_impurity(feature, impurity_criterion):
    probs = feature.value_counts(normalize=True)
    if impurity_criterion == 'entropy':
        impurity = -1 * np.sum(np.log2(probs) * probs)
    else:
        raise ValueError('Unknown impurity criterion')
    return(round(impurity, 3))

In [None]:
target_entropy = compute_impurity(df['vegetation'], 'entropy')
target_entropy

1.557

In [None]:
def comp_feature_information_gain(df, target, descriptive_feature, split_criterion):   
    print('target feature:', target)
    print('descriptive_feature:', descriptive_feature)
    print('split criterion:', split_criterion)
    target_entropy = compute_impurity(df[target], split_criterion)
    entropy_list = list()
    weight_list = list()
    for level in df[descriptive_feature].unique():
        df_feature_level = df[df[descriptive_feature] == level]
        entropy_level = compute_impurity(df_feature_level[target], split_criterion)
        entropy_list.append(round(entropy_level, 3))
        weight_level = len(df_feature_level) / len(df)
        weight_list.append(round(weight_level, 3))
    feature_remaining_impurity = np.sum(np.array(entropy_list) * np.array(weight_list))
    print('remaining impurity:', feature_remaining_impurity)
    information_gain = target_entropy - feature_remaining_impurity
    print('information gain:', information_gain)
    print('====================')
    return(information_gain)

In [None]:
split_criterion = 'entropy'
for feature in df.drop(columns='vegetation').columns:
    feature_info_gain = comp_feature_information_gain(df, 'vegetation', feature, split_criterion)

target feature: vegetation
descriptive_feature: stream
split criterion: entropy
remaining impurity: 1.250322
information gain: 0.306678
target feature: vegetation
descriptive_feature: slope
split criterion: entropy
remaining impurity: 0.9788939999999999
information gain: 0.578106
target feature: vegetation
descriptive_feature: elevation
split criterion: entropy
remaining impurity: 0.6798219999999999
information gain: 0.877178


# References : 
1. https://towardsdatascience.com/decision-tree-part-2-34b31b1dc328
2. https://towardsdatascience.com/decision-tree-overview-with-no-maths-66b256281e2b
3. https://raw.githubusercontent.com/akmand/datasets/master/FMLPDA_Table4_3.csv
4. https://en.wikipedia.org/wiki/Riparian_zone
5. https://en.wikipedia.org/wiki/Chaparral
6. https://en.wikipedia.org/wiki/Conifer
