In [12]:
import numpy as np
import pandas as pd
from pprint import pprint

# defining entropy


def entropy(target_col):
    elements, counts = np.unique(target_col, return_counts=True)
    entropy = np.sum([(-counts[i]/np.sum(counts))*np.log2(counts[i] /
                     np.sum(counts)) for i in range(len(elements))])
    return entropy


def InfoGain(data, split_attribute_name, target_name="class_type"):
    total_entropy = entropy(data[target_name])
    vals, counts = np.unique(data[split_attribute_name], return_counts=True)
    # calculating the weighted entropy
    Weighted_Entropy = np.sum([(counts[i]/np.sum(counts))*entropy(data.where(data[split_attribute_name] == vals[i]).
                                                                  dropna()[target_name])for i in range(len(vals))])

    # Calculating infomation gain
    Information_Gain = total_entropy-Weighted_Entropy
    return Information_Gain


def ID3(data, originaldata, features, target_attribute_name="class_type", parent_node_class=None):
    # If all target_values have the same value, return this value
    if len(np.unique(data[target_attribute_name])) <= 1:
        return np.unique(data[target_attribute_name])[0]

    # if the dataset is empty
    elif len(data) == 0:
        return np.unique(originaldata[target_attribute_name])[np.argmax(np.unique(originaldata[target_attribute_name], return_counts=True)[1])]

    # If the feature space is empty
    elif len(features) == 0:
        return parent_node_class

    # If none of the above condition holds true grow the tree

    else:
        parent_node_class = np.unique(data[target_attribute_name])[np.argmax(
            np.unique(data[target_attribute_name], return_counts=True)[1])]

    # Select the feature which best splits the dataset
    item_values = [InfoGain(data, feature, target_attribute_name)
                   for feature in features]  # Return the infogain values
    best_feature_index = np.argmax(item_values)
    best_feature = features[best_feature_index]

    # Create the tree structure
    tree = {best_feature: {}}

    # Remove the feature with the best info gain
    features = [i for i in features if i != best_feature]

    # Grow the tree branch under the root node

    for value in np.unique(data[best_feature]):
        value = value
        sub_data = data.where(data[best_feature] == value).dropna()
        # Recursive call of ID3 algotirthm
        subtree = ID3(sub_data, dataset, features,
                      target_attribute_name, parent_node_class)
        # adding subtree to the main tree
        tree[best_feature][value] = subtree
    return(tree)


def predict(query, tree, default=1):
    for key in list(query.keys()):
        if key in list(tree.keys()):
            try:
                result = tree[key][query[key]]
            except:
                return default

            result = tree[key][query[key]]
            if isinstance(result, dict):
                return predict(query, result)
            else:
                return result


def train_test_split(dataset):
    training_data = dataset.iloc[:80].reset_index(drop=True)
    testing_data = dataset.iloc[80:].reset_index(drop=True)
    return training_data, testing_data


def test(data, tree):
    queries = data.iloc[:, :-1].to_dict(orient="records")
    predicted = pd.DataFrame(columns=["predicted"])

    # Accuracy calculation

    for i in range(len(data)):
        predicted.loc[i, "predicted"] = predict(queries[i], tree, 1.0)
    print("Accuracy Score:", (np.sum(
        predicted["predicted"] == data["class_type"])/len(data))*100, '%')


dataset = pd.read_csv("./datasets/zoo.csv")
dataset = dataset.drop('animal_name',axis=1)


training_data = train_test_split(dataset)[0]
testing_data = train_test_split(dataset)[1]

tree = ID3(training_data,training_data,training_data.columns[:-1])
pprint(tree)
test(testing_data,tree)


{'legs': {0: {'fins': {0.0: {'toothed': {0.0: 7.0, 1.0: 3.0}},
                       1.0: {'eggs': {0.0: 1.0, 1.0: 4.0}}}},
          2: {'hair': {0.0: 2.0, 1.0: 1.0}},
          4: {'hair': {0.0: {'toothed': {0.0: 7.0, 1.0: 5.0}}, 1.0: 1.0}},
          6: {'aquatic': {0.0: 6.0, 1.0: 7.0}},
          8: 7.0}}
Accuracy Score: 85.71428571428571 %


In [22]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd

dataframe = pd.read_csv("./datasets/zoo.csv")
X_train, X_test, y_train, y_test = train_test_split(dataframe.iloc[:, 1:17], dataframe.iloc[:, 17], random_state=0, test_size=0.2)
clf = DecisionTreeClassifier(criterion='entropy', random_state=0)
clf.fit(X_train, y_train)
print(clf.score(X_test, y_test))

1.0
