# Imports

In [46]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

# Read data

In [47]:
dataset = pd.read_csv('dataset.csv')
print(dataset)

            date  precipitation  temp_max  temp_min  wind  weather
0     2012-01-01            0.0      12.8       5.0   4.7  drizzle
1     2012-01-02           10.9      10.6       2.8   4.5     rain
2     2012-01-03            0.8      11.7       7.2   2.3     rain
3     2012-01-04           20.3      12.2       5.6   4.7     rain
4     2012-01-05            1.3       8.9       2.8   6.1     rain
...          ...            ...       ...       ...   ...      ...
1456  2015-12-27            8.6       4.4       1.7   2.9     rain
1457  2015-12-28            1.5       5.0       1.7   1.3     rain
1458  2015-12-29            0.0       7.2       0.6   2.6      fog
1459  2015-12-30            0.0       5.6      -1.0   3.4      sun
1460  2015-12-31            0.0       5.6      -2.1   3.5      sun

[1461 rows x 6 columns]


# Split in train and test

In [48]:
tot_input = dataset.iloc[:, 1:-1]
tot_output = dataset.iloc[:, -1]

x_train, x_test, y_train, y_test = train_test_split(tot_input, tot_output, test_size = 0.2, random_state = 42)

In [49]:
print(tot_output.unique())

['drizzle' 'rain' 'sun' 'snow' 'fog']


# Entropy Decision Tree

In [50]:
class EntropyDT:
    def __init__(self, max_depth):
        self.max_depth = max_depth
        self.root_node = None

    class Node:
        def __init__(self, data, target, depth):
            self.data = data
            self.target = target
            self.depth = depth
            self.children = {}
            self.split_attribute = None
            self.is_leaf = False
            self.label = None

    def entropy(self, target):
        values, counts = np.unique(target, return_counts=True)
        probabilities = counts / len(target)
        entropy_value = -np.sum(probabilities * np.log2(probabilities + 1e-10))
        return entropy_value

    def information_gain(self, data, target, attribute):
        unique_values = data[attribute].unique()
        entropy_before = self.entropy(target)
        entropy_after = 0

        for value in unique_values:
            subset_indices = data[attribute] == value
            subset_target = target[subset_indices]
            entropy_after += len(subset_target) / len(target) * self.entropy(subset_target)

        gain = entropy_before - entropy_after
        return gain

    def id3_algorithm(self, node):
        unique_labels = node.target.unique()

        if len(unique_labels) == 1:
            node.is_leaf = True
            node.label = unique_labels[0]
            return

        if node.depth >= self.max_depth:
            node.is_leaf = True
            node.label = unique_labels[0]  # Choose the most common label at this leaf
            return

        best_gain = 0
        best_attribute = None

        for attribute in node.data.columns:
            gain = self.information_gain(node.data, node.target, attribute)
            if gain > best_gain:
                best_gain = gain
                best_attribute = attribute

        if best_attribute is None:
            node.is_leaf = True
            node.label = unique_labels[0]  # Choose the most common label at this leaf
            return

        node.split_attribute = best_attribute
        unique_values = node.data[best_attribute].unique()

        for value in unique_values:
            subset_indices = node.data[best_attribute] == value
            subset_data = node.data.loc[subset_indices].drop(columns=[best_attribute])
            subset_target = node.target[subset_indices]

            child_node = self.Node(subset_data, subset_target, node.depth + 1)
            node.children[value] = child_node

            self.id3_algorithm(child_node)

    def fit(self, data, target):
        self.root_node = self.Node(data, target, 0)
        self.id3_algorithm(self.root_node)

    def predict_instance(self, node, instance):
        if node.is_leaf:
            return node.label
        else:
            value = instance[node.split_attribute]
            if value in node.children:
                return self.predict_instance(node.children[value], instance)
            else:
                return node.target.mode().iloc[0]

    def predict(self, instances_df):
        predictions = []
    
        for index, row in instances_df.iterrows():
            instance_dict = row.to_dict()
            prediction = self.predict_instance(self.root_node, instance_dict)
            predictions.append(prediction)
    
        return predictions

# Gini Index Decision Tree

In [54]:
class GiniDT:
    def __init__(self, max_depth):
        self.max_depth = max_depth
        self.root_node = None

    class Node:
        def __init__(self, data, target, depth):
            self.data = data
            self.target = target
            self.depth = depth
            self.children = {}
            self.split_attribute = None
            self.is_leaf = False
            self.label = None

    def gini_index(self, target):
        values, counts = np.unique(target, return_counts=True)
        probabilities = counts / len(target)
        gini = 1 - np.sum(probabilities**2)
        return gini

    def gini_index_split(self, data, target, attribute):
        unique_values = data[attribute].unique()
        gini_before = self.gini_index(target)
        gini_after = 0

        for value in unique_values:
            subset_indices = data[attribute] == value
            subset_target = target[subset_indices]
            gini_after += len(subset_target) / len(target) * self.gini_index(subset_target)

        return gini_before - gini_after

    def decision_tree_algorithm(self, node):
        unique_labels = node.target.unique()

        if len(unique_labels) == 1:
            node.is_leaf = True
            node.label = unique_labels[0]
            return

        if node.depth >= self.max_depth:
            node.is_leaf = True
            node.label = unique_labels[0]  # Choose the most common label at this leaf
            return

        best_gini_gain = 0
        best_attribute = None

        for attribute in node.data.columns:
            gini_gain = self.gini_index_split(node.data, node.target, attribute)
            if gini_gain > best_gini_gain:
                best_gini_gain = gini_gain
                best_attribute = attribute

        if best_attribute is None:
            node.is_leaf = True
            node.label = unique_labels[0]  # Choose the most common label at this leaf
            return

        node.split_attribute = best_attribute
        unique_values = node.data[best_attribute].unique()

        for value in unique_values:
            subset_indices = node.data[best_attribute] == value
            subset_data = node.data.loc[subset_indices].drop(columns=[best_attribute])
            subset_target = node.target[subset_indices]

            child_node = self.Node(subset_data, subset_target, node.depth + 1)
            node.children[value] = child_node

            self.decision_tree_algorithm(child_node)

    def fit(self, data, target):
        self.root_node = self.Node(data, target, 0)
        self.decision_tree_algorithm(self.root_node)

    def predict_instance(self, node, instance):
        if node.is_leaf:
            return node.label
        else:
            value = instance[node.split_attribute]
            if value in node.children:
                return self.predict_instance(node.children[value], instance)
            else:
                return node.target.mode().iloc[0]

    def predict(self, instances_df):
        predictions = []

        for index, row in instances_df.iterrows():
            instance_dict = row.to_dict()
            prediction = self.predict_instance(self.root_node, instance_dict)
            predictions.append(prediction)

        return predictions

# Train Entropy Tree

In [58]:
max_depth = 3
tree = EntropyDT(max_depth)
# tree = GiniDT(max_depth)
tree.fit(x_train, y_train)

# Tester

In [59]:
# Example prediction
predictions = tree.predict(x_test)
total = len(predictions)
correct = 0
for i in range(total):
    if(predictions[i] == y_test.iloc[i]): correct+=1

print("Accuracy: ", correct*100/total, "%");

Accuracy:  77.47440273037543 %
