In [1]:
import pandas as pd
import math

In [2]:
class Node:
    def __init__(self, value=None):
        self.children = []
        self.value = value
        self.operator = None
        self.attribute = None  # for decision nodes
        self.split_value = None  # split value if continuous attribute

    def append_child(self, node):
        self.children.append(node)

    def set_prev_edge(self, value, operator):
        self.split_value = value
        self.operator = operator
        return self


In [3]:
def get_entropy(data_partition: pd.DataFrame, predicting_attribute: str):
    probability = data_partition[predicting_attribute].value_counts(normalize=True)
    entropy = -sum(p * math.log2(p) for p in probability if p > 0)
    return entropy

In [4]:
def select_attribute_with_highest_gain(data_partition: pd.DataFrame, attribute_list: list, predicting_attribute: str):

    base_entropy = get_entropy(data_partition, predicting_attribute)
    best_gain = -1
    best_attribute = None
    best_split_point = None

    for attribute in attribute_list:
        if data_partition[attribute].dtype in ['int64', 'float64']:  # Continuous attribute
            sorted_values = data_partition[attribute].sort_values().unique()
            for i in range(len(sorted_values) - 1):
                split_point = (sorted_values[i] + sorted_values[i + 1]) / 2
                left_partition = data_partition[data_partition[attribute] <= split_point]
                right_partition = data_partition[data_partition[attribute] > split_point]

                weighted_entropy = (
                    (len(left_partition) / len(data_partition)) * get_entropy(left_partition, predicting_attribute) +
                    (len(right_partition) / len(data_partition)) * get_entropy(right_partition, predicting_attribute)
                )
                gain = base_entropy - weighted_entropy

                if gain > best_gain:
                    best_gain = gain
                    best_attribute = attribute
                    best_split_point = split_point
        else:  # Discrete attribute
            weighted_entropy = 0
            for value in data_partition[attribute].unique():
                subset = data_partition[data_partition[attribute] == value]
                weighted_entropy += (len(subset) / len(data_partition)) * get_entropy(subset, predicting_attribute)
            gain = base_entropy - weighted_entropy

            if gain > best_gain:
                best_gain = gain
                best_attribute = attribute
                best_split_point = None

    return best_attribute, best_split_point

In [5]:
def generate_decision_tree(data_partition: pd.DataFrame, attribute_list: list, predicting_attribute: str):

    if data_partition[predicting_attribute].nunique() == 1:
        return Node(data_partition[predicting_attribute].iloc[0])  # Leaf

    if len(attribute_list) == 0:
        majority_value = data_partition[predicting_attribute].mode()[0]
        return Node(majority_value)

    best_attribute, split_point = select_attribute_with_highest_gain(data_partition, attribute_list, predicting_attribute)

    result_node = Node(value=best_attribute)
    result_node.attribute = best_attribute
    attribute_list = [attr for attr in attribute_list if attr != best_attribute] # remove attribute to not be used again

    if split_point is None:  # Discrete attribute
        for distinct_value in data_partition[best_attribute].unique():
            subset = data_partition[data_partition[best_attribute] == distinct_value]
            if subset.empty:
                majority_value = data_partition[predicting_attribute].mode()[0]
                result_node.append_child(Node(majority_value).set_prev_edge(distinct_value, '='))
            else:
                child = generate_decision_tree(subset, attribute_list, predicting_attribute)
                result_node.append_child(child.set_prev_edge(distinct_value, '='))
    else:  # Continuous attribute
        left_partition = data_partition[data_partition[best_attribute] <= split_point]
        right_partition = data_partition[data_partition[best_attribute] > split_point]

        left_child = generate_decision_tree(left_partition, attribute_list, predicting_attribute)
        right_child = generate_decision_tree(right_partition, attribute_list, predicting_attribute)

        result_node.append_child(left_child.set_prev_edge(split_point, '<='))
        result_node.append_child(right_child.set_prev_edge(split_point, '>'))

    return result_node

In [6]:
df = pd.read_csv('./datasets/weather_forecast_data.csv')
df.head()

Unnamed: 0,Temperature,Humidity,Wind_Speed,Cloud_Cover,Pressure,Rain
0,23.720338,89.592641,7.335604,50.501694,1032.378759,rain
1,27.879734,46.489704,5.952484,4.990053,992.61419,no rain
2,25.069084,83.072843,1.371992,14.855784,1007.23162,no rain
3,23.62208,74.367758,7.050551,67.255282,982.632013,rain
4,20.59137,96.858822,4.643921,47.676444,980.825142,no rain


In [7]:
attribute_list = [col for col in df.columns if col != 'Rain']
# 80 % of data used for training, rest will be used for evaluating the model via predicting the test data
root = generate_decision_tree(df[:2000], attribute_list, 'Rain')

In [8]:
def predict(tree: Node, input_data: dict, default_value=None):
    current_node = tree

    while current_node.children:
        if current_node.attribute is not None:
            if current_node.children[0].split_value is not None:
                if current_node.children[0].operator == '<=':
                    if input_data[current_node.attribute] <= current_node.children[0].split_value:
                        current_node = current_node.children[0]
                    else:
                        current_node = current_node.children[1]
                elif current_node.children[0].operator == '>':
                    if input_data[current_node.attribute] > current_node.children[0].split_value:
                        current_node = current_node.children[1]
                    else:
                        current_node = current_node.children[0]
            else:
                raise ValueError(f"Unexpected node structure for {current_node.attribute}")
        else:
            break

    return current_node.value

In [9]:
print('Predicting testing data:')
[f'Predited: {predict(root, df.loc[2000 + i])}, Expected: {df.loc[2000 + i]['Rain']}' for i in range(0, 500)]

Predicting testing data:


['Predited: no rain, Expected: no rain',
 'Predited: rain, Expected: rain',
 'Predited: no rain, Expected: no rain',
 'Predited: no rain, Expected: no rain',
 'Predited: no rain, Expected: no rain',
 'Predited: no rain, Expected: no rain',
 'Predited: no rain, Expected: no rain',
 'Predited: no rain, Expected: no rain',
 'Predited: no rain, Expected: no rain',
 'Predited: no rain, Expected: no rain',
 'Predited: no rain, Expected: no rain',
 'Predited: no rain, Expected: no rain',
 'Predited: no rain, Expected: no rain',
 'Predited: rain, Expected: rain',
 'Predited: no rain, Expected: no rain',
 'Predited: no rain, Expected: no rain',
 'Predited: no rain, Expected: no rain',
 'Predited: no rain, Expected: no rain',
 'Predited: rain, Expected: rain',
 'Predited: no rain, Expected: no rain',
 'Predited: no rain, Expected: no rain',
 'Predited: rain, Expected: rain',
 'Predited: rain, Expected: rain',
 'Predited: no rain, Expected: no rain',
 'Predited: no rain, Expected: no rain',
 'Pre