In [11]:
import pandas as pd
import numpy as np

class Node:
    def __init__(self, feature=None, threshold=None, left=None, right=None, value=None):
        self.feature = feature
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value  # This is the predicted value if the node is a leaf

def gini_index(groups, classes):
    total_samples = sum(len(group) for group in groups)
    gini = 0.0
    for group in groups:
        size = len(group)
        if size == 0:
            continue
        score = 0.0
        for class_val in classes:
            p = (group.iloc[:, -1] == class_val).sum() / size
            score += p * p
        gini += (1.0 - score) * (size / total_samples)
    return gini

def test_split(index, value, dataset):
    left = dataset[dataset.iloc[:, index] < value]
    right = dataset[dataset.iloc[:, index] >= value]
    return left, right

def get_split(dataset):
    class_values = np.unique(dataset.iloc[:, -1])
    b_index, b_value, b_score, b_groups = 999, 999, 999, None
    for index in range(dataset.shape[1]-1):
        for row in dataset.itertuples(index=False):
            groups = test_split(index, getattr(row, dataset.columns[index]), dataset)
            gini = gini_index(groups, class_values)
            if gini < b_score:
                b_index, b_value, b_score, b_groups = index, getattr(row, dataset.columns[index]), gini, groups
    return {'index': b_index, 'value': b_value, 'groups': b_groups}

def to_terminal(group):
    outcomes, counts = np.unique(group.iloc[:, -1], return_counts=True)
    return outcomes[np.argmax(counts)]

def split(node, max_depth, min_size, depth):
    left, right = node['groups']
    del(node['groups'])
    if left.shape[0] == 0 or right.shape[0] == 0:
        node['left'] = node['right'] = to_terminal(pd.concat([left, right]))
        return
    if depth >= max_depth:
        node['left'], node['right'] = to_terminal(left), to_terminal(right)
        return
    if left.shape[0] <= min_size:
        node['left'] = to_terminal(left)
    else:
        node['left'] = get_split(left)
        split(node['left'], max_depth, min_size, depth+1)
    if right.shape[0] <= min_size:
        node['right'] = to_terminal(right)
    else:
        node['right'] = get_split(right)
        split(node['right'], max_depth, min_size, depth+1)

def build_tree(train, max_depth, min_size):
    root = get_split(train)
    split(root, max_depth, min_size, 1)
    return root

def print_tree(node, depth=0):
    if isinstance(node, dict):
        print('{}[X{} < {}]'.format(depth*' ', node['index'], node['value']))
        print_tree(node['left'], depth+1)
        print_tree(node['right'], depth+1)
    else:
        print('{}[{}]'.format(depth*' ', node))

# Load dataset from CSV file
data = pd.read_csv("C:/Users/shahc/Desktop/Programming/Python programming/Pandas/salaries.csv")

# Example usage:
tree = build_tree(data, 4, 1)
print_tree(tree)

[X0 < google]
 [X0 < abc pharma]
  [nan]
  [nan]
 [X0 < google]
  [nan]
  [nan]


In [6]:
import pandas as pd
import numpy as np

class Node:
    def __init__(self, feature=None, threshold=None, left=None, right=None, value=None):
        self.feature = feature
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value  # This is the predicted value if the node is a leaf

def gini_index(groups, classes):
    total_samples = sum(len(group) for group in groups)
    gini = 0.0
    for group in groups:
        size = len(group)
        if size == 0:
            continue
        score = 0.0
        for class_val in classes:
            p = (group.iloc[:, -1] == class_val).sum() / size
            score += p * p
        gini += (1.0 - score) * (size / total_samples)
    return gini

def test_split(index, value, dataset):
    left = dataset[dataset.iloc[:, index] < value]
    right = dataset[dataset.iloc[:, index] >= value]
    return left, right

def get_split(dataset):
    class_values = np.unique(dataset.iloc[:, -1])
    b_index, b_value, b_score, b_groups = 999, 999, 999, None
    for index in range(dataset.shape[1]-1):
        for row in dataset.itertuples(index=False):
            groups = test_split(index, getattr(row, dataset.columns[index]), dataset)
            gini = gini_index(groups, class_values)
            if gini < b_score:
                b_index, b_value, b_score, b_groups = index, getattr(row, dataset.columns[index]), gini, groups
    return {'index': b_index, 'value': b_value, 'groups': b_groups}

def to_terminal(group):
    outcomes, counts = np.unique(group.iloc[:, -1], return_counts=True)
    return outcomes[np.argmax(counts)]

def split(node, max_depth, min_size, depth):
    left, right = node['groups']
    del(node['groups'])
    if left.shape[0] == 0 or right.shape[0] == 0:
        node['left'] = node['right'] = to_terminal(pd.concat([left, right]))
        return
    if depth >= max_depth:
        node['left'], node['right'] = to_terminal(left), to_terminal(right)
        return
    if left.shape[0] <= min_size:
        node['left'] = to_terminal(left)
    else:
        node['left'] = get_split(left)
        split(node['left'], max_depth, min_size, depth+1)
    if right.shape[0] <= min_size:
        node['right'] = to_terminal(right)
    else:
        node['right'] = get_split(right)
        split(node['right'], max_depth, min_size, depth+1)

def build_tree(train, max_depth, min_size):
    root = get_split(train)
    split(root, max_depth, min_size, 1)
    return root

def print_tree(node, depth=0):
    if isinstance(node, dict):
        print('{}[X{} < {}]'.format(depth*' ', node['index'], node['value']))
        print_tree(node['left'], depth+1)
        print_tree(node['right'], depth+1)
    else:
        print('{}[{}]'.format(depth*' ', node))

# Load dataset
data = {
    'Outlook': ['Sunny', 'Sunny', 'Overcast', 'Rain', 'Rain', 'Rain', 'Overcast', 'Sunny', 'Sunny', 'Rain', 'Sunny', 'Overcast', 'Overcast', 'Rain'],
    'Temperature': ['Hot', 'Hot', 'Hot', 'Mild', 'Cool', 'Cool', 'Cool', 'Mild', 'Cool', 'Mild', 'Mild', 'Mild', 'Hot', 'Mild'],
    'Humidity': ['High', 'High', 'High', 'High', 'Normal', 'Normal', 'Normal', 'High', 'Normal', 'Normal', 'Normal', 'High', 'Normal', 'High'],
    'Wind': ['Weak', 'Strong', 'Weak', 'Weak', 'Weak', 'Strong', 'Strong', 'Weak', 'Weak', 'Weak', 'Strong', 'Strong', 'Weak', 'Strong'],
    'PlayTennis': ['No', 'No', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'No']
}
tennis_df = pd.DataFrame(data)

# Example usage:
tree = build_tree(tennis_df, 3, 1)
print_tree(tree)


[X0 < Rain]
 [X0 < Overcast]
  [Yes]
  [Yes]
 [X2 < Normal]
  [X0 < Sunny]
   [No]
   [No]
  [X3 < Weak]
   [No]
   [Yes]


In [9]:
pip install graphviz

Defaulting to user installation because normal site-packages is not writeable
Collecting graphviz
  Downloading graphviz-0.20.1-py3-none-any.whl (47 kB)
     ---------------------------------------- 47.0/47.0 kB 2.3 MB/s eta 0:00:00
Installing collected packages: graphviz
Successfully installed graphviz-0.20.1
Note: you may need to restart the kernel to use updated packages.


In [10]:
import graphviz
def plot_tree(node, dot=None):
    if dot is None:
        dot = graphviz.Digraph(comment='Decision Tree')
    if node.value is not None:
        dot.node(str(id(node)), label=f"Class: {node.value}")
    else:
        dot.node(str(id(node)), label=f"Feature {node.feature} <= {node.threshold}")
        left_node = node.left
        right_node = node.right
        dot = plot_tree(left_node, dot)
        dot = plot_tree(right_node, dot)
        dot.edge(str(id(node)), str(id(left_node)), label='True')
        dot.edge(str(id(node)), str(id(right_node)), label='False')
    return dot
'decision_tree.png'
dot = plot_tree(tree)
dot.render("decision_tree", format="png", cleanup=True)


AttributeError: 'dict' object has no attribute 'value'