In [77]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [78]:
df = pd.DataFrame(data=X, columns=iris.feature_names)

df['target'] = iris['target']

df

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2
146,6.3,2.5,5.0,1.9,2
147,6.5,3.0,5.2,2.0,2
148,6.2,3.4,5.4,2.3,2


In [79]:
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [80]:
iris = load_iris()

X = iris['data']
y = iris['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 32)

X_test, X_val, y_test, y_val = train_test_split(X_test,y_test, test_size=0.5, random_state = 32)


# print(X_train.shape)
print(y_test)
print(y_val)

[0 0 0 2 0 1 2 1 0 0 0 2 2 2 0]
[1 1 0 1 2 2 0 1 2 1 2 0 1 1 0]


In [81]:
def calculate_entropy(y):
    # Count the occurrences of each class label
    counts = np.bincount(y)
    probabilities = counts / len(y)
    # Remove zero probabilities to avoid log(0)
    probabilities = probabilities[probabilities > 0]
    # Calculate entropy
    entropy = -np.sum(probabilities * np.log2(probabilities))
    return entropy

In [83]:
def split_data(X, y, feature_index, threshold):

    # Indices of samples where feature value <= threshold
    left_indices = np.where(X[:, feature_index] <= threshold)[0]

    # Indices of samples where feature value > threshold
    right_indices = np.where(X[:, feature_index] > threshold)[0]

    return left_indices, right_indices

In [84]:
def find_best_split(X, y):

    num_samples, num_features = X.shape

    parent_entropy = calculate_entropy(y)
    best_info_gain = 0
    best_feature_index = None
    best_threshold = None

    for feature_index in range(num_features):
        # Get all unique values of the selected feature

        feature_values = X[:, feature_index]
        thresholds = np.unique(feature_values)

        for threshold in thresholds:
            # Split the data
            left_indices, right_indices = split_data(X, y, feature_index, threshold)
            if len(left_indices) == 0 or len(right_indices) == 0:
                continue

            # Calculate the entropy for the left and right subsets
            left_entropy = calculate_entropy(y[left_indices])
            right_entropy = calculate_entropy(y[right_indices])

            # Calculate the weighted average entropy after the split
            n_left = len(left_indices)
            n_right = len(right_indices)
            weighted_entropy = (n_left / num_samples) * left_entropy + (n_right / num_samples) * right_entropy

            # Calculate information gain
            info_gain = parent_entropy - weighted_entropy

            # Update the best split if information gain is improved
            if info_gain > best_info_gain:
                best_info_gain = info_gain
                best_feature_index = feature_index
                best_threshold = threshold

    return best_feature_index, best_threshold

In [85]:
def build_decision_tree(X, y, max_depth, current_depth=0):
    num_samples = len(y)
    num_labels = len(np.unique(y))

    if current_depth >= max_depth or num_labels == 1 or num_samples == 0:
        return np.bincount(y).argmax()

    feature_index, threshold = find_best_split(X, y)
    if feature_index is None:
        return np.bincount(y).argmax()

    left_indices, right_indices = split_data(X, y, feature_index, threshold)

    left_subtree = build_decision_tree(X[left_indices], y[left_indices], max_depth, current_depth + 1)
    right_subtree = build_decision_tree(X[right_indices], y[right_indices], max_depth, current_depth + 1)


    return {
        'feature_index': feature_index,
        'threshold': threshold,
        'left_subtree': left_subtree,
        'right_subtree': right_subtree
    }


def predict_sample(sample, tree):
    if not isinstance(tree, dict):
        return tree

    feature_value = sample[tree['feature_index']]

    if feature_value <= tree['threshold']:
        return predict_sample(sample, tree['left_subtree'])
    else:
        return predict_sample(sample, tree['right_subtree'])


max_depth = 3
decision_tree = build_decision_tree(X_train, y_train, max_depth)

y_pred = [predict_sample(sample, decision_tree) for sample in X_test]

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9333333333333333


In [86]:
decision_tree

{'feature_index': 2,
 'threshold': 1.9,
 'left_subtree': 0,
 'right_subtree': {'feature_index': 3,
  'threshold': 1.7,
  'left_subtree': {'feature_index': 2,
   'threshold': 4.9,
   'left_subtree': 1,
   'right_subtree': 2},
  'right_subtree': {'feature_index': 2,
   'threshold': 4.8,
   'left_subtree': 1,
   'right_subtree': 2}}}

<!DOCTYPE html>
<html>
<head>
    <title>Decision Tree Visualization</title>
    <style>
        ul {
            list-style-type: none;
        }
        li::before {
            content: "• ";
            color: #555;
        }
        .decision {
            font-weight: bold;
        }
        .prediction {
            color: green;
            font-weight: bold;
        }
        .condition {
            color: blue;
        }
    </style>
</head>
<body>

<h2>Decision Tree</h2>
<ul>
    <li>
        <span class="decision">Is Petal Length (cm) ≤ 1.9?</span>
        <ul>
            <li>
                <span class="condition">Yes</span> ➔ Predict <span class="prediction">Class 0</span> (Iris-setosa)
            </li>
            <li>
                <span class="condition">No</span> ➔
                <ul>
                    <li>
                        <span class="decision">Is Petal Length (cm) ≤ 4.7?</span>
                        <ul>
                            <li>
                                <span class="condition">Yes</span> ➔
                                <ul>
                                    <li>
                                        <span class="decision">Is Petal Width (cm) ≤ 1.6?</span>
                                        <ul>
                                            <li>
                                                <span class="condition">Yes</span> ➔ Predict <span class="prediction">Class 1</span> (Iris-versicolor)
                                            </li>
                                            <li>
                                                <span class="condition">No</span> ➔ Predict <span class="prediction">Class 2</span> (Iris-virginica)
                                            </li>
                                        </ul>
                                    </li>
                                </ul>
                            </li>
                            <li>
                                <span class="condition">No</span> ➔
                                <ul>
                                    <li>
                                        <span class="decision">Is Petal Width (cm) ≤ 1.7?</span>
                                        <ul>
                                            <li>
                                                <span class="condition">Yes</span> ➔ Predict <span class="prediction">Class 1</span> (Iris-versicolor)
                                            </li>
                                            <li>
                                                <span class="condition">No</span> ➔ Predict <span class="prediction">Class 2</span> (Iris-virginica)
                                            </li>
                                        </ul>
                                    </li>
                                </ul>
                            </li>
                        </ul>
                    </li>
                </ul>
            </li>
        </ul>
    </li>
</ul>

</body>
</html>


### 2. Decision Tree in Hurry

In [88]:
from sklearn.tree import DecisionTreeClassifier

tree_classifier = DecisionTreeClassifier(max_depth=3, random_state=42)  # Adjust max_depth as needed

tree_classifier.fit(X_train, y_train)

y_pred = tree_classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9333333333333333


In [92]:
from sklearn.tree import DecisionTreeClassifier

tree_classifier = DecisionTreeClassifier(max_depth=3, random_state=42)  # Adjust max_depth as needed

tree_classifier.fit(X_train, y_train)

y_pred = tree_classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9333333333333333
