In [2]:
import pandas as pd
import numpy as np
from collections import Counter

# Read the dataset from the Excel file
df = pd.read_excel('water_leakage_data.xlsx')

# Split the features and target variable
X = df.drop(columns='Leakage').values
y = df['Leakage'].values

# Basic Decision Tree Algorithm (recursive split)
class DecisionTree:
    def __init__(self, max_depth=5):
        self.max_depth = max_depth
        self.tree = None

    def fit(self, X, y):
        self.tree = self._build_tree(X, y, depth=0)

    def _build_tree(self, X, y, depth):
        n_samples, n_features = X.shape
        unique_classes = np.unique(y)

        # Stopping condition
        if len(unique_classes) == 1:
            return unique_classes[0]
        if depth == self.max_depth:
            return Counter(y).most_common(1)[0][0]

        # Find best split
        best_split = self._best_split(X, y)
        left_data = X[best_split['left_indices']]
        left_labels = y[best_split['left_indices']]
        right_data = X[best_split['right_indices']]
        right_labels = y[best_split['right_indices']]

        left_node = self._build_tree(left_data, left_labels, depth + 1)
        right_node = self._build_tree(right_data, right_labels, depth + 1)

        return {
            'feature_index': best_split['feature_index'],
            'threshold': best_split['threshold'],
            'left': left_node,
            'right': right_node
        }

    def _best_split(self, X, y):
        n_samples, n_features = X.shape
        best_split = {'feature_index': None, 'threshold': None, 'left_indices': None, 'right_indices': None}
        best_gini = float('inf')

        for feature_index in range(n_features):
            thresholds = np.unique(X[:, feature_index])
            for threshold in thresholds:
                left_indices = np.where(X[:, feature_index] <= threshold)[0]
                right_indices = np.where(X[:, feature_index] > threshold)[0]

                if len(left_indices) == 0 or len(right_indices) == 0:
                    continue

                gini = self._gini_impurity(y[left_indices], y[right_indices])
                if gini < best_gini:
                    best_gini = gini
                    best_split['feature_index'] = feature_index
                    best_split['threshold'] = threshold
                    best_split['left_indices'] = left_indices
                    best_split['right_indices'] = right_indices

        return best_split

    def _gini_impurity(self, left_labels, right_labels):
        left_size = len(left_labels)
        right_size = len(right_labels)
        total_size = left_size + right_size

        left_gini = 1 - sum((np.sum(left_labels == cls) / left_size) ** 2 for cls in np.unique(left_labels))
        right_gini = 1 - sum((np.sum(right_labels == cls) / right_size) ** 2 for cls in np.unique(right_labels))

        return (left_size / total_size) * left_gini + (right_size / total_size) * right_gini

    def predict(self, X):
        return np.array([self._predict_single(sample, self.tree) for sample in X])

    def _predict_single(self, sample, tree):
        if not isinstance(tree, dict):
            return tree

        if sample[tree['feature_index']] <= tree['threshold']:
            return self._predict_single(sample, tree['left'])
        else:
            return self._predict_single(sample, tree['right'])

# Train a decision tree classifier
dt = DecisionTree(max_depth=5)
dt.fit(X, y)

# Make predictions
predictions = dt.predict(X)

# Evaluate accuracy
accuracy = np.sum(predictions == y) / len(y)
print(f"Accuracy: {accuracy * 100:.2f}%")

Accuracy: 100.00%


In [10]:
new_pred=dt.predict([[37,65,35.74646675,69,0]])
new_pred

array([0], dtype=int64)

In [12]:
new_pred1=dt.predict([[47,117,22.02254077,54,1]])
new_pred1

array([1], dtype=int64)