In [13]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

measuring the impurity in the data set 

In [14]:
def gini_impurity(y):
    classes, counts = np.unique(y, return_counts=True)
    prob_sq = (counts / counts.sum()) ** 2
    return 1 - prob_sq.sum() 

spliting the data bases on the given thresshold 

In [15]:
def split_dataset(X, y, feature, threshold):
    left_indices = X[:, feature] <= threshold
    right_indices = X[:, feature] > threshold
    return (X[left_indices], y[left_indices]), (X[right_indices], y[right_indices])

obtaining the best thresshold to split

In [16]:
def best_split(X, y):
    best_feature, best_threshold = None, None
    best_gini = float('inf')
    best_splits = None

    for feature in range(X.shape[1]):
        thresholds = np.unique(X[:, feature])
        for threshold in thresholds:
            (X_left, y_left), (X_right, y_right) = split_dataset(X, y, feature, threshold)

            if len(y_left) == 0 or len(y_right) == 0:
                continue

            gini_left = gini_impurity(y_left)
            gini_right = gini_impurity(y_right)
            gini_split = (len(y_left) * gini_left + len(y_right) * gini_right) / len(y)

            if gini_split < best_gini:
                best_gini = gini_split
                best_feature = feature
                best_threshold = threshold
                best_splits = (X_left, y_left, X_right, y_right)

    return best_feature, best_threshold, best_splits

building the decision tree

In [17]:
def build_tree(X, y, max_depth=None, min_samples_split=2, depth=0):
    
    #stopping condition 
    if (max_depth is not None and depth >= max_depth) or len(y) < min_samples_split or len(np.unique(y)) == 1:
        return np.argmax(np.bincount(y))
    
    feature, threshold, splits = best_split(X, y)
    if splits is None:
        return np.argmax(np.bincount(y))

    X_left, y_left, X_right, y_right = splits
    
    #sub branches
    left_subtree = build_tree(X_left, y_left, max_depth, min_samples_split, depth + 1)
    right_subtree = build_tree(X_right, y_right, max_depth, min_samples_split, depth + 1)

    return {"feature": feature, "threshold": threshold, "left": left_subtree, "right": right_subtree}

predicting the new data set 

In [18]:
def predict_sample(x, tree):
    if not isinstance(tree, dict):
        return tree

    feature = tree["feature"]
    threshold = tree["threshold"]

    if x[feature] <= threshold:
        return predict_sample(x, tree["left"])
    else:
        return predict_sample(x, tree["right"])

def predict(X, tree):
    return np.array([predict_sample(x, tree) for x in X])

metric calculation

In [19]:
def calculate_f1_score(y_true, y_pred):
    classes = np.unique(y_true)
    f1_scores = []

    for cls in classes:
        tp = np.sum((y_pred == cls) & (y_true == cls))
        fp = np.sum((y_pred == cls) & (y_true != cls))
        fn = np.sum((y_pred != cls) & (y_true == cls))

        precision = tp / (tp + fp) if (tp + fp) > 0 else 0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0

        f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
        f1_scores.append(f1)

    return np.mean(f1_scores)

In [20]:
file_path = r"C:\Users\91944\OneDrive - Indian Institute of Technology Indian School of Mines Dhanbad\Desktop\ml\multi_classification_train.csv"
data = pd.read_csv(file_path)

X = data.drop(columns=['ID', 'Class']).values
y = data['Class'].values

np.random.seed(42)
indices = np.random.permutation(len(X))
train_size = int(0.8 * len(X))
train_indices = indices[:train_size]
test_indices = indices[train_size:]

X_train, X_test = X[train_indices], X[test_indices]
y_train, y_test = y[train_indices], y[test_indices]

   
tree = build_tree(X_train, y_train, max_depth=15, min_samples_split=2)

    
y_pred = predict(X_test, tree)

accuracy = np.mean(y_pred == y_test)
print(f"Test Accuracy: {accuracy:.2f}")

f1_score = calculate_f1_score(y_test, y_pred)
print(f"Test F1 Score: {f1_score:.2f}")

  

Test Accuracy: 0.70
Test F1 Score: 0.65
