# Decision Tree (Classification)

In [3]:
import numpy as np
import pandas as pd

In [5]:
def gini_impurity(y):
    total_samples = len(y)
    if total_samples == 0:
        return 0
    class_counts = {}
    for label in y:
        class_counts[label] = class_counts.get(label, 0) + 1
    impurity = 1 - sum((count / total_samples) ** 2 for count in class_counts.values())
    return impurity

In [7]:
def split_dataset(X, y, feature_index, threshold):
    left_mask = X[:, feature_index] <= threshold
    right_mask = ~left_mask
    return X[left_mask], y[left_mask], X[right_mask], y[right_mask]

In [9]:
def best_split(X, y, min_samples_split=2, num_thresholds=10):
    best_feature_index = None
    best_threshold = None
    best_gini = float('inf')

    for feature_index in range(X.shape[1]):  
        feature_values = X[:, feature_index]
        min_val, max_val = np.min(feature_values), np.max(feature_values)
        thresholds = np.linspace(min_val, max_val, num_thresholds)

        for threshold in thresholds:
            X_left, y_left, X_right, y_right = split_dataset(X, y, feature_index, threshold)

            if len(y_left) < min_samples_split or len(y_right) < min_samples_split:
                continue

            gini_left = gini_impurity(y_left)
            gini_right = gini_impurity(y_right)
            weighted_gini = (len(y_left) * gini_left + len(y_right) * gini_right) / len(y)

            if weighted_gini < best_gini:
                best_gini = weighted_gini
                best_feature_index = feature_index
                best_threshold = threshold

    return best_feature_index, best_threshold

In [11]:
def build_tree(X, y, depth=0, max_depth=20, min_samples_split=5, num_thresholds=10):
    if depth >= max_depth or len(set(y)) == 1 or len(y) < min_samples_split:
        return max(set(y), key=list(y).count) 

    feature_index, threshold = best_split(X, y, min_samples_split, num_thresholds)
    if feature_index is None:  
        return max(set(y), key=list(y).count)

    X_left, y_left, X_right, y_right = split_dataset(X, y, feature_index, threshold)

    left_tree = build_tree(X_left, y_left, depth + 1, max_depth, min_samples_split, num_thresholds)
    right_tree = build_tree(X_right, y_right, depth + 1, max_depth, min_samples_split, num_thresholds)

    return (feature_index, threshold, left_tree, right_tree)

In [13]:
def predict_tree(X, tree):
    if isinstance(tree, (int, np.integer)):  
        return np.full(X.shape[0], tree)

    feature_index, threshold, left_tree, right_tree = tree
    left_mask = X[:, feature_index] <= threshold
    right_mask = ~left_mask

    predictions = np.zeros(X.shape[0], dtype=int)
    predictions[left_mask] = predict_tree(X[left_mask], left_tree)
    predictions[right_mask] = predict_tree(X[right_mask], right_tree)

    return predictions

In [15]:
def f1_score(y_true, y_pred):
    true_positives = sum((y_true == 1) & (y_pred == 1))
    false_positives = sum((y_true == 0) & (y_pred == 1))
    false_negatives = sum((y_true == 1) & (y_pred == 0))

    precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
    recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0

    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    return f1

In [17]:
train = pd.read_csv('binary_classification_train.csv')
test = pd.read_csv('binary_classification_test.csv')

train_data = train.drop('ID', axis=1)
test_data = test.drop('ID', axis=1)

X_train = train_data.iloc[:, :-1].values
y_train = train_data.iloc[:, -1].values
X_test = test_data.values

In [19]:
tree = build_tree(X_train, y_train, max_depth=20, min_samples_split=5, num_thresholds=10)

In [20]:
y_train_pred = predict_tree(X_train, tree)

In [21]:
f1 = f1_score(y_train, y_train_pred)
print(f"F1 Score on Training Data: {f1:.4f}")

F1 Score on Training Data: 0.9260


In [22]:
y_test_pred = predict_tree(X_test, tree)

In [23]:
output = pd.DataFrame({
    'ID': test['ID'],
    'Prediction': y_test_pred.astype(int)
})
output.to_csv('decision_tree_predictions(classification).csv', index=False)
print("Test predictions saved to 'test_predictions.csv'.")

Test predictions saved to 'test_predictions.csv'.
