In [90]:
def read_csv(file_path):
    file = open(file_path, "r")
    lines = file.readlines()
    file.close()
    data = [line.strip().split(",") for line in lines]
    print(f"read_csv: header = {data[0]}, number of rows = {len(data)-1}")
    return data[0], data[1:]

In [91]:
def fill_missing(row, index, default):
    return row[index] if row[index] != "" else default

In [92]:
def encode(row, col_idx, mapping):
    return mapping.get(row[col_idx], 0)

In [93]:
def preprocess(data, header):
    processed = []
    for row in data:
        try:
            gender = encode(row, 1, {"Male": 1, "Female": 0})
            married = encode(row, 2, {"Yes": 1, "No": 0})
            dependents = encode(row, 3, {"0": 0, "1": 1, "2": 2, "3+": 3})
            education = encode(row, 4, {"Graduate": 1, "Not Graduate": 0})
            self_emp = encode(row, 5, {"Yes": 1, "No": 0})
            applicant_income = int(fill_missing(row, 6, "0"))
            coapplicant_income = float(fill_missing(row, 7, "0"))
            loan_amount = float(fill_missing(row, 8, "120"))  # Median approx
            loan_term = float(fill_missing(row, 9, "360"))
            credit_history = float(fill_missing(row, 10, "1"))
            property_area = encode(row, 11, {"Urban": 2, "Semiurban": 1, "Rural": 0})
            loan_status = 1 if row[12] == "Y" else 0

            features = [
                gender, married, dependents, education, self_emp,
                applicant_income, coapplicant_income, loan_amount,
                loan_term, credit_history, property_area
            ]

            processed.append((features, loan_status))
        except:
            continue
    print(f"preprocess: processed {len(processed)} rows")
    return processed

In [94]:
def split_data(dataset, split_ratio=0.8):
    train_size = int(len(dataset) * split_ratio)
    train, test = dataset[:train_size], dataset[train_size:]
    print(f"split_data: train size = {len(train)}, test size = {len(test)}")
    return train, test

In [95]:
def majority_label(data):
    count = {0: 0, 1: 0}
    for _, label in data:
        count[label] += 1
    return 1 if count[1] >= count[0] else 0

In [96]:
def evaluate_split(data, feature_index):
    left, right = [], []
    for features, label in data:
        if features[feature_index] == 1:
            left.append((features, label))
        else:
            right.append((features, label))
    
    if len(left) == 0 or len(right) == 0:
        return 0, 0, 0

    left_pred = majority_label(left)
    right_pred = majority_label(right)

    correct = 0
    for features, label in data:
        pred = left_pred if features[feature_index] == 1 else right_pred
        if pred == label:
            correct += 1

    accuracy = correct / len(data)
    print(f"evaluate_split: feature {feature_index}, accuracy {accuracy:.4f}, left_pred {left_pred}, right_pred {right_pred}")
    return accuracy, left_pred, right_pred

In [97]:
header, raw_data = read_csv("loan_data.csv")
data = preprocess(raw_data, header)
train_data, test_data = split_data(data)

best_accuracy = 0
root_feature = -1
for i in range(11):
    acc, _, _ = evaluate_split(train_data, i)
    if acc > best_accuracy:
        best_accuracy = acc
        root_feature = i
print(f"Best root feature: {root_feature} with accuracy {best_accuracy:.4f}")

# Langkah 2: Split berdasarkan root_feature
left_data, right_data = [], []
for features, label in train_data:
    if features[root_feature] == 1:
        right_data.append((features, label))
    else:
        left_data.append((features, label))
print(f"Split train data by root feature {root_feature}: left size {len(left_data)}, right size {len(right_data)}")

read_csv: header = ['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Loan_Status'], number of rows = 614
preprocess: processed 614 rows
split_data: train size = 491, test size = 123
evaluate_split: feature 0, accuracy 0.6884, left_pred 1, right_pred 1
evaluate_split: feature 1, accuracy 0.6884, left_pred 1, right_pred 1
evaluate_split: feature 2, accuracy 0.6884, left_pred 1, right_pred 1
evaluate_split: feature 3, accuracy 0.6884, left_pred 1, right_pred 1
evaluate_split: feature 4, accuracy 0.6884, left_pred 1, right_pred 1
evaluate_split: feature 9, accuracy 0.8065, left_pred 1, right_pred 0
evaluate_split: feature 10, accuracy 0.6884, left_pred 1, right_pred 1
Best root feature: 9 with accuracy 0.8065
Split train data by root feature 9: left size 70, right size 421


In [98]:
def find_best_feature(data_subset):
    best_fi = -1
    best_acc = 0
    best_lp = 0
    best_rp = 0
    for i in range(11):
        acc, lp, rp = evaluate_split(data_subset, i)
        if acc > best_acc:
            best_acc = acc
            best_fi = i
            best_lp = lp
            best_rp = rp
    print(f"find_best_feature: best feature {best_fi} with accuracy {best_acc:.4f}")
    return best_fi, best_lp, best_rp

left_feature, left_1, left_0 = find_best_feature(left_data)
right_feature, right_1, right_0 = find_best_feature(right_data)

evaluate_split: feature 0, accuracy 0.9143, left_pred 0, right_pred 0
evaluate_split: feature 1, accuracy 0.9143, left_pred 0, right_pred 0
evaluate_split: feature 2, accuracy 0.9143, left_pred 0, right_pred 0
evaluate_split: feature 3, accuracy 0.9143, left_pred 0, right_pred 0
evaluate_split: feature 4, accuracy 0.9143, left_pred 0, right_pred 0
evaluate_split: feature 10, accuracy 0.9143, left_pred 0, right_pred 0
find_best_feature: best feature 0 with accuracy 0.9143
evaluate_split: feature 0, accuracy 0.7886, left_pred 1, right_pred 1
evaluate_split: feature 1, accuracy 0.7886, left_pred 1, right_pred 1
evaluate_split: feature 2, accuracy 0.7886, left_pred 1, right_pred 1
evaluate_split: feature 3, accuracy 0.7886, left_pred 1, right_pred 1
evaluate_split: feature 4, accuracy 0.7886, left_pred 1, right_pred 1
evaluate_split: feature 10, accuracy 0.7886, left_pred 1, right_pred 1
find_best_feature: best feature 0 with accuracy 0.7886


In [99]:
def predict(features):
    if features[root_feature] == 1:
        if features[right_feature] == 1:
            return right_1
        else:
            return right_0
    else:
        if features[left_feature] == 1:
            return left_1
        else:
            return left_0

In [100]:
def evaluate(dataset):
    correct = 0
    for features, label in dataset:
        if predict(features) == label:
            correct += 1
    accuracy = correct / len(dataset)
    print(f"evaluate: accuracy on dataset = {accuracy:.4f}")
    return accuracy

acc_test = evaluate(test_data)
print("Decision Tree Depth-2 Akurasi:", round(acc_test * 100, 2), "%")
print("Root Feature:", root_feature)
print("Left Feature:", left_feature, " Right Feature:", right_feature)

evaluate: accuracy on dataset = 0.8211
Decision Tree Depth-2 Akurasi: 82.11 %
Root Feature: 9
Left Feature: 0  Right Feature: 0
