In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

np.random.seed(42)

PROJECT_01_ROOT = "."
DATA_DIR = Path(PROJECT_01_ROOT).joinpath("CreditApproval")

training_data = pd.read_csv(DATA_DIR.joinpath("training.data"), header=None)
test_data = pd.read_csv(DATA_DIR.joinpath("test.data"), header=None)

In [2]:
len(training_data)

550

In [3]:
column_names = ["A" + str(i) for i in range(1, 17)] # A1 - A16
training_data.columns = column_names
test_data.columns = column_names

In [4]:
# pull labels off datasets
training_labels = training_data["A16"]
test_labels = test_data["A16"]

clean_data = training_data.drop("A16", axis=1)
clean_test_data = test_data.drop("A16", axis=1)

`training_data` will not be mutated. I will make copies and manipulate those to determine median values.

`clean_data` will be mutated to replace missing values with median values for appropriate columns. 

The below function handles updating attributes with their median values.

In [5]:
def update_attribute_w_median(attribute, is_numeric):
    """fills missing w median value for attribute, returns updated column"""
    # get a copy of dataset which lacks the missing values
    training_copy = training_data.copy()
    if not is_numeric:
        # cut missing values from copied training set when finding median
        training_copy = training_copy[training_copy[attribute] != "?"]
        attribute_values = training_copy[attribute].sort_values()
        attribute_median = attribute_values[len(attribute_values) // 2]
        
        # return col from original training set with replacement
        return training_data[attribute].replace("?", attribute_median)
    else:
        training_copy[attribute] = pd.to_numeric(training_copy[attribute], errors="coerce")
        attribute_median = training_copy[attribute].median()
        training_copy[attribute] = training_copy[attribute].fillna(attribute_median)
        return training_copy[attribute]

In [6]:
cols_with_missing_vals = training_data.columns[training_data.eq("?").any(axis=0)].tolist()
cols_with_missing_vals

['A1', 'A2', 'A4', 'A5', 'A6', 'A7', 'A14']

In [7]:
# below dict includes keys for each column with missing values
# the key is attribute name and value represents whether it is_numeric
missing_attributes = {
    "A1": False,
    "A2": True,
    "A4": False,
    "A5": False,
    "A6": False,
    "A7": False,
    "A14": True,
}

# call my custom function to update all columns in the clean_data df
for attribute, is_numeric in missing_attributes.items():
    filled_column = update_attribute_w_median(attribute, is_numeric)
    clean_data[attribute] = filled_column

for attribute, is_numeric in missing_attributes.items():
    filled_column = update_attribute_w_median(attribute, is_numeric)
    test_data[attribute] = filled_column

assert clean_data["A14"].dtype == 'float64'
assert test_data["A14"].dtype == 'float64'

At this stage, `clean_data` has been updated to replace all missing values with the appropriate median value. Now we can extract the datasets which will be relevant to our decision tree creation.

In [8]:
def sequentially_cross_validate(examples, labels, k=10):
    fold_size = len(examples) // k
    fold_msg = []
    print(f"cv {len(examples)} across {k} sets. folds are {fold_size} examples.")
    for i in range(k):
        # determine sequential indices for validation set
        v_start = i * fold_size
        v_end = (i + 1) * fold_size if i != k-1 else len(examples)
        
        X_validation = examples.iloc[v_start:v_end]
        y_validation = labels.iloc[v_start:v_end]

        # grab remaining data for training set
        X_train = pd.concat([examples.iloc[:v_start], examples.iloc[v_end:]])
        y_train = pd.concat([labels.iloc[:v_start], labels.iloc[v_end:]])

        print(f"Fold {i}\n\tValidation from {v_start} to {v_end - 1} of len {len(X_validation)}")
        # train the model on training set
        # run prediction on validation set
        


In [9]:
dataset = pd.concat(axis=1,objs=[clean_data, training_labels])

In [10]:
from decision_tree import DecisionTreeNode

quinlan_tree = DecisionTreeNode(algorithm="C4.5")
dataset = pd.concat(axis=1,objs=[clean_data, training_labels])
quinlan_tree.build_decision_tree(dataset)

<decision_tree.DecisionTreeNode at 0x7ffffd189cf0>

In [11]:
# quinlan_tree.predicted_value
res = quinlan_tree.predict(test_data)
res

0      -
1      -
2      -
3      -
4      +
      ..
135    +
136    +
137    +
138    +
139    +
Length: 140, dtype: object

In [12]:
cart_tree = DecisionTreeNode(algorithm="CART")
dataset = pd.concat(axis=1,objs=[clean_data, training_labels])
cart_tree.build_decision_tree(dataset)

<decision_tree.DecisionTreeNode at 0x7fffe051f6d0>

In [13]:
res = cart_tree.predict(test_data)
res

KeyError: 'ff'

In [13]:
quinlan_tree.print_tree()

[Split on A9]
--> A9 = f:
    [A2 <= 70.53999999999999]
    --> Left:
        [A15 <= 52776.0]
        --> Left:
            [A3 <= 0.1675]
            --> Left:
                [A14 <= 480.0]
                --> Left:
                    [A14 <= 280.0]
                    --> Left:
                        [A8 <= 1.25]
                        --> Left:
                            [A2 <= 16.21]
                            --> Left:
                                Predict: +
                            --> Right:
                                [A14 <= 152.0]
                                --> Left:
                                    Predict: -
                                --> Right:
                                    [Split on A4]
                                    --> A4 = u:
                                        [A2 <= 21.795]
                                        --> Left:
                                            [A2 <= 20.25]
                                           