In [22]:
import pandas as pd
import numpy as np
from pathlib import Path

np.random.seed(42)

PROJECT_01_ROOT = "."
DATA_DIR = Path(PROJECT_01_ROOT).joinpath("CreditApproval")

training_data = pd.read_csv(DATA_DIR.joinpath("training.data"), header=None)
test_data = pd.read_csv(DATA_DIR.joinpath("test.data"), header=None)

In [23]:
len(training_data)

550

In [24]:
column_names = ["A" + str(i) for i in range(1, 17)] # A1 - A16
training_data.columns = column_names
test_data.columns = column_names

In [25]:
# pull labels off datasets
training_labels = training_data["A16"]
test_labels = test_data["A16"]

clean_training = training_data.drop("A16", axis=1)
clean_test = test_data.drop("A16", axis=1)

`training_data` will not be mutated. I will make copies and manipulate those to determine median values.

`clean_training` will be mutated to replace missing values with median values for appropriate columns. 

The below function handles updating attributes with their median values.

In [26]:
def update_attribute_w_median(attribute, is_numeric):
    """fills missing w median value for attribute, returns updated column"""
    # get a copy of dataset which lacks the missing values
    training_copy = training_data.copy()
    if not is_numeric:
        # cut missing values from copied training set when finding median
        training_copy = training_copy[training_copy[attribute] != "?"]
        attribute_values = training_copy[attribute].sort_values()
        attribute_median = attribute_values[len(attribute_values) // 2]
        
        # return col from original training set with replacement
        return training_data[attribute].replace("?", attribute_median)
    else:
        training_copy[attribute] = pd.to_numeric(training_copy[attribute], errors="coerce")
        attribute_median = training_copy[attribute].median()
        training_copy[attribute] = training_copy[attribute].fillna(attribute_median)
        return training_copy[attribute]

In [27]:
cols_with_missing_vals = training_data.columns[training_data.eq("?").any(axis=0)].tolist()
cols_with_missing_vals

['A1', 'A2', 'A4', 'A5', 'A6', 'A7', 'A14']

In [29]:
# below dict includes keys for each column with missing values
# the key is attribute name and value represents whether it is_numeric
missing_attributes = {
    "A1": False,
    "A2": True,
    "A4": False,
    "A5": False,
    "A6": False,
    "A7": False,
    "A14": True,
}

# call my custom function to update all columns in the clean_training df
for attribute, is_numeric in missing_attributes.items():
    filled_column = update_attribute_w_median(attribute, is_numeric)
    clean_training[attribute] = filled_column

for attribute, is_numeric in missing_attributes.items():
    filled_column = update_attribute_w_median(attribute, is_numeric)
    clean_test[attribute] = filled_column

assert clean_training["A14"].dtype == 'float64'
assert clean_test["A14"].dtype == 'float64'

At this stage, `clean_training` and `clean_training` have been updated to replace all missing values with the appropriate median value. Now we can extract the datasets which will be relevant to our decision tree creation.

In [30]:
from sklearn.metrics import f1_score

def sequentially_cross_validate(model, examples, labels, k=10):    
    fold_size = len(examples) // k
    # print(f"cv {len(examples)} across {k} sets. folds are {fold_size} examples.")
    
    best_model = None
    best_score = -1
    
    for i in range(k):
        # determine sequential indices for validation set
        v_start = i * fold_size
        v_end = (i + 1) * fold_size if i != k-1 else len(examples)
        
        X_validation = examples.iloc[v_start:v_end]
        y_validation = labels.iloc[v_start:v_end]

        # grab remaining data for training set
        X_train = pd.concat([examples.iloc[:v_start], examples.iloc[v_end:]])
        y_train = pd.concat([labels.iloc[:v_start], labels.iloc[v_end:]])
        
        # print(f"Fold {i}\n\tValidation from {v_start} to {v_end - 1} of len {len(X_validation)}")
        # train the model on training set
        training_set = pd.concat(axis=1,objs=[X_train, y_train])
        model.build_decision_tree(training_set)
        
        # run prediction on validation set
        validation_set = pd.concat(axis=1,objs=[X_validation, y_validation])
        predicted_labels = model.predict(validation_set)

        # compute f1 score and store the best one
        f1 = f1_score(y_validation, predicted_labels, labels=['+', '-'], pos_label='+')
        print(f"Fold {i} model has F1 Score: {f1}")
        if f1 > best_score:
            best_score = f1
            best_model = model
    return best_model, best_score

In [19]:
from decision_tree import DecisionTreeNode

c45_tree = DecisionTreeNode(algorithm="C4.5")
best_c45, c45_score = sequentially_cross_validate(c45_tree, clean_training, training_labels)

Fold 0 model has F1 Score: 0.7692307692307692
Fold 1 model has F1 Score: 0.7659574468085107
Fold 2 model has F1 Score: 0.7916666666666667
Fold 3 model has F1 Score: 0.7659574468085107
Fold 4 model has F1 Score: 0.7272727272727272
Fold 5 model has F1 Score: 0.8571428571428572
Fold 6 model has F1 Score: 0.823529411764706
Fold 7 model has F1 Score: 0.8076923076923077
Fold 8 model has F1 Score: 0.847457627118644
Fold 9 model has F1 Score: 0.7346938775510203


In [20]:
cart_tree = DecisionTreeNode(algorithm="CART")
best_cart, cart_score = sequentially_cross_validate(cart_tree, clean_training, training_labels)

Fold 0 model has F1 Score: 0.7317073170731708
Fold 1 model has F1 Score: 0.72
Fold 2 model has F1 Score: 0.7727272727272727
Fold 3 model has F1 Score: 0.7659574468085107
Fold 4 model has F1 Score: 0.7441860465116279
Fold 5 model has F1 Score: 0.84
Fold 6 model has F1 Score: 0.6938775510204083
Fold 7 model has F1 Score: 0.8260869565217391
Fold 8 model has F1 Score: 0.7200000000000001
Fold 9 model has F1 Score: 0.6909090909090909


In [35]:
best_c45_predictions = best_c45.predict(clean_test)
c45_f1 = f1_score(test_labels, best_c45_predictions, labels=['+', '-'], pos_label='+')
c45_f1

0.7538461538461538

In [36]:
best_cart_predictions = best_cart.predict(clean_test)
cart_f1 = f1_score(test_labels, best_cart_predictions, labels=['+', '-'], pos_label='+')
cart_f1

0.7931034482758621