In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

np.random.seed(42)

PROJECT_01_ROOT = "."
DATA_DIR = Path(PROJECT_01_ROOT).joinpath("CreditApproval")

training_data = pd.read_csv(DATA_DIR.joinpath("training.data"))
test_data = pd.read_csv(DATA_DIR.joinpath("test.data"))

In [2]:
column_names = ["A" + str(i) for i in range(1, 17)] # A1 - A16
training_data.columns = column_names

In [3]:
# pull labels off dataset
training_labels = training_data["A16"]
clean_data = training_data.drop("A16", axis=1)

`training_data` will not be mutated. I will make copies and manipulate those to determine median values.

`clean_data` will be mutated to replace missing values with median values for appropriate columns. 

The below function handles updating attributes with their median values.

In [62]:
def update_attribute_w_median(attribute, is_numeric):
    """fills missing w median value for attribute, returns updated column"""
    # get a copy of dataset which lacks the missing values
    training_copy = training_data.copy()
    if not is_numeric:
        # cut missing values from copied training set when finding median
        training_copy = training_copy[training_copy[attribute] != "?"]
        attribute_values = training_copy[attribute].sort_values()
        attribute_median = attribute_values[len(attribute_values) // 2]
        
        # return col from original training set with replacement
        return training_data[attribute].replace("?", attribute_median)
    else:
        training_copy[attribute] = pd.to_numeric(training_copy[attribute], errors="coerce")
        attribute_median = training_copy[attribute].median()
        training_copy[attribute] = training_copy[attribute].fillna(attribute_median)
        return training_copy[attribute]

In [63]:
cols_with_missing_vals = training_data.columns[training_data.eq("?").any(axis=0)].tolist()
cols_with_missing_vals

['A1', 'A2', 'A4', 'A5', 'A6', 'A7', 'A14']

In [64]:
# below dict includes keys for each column with missing values
# the key is attribute name and value represents whether it is_numeric
missing_attributes = {
    "A1": False,
    "A2": True,
    "A4": False,
    "A5": False,
    "A6": False,
    "A7": False,
    "A14": True,
}

# call my custom function to update all columns in the clean_data df
for attribute, is_numeric in missing_attributes.items():
    filled_column = update_attribute_w_median(attribute, is_numeric)
    clean_data[attribute] = filled_column

assert clean_data["A14"].dtype == 'float64'

At this stage, `clean_data` has been updated to replace all missing values with the appropriate median value. Now we can extract the datasets which will be relevant to our decision tree creation.

In [4]:
training_data

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,A16
0,b,48.50,4.250,u,g,m,v,0.125,t,f,0,t,g,00225,0,+
1,a,68.67,15.000,u,g,e,z,0.000,t,t,14,f,g,00000,3376,+
2,a,25.08,2.540,y,p,aa,v,0.250,t,f,0,t,g,00370,0,+
3,b,33.75,2.750,u,g,i,bb,0.000,f,f,0,f,g,00180,0,-
4,a,28.75,3.750,u,g,c,v,1.085,t,t,1,t,g,00371,0,+
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
544,b,23.17,0.000,u,g,cc,v,0.085,t,f,0,f,g,00000,0,+
545,b,48.58,0.205,y,p,k,v,0.250,t,t,11,f,g,00380,2732,+
546,b,20.42,7.000,u,g,c,v,1.625,t,t,3,f,g,00200,1391,+
547,a,19.67,0.210,u,g,q,h,0.290,t,t,11,f,g,00080,99,+
