In [31]:
import pandas as pd
import numpy as np
from pathlib import Path

np.random.seed(42)

PROJECT_01_ROOT = "."
DATA_DIR = Path(PROJECT_01_ROOT).joinpath("CreditApproval")

training_data = pd.read_csv(DATA_DIR.joinpath("training.data"), header=None)
test_data = pd.read_csv(DATA_DIR.joinpath("test.data"), header=None)

In [33]:
len(training_data)

550

In [34]:
column_names = ["A" + str(i) for i in range(1, 17)] # A1 - A16
training_data.columns = column_names

In [35]:
# pull labels off dataset
training_labels = training_data["A16"]
clean_data = training_data.drop("A16", axis=1)

`training_data` will not be mutated. I will make copies and manipulate those to determine median values.

`clean_data` will be mutated to replace missing values with median values for appropriate columns. 

The below function handles updating attributes with their median values.

In [36]:
def update_attribute_w_median(attribute, is_numeric):
    """fills missing w median value for attribute, returns updated column"""
    # get a copy of dataset which lacks the missing values
    training_copy = training_data.copy()
    if not is_numeric:
        # cut missing values from copied training set when finding median
        training_copy = training_copy[training_copy[attribute] != "?"]
        attribute_values = training_copy[attribute].sort_values()
        attribute_median = attribute_values[len(attribute_values) // 2]
        
        # return col from original training set with replacement
        return training_data[attribute].replace("?", attribute_median)
    else:
        training_copy[attribute] = pd.to_numeric(training_copy[attribute], errors="coerce")
        attribute_median = training_copy[attribute].median()
        training_copy[attribute] = training_copy[attribute].fillna(attribute_median)
        return training_copy[attribute]

In [37]:
cols_with_missing_vals = training_data.columns[training_data.eq("?").any(axis=0)].tolist()
cols_with_missing_vals

['A1', 'A2', 'A4', 'A5', 'A6', 'A7', 'A14']

In [38]:
# below dict includes keys for each column with missing values
# the key is attribute name and value represents whether it is_numeric
missing_attributes = {
    "A1": False,
    "A2": True,
    "A4": False,
    "A5": False,
    "A6": False,
    "A7": False,
    "A14": True,
}

# call my custom function to update all columns in the clean_data df
for attribute, is_numeric in missing_attributes.items():
    filled_column = update_attribute_w_median(attribute, is_numeric)
    clean_data[attribute] = filled_column

assert clean_data["A14"].dtype == 'float64'

At this stage, `clean_data` has been updated to replace all missing values with the appropriate median value. Now we can extract the datasets which will be relevant to our decision tree creation.

In [39]:
clean_data

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15
0,b,16.92,0.335,y,p,k,v,0.290,f,f,0,f,s,200.0,0
1,b,48.50,4.250,u,g,m,v,0.125,t,f,0,t,g,225.0,0
2,a,68.67,15.000,u,g,e,z,0.000,t,t,14,f,g,0.0,3376
3,a,25.08,2.540,y,p,aa,v,0.250,t,f,0,t,g,370.0,0
4,b,33.75,2.750,u,g,i,bb,0.000,f,f,0,f,g,180.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
545,b,23.17,0.000,u,g,cc,v,0.085,t,f,0,f,g,0.0,0
546,b,48.58,0.205,y,p,k,v,0.250,t,t,11,f,g,380.0,2732
547,b,20.42,7.000,u,g,c,v,1.625,t,t,3,f,g,200.0,1391
548,a,19.67,0.210,u,g,q,h,0.290,t,t,11,f,g,80.0,99


In [40]:
# from sklearn.model_selection import KFold

# kf = KFold(n_splits=10, shuffle=False)
# for fold, (train_idx, test_idx) in enumerate(kf.split(clean_data)):
#     print(f"Fold {fold}")
#     print(f"Train: Index={train_idx}")
#     print(f"Test: Index={test_idx}")

In [48]:
def sequentially_cross_validate(examples, labels, k=10):
    fold_size = len(examples) // k
    fold_msg = []
    print(f"cv {len(examples)} across {k} sets. folds are {fold_size} examples.")
    for i in range(k):
        # determine sequential indices for validation set
        v_start = i * fold_size
        v_end = (i + 1) * fold_size if i != k-1 else len(examples)
        
        X_validation = examples.iloc[v_start:v_end]
        y_validation = labels.iloc[v_start:v_end]

        # grab remaining data for training set
        X_train = pd.concat([examples.iloc[:v_start], examples.iloc[v_end:]])
        y_train = pd.concat([labels.iloc[:v_start], labels.iloc[v_end:]])

        # print(f"Fold {i}\n\tValidation from {v_start} to {v_end - 1} of len {len(X_validation)}")
        # train the model on training set
        # run prediction on validation set
        


In [46]:
sequentially_cross_validate(clean_data, training_labels)


cv 550 across 10 sets. folds are 55 examples.
Fold 0
	Validation from 0 to 54 of len 55
Fold 1
	Validation from 55 to 109 of len 55
Fold 2
	Validation from 110 to 164 of len 55
Fold 3
	Validation from 165 to 219 of len 55
Fold 4
	Validation from 220 to 274 of len 55
Fold 5
	Validation from 275 to 329 of len 55
Fold 6
	Validation from 330 to 384 of len 55
Fold 7
	Validation from 385 to 439 of len 55
Fold 8
	Validation from 440 to 494 of len 55
Fold 9
	Validation from 495 to 549 of len 55


In [27]:
print(len(clean_data))

549
