In [4]:
import numpy as np
import pandas as pd
import dill

In [5]:
# load trained models and train/test sets

with open("model_evaluation/cred_tree.pkl", "br") as pklfile:
    pctree = dill.load(pklfile)
with open("model_evaluation/cred_tree_no_zeros.pkl", "br") as pklfile:
    zeroless_pctree = dill.load(pklfile)
train = pd.read_pickle("data/train.pkl")
test = pd.read_pickle("data/test_by_ID.pkl")
zeroless_train = pd.read_pickle("data/train_no_zeros.pkl")
zeroless_test = pd.read_pickle("data/test_by_ID_no_zeros.pkl")

In [6]:
# get cluster predictions for each test record

predictors = [
    "CODE_GENDER",
    "FLAG_OWN_CAR",
    "FLAG_OWN_REALTY",
    "CNT_CHILDREN",
    "AMT_INCOME_TOTAL",
    "NAME_INCOME_TYPE",
    "NAME_EDUCATION_TYPE",
    "NAME_FAMILY_STATUS",
    "NAME_HOUSING_TYPE",
    "DAYS_BIRTH",
    "DAYS_EMPLOYED",
    "FLAG_MOBIL",
    "FLAG_WORK_PHONE",
    "FLAG_PHONE",
    "FLAG_EMAIL",
    "OCCUPATION_TYPE",
    "CNT_FAM_MEMBERS"
]
cluster_preds = pctree.predict(test.loc[:, predictors])
zeroless_cluster_preds = zeroless_pctree.predict(zeroless_test.loc[:, predictors])

In [7]:
def calculate_pmf(df, idx):
    pmf = pd.Series(0, index=idx)
    counts = df["STATUS"].value_counts()
    pmf[counts.index] = counts / df.shape[0]
    return pmf

# get predicted pmfs and actual pmfs for test set with zeros
pred_pmfs = pd.DataFrame(
    [
        calculate_pmf(pctree.leaf_nodes_[clust], idx=np.arange(-6,2))
        for clust in cluster_preds.loc[:, 0]
    ],
    index=cluster_preds.index
)
by_ID = test.groupby("ID")
actual_pmfs = by_ID.apply(calculate_pmf, idx=np.arange(-6,2))

# get predicted pmfs and actual pmfs for zeroless test set
zeroless_pred_pmfs = pd.DataFrame(
    [
        calculate_pmf(zeroless_pctree.leaf_nodes_[clust], idx=(-6, -5, -4, -3, -2, -1, 1))
        for clust in zeroless_cluster_preds.loc[:, 0]
    ],
    index=zeroless_cluster_preds.index
)
zeroless_by_ID = zeroless_test.groupby("ID")
zeroless_actual_pmfs = zeroless_by_ID.apply(calculate_pmf, idx=(-6,-5,-4,-3,-2,-1,1))

In [8]:
pred_pmfs.shape, actual_pmfs.shape, zeroless_pred_pmfs.shape, zeroless_actual_pmfs.shape

((33718, 8), (33718, 8), (29507, 7), (29507, 7))

In [9]:
nullhypo_pmf = train["STATUS"].value_counts() / train.shape[0]
nullhypo_pmf.sort_index(inplace=True)

zeroless_nullhypo_pmf = zeroless_train["STATUS"].value_counts() / zeroless_train.shape[0]
zeroless_nullhypo_pmf.sort_index(inplace=True) 

nullhypo_pmf, zeroless_nullhypo_pmf

(-6    0.001936
 -5    0.000261
 -4    0.000348
 -3    0.001051
 -2    0.011283
 -1    0.374029
  0    0.187431
  1    0.423662
 Name: STATUS, dtype: float64,
 -6    0.002382
 -5    0.000321
 -4    0.000428
 -3    0.001293
 -2    0.013885
 -1    0.460305
  1    0.521386
 Name: STATUS, dtype: float64)

In [10]:
pred_mostly_on_time = (pred_pmfs.loc[:, 1] + pred_pmfs.loc[:, 0]) > 0.5
actual_mostly_on_time = (actual_pmfs.loc[:, 1] + actual_pmfs.loc[:, 0]) > 0.5
nullhypo_mostly_on_time = (nullhypo_pmf.loc[1] + nullhypo_pmf.loc[0]) > 0.5

print(
    f"Model accuracy prediction at least 50% 1s or 0s: ",
    (pred_mostly_on_time == actual_mostly_on_time).mean()
)
print(
    f"Null hypothesis at least 50% 1s or 0s: ",
    (nullhypo_mostly_on_time == actual_mostly_on_time).mean()
)

Model accuracy prediction at least 50% 1s or 0s:  0.5927397829052732
Null hypothesis at least 50% 1s or 0s:  0.5257725843762975


In [16]:
num_pos = actual_mostly_on_time.sum()
num_neg = (~actual_mostly_on_time).sum()

true_pos_rate = (pred_mostly_on_time & actual_mostly_on_time).sum() / num_pos
true_neg_rate = (~pred_mostly_on_time & ~actual_mostly_on_time).sum() / num_neg
false_pos_rate = (pred_mostly_on_time & ~actual_mostly_on_time).sum() / num_neg
false_neg_rate = (~pred_mostly_on_time & actual_mostly_on_time).sum() / num_pos

print("Model Prediction Rates")
print("--------------------")
print("True positive rate: ", true_pos_rate)
print("True negative rate: ", true_neg_rate)
print("False positive rate: ", false_pos_rate)
print("False negative rate: ", false_neg_rate)

Model Prediction Rates
--------------------
True positive rate:  0.8383348375451264
True negative rate:  0.32045028142589116
False positive rate:  0.6795497185741088
False negative rate:  0.16166516245487364


In [17]:
nullhypo_true_pos_rate = (nullhypo_mostly_on_time & actual_mostly_on_time).sum() / num_pos
nullhypo_true_neg_rate = (~nullhypo_mostly_on_time & ~actual_mostly_on_time).sum() / num_neg
nullhypo_false_pos_rate = (nullhypo_mostly_on_time & ~actual_mostly_on_time).sum() / num_neg
nullhypo_false_neg_rate = (~nullhypo_mostly_on_time & actual_mostly_on_time).sum() / num_pos

print("Null Hypothesis Accuracy Rates")
print("------------------------------")
print("True positive rate: ", nullhypo_true_pos_rate)
print("True negative rate: ", nullhypo_true_neg_rate)
print("False positive rate: ", nullhypo_false_pos_rate)
print("False negative rate: ", nullhypo_false_neg_rate)

Null Hypothesis Accuracy Rates
------------------------------
True positive rate:  1.0
True negative rate:  0.0
False positive rate:  1.0
False negative rate:  0.0


In [13]:
zeroless_pred_mostly_on_time = zeroless_pred_pmfs.loc[:, 1] > 0.5
zeroless_actual_mostly_on_time = zeroless_actual_pmfs.loc[:, 1] > 0.5
zeroless_nullhypo_mostly_on_time = zeroless_nullhypo_pmf.loc[1] > 0.5

print(
    f"Zeroless model accuracy predicting at least 50% 1s: ",
    (zeroless_pred_mostly_on_time == zeroless_actual_mostly_on_time).mean()
)
print(
    f"Zeroless null hypothesis at least 50% 1s or 0s: ",
    (zeroless_nullhypo_mostly_on_time == zeroless_actual_mostly_on_time).mean()
)

Zeroless model accuracy predicting at least 50% 1s:  0.6156166333412411
Zeroless null hypothesis at least 50% 1s or 0s:  0.389873589317789


In [18]:
zeroless_num_pos = zeroless_actual_mostly_on_time.sum()
zeroless_num_neg = (~zeroless_actual_mostly_on_time).sum()

zeroless_true_pos_rate = (zeroless_pred_mostly_on_time & zeroless_actual_mostly_on_time).sum() / zeroless_num_pos
zeroless_true_neg_rate = (~zeroless_pred_mostly_on_time & ~zeroless_actual_mostly_on_time).sum() / zeroless_num_neg
zeroless_false_pos_rate = (zeroless_pred_mostly_on_time & ~zeroless_actual_mostly_on_time).sum() / zeroless_num_neg
zeroless_false_neg_rate = (~zeroless_pred_mostly_on_time & zeroless_actual_mostly_on_time).sum() / zeroless_num_pos

print("Zeroless Model Accuracy Rates")
print("------------------------------")
print("True positive rate: ", zeroless_true_pos_rate)
print("True negative rate: ", zeroless_true_neg_rate)
print("False positive rate: ", zeroless_false_pos_rate)
print("False negative rate: ", zeroless_false_neg_rate)

Zeroless Model Accuracy Rates
------------------------------
True positive rate:  0.6494262865090403
True negative rate:  0.594012109092929
False positive rate:  0.40598789090707105
False negative rate:  0.35057371349095967


In [19]:
zeroless_nullhypo_true_pos_rate = (zeroless_nullhypo_mostly_on_time & zeroless_actual_mostly_on_time).sum() / zeroless_num_pos
zeroless_nullhypo_true_neg_rate = (~zeroless_nullhypo_mostly_on_time & ~zeroless_actual_mostly_on_time).sum() / zeroless_num_neg
zeroless_nullhypo_false_pos_rate = (zeroless_nullhypo_mostly_on_time & ~zeroless_actual_mostly_on_time).sum() / zeroless_num_neg
zeroless_nullhypo_false_neg_rate = (~zeroless_nullhypo_mostly_on_time & zeroless_actual_mostly_on_time).sum() / zeroless_num_pos

print("Null Hypothesis Accuracy Rates")
print("------------------------------")
print("True positive rate: ", zeroless_nullhypo_true_pos_rate)
print("True negative rate: ", zeroless_nullhypo_true_neg_rate)
print("False positive rate: ", zeroless_nullhypo_false_pos_rate)
print("False negative rate: ", zeroless_nullhypo_false_neg_rate)

Null Hypothesis Accuracy Rates
------------------------------
True positive rate:  1.0
True negative rate:  0.0
False positive rate:  1.0
False negative rate:  0.0
