In [1]:
import PRF
from sklearn.ensemble import RandomForestClassifier
import numpy

In [2]:
X = numpy.load('data/bootstrap_X.npy')
y = numpy.load('data/bootstrap_y.npy')
y[y > 2] = 2

n_objects = X.shape[0]
n_features = X.shape[1]
print(n_objects, 'objects,', n_features, 'features')

shuffled_inds = numpy.random.choice(numpy.arange(n_objects),n_objects,replace=False)

shuffled_inds = numpy.where( (y == 1)  |  (y == 2) |  (y == 4)|  (y == 5)|  (y == 6)|  (y == 8)|  (y == 13))[0]
shuffled_inds = numpy.random.choice(shuffled_inds,len(shuffled_inds),replace=False)
n_train = 5000
n_test = 500
print('Train set size = {}, Test set size = {}'.format(n_train, n_test))

nf = n_features
train_inds = shuffled_inds[:n_train]
X_train = X[train_inds][:,:nf]
y_train = y[train_inds]

test_inds = shuffled_inds[n_train:(n_train + n_test)]
X_test = X[test_inds][:,:nf]
y_test = y[test_inds]

45879 objects, 17 features
Train set size = 5000, Test set size = 500


In [None]:
n_trees = 1
prf_cls = PRF.prf(n_estimators=n_trees,  bootstrap=True)
prf_cls.fit(X=X_train, y=y_train)
prf_cls.score(X_test, y=y_test)

# Missing values
* the original data does not have missing values. We add missing values to the data by setting in random elements in X to numpy.nan
* we compare the results to the original RF where the missing values are imputed

In [None]:
from sklearn.impute import SimpleImputer as Imputer # for new versions for sklearn
#from sklearn.preprocessing import Imputer # old versions of sklearn

import matplotlib.pyplot as plt

def insert_nans(X_train, X_test, nan_frac):
    X_train_w_nans = X_train.copy()
    X_test_w_nans = X_test.copy()

    nof_nans = int( numpy.prod(X_train.shape) * nan_frac )
    for i in range(nof_nans):
        o = numpy.random.choice(n_train)
        f = numpy.random.choice(nf)
        X_train_w_nans[o,f] = numpy.nan

    nof_nans = int( numpy.prod(X_test.shape) * nan_frac )
    for i in range(nof_nans):
        o = numpy.random.choice(n_test)
        f = numpy.random.choice(nf)
        X_test_w_nans[o,f] = numpy.nan
    
    imp = Imputer(missing_values=numpy.nan, strategy='median')
    X_train_w_nans_imp = imp.fit_transform(X_train_w_nans)
    X_test_w_nans_imp = imp.fit_transform(X_test_w_nans)
    
    return X_train_w_nans, X_test_w_nans, X_train_w_nans_imp, X_test_w_nans_imp

def prf_rf_nans_compare_single(X_train, X_test, n_trees, nan_frac):
    
    X_train_w_nans, X_test_w_nans, X_train_w_nans_imp, X_test_w_nans_imp = insert_nans(X_train, X_test, nan_frac)

    nof_nans = numpy.sum(numpy.isnan(X_train_w_nans))
    print('fraction of nans in X: {:.3f}'.format(nof_nans/numpy.prod(X_train.shape)))
    
    print('Accuracy for {} trees --- '.format(n_trees))

    prf_cls = PRF.prf(n_estimators=n_trees,  bootstrap=True)
    prf_cls.fit(X=X_train_w_nans, y=y_train)
    print('PRF: {}'.format(prf_cls.score(X=X_test_w_nans, y=y_test)))

    RF = RandomForestClassifier(n_estimators=n_trees,n_jobs=-1, bootstrap=True)
    RF.fit(X_train, y_train)
    print('RF: {}'.format(RF.score(X_test_w_nans_imp,y_test)))
    
    return

def plot_prf_rf_cmpr(nan_frac_vec_true, prf_scores, prf_scores_stds, rf_scores, rf_scores_stds):
    plt.figure(figsize = (10,7))
    lw = 7
    alpha = 0.5
    alpha_eb = 0.3
    ms = 25
    
    markers, caps, bars =plt.errorbar(x=nan_frac_vec_true,y=prf_scores,yerr=prf_scores_stds,capsize=10, label = 'PRF',fmt ='--*', markersize= 15, capthick=3)
    [bar.set_alpha(alpha_eb) for bar in bars]
    markers, caps, bars =plt.errorbar(x=nan_frac_vec_true,y=rf_scores,yerr=rf_scores_stds,capsize=10,label = 'RF',fmt ='--*', markersize= 15, capthick=3)
    [bar.set_alpha(alpha_eb) for bar in bars]

    plt.legend(fontsize = 20)
    plt.xlabel('Fraction of NaNs in X', fontsize = 20)
    plt.ylabel('Accuracy', fontsize = 20)
    plt.xticks(fontsize = 20)
    yticks = plt.gca().get_yticks()
    yticks_p = ['{}%'.format('%.1f' % (yt*100)) for yt in yticks]
    plt.yticks(yticks, yticks_p, fontsize = 20)
    plt.tight_layout()
    plt.show()
    
    return

def prf_rf_nans_compare_full(X_train, X_test, n_itr, n_trees):

    nof_elements = numpy.prod(X_train.shape)
    
    prf_scores = []
    prf_scores_stds = []
    rf_scores = []
    rf_scores_stds = []

    nan_frac_vec = [0, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.8, 1.1, 1.5, 2]
    nan_frac_vec_true = numpy.zeros(len(nan_frac_vec))
    for n_idx, nan_frac in enumerate(nan_frac_vec):

        X_train_w_nans, X_test_w_nans, X_train_w_nans_imp, X_test_w_nans_imp = insert_nans(X_train, X_test, nan_frac)

        scores = numpy.zeros(n_itr)
        for i in range(n_itr): 
            prf_cls = PRF.prf(n_estimators=n_trees,  bootstrap=True, keep_proba=0.01)
            prf_cls.fit(X=X_train_w_nans, y=y_train)
            scores[i] = prf_cls.score(X=X_test_w_nans, y=y_test)
        prf_scores += [scores.mean()]
        prf_scores_stds += [scores.std()]

        scores = numpy.zeros(n_itr)
        for i in range(n_itr): 
            RF = RandomForestClassifier(n_estimators=n_trees,n_jobs=-1, bootstrap=True)
            RF.fit(X_train_w_nans_imp, y_train)
            scores[i] = RF.score(X_test_w_nans_imp,y_test)
        rf_scores += [scores.mean()]
        rf_scores_stds += [scores.std()]
        
        nof_nans = numpy.sum(numpy.isnan(X_train_w_nans))
        nan_frac_vec_true[n_idx] = nof_nans/nof_elements

        print('nan fraction:{:.2f}, PRF:{:.3f}, RF:{:.3f}'.format(nan_frac_vec_true[n_idx],prf_scores[-1], rf_scores[-1]))
        

    
    plot_prf_rf_cmpr(nan_frac_vec_true, prf_scores, prf_scores_stds, rf_scores, rf_scores_stds)
    
    return nan_frac_vec_true, prf_scores, rf_scores, prf_scores_stds, rf_scores_stds


def prf_rf_nans_test_set_only_compare_full(X_train, X_test, n_itr, n_trees):

    nof_elements = numpy.prod(X_train.shape)
    
    prf_scores = []
    prf_scores_stds = []
    rf_scores = []
    rf_scores_stds = []

    nan_frac_vec = [0, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.8, 1.1, 1.5, 2]
    nan_frac_vec_true = numpy.zeros(len(nan_frac_vec))
    for n_idx, nan_frac in enumerate(nan_frac_vec):

        X_train_w_nans, X_test_w_nans, X_train_w_nans_imp, X_test_w_nans_imp = insert_nans(X_train, X_test, nan_frac)

        scores = numpy.zeros(n_itr)
        for i in range(n_itr): 
            prf_cls = PRF.prf(n_estimators=n_trees,  bootstrap=True, keep_proba=0.01)
            prf_cls.fit(X=X_train, y=y_train)
            scores[i] = prf_cls.score(X=X_test_w_nans, y=y_test)
        prf_scores += [scores.mean()]
        prf_scores_stds += [scores.std()]

        scores = numpy.zeros(n_itr)
        for i in range(n_itr): 
            RF = RandomForestClassifier(n_estimators=n_trees,n_jobs=-1, bootstrap=True)
            RF.fit(X_train, y_train)
            scores[i] = RF.score(X_test_w_nans_imp,y_test)
        rf_scores += [scores.mean()]
        rf_scores_stds += [scores.std()]
        
        nof_nans = numpy.sum(numpy.isnan(X_train_w_nans))
        nan_frac_vec_true[n_idx] = nof_nans/nof_elements

        print('nan fraction:{:.2f}, PRF:{:.3f}, RF:{:.3f}'.format(nan_frac_vec_true[n_idx],prf_scores[-1], rf_scores[-1]))
        

    plot_prf_rf_cmpr(nan_frac_vec_true, prf_scores, prf_scores_stds, rf_scores, rf_scores_stds)
    
    return nan_frac_vec_true, prf_scores, rf_scores, prf_scores_stds, rf_scores_stds

In [None]:
_ = prf_rf_nans_compare_single(X_train=X_train, X_test=X_test, n_trees=1, nan_frac=0.5)

In [None]:
_ = prf_rf_nans_compare_single(X_train=X_train, X_test=X_test, n_trees=10, nan_frac=0.5)

In [None]:
_ = prf_rf_nans_compare_single(X_train=X_train, X_test=X_test, n_trees=100, nan_frac=0.5)

# Missing values in both train and test sets
* The PRF accuracy is higher for a single tree, but the same as a regular RF for a large number of trees

In [None]:
_ = prf_rf_nans_compare_full(X_train=X_train, X_test=X_test, n_itr=10, n_trees=1)

In [None]:
_ = prf_rf_nans_compare_full(X_train=X_train, X_test=X_test, n_itr = 1, n_trees = 25)

# Missing values in test set only
* The PRF accuracy is higher even for a large number of trees

In [None]:
_ = prf_rf_nans_test_set_only_compare_full(X_train=X_train, X_test=X_test, n_itr = 10, n_trees = 1)

In [None]:
_ = prf_rf_nans_test_set_only_compare_full(X_train=X_train, X_test=X_test, n_itr = 5, n_trees = 10)

In [None]:
_ = prf_rf_nans_test_set_only_compare_full(X_train=X_train, X_test=X_test, n_itr = 1, n_trees = 25)