# Imports

In [1]:
from scipy.stats import mannwhitneyu

import pandas as pd
import numpy as np
from sklearn import datasets
from sklearn.utils import resample
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier

import matplotlib.pyplot as plt
import seaborn as sns

# Visualizations for Accuracy Plots 
Experiments 2 and 5

In [None]:
def lineplots(remove, val_scores_in_model, val_scores_out_model, title, palette1, palette2, png_name):
    
    fig = plt.figure(1, figsize=(30,12))
    fig.suptitle(title, fontsize = 20, fontweight = 'bold')
    
    arr_remove = np.array(remove)
    arr_val_scores_in_model = np.array(val_scores_in_model)
    arr_val_scores_out_model = np.array(val_scores_out_model)
    
    # in-model accuracy
    plt.subplot(121)
    sns.set_style("white")
    sns.lineplot(x = arr_remove*100, y = arr_val_scores_in_model*100, palette=palette1, linewidth = 4)
    
    plt.xlabel('Fraction of train data removed (%)', fontsize=20)
    plt.ylabel('Prediction accuracy (%)', fontsize=20)
    plt.xticks(fontsize = 20)
    plt.yticks(fontsize = 20)
    
    # out-model accuracy
    plt.subplot(122)
    sns.set_style("white")
    sns.lineplot(x = arr_remove*100, y = arr_val_scores_out_model*100,palette=palette2, linewidth = 4)
    
    plt.xlabel('Fraction of train data removed (%)', fontsize = 20)
    plt.ylabel('Prediction accuracy (%)', fontsize = 20)
    plt.xticks(fontsize = 20)
    plt.yticks(fontsize = 20)
    
    fig.savefig(png_name)
    
    plt.show()

In [None]:
def lineplot(remove, val_scores_diff, title, palette, png_name):
    
    fig = plt.figure(1, figsize=(30,12))
    fig.suptitle(title, fontsize = 20, fontweight = 'bold')
    
    arr_remove = np.array(remove)
    arr_val_scores_diff = np.array(val_scores_diff)
   
    # in-model accuracy
    plt.subplot(121)
    sns.set_style("white")
    sns.lineplot(x = arr_remove*100, y = arr_val_scores_diff *100, palette=palette, linewidth = 4)
    
    plt.xlabel('Fraction of train data removed (%)', fontsize=20)
    plt.ylabel('Prediction accuracy (%)', fontsize=20)
    plt.xticks(fontsize = 20)
    plt.yticks(fontsize = 20)
    
    fig.savefig(png_name)
    
    plt.show

# Mann Whitney U Test

## Functions

In [5]:
def standardize_columns(data):
    '''
    Input:
      data (data frame): contains only numeric columns
    Output:
      data frame, the same data, except each column is standardized 
      to have 0-mean and unit variance
    '''
    target_data = data.select_dtypes(include=np.number)
    target_data = target_data.drop(['target'], axis = 1)
    
    standardized_data = (target_data - target_data.mean())/target_data.std()
    standardized_data.insert(len(data.columns)-1, 'target', data['target'])
    
    return standardized_data

In [6]:
def add_col_to_df(data, col_name, col):
    
    new_data = data.copy(deep = True)
    new_data.insert(len(data.columns)-1, col_name, col)
    
    return new_data

In [7]:
def get_feature_pairs(num_features, col_names):
    my_feature_pairs = []
    for _ in range(num_features*num_features):
        for i in range(len(col_names)):
            for j in range(len(col_names)):
                feature_pair = (col_names[j], col_names[i])
                my_feature_pairs.append(feature_pair)
    return my_feature_pairs 

In [2]:
def create_boot_and_oob(training_folds, num_boot_samples):
        
        all_boot_and_oob = []
        
        for i in range(num_boot_samples):
            boot = resample(training_folds, replace = True)
            oob = pd.DataFrame([training_folds.loc[x,:] for x in training_folds.index if x not in boot.index])
        
            boot_and_oob = (boot, oob)
            all_boot_and_oob.append(boot_and_oob)
            
        return all_boot_and_oob

In [3]:
def compute_accuracies_for_every_instance(training_folds, num_boot_samples, clf):
        
        all_boot_and_oob = create_boot_and_oob(training_folds, num_boot_samples)

        both_accuracies_for_every_instance = []
        my_clf = clf
        
        for x in training_folds.index: # go through training_folds index
            in_model_acc = np.array([ ])
            out_model_acc = np.array([ ])
            for boot_oob_tuple in all_boot_and_oob:
                #if x is not in boot_oob_tuplet[1]=oob --> in-model accuracy
                if x not in boot_oob_tuple[1].index:
                    # create training set
                    X_train = boot_oob_tuple[0].iloc[:,:-1] 
                    y_train = boot_oob_tuple[0].iloc[:,-1] #last column: target
                    
                    # test = oob
                    X_test = boot_oob_tuple[1].iloc[:,:-1]
                    y_test = boot_oob_tuple[1].iloc[:,-1]
                    
                    my_clf.fit(X_train, y_train)
                    # test model
                    y_predicted_test =  my_clf.predict(X_test)
                    accuracy = accuracy_score(y_test,y_predicted_test)
                    
                    #print("index: " + str(x) + "in_model_acc: " + str(in_model_acc))
                    in_model_acc = np.append(in_model_acc, accuracy)
       
                    
                else: #out-model accuracy
                
                    X_train = boot_oob_tuple[0].iloc[:,:-1] 
                    y_train = boot_oob_tuple[0].iloc[:,-1] #last column: target
                    # test = oob
                    X_test = boot_oob_tuple[1].iloc[:,:-1].drop(x) # remove target data point
                    y_test = boot_oob_tuple[1].iloc[:,-1].drop(x) # remove target data point
                    
                    my_clf.fit(X_train, y_train)
                    # test model
                    y_predicted_test = my_clf.predict(X_test)
                    accuracy = accuracy_score(y_test,y_predicted_test)
                    
                    out_model_acc = np.append(out_model_acc, accuracy)
                
            both_accuracies_for_one_instance = (in_model_acc, out_model_acc)
            #print('INDEX: ' +str(x) + ' ACC 1x instance: ' + str(both_accuracies_for_one_instance))
            both_accuracies_for_every_instance.append(both_accuracies_for_one_instance)
            
        return both_accuracies_for_every_instance

In [8]:
def scatters_of_multiD_data(data, color_colname, feature_pairs, num_features, setting, cmap,png_name, dot_size=10):
    
    mylist = []
    x = 1
    for _ in range(num_features):
        mylist.append(x)
        x += num_features
            #18,12
    fig = plt.figure(1, figsize=(18,12))
    
  
    #num_features*num_features+1
    for n in range(1,num_features*num_features+1):
        # nrows, ncols, plot_number
        plt.subplot(num_features, num_features, n).set_facecolor('white')
        ax = plt.subplot(num_features, num_features, n)
        ax.spines['bottom'].set_color('black')
        ax.spines['top'].set_color('black')
        ax.spines['left'].set_color('black')
        ax.spines['right'].set_color('black')
        
        #values for color bar
        colors = data[color_colname]
        x, y = feature_pairs[n-1]
        plt.scatter(x = data[x],y = data[y], s= dot_size, c = colors, cmap = cmap)
        
        #plt.xlabel(x, fontsize=12)
        plt.xticks(fontsize = 16, fontweight = 'bold')
        #plt.ylabel(y, fontsize=12)
        plt.yticks(fontsize = 16, fontweight = 'bold')
        
        if n < ((num_features* num_features)- num_features):
            #plt.subplot(4,4,n).get_xaxis().set_visible(False)
            plt.subplot(num_features, num_features,n).xaxis.set_ticklabels([])
            plt.xlabel('')
        
            
        if n not in mylist:
            #plt.subplot(4,4,n).get_yaxis().set_visible(False)
            plt.subplot(num_features,num_features,n).yaxis.set_ticklabels([])
            plt.ylabel('')
            
    plt.subplots_adjust(bottom=0.1, right=0.83, top=0.95, wspace=0.05, hspace=0.05)
    
    
    cax = plt.axes([0.85, 0.1, 0.020, 0.85])
    #aspect not working
    cbar = plt.colorbar(cax=cax, aspect=20)
    cbar.set_label(label = str(color_colname) + setting, fontsize=20)
    cbar.ax.tick_params(labelsize=20)

    #fig.tight_layout()
    
    fig.savefig(png_name)
    
    plt.show()

## Test

In [9]:
Haberman_Survival = pd.read_csv("haberman.data")
haberman_col_names =['Age', 'year', '# positive axillary nodes', 'target']  #target: 'Survival Status'       
Haberman_Survival.to_csv("haberman.data", header=haberman_col_names, index=False)

Haberman_Survival = pd.read_csv("haberman.data")
prep_Haberman_Survival = standardize_columns(Haberman_Survival)

In [None]:
both_accuracies_for_every_instance_10k_HS = compute_accuracies_for_every_instance(prep_Haberman_Survival, 
                                                                               10000, 
                                                                               DecisionTreeClassifier(random_state=22))

In [None]:
results_every_instance_HS_10k = []
U_HS_10k = []
p_HS_10k = []
# for every instance in dataset
for i in range(len(prep_Haberman_Survival)):
    # compute U and p of mannwhitney test
    U, p = mannwhitneyu(both_accuracies_for_every_instance_10k_HS[i][0], both_accuracies_for_every_instance_10k_HS[i][1])
    U_HS_10k.append(U) 
    p_HS_10k.append(p)
    results_one_instance = (U,p)
    results_every_instance_HS_10k.append(results_one_instance)

## Visualizing Test Results

In [None]:
scatters_of_multiD_data(viz_HS_10k, 
                        'U', 
                        feature_pairs_HS, 
                        3, 
                        '-statistic', 
                        'jet',
                        'WhitneyU_Habermas_10k_final',
                       60)