# Import

In [9]:
import pandas as pd
import numpy as np

from sklearn.utils import resample
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier

# Code for Experiments

In [10]:
class Prep_Bdv_Experiments():
    """Class used to implement and experiment with bootstrap data values (bdv)."""
    
    ######## 1. Constructor ########
    
    def __init__(self, 
                 training_folds,
                 acc_kind='in-model',
                 clf=DecisionTreeClassifier(random_state=22), 
                 num_boot_samples=1000,
                 remove=[0, 0.05, 0.10, 0.15],
                 remove_kind= 'low'
                ):
        """Create a new class instance.
        
        training_folds    training dataset (composed of k-folds); format: dataframe
        acc_kind          kind of accuracy: in-model accuracy, out-model accuracy or accuracy difference (=bdv)
        clf               a classifier e.g., decison tree
        num_boot_samples  # bootstrap samples used for data valuation method
        remove            % of data to be removed for thresholding experiments
        bdv               bootstrap data values of all data points
        my_train_data     training data used to build final classifier,i.e.,training data 
                          from which a certain % of data was removed (for thresholding experiment)
        removed_data      data that was removed from training_folds to create my_train_data
        """
        self._num_boot_samples = num_boot_samples
        self._remove = remove
        self._bdv = self.compute_bdv(training_folds, acc_kind, num_boot_samples, clf)
        self._train_data_final_clf = self.compute_train_data_final_clf(training_folds, remove, remove_kind)
        
        
    ######## 2. Access attributes ########
    
    def get_num_boot_samples(self):
        return self._num_boot_samples 
    
    def get_remove(self):
        """Return % of data to be removed from training data."""
        return self._remove
    
    def get_bdv(self):
        """Return bootstrap data values for each data point."""
        return self._bdv
    
    def get_train_data_final_clf(self):
        """Return processed training data for final classifier."""
        return self._train_data_final_clf
  
    
    ######## 3. Compute Bootstrap Data Values ########
    
    # 3.1 Generate bootstrap samples and out-of-bag instances (oob)
    def create_boot_and_oob(self, training_folds, num_boot_samples):
        
        all_boot_and_oob = []
        
        for i in range(num_boot_samples):
            boot = resample(training_folds, replace = True)
            oob = pd.DataFrame([training_folds.loc[x,:] for x in training_folds.index if x not in boot.index])
        
            boot_and_oob = (boot, oob)
            all_boot_and_oob.append(boot_and_oob)
            
        return all_boot_and_oob
            
    
    # 3.2 Compute Bootstrap Data Values (bdv)
    def compute_bdv(self, training_folds, acc_kind, num_boot_samples, clf):
        
        all_boot_and_oob = self.create_boot_and_oob(training_folds, num_boot_samples)

        bdv = [] #list of bootstrap data values or in-model or out-model accuracies
        my_clf = clf
        
        for x in training_folds.index: # go through training_folds indices
            in_model_acc = np.array([ ])
            out_model_acc = np.array([ ])
            for boot_oob_tuple in all_boot_and_oob:
                #IF index (=x) is not in oob (=boot_oob_tuplet[1]) THEN compute in-model accuracy
                if x not in boot_oob_tuple[1].index:
                    # create training set from bootstrap sample
                    X_train = boot_oob_tuple[0].iloc[:,:-1] 
                    y_train = boot_oob_tuple[0].iloc[:,-1] #last column: target
                    
                    # create test set from oob
                    X_test = boot_oob_tuple[1].iloc[:,:-1]
                    y_test = boot_oob_tuple[1].iloc[:,-1]
                    
                    # fit classifier
                    my_clf.fit(X_train, y_train)
                    # test classifier
                    y_predicted_test =  my_clf.predict(X_test)
                    #compute accuracy
                    accuracy = accuracy_score(y_test,y_predicted_test)
                    
                    in_model_acc = np.append(in_model_acc, accuracy)
       
                #ELSE compute out-model accuracy 
                else: 
                    # create training set from bootstrap sample
                    X_train = boot_oob_tuple[0].iloc[:,:-1] 
                    y_train = boot_oob_tuple[0].iloc[:,-1] #last column: target
                    # create test set from oob
                    X_test = boot_oob_tuple[1].iloc[:,:-1].drop(x) # remove target data point
                    y_test = boot_oob_tuple[1].iloc[:,-1].drop(x) # remove target data point
                    
                    # fit classifier
                    my_clf.fit(X_train, y_train)
                    # test classifier
                    y_predicted_test = my_clf.predict(X_test)
                    #compute accuracy
                    accuracy = accuracy_score(y_test,y_predicted_test)
                    
                    out_model_acc = np.append(out_model_acc, accuracy)
                    
            mean_in_model_acc = np.mean(in_model_acc) # in-model accuracy for instance x
            mean_out_model_acc = np.mean(out_model_acc) # out-model accuracy for instance x
            
            # user selects type of accuracy needed for experiment
            if acc_kind == 'difference': # difference = bootstrap data values
                one_bdv = mean_in_model_acc - mean_out_model_acc 
                bdv.append(one_bdv)
                
            if acc_kind == 'in-model':
                bdv.append(mean_in_model_acc)
            
            if acc_kind == 'out-model':
                bdv.append(mean_out_model_acc)
                 
        return bdv
    
    ######## 4. Training Data for Final Classifier ########
    
    def add_bdv_to_df(self, training_folds): 
        """Add bdv column to training_folds dataset before the target column."""
        new_data = training_folds.copy(deep = True)
        new_data.insert(len(training_folds.columns)-1, 'Bootstrap Data Values', self._bdv)
        return new_data
    
    def compute_train_data_final_clf(self, training_folds, remove, remove_kind):
        """Removes a certain % of data with lowest or highest bdv."""
        
        all_removed_indexes = []
        all_X_train_data_final_clf = []
        all_y_train_data_final_clf = []
        
        dataset_with_bdv = self.add_bdv_to_df(training_folds)
        
        for i in range(len(remove)):
            # compute number of rows to be removed
            num_rows = len(dataset_with_bdv.index)
            num_rows_remove = round(remove[i] * num_rows) # round number in case it is a fraction
            
            if remove_kind == 'low':
                # sort indexes by bdv in ascending order
                ranked_indexes = dataset_with_bdv.loc[:, 'Bootstrap Data Values'].sort_values(ascending=True).index
                removed_indexes = ranked_indexes[:num_rows_remove]
                
            if remove_kind == 'high':
                # sort indexes by bdv in descending order
                ranked_indexes = dataset_with_bdv.loc[:, 'Bootstrap Data Values'].sort_values(ascending=False).index
                removed_indexes = ranked_indexes[:num_rows_remove]
            
            #drop rows
            train_data_final_clf = dataset_with_bdv.drop(columns = 'Bootstrap Data Values')
            train_data_final_clf =  train_data_final_clf.drop(index=removed_indexes)
            X_train_final_clf = train_data_final_clf.iloc[:,:-1]
            y_train_final_clf = train_data_final_clf.iloc[:,-1]
            
            all_removed_indexes.append(removed_indexes)
            all_X_train_data_final_clf.append(X_train_final_clf)
            all_y_train_data_final_clf.append(y_train_final_clf)

        
        return all_X_train_data_final_clf, all_y_train_data_final_clf, all_removed_indexes