# Data Valuation Viz Code

In [3]:
import matplotlib.pyplot as plt
import seaborn as sns

import pandas as pd
import numpy as np

## Adding Results to Dataframes

In [2]:
def add_bvd_to_df(data, bvd):
    
    new_data = data.copy(deep = True)
    new_data.insert(len(data.columns)-1, 'Bootstrap Data Values', bvd)
    
    return new_data

In [3]:
def add_bvd_and_p_to_df(data, bvd, p_values):
    
    new_data = data.copy(deep = True)
    new_data.insert(len(data.columns)-1, 'p-values', p_values)
    new_data.insert(len(data.columns)-1, 'Bootstrap Data Values', bvd)
    
    return new_data

In [4]:
def add_p_to_df(data, p_values):
    
    new_data = data.copy(deep = True)
    new_data.insert(len(data.columns)-1, 'p-values', p_values)
    
    return new_data

In [5]:
def add_acc_to_df(data, accuracy, accuracy_kind):
    
    new_data = data.copy(deep = True)
    new_data.insert(len(data.columns)-1, accuracy_kind, accuracy)
    
    return new_data

## Other Preprocessing for VIZ

In [6]:
#for scatters of data with more than 2 dimensions
def get_feature_pairs(num_features, col_names):
    my_feature_pairs = []
    for _ in range(num_features*num_features):
        for i in range(len(col_names)):
            for j in range(len(col_names)):
                feature_pair = (col_names[j], col_names[i])
                my_feature_pairs.append(feature_pair)
    return my_feature_pairs 

## Vizualizing Results

### Scatterplots

In [7]:
# viz 3 scatters of the same data
def plot_3_scatters(data1, data2, data3, color_colname, x, y, cmap):
    '''
    Input: df with p-values and/or bootstrap data values
    '''
    #max = 0
    #min = 0
    #max = round(max(max(data1[color_colname]), max(data2[color_colname]), max(data3[color_colname])), 6)
    #min = round(min(min(data1[color_colname]), min(data2[color_colname]), min(data3[color_colname])),6)
    
    fig = plt.figure(1, figsize=(30,12))
    fig.suptitle('Scatterplots of Original Setting vs Modified Settings', fontsize = 20, fontweight = 'bold')
    
    # nrows, ncols, plot_number
    plt.subplot(221)
    colors = data1[color_colname]
    plt.scatter(x = data1[x],y = data1[y], s = 60, c = colors, cmap = cmap)
    
    plt.title('Original Setting with Decision Trees', fontsize = 14, fontweight = 'bold')
    plt.xlabel(x)
    plt.ylabel(y)
    
    plt.subplot(222)
    colors = data2[color_colname]
    plt.scatter(x = data2[x],y = data2[y], s = 60, c = colors, cmap = cmap)
    
    plt.title('Modified Setting with Decision Trees', fontsize = 14, fontweight = 'bold')
    plt.xlabel(x)
    plt.ylabel(y)
    
    plt.subplot(223)
    colors = data3[color_colname]
    plt.scatter(x = data3[x],y = data3[y], s = 60, c = colors, cmap = cmap)
    
    plt.title('Modified Setting with Stumps', fontsize = 14, fontweight = 'bold')
    plt.xlabel(x)
    plt.ylabel(y)
    
    plt.subplots_adjust(bottom=0.1, right=0.8, top=0.9)
    
    cax = plt.axes([0.85, 0.1, 0.075, 0.8])
    #aspect not working
    cbar = plt.colorbar(cax=cax)
    cbar.set_label(label = color_colname)
    #cbar.clim(min,max)
    
    plt.show()

In [8]:
# visualize data with 4 features in 16x 2d scatterplots
def plot_4d_scatters(data, color_colname, feature_pairs, setting, cmap):
    '''
    Input: df with p-values and/or bootstrap data values
    '''
    
    fig = plt.figure(1, figsize=(30,26))
    fig.suptitle('Scatterplots of ' + color_colname + " " + setting, 
                 fontsize = 30, 
                 fontweight = 'bold')
  
    
    for n in range(1,17):
        # nrows, ncols, plot_number
        plt.subplot(4,4,n)
        
        #values for color bar
        colors = data[color_colname]
        x, y = feature_pairs[n-1]
        plt.scatter(x = data[x],y = data[y], s = 60, c = colors, cmap = cmap)
        
        plt.xlabel(x, fontsize=25)
        plt.xticks(fontsize = 15, fontweight = 'bold')
        plt.ylabel(y, fontsize=25)
        plt.yticks(fontsize = 15, fontweight = 'bold')
        
        if n < 13:
            #plt.subplot(4,4,n).get_xaxis().set_visible(False)
            plt.subplot(4,4,n).xaxis.set_ticklabels([])
            plt.xlabel('')
            
        if n not in [1, 5, 9, 13]:
            #plt.subplot(4,4,n).get_yaxis().set_visible(False)
            plt.subplot(4,4,n).yaxis.set_ticklabels([])
            plt.ylabel('')
            
    plt.subplots_adjust(bottom=0.1, right=0.83, top=0.95, wspace=0.05, hspace=0.05)
    
    
    cax = plt.axes([0.85, 0.1, 0.075, 0.85])
    #aspect not working
    cbar = plt.colorbar(cax=cax, aspect=20)
    cbar.set_label(label = color_colname, fontsize=20)
    cbar.ax.tick_params(labelsize=15)
    
    plt.show()

In [9]:
# can replace function: plot 4d scatters
def scatters_of_multiD_data(data, color_colname, feature_pairs, num_features, setting, cmap, dot_size=10):
    '''
    Input: df with p-values and/or bootstrap data values
    '''
    mylist = []
    x = 1
    for _ in range(num_features):
        mylist.append(x)
        x += num_features
            
    fig = plt.figure(1, figsize=(30,26))
    fig.suptitle('Scatterplots of ' + color_colname + " " + setting, 
                 fontsize = 30, 
                 fontweight = 'bold')
  
    #num_features*num_features+1
    for n in range(1,num_features*num_features+1):
        # nrows, ncols, plot_number
        plt.subplot(num_features, num_features, n)
        
        #values for color bar
        colors = data[color_colname]
        x, y = feature_pairs[n-1]
        plt.scatter(x = data[x],y = data[y], s= dot_size, c = colors, cmap = cmap)
        
        plt.xlabel(x, fontsize=12)
        plt.xticks(fontsize = 10, fontweight = 'bold')
        plt.ylabel(y, fontsize=12)
        plt.yticks(fontsize = 10, fontweight = 'bold')
        
        if n < ((num_features* num_features)- num_features):
            #plt.subplot(4,4,n).get_xaxis().set_visible(False)
            plt.subplot(num_features, num_features,n).xaxis.set_ticklabels([])
            plt.xlabel('')
        
            
        if n not in mylist:
            #plt.subplot(4,4,n).get_yaxis().set_visible(False)
            plt.subplot(num_features,num_features,n).yaxis.set_ticklabels([])
            plt.ylabel('')
            
    plt.subplots_adjust(bottom=0.1, right=0.83, top=0.95, wspace=0.05, hspace=0.05)
    
    
    cax = plt.axes([0.85, 0.1, 0.075, 0.85])
    #aspect not working
    cbar = plt.colorbar(cax=cax, aspect=20)
    cbar.set_label(label = color_colname, fontsize=20)
    cbar.ax.tick_params(labelsize=15)

    #fig.tight_layout()
    
    plt.show()

## Pipe Data Valuation VIZ

In [10]:
def pipe_3_scatters(data, x, y, color_colname, cmap, bvd1, bvd2, bvd3, p_values1, p_values2, p_values3):
    
    data1 = add_bvd_and_p_to_df(data, bvd1, p_values1)
    data2 = add_bvd_and_p_to_df(data, bvd2, p_values2)
    data3 = add_bvd_and_p_to_df(data, bvd3, p_values3)
    plot_3_scatters(data1, data2, data3, color_colname, x, y, cmap)


In [11]:
def pipe_4d_scatters(data, bvd, p_values, color_colname, feature_pairs,setting, cmap):
    
    data = add_bvd_and_p_to_df(data, bvd, p_values)
    plot_4d_scatters(data, color_colname, feature_pairs, setting, cmap)

In [12]:
def pipe_multidiemsnion_scatters(data, bvd, p_values,num_features, col_names, color_colname, setting, cmap, dot_size):
    
    data = add_bvd_and_p_to_df(data, bvd, p_values)
    feature_pairs = get_feature_pairs(num_features, col_names)
    scatters_of_multiD_data(data, color_colname, feature_pairs, num_features, setting, cmap, dot_size)
    

In [13]:
def pipe_multidiemsnion_scatters_bothacc(data, accuracy, accuracy_kind, num_features, col_names, color_colname, setting, cmap, dot_size):
    
    data = add_acc_to_df(data, accuracy, accuracy_kind)
    feature_pairs = get_feature_pairs(num_features, col_names)
    scatters_of_multiD_data(data, color_colname, feature_pairs, num_features, setting, cmap, dot_size)