# Imports

In [5]:
import matplotlib.pyplot as plt
import seaborn as sns

import pandas as pd
import numpy as np
from sklearn import datasets

from sklearn.datasets import make_blobs
from sklearn.tree import DecisionTreeClassifier

# Default plot configurations
%matplotlib inline
plt.rcParams['figure.figsize'] = (14,8)
plt.rcParams['figure.dpi'] = 150
sns.set()

# Some Visualizations

In [1]:
def scatters_acc_color(data, 
                      x, 
                      y, 
                      remove_list,  
                      all_removed_indexes, 
                      title,
                      palette,
                      dot_size=10):
    '''
    A series of scatterplot which mark removed data.
    
    '''
            
    fig = plt.figure(1, figsize=(15,20))
    fig.suptitle(title, fontsize = 18)
   
    for n in range(len(remove_list)):
        # nrows, ncols, plot_number
        plt.subplot(4, 2, n+1)
        
        data['removed'] = 0
        mask = data['removed'].index.isin(all_removed_indexes[n])
        data.loc[mask, 'removed'] = 1
          
        #style=data['target']
        sns.scatterplot(x = data[x],y = data[y], hue = data['target'], 
                        size = data['removed'], style=data['removed'],  edgecolor="k", 
                        linewidth=1, s= dot_size, palette=palette,
                       legend = False) 
        
        plt.xlabel(x, fontsize=12)
        plt.xticks(fontsize = 10, fontweight = 'bold')
        plt.ylabel(y, fontsize=12)
        plt.yticks(fontsize = 10, fontweight = 'bold')
        plt.title('-' + str(remove_list[n]*100) + '% of Data',  #str(round(val_scores[n],4)) + " Accuracy |
                 fontsize=12, fontweight = 'bold')
        
        if n+1 not in [7,8]:
            plt.subplot(4, 2,n+1).xaxis.set_ticklabels([])
            plt.xlabel('')
        if n+1 not in [1,3,5,7]:
            plt.subplot(4,2,n+1).yaxis.set_ticklabels([])
            plt.ylabel('')
            
    plt.subplots_adjust(bottom=0.1, right=0.83, top=0.93, wspace=0.05, hspace=0.1)
        
    plt.show()

In [3]:
def save_heat_scatter(data, x, y, dot_size, cmap, palette, bar_label, title, png_name):
    '''
    Input: df with column of bootstrap data values
    '''
    
    fig = plt.figure(1, figsize=(30,12))
    fig.suptitle(title, fontsize = 20, fontweight = 'bold')
    
    plt.subplot(121).set_facecolor('white')
    ax0 = plt.subplot(121)
    ax0.spines['bottom'].set_color('black')
    ax0.spines['top'].set_color('black')
    ax0.spines['left'].set_color('black')
    ax0.spines['right'].set_color('black')
    sns.scatterplot(data=data, x = x, y=y, hue='target', s = dot_size, palette = palette , linewidth=0, legend = False)
    plt.xlabel('')
    plt.ylabel('')
    plt.xticks(fontsize = 20)
    plt.yticks(fontsize = 20)
    
    plt.subplot(122).set_facecolor('white')
    ax1 = plt.subplot(122)
    ax1.spines['bottom'].set_color('black')
    ax1.spines['top'].set_color('black')
    ax1.spines['left'].set_color('black')
    ax1.spines['right'].set_color('black')
    
    # nrows, ncols, plot_number
    colors = data['BDV']
    plt.scatter(x = data[x],y = data[y], s = dot_size,c = colors, cmap = cmap)
    plt.xticks(fontsize = 20)
    plt.yticks(fontsize = 20)
    
    
    plt.subplots_adjust(wspace = 0.1)
    
    cax = plt.axes([0.92, 0.125, 0.015, 0.75])
    cbar = plt.colorbar(cax=cax)
    cbar.set_label(label = bar_label, fontsize = 20, labelpad=15)
    cbar.outline.set_color('black')
    cbar.outline.set_linewidth(1.5)
    cbar.ax.tick_params(labelsize=20)
    
    fig.savefig(png_name)
    
    plt.show()

In [4]:
def save_separate_heat_scatter(class_1, class_2, x, y, dot_size, cmap, cbar_label, title, png_name):
    '''
    Input: df with column of bootstrap data values
    '''
    fig = plt.figure(1, figsize=(30,12))
    fig.suptitle(title, fontsize = 20, fontweight = 'bold')
    
    # nrows, ncols, plot_number
    plt.subplot(121).set_facecolor('white')
    ax0 = plt.subplot(121)
    ax0.spines['bottom'].set_color('black')
    ax0.spines['top'].set_color('black')
    ax0.spines['left'].set_color('black')
    ax0.spines['right'].set_color('black')
    
    colors = class_1['BDV']
    plt.scatter(x = class_1[x],y = class_1[y], s = dot_size, c = colors, cmap = cmap) #cmap = 'jet'
    
    
    plt.xticks(fontsize = 20)
    plt.yticks(fontsize = 20)
    
    plt.subplot(122).set_facecolor('white')
    ax1 = plt.subplot(122)
    ax1.spines['bottom'].set_color('black')
    ax1.spines['top'].set_color('black')
    ax1.spines['left'].set_color('black')
    ax1.spines['right'].set_color('black')
    
    colors = class_2['BDV']
    plt.scatter(x = class_2[x],y = class_2[y], s = dot_size, c = colors, cmap = cmap)
    
    plt.xticks(fontsize = 20)
    plt.yticks(fontsize = 20)


    #plt.subplots_adjust(bottom=0.1, right=0.8, top=0.9, wspace = 0.2)
    plt.subplots_adjust(wspace = 0.1)
    
    cax = plt.axes([0.93, 0.125, 0.015, 0.75])
    cbar = plt.colorbar(cax=cax)
    cbar.set_label(label = cbar_label, fontsize = 20,labelpad=15 )
    cbar.outline.set_color('black')
    cbar.outline.set_linewidth(1.5)
    cbar.ax.tick_params(labelsize=20)
    
    fig.savefig(png_name)
    
    plt.show()

In [2]:
def all_synthetic_datasets(dataset_list, x, y, dot_size, cmap, palette, bar_label, title):
    
    fig = plt.figure(1, figsize=(30,12))
    fig.suptitle(title, fontsize = 20, fontweight = 'bold')
    
    for i in range(len(dataset_list)):
        plt.subplot(1,4,i+1).set_facecolor('white')
        ax = plt.subplot(1,4,i+1)
        ax.spines['bottom'].set_color('black')
        ax.spines['top'].set_color('black')
        ax.spines['left'].set_color('black')
        ax.spines['right'].set_color('black')
        sns.scatterplot(data=dataset_list[i], x = x, y=y, hue='target', s = dot_size, palette = palette , linewidth=0, legend = False)
        plt.xlabel('')
        plt.ylabel('')
        plt.xticks(fontsize = 20)
        plt.yticks(fontsize = 20)
    
    
    plt.subplots_adjust(wspace = 0.1)
    
    cax = plt.axes([0.92, 0.125, 0.015, 0.75])
    cbar = plt.colorbar(cax=cax)
    cbar.set_label(label = bar_label, fontsize = 20, labelpad=15)
    cbar.outline.set_color('black')
    cbar.outline.set_linewidth(1.5)
    cbar.ax.tick_params(labelsize=20)
    
    plt.show()

In [12]:
def all_synthetic_datasets_cube(dataset_list, dataset_names, x, y, dot_size, palette, title, png_name):
                        
    fig = plt.figure(1, figsize=(12,12)) # figsize=(width, height)
    fig.suptitle(title, fontsize = 20, fontweight = 'bold')
    
    for i in range(len(dataset_list)):
        plt.subplot(2,2,i+1).set_facecolor('white')
        ax = plt.subplot(2,2,i+1)
        ax.spines['bottom'].set_color('black')
        ax.spines['top'].set_color('black')
        ax.spines['left'].set_color('black')
        ax.spines['right'].set_color('black')
        sns.scatterplot(data=dataset_list[i], x = x, y=y, hue='target', s = dot_size, palette = palette , linewidth=0, legend = False)
        plt.xlabel(dataset_names[i], fontsize = 20)
        plt.ylabel('')
        plt.xticks(fontsize = 18)
        plt.yticks(fontsize = 18)
    
    plt.subplots_adjust(wspace = 0.1)
    
    fig.savefig(png_name)
    
    plt.show()

# Functions

In [7]:
def standardize_columns(data):
    '''
    Input:
      data (data frame): contains only numeric columns
    Output:
      data frame, the same data, except each column is standardized 
      to have 0-mean and unit variance
    '''
    target_data = data.select_dtypes(include=np.number)
    target_data = target_data.drop(['target'], axis = 1)
    
    standardized_data = (target_data - target_data.mean())/target_data.std()
    standardized_data.insert(len(data.columns)-1, 'target', data['target'])
    
    return standardized_data

# Generating Synthetic Data for Experiments 

## Dataset A

In [11]:
centers = [[-2,0], [2,0]]

X2, y2 = make_blobs(n_samples = 100, centers=centers, n_features=2, cluster_std = 1, random_state=0)
distant = pd.DataFrame(dict(feature1=X2[:,0], feature2=X2[:,1], target=y2))
prep_distant = standardize_columns(distant)

## Dataset B

In [9]:
centers = [[0,0], [2,0]]

X2, y2 = make_blobs(n_samples = 100, centers=centers, n_features=2, cluster_std = 1, random_state=0)
data_2 = pd.DataFrame(dict(feature1=X2[:,0], feature2=X2[:,1], target=y2))
prep_data_2 = standardize_columns(data_2)

## Dataset C

In [8]:
centers = [[0,0], [2,0]]

X1b, y1b = make_blobs(n_samples = 200, centers=centers, n_features=2, cluster_std = 0.55, random_state=0)
data_1b = pd.DataFrame(dict(feature1=X1b[:,0], feature2=X1b[:,1], target=y1b))
prep_data_1b = standardize_columns(data_1b)

## Dataset D

In [10]:
centers = [[0,0], [2,0]]

X3, y3 = make_blobs(n_samples = 500, centers=centers, n_features=2, cluster_std = 1, random_state=0)
data_3 = pd.DataFrame(dict(feature1=X3[:,0], feature2=X3[:,1], target=y3))
prep_data_3 = standardize_columns(data_3)