# Distances

This notebook contains
- necessary imports
- distance measures for distributions
- metfods for getting distances between conditions
- boxplots

In [1]:
# imports

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy
import random
from tqdm import tqdm

## Distance measures:

In [2]:
# function to turn arrays into discrete probability distributions

def make_prob_distr(array1, array2):

    union = []
    union.extend(array1.flat)
    union.extend(array2.flat)
    unique_values = list(set(union))
    unique_values.sort()

    probdistr1 = []
    probdistr2 = []
    for v in unique_values:
        probdistr1.append((array1 == v).sum()/len(array1.flat))
        probdistr2.append((array2 == v).sum()/len(array2.flat))
        
    return probdistr1, probdistr2

In [3]:
##########################################
# 5 different Distance measure functions #
##########################################


# kullback leibler divergence
def kl_divergence(array1, array2):
    
    def kl(array1, array2):
        l_sum = [0.0]
        for p,q in zip(array1, array2):
            if p*q != 0.0:
                l_sum.append(p * np.log(p / q))
        return np.sum(l_sum)
    
    probdistr1, probdistr2 = make_prob_distr(array1, array2)
    return kl(probdistr1, probdistr2)

# Jensen–Shannon divergence
def js_divergence(array1, array2):
    
    def make_mixture_distribution(array1, array2):
    
        union = []
        union.extend(array1.flat)
        union.extend(array2.flat)
        unique_values = list(set(union))
        unique_values.sort()

        probdistr = []
        for v in unique_values:
            probdistr1 = (array1 == v).sum() / len(array1.flat)
            probdistr2 = (array2 == v).sum() / len(array2.flat)
            probdistr.append(np.mean([probdistr1,probdistr2]))

        return probdistr
    
    def kl(array1, array2):
        l_sum = [0.0]
        for p,q in zip(array1, array2):
            if p*q != 0.0:
                l_sum.append(p * np.log(p / q))
        return np.sum(l_sum)
    
    probdistr1, probdistr2 = make_prob_distr(array1, array2)
    mix_prob_distr = make_mixture_distribution(array1, array2)
    js = 0.5*kl(probdistr1, mix_prob_distr) + 0.5*kl(probdistr2, mix_prob_distr)
                          
    return js

# wasserstein distance or earths mover's distance
def wasserstein_distance(array1, array2):
    
    union = []
    union.extend(array1.flat)
    union.extend(array2.flat)
    unique_values = list(set(union))
    unique_values.sort()

    probdistr1 = []
    probdistr2 = []
    for v in unique_values:
        probdistr1.append((array1 == v).sum()/len(array1.flat))
        probdistr2.append((array2 == v).sum()/len(array2.flat))

    return scipy.stats.wasserstein_distance(u_values=probdistr1, v_values=probdistr2, u_weights=unique_values, v_weights=unique_values)

# bhattacharyya distance
def bhattacharyya(array1, array2):
    
    probdistr1, probdistr2 = make_prob_distr(array1, array2)
    bc = np.sum(np.sqrt(np.multiply(probdistr1, probdistr2)))
    if bc == 0.0:
        return np.nan
    return - np.log(bc)


# hellinger distance
def hellinger_distance(array1, array2):
    
    probdistr1, probdistr2 = make_prob_distr(array1, array2)
    bc = np.sum(np.sqrt(np.multiply(probdistr1, probdistr2)))                    
    return np.sqrt(1-bc)

def histogram_intersection(array1, array2):
    
    probdistr1, probdistr2 = make_prob_distr(array1, array2)
    mins = []
    for v1, v2 in zip(probdistr1, probdistr2):
        mins.append(np.min([v1,v2]))
    shared_ratio = np.sum(mins)/1
    return 1 - shared_ratio

def histogram_correlation(array1, array2):
    
    probdistr1, probdistr2 = make_prob_distr(array1, array2)
    
    mean1 = np.mean(probdistr1)
    mean2 = np.mean(probdistr2)
    numerator = []
    denominator_a = []
    denominator_b = []
    
    for v1,v2 in zip(probdistr1, probdistr2):
        numerator.append((v1-mean1)*(v2-mean2))
        denominator_a.append((v1-mean1)*(v1-mean1))
        denominator_b.append((v2-mean2)*(v2-mean2))
    numerator = np.sum(numerator)
    denominator = np.sqrt(np.sum(denominator_a) * np.sum(denominator_b))
    
    # the correlation coefficient (between -1 and 1)
    r = numerator/denominator
    
    # turning it into a distance measure
    d = r*(-0.5) + 0.5
    
    return d

In [4]:
# total variation distance



In [5]:
# chi square statistics



## Extracting distances depending on conditions:

In [6]:
wts_per_dataset = 15

In [7]:
# getting distances for a certain distance measure, layer and sign_distribution variable

def distances_for_distributions(distance_measure, layer, variable):
    
    # extracting a sign distribution given a number and a dataset
    def get_sign_distr(n, d_set):
        sign_distr = None
        if d_set == "RSN":
            sign_distr = pd.read_csv(f'2b Sign distributions/RSN_{n}_sign_distr.csv')
        else:
            sign_distr = pd.read_csv(f'2b Sign distributions/{d_set}_IMP_{n}_sign_distr.csv')
        return np.array(sign_distr[sign_distr["layer"]==layer][[variable]])
    
    
    distances = {"CIFAR":[], "CINIC":[], "SVHN":[], "RSN":[],
                 "RSN_CIFAR":[], "RSN_CINIC":[], "RSN_SVHN":[],
                 "CIFAR_SVHN":[], "CINIC_CIFAR":[], "SVHN_CINIC":[]}
    
    # taking pairs, but not both ways
    for i in range(wts_per_dataset):
        for j in range(wts_per_dataset):
            
            # comparing within conditions
            if i < j:
                for c in ["CIFAR", "CINIC", "SVHN", "RSN"]:
                    distances[c].append(distance_measure(get_sign_distr(i, c),get_sign_distr(j, c)))
                    
            # comparing RSNs and WTs
            for c in ["CIFAR", "CINIC", "SVHN"]:
                distances[f"RSN_{c}"].append(distance_measure(get_sign_distr(i, "RSN"),get_sign_distr(j, c)))
            
            # comparing between datasets
            distances["CIFAR_SVHN"].append(distance_measure(get_sign_distr(i, "CIFAR"),get_sign_distr(j, "SVHN")))
            distances["CINIC_CIFAR"].append(distance_measure(get_sign_distr(i, "CINIC"),get_sign_distr(j, "CIFAR")))
            distances["SVHN_CINIC"].append(distance_measure(get_sign_distr(i, "SVHN"),get_sign_distr(j, "CINIC")))
    
    # add 4 collective distance conditions
    distances["within_WTs"] = distances["CIFAR"]+distances["CINIC"]+distances["SVHN"]
    distances["within_conditions"] = distances["CIFAR"]+distances["CINIC"]+distances["SVHN"]+distances["RSN"]
    distances["between_WTs_and_RSN"] = distances["RSN_CIFAR"]+distances["RSN_CINIC"]+distances["RSN_SVHN"]
    distances["between_WT_datasets"] = distances["CIFAR_SVHN"]+distances["CINIC_CIFAR"]+distances["SVHN_CINIC"]
            
    return distances

In [8]:
# all the different plot parameters:

distance_measures = [kl_divergence,
                     js_divergence, 
                     wasserstein_distance, 
                     hellinger_distance, 
                     bhattacharyya, 
                     histogram_correlation, 
                     histogram_intersection]

distance_measure_names = ["kl_divergence",
                          "js_divergence", 
                          "wasserstein_distance",  
                          "hellinger_distance", 
                          "bhattacharyya", 
                          "histogram_correlation", 
                          "histogram_intersection"]

distances_short = ["kl","jsd", "wsd",  "hd", "bd", "hc", "hi"]

variables = ["prune_rate_in", "prune_rate_out", "sign_rate_in", "sign_rate_out"]
layers = ["dense1", "dense2"]

In [9]:
# store all the distances

def store_all_distances():

    for d_measure, d_measure_name in tqdm(zip(distance_measures, distance_measure_names), leave=False, desc="distance_measures"):
        for layer in layers:
            for variable in variables:
            
                # get distance
                distances = distances_for_distributions(d_measure, layer, variable)

                # store distance
                myFile = open(f'4b Distances/{layer}_{variable}_{d_measure_name}_distances.txt', 'w')
                myFile.write(str(distances))
                myFile.close()

store_all_distances()

                                          

## Collecting mean and std of all distanc conditions:

In [10]:
# calculate std and mean for each condition and collect in a giant dataframe

def get_mean_std_distances():

    distance_statistics = pd.DataFrame()

    for d_measure, d_measure_name in tqdm(zip(distance_measures, distance_measure_names), leave=False, desc="distance_measures"):
        for layer in layers:
            for variable in variables:

                # get distances for all conditions
                distances = distances_for_distributions(d_measure, layer, variable)

                # iterate thorugh all conditions and collect their mean and std in a dataframe
                new_distance_statistics = pd.DataFrame()
                for c_name, d in distances.items():
                    dic = {}

                    dic["mean"] = np.mean(d)
                    dic["std"] = np.std(d)
                    dic["layer"] = layer
                    dic["variable"] = variable
                    dic["distance_measure"] = d_measure_name

                    # add new row as dataframe to statistics
                    df = pd.DataFrame(data = dic, index = [c_name])
                    new_distance_statistics = pd.concat([new_distance_statistics, df], axis=0)

                # add collected statistics do big data frame
                distance_statistics = pd.concat([distance_statistics, new_distance_statistics], axis=0)
                
    return distance_statistics     

In [11]:

# get distance statistics
distance_statistics = get_mean_std_distances()

# store distances statistics in a file
distance_statistics.to_csv(f'4b Distances/mean_std_distances.csv', index=True)

                                          

## Hypothesis testing

Hypothesis:  

1. Distances within WTs are smaller than between WTs and RSNs
2. Distances within RSNs are smaller than between WTs and RSNs
3. Distances within WTs are smaller than between WT datasts
4. Distances between WT datasets are smaller than between WTs and RSNs

In [15]:
hypothesis_pairs = [("within_WTs","between_WTs_and_RSN"),("RSN","between_WTs_and_RSN"),("within_WTs","between_WT_datasets"),("between_WT_datasets","between_WTs_and_RSN")]

In [16]:

def do_t_tests():
    
    t_test_results = pd.DataFrame()

    for d_measure, d_measure_name in tqdm(zip(distance_measures, distance_measure_names), leave=False, desc="distance_measures"):
        for layer in layers:
            for variable in variables:

                new_results = pd.DataFrame()

                # get distances for all conditions
                distances = distances_for_distributions(d_measure, layer, variable)

                # iterate thorugh important conditions and perform corresponding hypothesis tests

                # iterate thorugh hypotheses
                for (d1,d2), h in zip(hypothesis_pairs, range(1,5)):

                    # perform one sided two sample Welch’s t-test
                    h_results = scipy.stats.ttest_ind(distances[d1], distances[d2], alternative="less", equal_var=False)
                    p_value = h_results.pvalue

                    # add test results as row in dictionary
                    dic = {}
                    dic["hypothesis"] = f"H{h}"
                    dic["accepted"] = p_value<=0.05
                    dic["p-value"] = p_value
                    dic["layer"] = layer
                    dic["variable"] = variable
                    dic["distance_measure"] = d_measure_name

                    # add row to datframe
                    df = pd.DataFrame(data = dic, index = [h])
                    new_results = pd.concat([new_results, df], axis=0)

                # add collected statistics do big data frame
                t_test_results = pd.concat([t_test_results, new_results], axis=0)
                
    return t_test_results

In [17]:

# get t_test_results
significance_test_results = do_t_tests()

# store t_test_results in a file
significance_test_results.to_csv(f'4b Distances/significance_test_results.csv', index=False)

                                          