# Distances

This notebook contains
- necessary imports
- distance measures for distributions
- metfods for getting distances between conditions
- boxplots

In [1]:
# imports

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy
import random

## Distance measures:

In [2]:
# function to turn arrays into discrete probability distributions

def make_prob_distr(array1, array2):

    union = []
    union.extend(array1.flat)
    union.extend(array2.flat)
    unique_values = list(set(union))
    unique_values.sort()

    probdistr1 = []
    probdistr2 = []
    for v in unique_values:
        probdistr1.append((array1 == v).sum()/len(array1.flat))
        probdistr2.append((array2 == v).sum()/len(array2.flat))
        
    return probdistr1, probdistr2

In [3]:
##########################################
# 5 different Distance measure functions #
##########################################


# kullback leibler divergence
def kl_divergence(array1, array2):
    
    def kl(array1, array2):
        l_sum = [0.0]
        for p,q in zip(array1, array2):
            if p*q != 0.0:
                l_sum.append(p * np.log(p / q))
        return np.sum(l_sum)
    
    probdistr1, probdistr2 = make_prob_distr(array1, array2)
    return kl(probdistr1, probdistr2)

# Jensen–Shannon divergence
def js_divergence(array1, array2):
    
    def make_mixture_distribution(array1, array2):
    
        union = []
        union.extend(array1.flat)
        union.extend(array2.flat)
        unique_values = list(set(union))
        unique_values.sort()

        probdistr = []
        for v in unique_values:
            probdistr1 = (array1 == v).sum() / len(array1.flat)
            probdistr2 = (array2 == v).sum() / len(array2.flat)
            probdistr.append(np.mean([probdistr1,probdistr2]))

        return probdistr
    
    def kl(array1, array2):
        l_sum = [0.0]
        for p,q in zip(array1, array2):
            if p*q != 0.0:
                l_sum.append(p * np.log(p / q))
        return np.sum(l_sum)
    
    probdistr1, probdistr2 = make_prob_distr(array1, array2)
    mix_prob_distr = make_mixture_distribution(array1, array2)
    js = 0.5*kl(probdistr1, mix_prob_distr) + 0.5*kl(probdistr2, mix_prob_distr)
                          
    return js

# wasserstein distance or earths mover's distance
def wasserstein_distance(array1, array2):
    
    union = []
    union.extend(array1.flat)
    union.extend(array2.flat)
    unique_values = list(set(union))
    unique_values.sort()

    probdistr1 = []
    probdistr2 = []
    for v in unique_values:
        probdistr1.append((array1 == v).sum()/len(array1.flat))
        probdistr2.append((array2 == v).sum()/len(array2.flat))

    return scipy.stats.wasserstein_distance(u_values=probdistr1, v_values=probdistr2, u_weights=unique_values, v_weights=unique_values)

# bhattacharyya distance
def bhattacharyya(array1, array2):
    
    probdistr1, probdistr2 = make_prob_distr(array1, array2)
    bc = np.sum(np.sqrt(np.multiply(probdistr1, probdistr2)))
    if bc == 0.0:
        return np.nan
    return - np.log(bc)


# hellinger distance
def hellinger_distance(array1, array2):
    
    probdistr1, probdistr2 = make_prob_distr(array1, array2)
    bc = np.sum(np.sqrt(np.multiply(probdistr1, probdistr2)))                    
    return np.sqrt(1-bc)

def histogram_intersection(array1, array2):
    
    probdistr1, probdistr2 = make_prob_distr(array1, array2)
    mins = []
    for v1, v2 in zip(probdistr1, probdistr2):
        mins.append(np.min([v1,v2]))
    shared_ratio = np.sum(mins)/1
    return 1 - shared_ratio

def histogram_correlation(array1, array2):
    
    probdistr1, probdistr2 = make_prob_distr(array1, array2)
    
    mean1 = np.mean(probdistr1)
    mean2 = np.mean(probdistr2)
    numerator = []
    denominator_a = []
    denominator_b = []
    
    for v1,v2 in zip(probdistr1, probdistr2):
        numerator.append((v1-mean1)*(v2-mean2))
        denominator_a.append((v1-mean1)*(v1-mean1))
        denominator_b.append((v2-mean2)*(v2-mean2))
    numerator = np.sum(numerator)
    denominator = np.sqrt(np.sum(denominator_a) * np.sum(denominator_b))
    
    # the correlation coefficient (between -1 and 1)
    r = numerator/denominator
    
    # turning it into a distance measure
    d = r*(-0.5) + 0.5
    
    return d

In [4]:
# total variation distance



In [5]:
# chi square statistics



## Extracting distances depending on conditions:

In [6]:
# getting distances for a certain distance measure, layer and sign_distribution variable

def distances_for_distributions(distance_measure, layer, variable):
    
    # extracting a sign distribution given a number and a dataset
    def get_sign_distr(n, d_set):
        sign_distr = None
        if d_set == "RSN":
            sign_distr = pd.read_csv(f'2b Sign distributions/RSN_{n}_sign_distr.csv')
        else:
            sign_distr = pd.read_csv(f'2b Sign distributions/{d_set}_IMP_{n}_sign_distr.csv')
        return np.array(sign_distr[sign_distr["layer"]==layer][[variable]])
    
    
    distances = {"CIFAR":[], "CINIC":[], "SVHN":[], "RSN":[],
                 "RSN_CIFAR":[], "RSN_CINIC":[], "RSN_SVHN":[],
                 "CIFAR_SVHN":[], "CINIC_CIFAR":[], "SVHN_CINIC":[]}
    
    # taking pairs, but not both ways
    for i in range(10):
        for j in range(10):
            
            # comparing within conditions
            if i < j:
                for c in ["CIFAR", "CINIC", "SVHN", "RSN"]:
                    distances[c].append(distance_measure(get_sign_distr(i, c),get_sign_distr(j, c)))
                    if distance_measure(get_sign_distr(i, c),get_sign_distr(j, c)) == np.nan:
                        print(i, " and ", j , " in ", c)
                    
            # comparing RSNs and WTs
            for c in ["CIFAR", "CINIC", "SVHN"]:
                distances[f"RSN_{c}"].append(distance_measure(get_sign_distr(i, "RSN"),get_sign_distr(j, c)))
                if distance_measure(get_sign_distr(i, "RSN"),get_sign_distr(j, c)) == np.nan:
                    print(i, " and ", j , " between ", c, " WTs")
            
            # comparing between datasets
            distances["CIFAR_SVHN"].append(distance_measure(get_sign_distr(i, "CIFAR"),get_sign_distr(j, "SVHN")))
            distances["CINIC_CIFAR"].append(distance_measure(get_sign_distr(i, "CINIC"),get_sign_distr(j, "CIFAR")))
            distances["SVHN_CINIC"].append(distance_measure(get_sign_distr(i, "SVHN"),get_sign_distr(j, "CINIC")))
    
    # add 4 collective distance conditions
    distances["within_WTs"] = distances["CIFAR"]+distances["CINIC"]+distances["SVHN"]
    distances["within_conditions"] = distances["CIFAR"]+distances["CINIC"]+distances["SVHN"]+distances["RSN"]
    distances["between_WTs_and_RSN"] = distances["RSN_CIFAR"]+distances["RSN_CINIC"]+distances["RSN_SVHN"]
    distances["between_WT_datasets"] = distances["CIFAR_SVHN"]+distances["CINIC_CIFAR"]+distances["SVHN_CINIC"]
            
    return distances

## Collecting mean and std of all distanc conditions:

In [9]:
# all the different plot parameters:

distance_measures = [kl_divergence,
                     js_divergence, 
                     wasserstein_distance, 
                     hellinger_distance, 
                     bhattacharyya, 
                     histogram_correlation, 
                     histogram_intersection]

distance_measure_names = ["kl_divergence",
                          "js_divergence", 
                          "wasserstein_distance",  
                          "hellinger_distance", 
                          "bhattacharyya", 
                          "histogram_correlation", 
                          "histogram_intersection"]

distances_short = ["kl","jsd", "wsd",  "hd", "bd", "hc", "hi"]

variables = ["prune_rate_in", "prune_rate_out", "sign_rate_in", "sign_rate_out"]
layers = ["dense1", "dense2"]

In [None]:
# store all the distances

def store_all_distances():
distances = distances_for_distributions(wasserstein_distance, "dense1", "prune_rate_in")

    for d_measure, d_measure_name in zip(distance_measures, distance_measure_names):
        for layer in layers:
            for variable in variables:
            
                # get distance
                distances = distances_for_distributions(d_measure, layer, variable)

                # store distance
                myFile = open(f'4b Distances/{layer}_{variable}_{d_measure_name}_distances.txt', 'w')
                myFile.write(str(distances))
                myFile.close()

store_all_distances()

In [10]:
# calculate std and mean for each condition and collect in a giant dataframe

distance_statistics = pd.DataFrame()

for d_measure, d_measure_name in zip(distance_measures, distance_measure_names):
    for layer in layers:
        for variable in variables:
            
            # get distances for all conditions
            distances = distances_for_distributions(d_measure, layer, variable)
            
            # iterate thorugh all conditions and collect their mean and std in a dataframe
            new_distance_statistics = pd.DataFrame()
            for c_name, d in distances.items():
                dic = {}
                
                dic["mean"] = np.mean(d)
                dic["std"] = np.std(d)
                dic["layer"] = layer
                dic["variable"] = variable
                dic["distance_measure"] = d_measure_name
                
                # add new row as dataframe to statistics
                df = pd.DataFrame(data = dic, index = [c_name])
                new_distance_statistics = pd.concat([new_distance_statistics, df], axis=0)

            # add collected statistics do big data frame
            distance_statistics = pd.concat([distance_statistics, new_distance_statistics], axis=0)
            

In [11]:
# print dataframes with grouping by
distance_statistics.to_csv(f'4b Distances/mean_std_distances.csv', index=True)
print(distance_statistics[0:14])

                         mean       std   layer       variable  \
CIFAR                0.042779  0.042334  dense1  prune_rate_in   
CINIC                0.015865  0.034366  dense1  prune_rate_in   
SVHN                 0.019767  0.063155  dense1  prune_rate_in   
RSN                  0.209310  0.047393  dense1  prune_rate_in   
RSN_CIFAR            0.117937  0.046689  dense1  prune_rate_in   
RSN_CINIC            0.078041  0.046744  dense1  prune_rate_in   
RSN_SVHN             0.083331  0.047207  dense1  prune_rate_in   
CIFAR_SVHN          -0.029581  0.017096  dense1  prune_rate_in   
CINIC_CIFAR          0.273065  0.114693  dense1  prune_rate_in   
SVHN_CINIC          -0.033932  0.033337  dense1  prune_rate_in   
within_WTs           0.026137  0.049615  dense1  prune_rate_in   
within_conditions    0.071930  0.093267  dense1  prune_rate_in   
between_WTs_and_RSN  0.093103  0.050108  dense1  prune_rate_in   
between_WT_datasets  0.069851  0.159700  dense1  prune_rate_in   

         

## Hypothesis testing

Hypothesis:  

1. Distances within WTs are smaller than between WTs and RSNs
2. Distances within RSNs are smaller than between WTs and RSNs
3. Distances between WT datasets are smaller than between WTs and RSNs
4. Distances within WTs are smaller than between WT datasts

In [12]:
hypothesis_pairs = [("within_WTs","between_WTs_and_RSN"),("RSN","between_WTs_and_RSN"),("between_WT_datasets","between_WTs_and_RSN"),("within_WTs","between_WT_datasets")]

In [13]:
significance_test_results = pd.DataFrame()

for d_measure, d_measure_name in zip(distance_measures, distance_measure_names):
    for layer in layers:
        for variable in variables:
            
            new_results = pd.DataFrame()
            
            # get distances for all conditions
            distances = distances_for_distributions(d_measure, layer, variable)
            
            # iterate thorugh important conditions and perform corresponding hypothesis tests
            
            # iterate thorugh hypotheses
            for (d1,d2), h in zip(hypothesis_pairs, range(1,5)):
                
                # perform one sided two sample Welch’s t-test
                h_results = scipy.stats.ttest_ind(distances[d1], distances[d2], alternative="less", equal_var=False)
                p_value = h_results.pvalue
                
                # add test results as row in dictionary
                dic = {}
                dic["hypothesis"] = f"H{h}"
                dic["accepted"] = p_value<=0.05
                dic["p-value"] = p_value
                dic["layer"] = layer
                dic["variable"] = variable
                dic["distance_measure"] = d_measure_name
                
                # add row to datframe
                df = pd.DataFrame(data = dic, index = [h])
                new_results = pd.concat([new_results, df], axis=0)

            # add collected statistics do big data frame
            significance_test_results = pd.concat([significance_test_results, new_results], axis=0)

In [14]:
# store significance test results in a file

significance_test_results.to_csv(f'4b Distances/significance_test_results.csv', index=False)

## Displaying results helpfully

In [15]:
# all the different plot parameters:

distance_measures = [kl_divergence,
                     js_divergence, 
                     wasserstein_distance, 
                     hellinger_distance, 
                     bhattacharyya, 
                     histogram_correlation, 
                     histogram_intersection]

distance_measure_names = ["kl_divergence",
                          "js_divergence", 
                          "wasserstein_distance",  
                          "hellinger_distance", 
                          "bhattacharyya", 
                          "histogram_correlation", 
                          "histogram_intersection"]

distances_short = ["kl","jsd", "wsd",  "hd", "bd", "hc", "hi"]

variables = ["prune_rate_in", "prune_rate_out", "sign_rate_in", "sign_rate_out"]
layers = ["dense1", "dense2"]

In [16]:
# load significance test results from file

test_results = pd.read_csv(f'4b Distances/significance_test_results.csv')

In [17]:
def hypothesis_tables(results):
    
    new_dfs = []
    for layer in layers:

        results_l = results[results["layer"] == layer]
        
        # make new dataframe with the indexi
        new_df = results_l[results_l["distance_measure"] == "wasserstein_distance"][["variable","hypothesis"]]
        new_df = new_df.reset_index(drop=True)

        # apppend test results to dataframe
        for dm, dm_short in zip(distance_measure_names, distances_short):
            df = results_l[results_l["distance_measure"] == dm]
            df = df[["accepted", "p-value"]]
            df.rename(columns = {"accepted":f"accepted_{dm_short}", "p-value":f"p-value_{dm_short}"}, inplace = True)
            df = df.reset_index(drop=True)
            new_df = pd.concat([new_df, df], axis=1)

        new_df.set_index(["variable","hypothesis"])
        new_dfs.append(new_df)
        
        display(new_df)
        
    return new_dfs

In [20]:
def count_hypothesis_accepted(tables):
    
    h_count_tables= []
    for table in tables:
        # get the column names of the accepted values
        column_names = []
        for dm, dm_short in zip(distance_measure_names, distances_short):
            column_names.append("accepted_"+dm_short)

        # get the counts for each variable and hypothesis combination
        accepted_counts = []
        for index, row in table.iterrows():
            accepted = []
            for c in column_names:
                accepted.append(row[c])
            accepted_counts.append(np.sum(accepted))

        # turn into pandas series and append to dataframe
        counts = pd.Series(accepted_counts, index=range(16))
        h_counts = pd.concat([table[["variable","hypothesis"]], counts], axis=1)
        h_count_tables.append(h_counts)
        
        # display dataframe
        display(h_counts)
        
    return h_count_tables

In [22]:
tables = hypothesis_tables(test_results)
h_counts = count_hypothesis_accepted(tables)

Unnamed: 0,variable,hypothesis,accepted_kl,p-value_kl,accepted_jsd,p-value_jsd,accepted_wsd,p-value_wsd,accepted_hd,p-value_hd,accepted_bd,p-value_bd,accepted_hc,p-value_hc,accepted_hi,p-value_hi
0,prune_rate_in,H1,True,3.2427969999999996e-30,True,2.264624e-115,True,0.0,True,3.909581e-111,True,1.057272e-162,True,3.318753e-45,True,2.049454e-103
1,prune_rate_in,H2,False,1.0,True,1.037312e-80,True,4.550841e-72,True,7.786107e-63,True,3.734437e-237,True,3.766119e-56,True,1.6801400000000001e-66
2,prune_rate_in,H3,True,0.008403302,True,1.051166e-156,True,0.0,True,1.265478e-167,True,2.400772e-144,True,3.373873e-164,True,8.109988e-126
3,prune_rate_in,H4,True,1.105343e-05,True,3.249551e-24,True,4.476025e-43,True,3.0729939999999998e-21,True,8.261932e-24,False,0.4839805,True,2.816054e-33
4,prune_rate_out,H1,True,3.280467e-126,True,5.658682e-101,True,6.249928e-294,True,8.425627e-119,True,1.968376e-92,False,0.1028907,True,1.313467e-110
5,prune_rate_out,H2,True,8.894449e-139,True,1.834225e-129,True,5.63156e-66,True,1.165327e-100,True,2.3750620000000001e-119,True,2.69592e-57,True,7.045144e-96
6,prune_rate_out,H3,True,4.383373e-99,True,2.177003e-25,True,2.114577e-260,True,8.914856e-26,True,2.5732260000000003e-25,False,1.0,True,1.149665e-31
7,prune_rate_out,H4,True,1.310618e-18,True,1.818872e-63,True,7.284963e-65,True,4.5982350000000006e-62,True,3.3249609999999997e-63,True,1.25079e-29,True,6.714643e-57
8,sign_rate_in,H1,False,0.9976715,True,9.752461e-50,True,1.651886e-59,True,2.715083e-49,False,,True,3.57896e-55,True,3.185572e-46
9,sign_rate_in,H2,True,0.0001534007,True,9.580203000000001e-27,True,4.5211480000000004e-76,True,1.109805e-26,False,,False,1.0,True,2.832004e-25


Unnamed: 0,variable,hypothesis,accepted_kl,p-value_kl,accepted_jsd,p-value_jsd,accepted_wsd,p-value_wsd,accepted_hd,p-value_hd,accepted_bd,p-value_bd,accepted_hc,p-value_hc,accepted_hi,p-value_hi
0,prune_rate_in,H1,True,7.998345999999999e-42,True,6.0925060000000005e-31,True,2.041633e-118,True,1.742875e-33,True,6.549042e-31,True,1.583207e-20,True,3.702318e-29
1,prune_rate_in,H2,True,8.296978e-51,True,6.110272e-44,True,8.645133999999999e-19,True,4.521358e-53,True,6.235870999999999e-42,True,1.746481e-22,True,9.994236999999999e-42
2,prune_rate_in,H3,False,0.2461236,False,0.9999325,True,9.630036e-94,False,0.999946,False,0.9973952,False,1.0,False,0.9999999
3,prune_rate_in,H4,True,2.948274e-50,True,2.4558219999999997e-64,True,6.478480999999999e-19,True,5.007106e-65,True,1.7317610000000002e-62,True,1.473042e-62,True,1.874312e-64
4,prune_rate_out,H1,True,5.244927e-11,True,9.067777e-13,True,4.31599e-07,True,2.452786e-11,True,1.771878e-12,True,0.01121075,True,2.242696e-06
5,prune_rate_out,H2,True,2.116072e-17,True,1.957259e-16,True,6.425582e-09,True,7.206852e-10,True,6.733355e-13,True,6.517979e-11,True,1.61466e-11
6,prune_rate_out,H3,False,0.9417273,False,0.8120881,False,0.5677963,False,0.6106684,False,0.5697354,False,1.0,False,0.9998741
7,prune_rate_out,H4,True,1.467927e-13,True,3.504345e-14,True,2.458836e-07,True,3.999614e-12,True,1.168149e-12,True,4.210849e-13,True,4.585934e-13
8,sign_rate_in,H1,False,0.6597075,True,4.089268e-26,True,1.5025540000000001e-18,True,1.924143e-25,True,6.044886e-27,False,0.2488103,True,4.07545e-27
9,sign_rate_in,H2,False,0.9990706,True,2.113243e-33,True,0.01195451,True,3.59429e-32,True,4.607734e-41,True,1.636774e-06,True,2.7420690000000003e-29


Unnamed: 0,variable,hypothesis,0
0,prune_rate_in,H1,7
1,prune_rate_in,H2,6
2,prune_rate_in,H3,7
3,prune_rate_in,H4,6
4,prune_rate_out,H1,6
5,prune_rate_out,H2,7
6,prune_rate_out,H3,6
7,prune_rate_out,H4,7
8,sign_rate_in,H1,5
9,sign_rate_in,H2,5


Unnamed: 0,variable,hypothesis,0
0,prune_rate_in,H1,7
1,prune_rate_in,H2,7
2,prune_rate_in,H3,1
3,prune_rate_in,H4,7
4,prune_rate_out,H1,7
5,prune_rate_out,H2,7
6,prune_rate_out,H3,0
7,prune_rate_out,H4,7
8,sign_rate_in,H1,5
9,sign_rate_in,H2,6


Hypothesis:  

1. Distances within WTs are smaller than between WTs and RSNs
2. Distances within RSNs are smaller than between WTs and RSNs
3. Distances between WT datasets are smaller than between WTs and RSNs
4. Distances within WTs are smaller than between WT datasts