# Similariteis


This notebook contains
- necessary imports
- similarity measures for distributions
- metfods for getting similarity scores between conditions
- boxplots

In [1]:
# imports

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy
import ast
import random
from tqdm import tqdm

## Similarity measures:

In [2]:
# function to turn arrays into discrete probability distributions

def make_prob_distr(array1, array2):

    union = []
    union.extend(array1.flat)
    union.extend(array2.flat)
    unique_values = list(set(union))
    unique_values.sort()

    probdistr1 = []
    probdistr2 = []
    for v in unique_values:
        probdistr1.append((array1 == v).sum()/len(array1.flat))
        probdistr2.append((array2 == v).sum()/len(array2.flat))
        
    return probdistr1, probdistr2

In [3]:
##########################################
# different Similarity measure functions #
##########################################


# kullback leibler divergence
def kl_divergence(array1, array2):
    
    def kl(array1, array2):
        l_sum = [0.0]
        for p,q in zip(array1, array2):
            if p*q != 0.0:
                l_sum.append(p * np.log(p / q))
        return np.sum(l_sum)
    
    probdistr1, probdistr2 = make_prob_distr(array1, array2)
    return -1 * kl(probdistr1, probdistr2)

# Jensen–Shannon divergence
def js_divergence(array1, array2):
    
    def make_mixture_distribution(array1, array2):
    
        union = []
        union.extend(array1.flat)
        union.extend(array2.flat)
        unique_values = list(set(union))
        unique_values.sort()

        probdistr = []
        for v in unique_values:
            probdistr1 = (array1 == v).sum() / len(array1.flat)
            probdistr2 = (array2 == v).sum() / len(array2.flat)
            probdistr.append(np.mean([probdistr1,probdistr2]))

        return probdistr
    
    def kl(array1, array2):
        l_sum = [0.0]
        for p,q in zip(array1, array2):
            if p*q != 0.0:
                l_sum.append(p * np.log(p / q))
        return np.sum(l_sum)
    
    probdistr1, probdistr2 = make_prob_distr(array1, array2)
    mix_prob_distr = make_mixture_distribution(array1, array2)
    js = 0.5*kl(probdistr1, mix_prob_distr) + 0.5*kl(probdistr2, mix_prob_distr)
                          
    return -1 * js

# wasserstein distance or earths mover's distance
def wasserstein_distance(array1, array2):
    
    union = []
    union.extend(array1.flat)
    union.extend(array2.flat)
    unique_values = list(set(union))
    unique_values.sort()

    probdistr1 = []
    probdistr2 = []
    for v in unique_values:
        probdistr1.append((array1 == v).sum()/len(array1.flat))
        probdistr2.append((array2 == v).sum()/len(array2.flat))

    return -1 * scipy.stats.wasserstein_distance(u_values=probdistr1, v_values=probdistr2, u_weights=unique_values, v_weights=unique_values)

# bhattacharyya distance
def bhattacharyya(array1, array2):
    
    probdistr1, probdistr2 = make_prob_distr(array1, array2)
    bc = np.sum(np.sqrt(np.multiply(probdistr1, probdistr2)))
    if bc == 0.0:
        return np.nan
    return -1 * (- np.log(bc))


# hellinger distance
def hellinger_distance(array1, array2):
    
    probdistr1, probdistr2 = make_prob_distr(array1, array2)
    bc = np.sum(np.sqrt(np.multiply(probdistr1, probdistr2)))                    
    return -1 * np.sqrt(1-bc)

# histogram_intersection
def histogram_intersection(array1, array2):
    
    probdistr1, probdistr2 = make_prob_distr(array1, array2)
    mins = []
    for v1, v2 in zip(probdistr1, probdistr2):
        mins.append(np.min([v1,v2]))
    shared_ratio = np.sum(mins)/1
    return 1 - shared_ratio

# histogram_correlation
def histogram_correlation(array1, array2):
    
    probdistr1, probdistr2 = make_prob_distr(array1, array2)
    
    mean1 = np.mean(probdistr1)
    mean2 = np.mean(probdistr2)
    numerator = []
    denominator_a = []
    denominator_b = []
    
    for v1,v2 in zip(probdistr1, probdistr2):
        numerator.append((v1-mean1)*(v2-mean2))
        denominator_a.append((v1-mean1)*(v1-mean1))
        denominator_b.append((v2-mean2)*(v2-mean2))
    numerator = np.sum(numerator)
    denominator = np.sqrt(np.sum(denominator_a) * np.sum(denominator_b))
    
    # the correlation coefficient (between -1 and 1)
    r = numerator/denominator
    
    return r

# total variation distance
def total_variation(array1, array2):

    probdistr1, probdistr2 = make_prob_distr(array1, array2)
    tv = np.max(np.abs(np.array(probdistr1) - np.array(probdistr2)))

    return -1 * tv

# chi square statistics
def chi2_distance(array1, array2):

    probdistr1, probdistr2 = make_prob_distr(array1, array2)

    numerator = (np.array(probdistr1) - np.array(probdistr2)) **2
    denominator = np.array(probdistr1) + np.array(probdistr2)
    chi = 0.5 * np.sum(numerator/denominator)
 
    return -1 * chi


## Extracting similarities depending on conditions:

In [4]:
wts_per_dataset = 15

In [5]:
# getting similarities for a certain similarity measure, layer and sign_distribution variable

def similarities_for_distributions(similarity_measure, layer, variable):
    
    # extracting a sign distribution given a number and a dataset
    def get_sign_distr(n, d_set):
        sign_distr = None
        if d_set == "RSN":
            sign_distr = pd.read_csv(f'2b Sign distributions/RSN_{n}_sign_distr.csv')
        else:
            sign_distr = pd.read_csv(f'2b Sign distributions/{d_set}_IMP_{n}_sign_distr.csv')
        return np.array(sign_distr[sign_distr["layer"]==layer][[variable]])
    
    
    similarities = {"CIFAR":[], "CINIC":[], "SVHN":[], "RSN":[],
                 "RSN_CIFAR":[], "RSN_CINIC":[], "RSN_SVHN":[],
                 "CIFAR_SVHN":[], "CINIC_CIFAR":[], "SVHN_CINIC":[]}
    
    # taking pairs, but not both ways
    for i in range(wts_per_dataset):
        for j in range(wts_per_dataset):
            
            # comparing within conditions
            if i < j:
                for c in ["CIFAR", "CINIC", "SVHN", "RSN"]:
                    similarities[c].append(similarity_measure(get_sign_distr(i, c),get_sign_distr(j, c)))
                    
            # comparing RSNs and WTs
            for c in ["CIFAR", "CINIC", "SVHN"]:
                similarities[f"RSN_{c}"].append(similarity_measure(get_sign_distr(i, "RSN"),get_sign_distr(j, c)))
            
            # comparing between datasets
            similarities["CIFAR_SVHN"].append(similarity_measure(get_sign_distr(i, "CIFAR"),get_sign_distr(j, "SVHN")))
            similarities["CINIC_CIFAR"].append(similarity_measure(get_sign_distr(i, "CINIC"),get_sign_distr(j, "CIFAR")))
            similarities["SVHN_CINIC"].append(similarity_measure(get_sign_distr(i, "SVHN"),get_sign_distr(j, "CINIC")))
    
    # add 4 collective distance conditions
    similarities["within_WTs"] = similarities["CIFAR"]+similarities["CINIC"]+similarities["SVHN"]
    similarities["within_conditions"] = similarities["CIFAR"]+similarities["CINIC"]+similarities["SVHN"]+similarities["RSN"]
    similarities["between_WTs_and_RSN"] = similarities["RSN_CIFAR"]+similarities["RSN_CINIC"]+similarities["RSN_SVHN"]
    similarities["between_WT_datasets"] = similarities["CIFAR_SVHN"]+similarities["CINIC_CIFAR"]+similarities["SVHN_CINIC"]
            
    return similarities

In [6]:
# all the different plot parameters:

similarity_measures = [kl_divergence,
                     js_divergence, 
                     wasserstein_distance, 
                     hellinger_distance, 
                     bhattacharyya, 
                     histogram_correlation, 
                     histogram_intersection]

similarity_measure_names = ["kl_divergence",
                          "js_divergence", 
                          "wasserstein_distance",  
                          "hellinger_distance", 
                          "bhattacharyya", 
                          "histogram_correlation", 
                          "histogram_intersection"]

similarity_short = ["kl","jsd", "wsd",  "hd", "bd", "hc", "hi"]

variables = ["prune_rate_in", "prune_rate_out", "sign_rate_in", "sign_rate_out"]
layers = ["dense1", "dense2"]

In [7]:
# using a subset of the similarity measures

similarity_measures = [js_divergence, 
                     wasserstein_distance, 
                     hellinger_distance,
                     total_variation,
                     chi2_distance,
                     histogram_correlation, 
                     histogram_intersection]

similarity_measure_names = ["js_divergence", 
                          "wasserstein_distance",  
                          "hellinger_distance",
                          "total_variation",
                          "chi2_distance", 
                          "histogram_correlation", 
                          "histogram_intersection"]

similarity_short = ["jsd", "wsd",  "hd", "tv", "chi2", "hc", "hi"]

In [8]:
# store all the similarities

def store_all_similarities():

    for s_measure, s_measure_name, s_short in tqdm(zip(similarity_measures, similarity_measure_names, similarity_short), leave=False, desc="similarity_measures"):
        for layer in layers:
            for variable in variables:
            
                # get distance
                similarities = similarities_for_distributions(s_measure, layer, variable)

                # store distance
                myFile = open(f'4b Similarities/{s_measure_name}/{layer}_{variable}_{s_short}_similarities.txt', 'w')
                myFile.write(str(similarities))
                myFile.close()

#store_all_similarities()

In [9]:
def get_similarity(s_measure_name, s_measure_short, layer, variable):
                
    myFile = open(f'4b Similarities/{s_measure_name}/{layer}_{variable}_{s_measure_short}_similarities.txt', 'r')
    similarity = myFile.read()
    similarity = ast.literal_eval(similarity) 
    
    return similarity

## Collecting mean and std of all distanc conditions:

In [10]:
# calculate std and mean for each condition and collect in a giant dataframe

def get_mean_std_similarities():

    similarity_statistics = pd.DataFrame()

    for s_measure, s_measure_name, s_measure_short in tqdm(zip(similarity_measures, similarity_measure_names, similarity_short), leave=False, desc="similarity_measures"):
        for layer in layers:
            for variable in variables:

                # get similariteis for all conditions
                #similarities = similarities_for_distributions(s_measure, layer, variable)
                similarities = get_similarity(s_measure_name, s_measure_short, layer, variable)

                # iterate thorugh all conditions and collect their mean and std in a dataframe
                new_similarity_statistics = pd.DataFrame()
                for c_name, s in similarities.items():
                    dic = {}

                    dic["mean"] = np.mean(s)
                    dic["std"] = np.std(s)
                    dic["layer"] = layer
                    dic["variable"] = variable
                    dic["similarity_measure"] = s_measure_name

                    # add new row as dataframe to statistics
                    df = pd.DataFrame(data = dic, index = [c_name])
                    new_similarity_statistics = pd.concat([new_similarity_statistics, df], axis=0)

                # add collected statistics do big data frame
                similarity_statistics = pd.concat([similarity_statistics, new_similarity_statistics], axis=0)
                
    return similarity_statistics     

In [11]:

# get similarity statistics
similarity_statistics = get_mean_std_similarities()

# store similarity statistics in a file
similarity_statistics.to_csv(f'4b Similarities/mean_std_similarities.csv', index=True)

                                           

## Hypothesis testing

Hypothesis:  

1. WTs are more similar with each other than WTs with RSNs.
2. RSNs are more similar with each other than WTs with RSNs.
3. WTs of the same dataset are more similar with each other than WTs of different datasets.
4. WTs of different datasets are more similar with each than WTs with RSNs.

In [12]:
hypothesis_pairs = [("within_WTs","between_WTs_and_RSN"),("RSN","between_WTs_and_RSN"),("within_WTs","between_WT_datasets"),("between_WT_datasets","between_WTs_and_RSN")]

In [17]:

def do_t_tests():
    
    t_test_results = pd.DataFrame()

    for s_measure, s_measure_name, s_measure_short in tqdm(zip(similarity_measures, similarity_measure_names, similarity_short), leave=False, desc="similarity_measures"):
        for layer in layers:
            for variable in variables:

                new_results = pd.DataFrame()

                # get similarities for all conditions
                #similarities = similarities_for_distributions(s_measure, layer, variable)
                similarities = get_similarity(s_measure_name, s_measure_short, layer, variable)

                # iterate thorugh important conditions and perform corresponding hypothesis tests

                # iterate thorugh hypotheses
                for (s1,s2), h in zip(hypothesis_pairs, range(1,5)):

                    # perform one sided two sample Welch’s t-test
                    h_results = scipy.stats.ttest_ind(similarities[s1], similarities[s2], alternative="greater", equal_var=False)
                    p_value = h_results.pvalue

                    # add test results as row in dictionary
                    dic = {}
                    dic["hypothesis"] = f"H{h}"
                    dic["accepted"] = p_value<=0.05
                    dic["p-value"] = p_value
                    dic["layer"] = layer
                    dic["variable"] = variable
                    dic["similarity_measure"] = s_measure_name

                    # add row to datframe
                    df = pd.DataFrame(data = dic, index = [h])
                    new_results = pd.concat([new_results, df], axis=0)

                # add collected statistics do big data frame
                t_test_results = pd.concat([t_test_results, new_results], axis=0)
                
    return t_test_results

In [18]:

# get t_test_results
significance_test_results = do_t_tests()

# store t_test_results in a file
significance_test_results.to_csv(f'4b Similarities/significance_test_results.csv', index=False)

                                           