# Clustering different pruning rates

In [None]:
# imports

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import copy

from sklearn.cluster import SpectralClustering, KMeans, AgglomerativeClustering

In [None]:
p_rates = [30,51,65,76,80,83,88,92,94,96,97]

In [None]:
# get collective sign distributions for p_rates

def get_collective_sign_distr(n, dataset_name, layer, p_rate):

    # collect all the sign distributions in one huge dataFrame
    coll_sign_distr = pd.DataFrame()
    for i in range(n):
        sign_distr = pd.read_csv(f'2b Sign distributions/more_p_rates/{dataset_name}_IMP{p_rate}_{i}_sign_distr.csv')
        coll_sign_distr = pd.concat([coll_sign_distr, sign_distr])
    # seperate the dataframe by layer
    coll_sign_distr = coll_sign_distr[coll_sign_distr["layer"]==layer][["prune_rate_in", "prune_rate_out", "sign_rate_in", "sign_rate_out"]]
    
    return coll_sign_distr

In [None]:
# get clusters and plot them

def cluster_sign_distr(sign_distr, n_clusters = 2, algorithm = AgglomerativeClustering, print_matrix = False):

    sign_distr = copy.copy(sign_distr)
    
    # preprocess the data
    sign_distr_array = []
    for index, row in sign_distr.iterrows():
        sign_distr_array.append(row.to_numpy())
    sign_distr_array = np.array(sign_distr_array)
    
    # clustering
    clustering = algorithm(n_clusters=n_clusters)
    clustering.fit(sign_distr_array)
    labels = clustering.labels_
    
    # append labels to dataframe
    labels_df = pd.DataFrame(labels, index=range(len(labels)),columns=["cluster"])
    sign_distr = sign_distr.reset_index(drop=True)
    sign_distr = pd.concat([sign_distr, labels_df], axis=1)
    
    # show scatterplot matrix
    if print_matrix:
        sns.pairplot(sign_distr, hue="cluster", height=1.75, kind='scatter', plot_kws={'alpha':0.8})
    
    return sign_distr

In [None]:
# get cluster observations table

def get_cluster_obs(cluster_data, n_clusters = 4):

    cluster_stats = pd.DataFrame()
    for cluster in range(n_clusters):

        cluster_stats_block = pd.DataFrame()
        variables = ["prune_rate_in","prune_rate_out","sign_rate_in","sign_rate_out"]
        for v in variables:

            cluster_data_filtered = cluster_data[cluster_data["cluster"]==cluster]
            #display(cluster_data_filtered)
            cluster_data_filtered = cluster_data_filtered[v]

            cluster_stats_vrow = {}
            cluster_stats_vrow[f"mean_c{cluster}"] = np.mean(cluster_data_filtered)
            cluster_stats_vrow[f"std_c{cluster}"] = np.std(cluster_data_filtered)
            cluster_stats_vrow[f"ratio_c{cluster}"] = len(cluster_data_filtered)/len(cluster_data[v])

            cluster_stats_vrow_df = pd.DataFrame(cluster_stats_vrow, index=[v])
            cluster_stats_block = pd.concat([cluster_stats_block, cluster_stats_vrow_df], axis=0)

        cluster_stats = pd.concat([cluster_stats, cluster_stats_block], axis=1)

    return cluster_stats

In [None]:
cluster_obs = []

# print cluster matrixes
for p_rate in p_rates:

    # cluster collective sign distributions
    coll_sign_distr = get_collective_sign_distr(15, "CIFAR","dense1", p_rate)
    cluster_data = cluster_sign_distr(coll_sign_distr, n_clusters=4, algorithm=AgglomerativeClustering, print_matrix=True)

    # collect cluster observations
    obs = get_cluster_obs(cluster_data)
    cluster_obs.append(obs)

# print cluster obs
for obs in cluster_obs:
    display(obs)