Compute basis functions for clustering of dimer network outputs. 

In [1]:
import os
import sys
import numpy as np
import pandas as pd
import eqtk

#For plotting
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sn
import plotnine as p9 #easier for some things

#For clustering
from fastdist import fastdist
import scipy.cluster.hierarchy as hcluster
from scipy.cluster.vq import whiten
from scipy.cluster.hierarchy import dendrogram, linkage, cut_tree
from scipy.spatial.distance import pdist



In [2]:
pwd = os.path.abspath('../..')
sys.path.append(os.path.join(pwd, 'code/')) 
import utilities
from utilities import *

In [None]:
#Set plotting defaults
sn.set_style('ticks')
mpl.rcParams.update({'text.color': 'black', 'axes.labelcolor': 'black', 
                     'xtick.color': 'black', 'ytick.color': 'black', 'figure.dpi':100, 'savefig.dpi':300,
                     'savefig.bbox': 'tight', 'savefig.transparent': True, 'mathtext.default':'regular'})
sn.set_context('notebook')
%matplotlib inline
%config InlineBackend.figure_format='retina'

In [6]:
indir = '../../data/20220617_1input_randomParams_highRes/'

In [9]:
def scale_axis(mat, axis=0):
    mat_mn = np.expand_dims(mat.mean(axis = axis), axis)
    mat_sd = np.expand_dims(mat.std(axis = 0), axis)
    mat_scaled = (mat - mat_mn)/mat_sd
    return mat_scaled

In [10]:
cluster_counts_all = [0] * 4
cluster_basis_all = [0] * 4
for m in range(4):
    clusters_mat = np.load(os.path.join(indir, f'hierarchical_clustering/hc_{m+3}M_1000univ_clusters_thresh3.npy'))
    cluster_counts = np.apply_along_axis(lambda x: len(np.unique(x)), 1, clusters_mat)
    cluster_counts_all[m] = cluster_counts
    
    #Now compute basis curves
    outmat = np.load(os.path.join(indir, f'output_{m+3}M_LHSsample_1000k.npy'))

    n_univ = clusters_mat.shape[0]
    basis_combined = [0]*n_univ
    for univ in range(n_univ):
        outmat_scaled = scale_axis(outmat[:,univ,:], axis = 0)
        cluster_labels = clusters_mat[univ,:]
        clusters = np.unique(cluster_labels)
        basis = np.zeros((len(clusters), outmat_scaled.shape[0]))
        for i, clust in enumerate(clusters):
            cluster_ind = np.nonzero(cluster_labels == clust)[0]
            outmat_clust = outmat_scaled[:,cluster_ind]
            basis[i,:] = outmat_clust.mean(axis=1)
        basis_combined[univ] = basis
    
    cluster_basis_all[m] = basis_combined

In [45]:
outdir = '../../data/20220617_1input_randomParams_highRes/hierarchical_clustering/'

In [46]:
for m in range(4):
    cluster_basis = np.vstack(cluster_basis_all[m]).T
    
    univ_labels = np.repeat(np.arange(1000), cluster_counts_all[m])
    
    np.save(os.path.join(outdir, f'hc_{m}M_1000univ_clusterBasis_thresh3.npy'), cluster_basis)
    np.save(os.path.join(outdir, f'hc_{m}M_1000univ_clusterUnivLabels_thresh3.npy'), univ_labels)


Save array for some of the universes with the greatest numbers of clusters. Also plot the curves

In [58]:
def plot_outcurves_clustered2(outmat, cluster_labels, shaded='max-min', clusters = None, 
                             percentiles=(0.05, 0.95), sample_n = 10, xticks = [0,4,9],
                             xticklabels = [-3,0,3], add_legend = False, plot_width=7, plot_height=10):
    """
    """
    if clusters is None: #plot all clusters  
        clusters = np.unique(cluster_labels)
    
    n_clusters = len(clusters)    
    # n_outweights = outmat.shape[2]
    n_titration = outmat.shape[0]
    
    xticklabels = [f'$10^{{{i}}}$' for i in xticklabels]
    fig, axes = plt.subplots(2, n_clusters, figsize=(n_clusters*plot_width, plot_height), squeeze=False, constrained_layout=True)
    for i, clust in enumerate(clusters):
        cluster_ind = np.nonzero(cluster_labels == clust)[0]
        outmat_clust = outmat[:,cluster_ind]
        clust_percent = np.round(outmat_clust.shape[1]/outmat.shape[1] * 100, 2)
        
        #Plot cluster average on the top row
        cluster_mean = outmat_clust.mean(axis=1)

        if shaded == 'max-min':
            cluster_lb = outmat_clust.min(axis=1)
            cluster_ub = outmat_clust.max(axis=1)
            label = 'max-min'
        elif shaded == 'percentile':
            cluster_lb = np.percentile(outmat_clust, percentiles[0], axis = 1)
            cluster_ub = np.percentile(outmat_clust, percentiles[1], axis = 1)
            label = f'{percentiles[0]}-{percentiles[1]} percentile'

        axes[(0,i)].plot(np.arange(n_titration), cluster_mean, label='average')
        axes[(0,i)].fill_between(np.arange(n_titration), cluster_lb, cluster_ub, alpha=0.2, label=label)
        
        #plot example curves on the bottom row
        if sample_n is not None and sample_n < outmat_clust.shape[1]:
            rand_ind = np.random.choice(np.arange(outmat_clust.shape[1]), size=sample_n, replace=False)
            outmat_clust = outmat_clust[:,rand_ind]
        for j in range(outmat_clust.shape[1]):
            axes[(1,i)].plot(np.arange(n_titration), outmat_clust[:,j])
        
        #Format axes
        axes[(0,i)].set(ylabel='out',xticks = xticks, xticklabels = xticklabels,
                        title=f'cluster: {clust}, {clust_percent}%')
        axes[(1,i)].set(ylabel='out',xticks = xticks, xticklabels = xticklabels,
                        title=f'example curves:')
        if add_legend:
                axes[(0,i)].legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0)

    return fig, axes
        

In [74]:
indir

'../../data/20220617_1input_randomParams_highRes/'

In [80]:
for m in range(4):
    clusters_mat = np.load(os.path.join(indir, f'hierarchical_clustering/hc_{m+3}M_1000univ_clusters_thresh3.npy'))

    clust_sorted_desc_ind= np.argsort(-1*cluster_counts_all[m])
    top_univ = clust_sorted_desc_ind[0]
    
    #save basis functions for the universe with the most clusters
    cluster_basis_top_univ = cluster_basis_all[m][top_univ]
    np.save(os.path.join(outdir, f'hc_{m+3}M_clusterBasis_univ{top_univ}_thresh3.npy'), cluster_basis_top_univ)
    
    #Plot basis curves for the top univ
    outmat = np.load(os.path.join(indir, f'output_{m+3}M_LHSsample_1000k.npy'))
    outmat_scaled = scale_axis(outmat[:,top_univ,:])

    fig1, axes1 = plot_outcurves_clustered2(outmat_scaled, clusters_mat[top_univ,:], 
                                               shaded='max-min',
                                               xticks = [0,49,99],
                                               xticklabels = [-3,0,3], add_legend = True)
    fig1.suptitle(f'Universe {top_univ}')
    fig1.savefig(f'../../plots/20220617_1input_randomParams_highRes/hierarchical_clustering/hc_curves_{m+3}M_univ{top_univ}_thresh3.pdf')
    plt.close()