## Figure 7

Analysis of feature correlation in topological featurization and use of featurization by extracting the largest subtribes in volumetric samples

In [None]:
from toposample import Config, data
import json
import h5py
import numpy as np

"""
Paths to relevant data.
"""
cfg = Config("../working_dir/config/common_config.json")
features_fn = cfg._cfg['analyzed']['features']


In [None]:
"""
Helper functions
"""

def load_features(ftr_file, strat):
    
    with open(ftr_file, 'r') as fid:
        ftr_res = json.load(fid)
    
    specs = sorted(ftr_res[strat].keys())
    ftr_values = []
    for spec in specs:
        ftr_fn = ftr_res[strat][spec]['0']['data_fn'] # Assume that samples are pooled to single element at index '0' for featurization-based results
        with h5py.File(ftr_fn, 'r') as ftr_file:
            stim_names = list(ftr_file['per_stimulus'].keys())
            ftr_per_stim = [np.array(ftr_file['per_stimulus'][stim]) for stim in stim_names]
            ftr_values.append(np.concatenate(ftr_per_stim, 2)) # Pool all stimuli together
    
    spec_idx = np.array([spec.split('@') for spec in specs])
    spec_names, spec_inv = np.unique(spec_idx[:, 0], return_inverse=True)
    spec_idx[:, 0] = spec_inv
    spec_idx = spec_idx.astype(int)
    
    return ftr_values, spec_names, spec_idx

In [None]:
"""
Loading the data
"""

# Features (Euler characteristic values)
ftr_volumetric, ftr_specs_volumetric, ftr_specs_idx_volumetric = load_features(features_fn, 'Radius')
assert ftr_specs_idx_volumetric.shape[1]==1, 'ERROR: No sub-indices expected for volumetric samples!'

ftr_subtribes, ftr_specs_subtribes, ftr_specs_idx_subtribes = load_features(features_fn, 'subtribes')
assert np.all(ftr_specs_volumetric == ftr_specs_subtribes), 'ERROR: Specifier mismatch!' # Consistency check

# ftr_parametric...TODO

# Accuracies
# TODO


In [None]:
"""
Some analysis of the data
"""

# Feature (Euler characteristic) correlation
ec_cc_volumetric = [[np.corrcoef(ftr_volumetric[spidx][:, :, trial].T) for trial in range(ftr_volumetric[spidx].shape[2])] for spidx in range(len(ftr_volumetric))] # Correlation within single trials
ec_mcc_volumetric = [np.nanmean(ec_cc_volumetric[spidx], 0) for spidx in range(len(ftr_volumetric))] # Mean over trials
ec_mmcc_volumetric = [np.sum(np.triu(ec_mcc_volumetric[spidx], 1)) / ((np.prod(ec_mcc_volumetric[spidx].shape) - ec_mcc_volumetric[spidx].shape[0]) / 2) for spidx in range(len(ftr_volumetric))] # Mean over pairs

ec_cc_subtribes = [[np.corrcoef(ftr_subtribes[spidx][:, :, trial].T) for trial in range(ftr_subtribes[spidx].shape[2])] for spidx in range(len(ftr_subtribes))] # Correlation within single trials
ec_mcc_subtribes = [np.nanmean(ec_cc_subtribes[spidx], 0) for spidx in range(len(ftr_subtribes))] # Mean over trials
ec_mmcc_subtribes = [np.sum(np.triu(ec_mcc_subtribes[spidx], 1)) / ((np.prod(ec_mcc_subtribes[spidx].shape) - ec_mcc_subtribes[spidx].shape[0]) / 2) for spidx in range(len(ftr_subtribes))] # Mean over pairs

# ec_cc_parametric...TODO

In [None]:
"""
Panel A: Two examples of sets of tribes with uncorrelated and highly correlated features
"""
# To be added...

In [None]:
"""
Panel B: Average feature correlation for champion and volumetric samples
"""
# To be added...

In [None]:
"""
Panel C: High average correlation leads to poor classification
"""
# To be added...

In [None]:
"""
Panel D: Overview of accuracies when using the largest 25 subtribes in volumetric samples
"""
# To be added...

In [None]:
"""
Panel E: Feature correlation vs accuracy based on subtribes in volumetric samples
"""
# To be added...