In [176]:
"""
use pandas.DataFrame
"""
import itertools
import logging
from enum import Enum
from pathlib import Path

import neurom as nm
import numpy as np
import pandas as pd
from lxml import etree
from neurom import NeuriteType
from scipy import stats

L = logging.getLogger(__name__)
pd.options.display.width = 0


class DISCRETE_FEATURE_NAMES(Enum):
    LEN = 'total_length'
    SURFACE_AREA = 'total_area_per_neurite'
    VOLUMES = 'neurite_volumes'
    NUMBER_OF_SECTIONS = 'number_of_sections'
    NUMBER_OF_BIFURCATIONS = 'number_of_bifurcations'
    NUMBER_OF_TERMINATIONS = 'number_of_terminations'


class CONTINUOUS_FEATURE_NAMES(Enum):
    SECTION_LEN = 'section_lengths'
    SECTION_RADIAL_DISTANCES = 'section_radial_distances'
    SECTION_PATH_DISTANCES = 'section_path_distances'
    PARTITION_ASYMMETRY = 'partition_asymmetry'
    SEGMENT_RADII = 'segment_radii'


NEURITES = (NeuriteType.soma,
            NeuriteType.axon,
            NeuriteType.basal_dendrite,
            NeuriteType.apical_dendrite,
            NeuriteType.undefined,)
NEURITE_NAMES = [type.name for type in NEURITES]

CUSTOM_FEATURE_LOAD = {
    DISCRETE_FEATURE_NAMES.SURFACE_AREA.value: {
        NeuriteType.soma.name: lambda neuron: nm.get('soma_surface_areas', neuron),
    }
}

MORPH_FILETYPES = ['.h5', '.swc', '.asc']


class Features(object):
    class INDEX(Enum):
        MTYPE = 'mtype'
        FILENAME = 'filename'
        NEURITE = 'neurite'

    _INDEX_NAMES = [index.value for index in INDEX]

    def __init__(self, index, discrete, continuous):
        self.discrete = pd.concat(discrete, keys=index, names=self._INDEX_NAMES)
        self.continuous = pd.concat(continuous, keys=index, names=self._INDEX_NAMES)


def get_discrete_features(neuron) -> pd.DataFrame:
    feature_names = [name.value for name in DISCRETE_FEATURE_NAMES]
    df = get_features(neuron, feature_names)
    df = df.applymap(np.sum)
    df.loc[NeuriteType.all.name] = df.sum()
    return df


def get_continuous_features(neuron) -> pd.DataFrame:
    feature_names = [name.value for name in CONTINUOUS_FEATURE_NAMES]
    df = get_features(neuron, feature_names)
    # `np.concatenate(x).tolist()` is used instead `np.concatenate(x)` to treat the return object
    # as a single value. Without it pandas treat it as a Series and tries to broadcast it.
    # When `np.concatenate(x).tolist()` returns a list of length equal to len(df) => may be an error
    df.loc[NeuriteType.all.name] = df.aggregate(lambda x: np.concatenate(x).tolist())
    return df


def get_features(neuron, feature_names) -> pd.DataFrame:
    df = pd.DataFrame(index=NEURITE_NAMES, columns=feature_names)
    for neurite, feature_name in itertools.product(NEURITES, feature_names):
        val = None
        if feature_name in CUSTOM_FEATURE_LOAD:
            if neurite.name in CUSTOM_FEATURE_LOAD[feature_name]:
                val = CUSTOM_FEATURE_LOAD[feature_name][neurite.name](neuron)
        if val is None:
            val = nm.get(feature_name, neuron, neurite_type=neurite)
        df.loc[neurite.name, feature_name] = val.tolist()
    return df


def get_mtype_dict(db_file: Path) -> dict:
    root = etree.parse(str(db_file)).getroot()
    mtype_dict = {}
    for morphology in root.iterfind('.//morphology'):
        name = morphology.findtext('name')
        if not name:
            L.warning('Empty morphology name in %s', db_file)
        mtype = morphology.findtext('mtype')
        if not mtype:
            L.warning('Empty morphology mtype in %s', db_file)
        if name in mtype_dict and mtype_dict[name] != mtype:
            L.warning('Multiple mtypes %s %s for %s', mtype, mtype_dict[name], name)
        mtype_dict[name] = mtype
    return mtype_dict


def get_valid_morph_features(morph_dirpath: Path) -> Features:
    if not morph_dirpath.is_dir():
        raise ValueError(
            '"{}" must be a directory with morphology files'.format(morph_dirpath))
    mtype_dict = get_mtype_dict(morph_dirpath.joinpath('neuronDB.xml'))
    index, discrete, continuous = [], [], []
    for file in morph_dirpath.iterdir():
        if file.suffix in MORPH_FILETYPES:
            neuron = nm.load_neuron(str(file))
            mtype = mtype_dict[neuron.name]
            index.append((mtype, neuron.name))
            discrete.append(get_discrete_features(neuron))
            continuous.append(get_continuous_features(neuron))
    return Features(index, discrete, continuous)


def get_test_morph_features(morph_dirpath: Path) -> Features:
    if not morph_dirpath.is_dir():
        raise ValueError(
            '"{}" must be a directory'.format(morph_dirpath))
    index, discrete, continuous = [], [], []
    for mtype_dir in morph_dirpath.iterdir():
        mtype = mtype_dir.name
        for file in mtype_dir.iterdir():
            if file.suffix in MORPH_FILETYPES:
                neuron = nm.load_neuron(str(file))
                index.append((mtype, neuron.name))
                discrete.append(get_discrete_features(neuron))
                continuous.append(get_continuous_features(neuron))
    return Features(index, discrete, continuous)


def ks_2samp(a, b):
    return stats.ks_2samp(a, b) + (len(a),)


def expand_ks_tuples(ks_as_tuples, ks_columns):
    """transform tuple values to their separate columns"""
    tmp_list = []
    for col in ks_columns:
        expanded_splt = ks_as_tuples.apply(lambda x: pd.Series(x[col]), axis=1)
        columns = pd.MultiIndex.from_product([[col], ['distance', 'p', 'sample_size']])
        expanded_splt.columns = columns
        tmp_list.append(expanded_splt)
    return pd.concat(tmp_list, axis=1)


def get_ks_of_valid_features(valid_features):
    def ks_valid(feature_series):
        def ks(a, b):
            b = np.concatenate(b)
            if a and b.size:
                return ks_2samp(a, b)

        fs_list = feature_series.to_list()
        return [ks(fs_list[i], fs_list[:i] + fs_list[i + 1:]) for i in range(0, len(fs_list))]

    ks_as_tuples = valid_features.continuous \
        .groupby([Features.INDEX.MTYPE.value, Features.INDEX.NEURITE.value]) \
        .transform(ks_valid)
    return expand_ks_tuples(ks_as_tuples, valid_features.continuous.columns)


def get_ks_of_test_features(test_features, valid_features):
    def ks_test(file_series):
        mtype = file_series.index.get_level_values('mtype').unique()[0]
        if not mtype in neurite_feature_distr.index.levels[0]:
            return None
        mtype_series = neurite_feature_distr.loc[mtype][file_series.name]
        return [ks_2samp(fm[0], fm[1])
                if fm[0] and fm[1] else None for fm in zip(file_series, mtype_series)]

    neurite_feature_distr = valid_features.continuous \
        .groupby([Features.INDEX.MTYPE.value, Features.INDEX.NEURITE.value]) \
        .agg(lambda feature: np.concatenate(feature).tolist())

    ks_as_tuples = test_features.continuous \
        .groupby([Features.INDEX.MTYPE.value, Features.INDEX.FILENAME.value]).transform(ks_test)
    return expand_ks_tuples(ks_as_tuples, test_features.continuous.columns)


def discrete_z_score(valid_features: Features, test_features: Features):
    valid_mean = valid_features.discrete.groupby(
        [Features.INDEX.MTYPE.value, Features.INDEX.NEURITE.value]).mean()
    valid_std = valid_features.discrete.groupby(
        [Features.INDEX.MTYPE.value, Features.INDEX.NEURITE.value]).std()
    return ((test_features.discrete - valid_mean) / valid_std).dropna(how='all')


def discrete_report(z_score: pd.DataFrame, p_value=0.05):
    assert 0. <= p_value <= 1.
    threshold = np.abs(stats.norm.ppf(p_value / 2.))
    # some cells in z_score are NaN so we use `failed_neurites` + `any`
    # instead of `valid_neurites` + `all`.
    failed_neurites = (z_score.abs() > threshold).any(axis=1)
    return (~failed_neurites).groupby(['filename', 'mtype']).all()


def valid_ks_neurite_distr(valid_ks):
    def tt(mtype_df):
        print(mtype_df)
        return mtype_df

    return valid_ks.groupby([Features.INDEX.MTYPE.value, Features.INDEX.NEURITE.value]).apply(tt)



In [71]:
# valid_features = build_valid_morphologies(Path('../tests/data/morphologies/valid/all'))
# import shutil
# L23_BTC
# filenames = set(valid_features.discrete.loc['L5_TPC'].index.get_level_values('filename'))
# filenames
# for filename in filenames:
#     shutil.copy2(
#         '/home/sanin/workspace/morph-validator/tests/data/morphologies/valid/all/' + filename + '.h5',
#         '/home/sanin/workspace/morph-validator/tests/data/morphologies/valid/mini')

In [177]:
valid_features = build_valid_morphologies(Path('../tests/data/morphologies/valid/mini'))

In [178]:
test_features = build_test_morphologies(Path('../tests/data/morphologies/test'))

In [173]:
z_score = discrete_z_score(valid_features, test_features)
discrete_report(z_score)


filename        mtype  
C040426         L5_MC      True
C050896A-I      L5_MC      True
C290500C-I4     L5_MC      True
mtC031100A_idB  L23_BTC    True
dtype: bool

In [179]:
valid_ks = get_ks_of_valid_features(valid_features)
valid_ks

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,section_lengths,section_lengths,section_lengths,section_radial_distances,section_radial_distances,section_radial_distances,section_path_distances,section_path_distances,section_path_distances,partition_asymmetry,partition_asymmetry,partition_asymmetry,segment_radii,segment_radii,segment_radii
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,distance,p,sample_size,distance,p,sample_size,distance,p,sample_size,distance,p,sample_size,distance,p,sample_size
mtype,filename,neurite,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2
L23_BTC,C210401C,soma,,,,,,,,,,,,,,,
L23_BTC,C210401C,axon,0.194428,4.224599e-09,281.0,0.474104,2.738167e-51,281.0,0.433077,7.238952e-43,281.0,0.039715,9.830557e-01,137.0,0.659635,0.000000e+00,4800.0
L23_BTC,C210401C,basal_dendrite,0.143301,2.271957e-01,55.0,0.096142,7.011391e-01,55.0,0.099731,6.577285e-01,55.0,0.066620,9.997144e-01,25.0,0.297540,1.506568e-139,2015.0
L23_BTC,C210401C,apical_dendrite,,,,,,,,,,,,,,,
L23_BTC,C210401C,undefined,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
L5_TPC,rat_20170523_E1_LH2_cell1,axon,0.255941,9.893197e-13,245.0,0.103868,1.877043e-02,245.0,0.118934,4.432607e-03,245.0,0.332354,6.831069e-11,122.0,0.043005,5.823998e-20,16889.0
L5_TPC,rat_20170523_E1_LH2_cell1,basal_dendrite,0.096367,5.597696e-01,90.0,0.183044,2.484403e-02,90.0,0.129121,2.193805e-01,90.0,0.166667,3.697270e-01,40.0,0.489286,0.000000e+00,5809.0
L5_TPC,rat_20170523_E1_LH2_cell1,apical_dendrite,0.158956,4.322941e-02,99.0,0.170711,2.419461e-02,99.0,0.080449,6.930935e-01,99.0,0.119219,6.295643e-01,49.0,0.476484,0.000000e+00,8633.0
L5_TPC,rat_20170523_E1_LH2_cell1,undefined,,,,,,,,,,,,,,,


In [180]:
test_ks = get_ks_of_test_features(test_features, valid_features)
test_ks

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,section_lengths,section_lengths,section_lengths,section_radial_distances,section_radial_distances,section_radial_distances,section_path_distances,section_path_distances,section_path_distances,partition_asymmetry,partition_asymmetry,partition_asymmetry,segment_radii,segment_radii,segment_radii
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,distance,p,sample_size,distance,p,sample_size,distance,p,sample_size,distance,p,sample_size,distance,p,sample_size
mtype,filename,neurite,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2
L5_MC,C040426,soma,,,,,,,,,,,,,,,
L5_MC,C040426,axon,0.139116,0.000354,232.0,0.169889,5.0804e-06,232.0,0.221053,6.416586e-10,232.0,0.106251,0.161341,114.0,0.702872,0.0,5868.0
L5_MC,C040426,basal_dendrite,0.243609,0.038656,33.0,0.307278,0.003818319,33.0,0.3531,0.0005027685,33.0,0.222351,0.411887,15.0,0.352281,0.0,4245.0
L5_MC,C040426,apical_dendrite,,,,,,,,,,,,,,,
L5_MC,C040426,undefined,,,,,,,,,,,,,,,
L5_MC,C040426,all,0.139815,9.4e-05,265.0,0.138481,0.0001132425,265.0,0.183495,6.902218e-08,265.0,0.076703,0.446899,129.0,0.59439,0.0,10113.0
L5_MC,C050896A-I,soma,,,,,,,,,,,,,,,
L5_MC,C050896A-I,axon,0.11193,0.000435,361.0,0.31241,1.243338e-28,361.0,0.197341,7.940093e-12,361.0,0.142531,0.002101,180.0,0.482605,0.0,3093.0
L5_MC,C050896A-I,basal_dendrite,0.216981,0.007007,63.0,0.166367,0.07104325,63.0,0.156334,0.1044794,63.0,0.071098,0.997022,30.0,0.170079,1.974304e-91,4419.0
L5_MC,C050896A-I,apical_dendrite,,,,,,,,,,,,,,,
