In [1]:
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
import scipy.io as sio
from cplAE_MET.utils.load_config import load_config
from cplAE_MET.models.augmentations import undo_radial_correction
from cplAE_MET.models.augmentations import do_radial_correction

In [2]:
# Set the config file name
config_file = 'config_preproc.toml'

In [3]:
# Function to set the input and output path
def set_paths(config_file=None):
    paths, _ = load_config(config_file=config_file, verbose=False)
    paths['input'] = f'{str(paths["data_dir"])}'
    paths['arbor_density_file'] = f'{paths["input"]}/{str(paths["arbor_density_file"])}'

    paths['specimen_ids'] = f'{paths["input"]}/{str(paths["specimen_ids_file"])}'
    paths['m_data_folder'] = f'{paths["input"]}/{str(paths["m_data_folder"])}'
    paths['m_anno'] = f'{paths["m_data_folder"]}/{str(paths["m_anno"])}'
    paths['hist2d_120x4'] = f'{paths["m_data_folder"]}/{str(paths["hist2d_120x4_folder"])}'

    paths['t_anno'] = f'{paths["input"]}/{"anno.feather"}'
    return paths

In [4]:
# Function to define the arbor density channel names based on the class
def get_file_apendix(exc_or_inh):
    appendix = []
    if exc_or_inh == "inh":
        appendix = ["axon", "dendrite"]
    if exc_or_inh == "exc":
        appendix = ["apical", "basal"]
    return appendix

In [5]:
# Function to remove some cells that have only few non-zero pixels in their arbor density images
def get_cell_ids_of_abnormal_images(specimen_ids, image_path, m_anno,  min_nonzero_pixels=5):
    '''
    Get all the specimen_ids that have few nonzero pixels

    Args:
        anno: annotation file, which has column called specimen_id
        image_path: the path to the images
        exc_or_inh: inh or exc cells are being processed
        min_nonzero_pixels: the cell is abnormal if it has less than this number of nonzero pixels

    Returns:
        list of specimen ids of the abnormal cell images
    '''

    ab_spec_id = []
    for i, spec_id in tqdm(enumerate(specimen_ids)):
        if spec_id in m_anno['specimen_id'].astype(str).to_list():
            exc_or_inh = m_anno[m_anno['specimen_id'] == spec_id]['class']
            app = get_file_apendix(exc_or_inh.values[0])
            if os.path.isfile(image_path + f'/hist2d_120x4_{app[0]}_{spec_id}.csv'):

                im0 = pd.read_csv(image_path + f'/hist2d_120x4_{app[0]}_{spec_id}.csv', header=None).values
                im1 = pd.read_csv(image_path + f'/hist2d_120x4_{app[1]}_{spec_id}.csv', header=None).values

                if np.count_nonzero(im0) < min_nonzero_pixels or np.count_nonzero(im1) < min_nonzero_pixels:
                    ab_spec_id.append(spec_id)
    return ab_spec_id

In [6]:
# Read the m_anno which is the metadata file for the m cells
dir_pth = set_paths(config_file=config_file)
m_anno_path = dir_pth['m_anno']
hist2d_120x4_path = dir_pth["hist2d_120x4"]

In [7]:
# Reading m_anno and finding cells with few nonzero pixels
ids = pd.read_csv(dir_pth['specimen_ids'])
specimen_ids = ids['specimen_id'].astype(str).tolist()
m_anno = pd.read_csv(m_anno_path) #This is used for soma depth and class type
ab_spec_id = get_cell_ids_of_abnormal_images(specimen_ids, hist2d_120x4_path, m_anno,  min_nonzero_pixels=5)
print(len(ab_spec_id), "cells will be dropped because of the few non zero pixels")
drop_spec_id = ab_spec_id

16703it [05:09, 53.90it/s]

31 cells will be dropped because of the few non zero pixels





In [8]:
print("...................................................")
print("Generating image for all the locked dataset, for those that we dont have M, we put zeros")
hist_shape = (1, 120, 4, 1)
im_shape = (1, 120, 4, 4)
im = np.zeros((len(specimen_ids), 120, 4, 4), dtype=float)
soma_depth = np.zeros((len(specimen_ids),))
c = 0
for i, spec_id in tqdm(enumerate(specimen_ids)):
    if spec_id in drop_spec_id:
        im[i, ...] = np.full(im_shape, np.nan)
        soma_depth[i] = np.nan
    else:
        if spec_id in m_anno['specimen_id'].astype(str).to_list():
            exc_or_inh = m_anno[m_anno['specimen_id'] == spec_id]['class'].values[0]
            app = get_file_apendix(exc_or_inh)
            if os.path.isfile(hist2d_120x4_path + f'/hist2d_120x4_{app[0]}_{spec_id}.csv'):
                c += 1
                im0 = pd.read_csv(hist2d_120x4_path + f'/hist2d_120x4_{app[0]}_{spec_id}.csv', header=None).values
                im1 = pd.read_csv(hist2d_120x4_path + f'/hist2d_120x4_{app[1]}_{spec_id}.csv', header=None).values

                #convert arbor density to arbor mass
                mass0 = undo_radial_correction(im0)
                mass1 = undo_radial_correction(im1)

                # Normalize so that the mass sum is 350
                mass0 = mass0 * 350 / np.sum(mass0)
                mass1 = mass1 * 350 / np.sum(mass1)

                # compute the arbor density from the arbor mass again
                im0 = do_radial_correction(mass0)
                im1 = do_radial_correction(mass1)

                # convert images from 120x4 to 120x1 shape 
                # mass0 = mass0.sum(axis=1)
                # mass1 = mass1.sum(axis=1)

                # mass0 = mass0 * 100 / np.sum(mass0)
                # mass1 = mass1 * 100 / np.sum(mass1)

                im0 = mass0
                im1 = mass1
                
                if exc_or_inh == "inh":
                    im[i, :, :, 0:2] = (np.concatenate([im0.reshape(hist_shape), im1.reshape(hist_shape)], axis=3))
                    im[i, :, :, 2:] = 0.
                else:
                    im[i, :, :, 2:] = (np.concatenate([im0.reshape(hist_shape), im1.reshape(hist_shape)], axis=3))
                    im[i, :, :, 0:2] = 0.

                soma_depth[i] = np.squeeze(m_anno.loc[m_anno['specimen_id'] == spec_id]['soma_depth'].values)
            else:
                im[i, ...] = np.full(im_shape, np.nan)
                soma_depth[i] = np.nan
        else:
            im[i, ...] = np.full(im_shape, np.nan)
            soma_depth[i] = np.nan

print("so far in total", c, "cells have m data available")

...................................................
Generating image for all the locked dataset, for those that we dont have M, we put zeros


16703it [04:31, 61.44it/s]

so far in total 10246 cells have m data available





In [9]:
sio.savemat(dir_pth['arbor_density_file'], {'hist_ax_de_api_bas': im,
                                'soma_depth': soma_depth,
                                'specimen_id': ids['specimen_id'].astype(str).to_list()}, do_compression=True)