# Notebook to analyze and display cell type data


### Load important modules

In [1]:
# Standard modules
import numpy as np
import os
import pandas as pd 
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import logging
from numba import njit
from sklearn import linear_model


# Move to root directory for easier module handling
os.chdir("../..")
print(os.listdir("."))

#LBAE imports
from modules.maldi_data import MaldiData
from modules.figures import Figures
from modules.atlas import Atlas
from modules.storage import Storage
from modules.scRNAseq import ScRNAseq

# multithreading/multiprocessing
from multiprocessing import Pool
import multiprocessing
from threadpoolctl import threadpool_limits

# set thread limit
threadpool_limits(16)


['.git', '.gitattributes', '.vscode', 'LICENSE.md', 'TODO.py', 'app.py', 'assets', 'config.py', 'data_sample', 'documentation', 'index.py', 'js', 'main.py', 'modules', 'notebooks', 'pages', 'readme', 'style', 'data', 'README.md', '.ipynb_checkpoints', '.gitignore', 'Dockerfile', 'nohup.out', '.mypy_cache', 'requirements.txt', '__pycache__']


<threadpoolctl.threadpool_limits at 0x7ff65811fb50>

#### Load LBAE objects

In [2]:
path_data = "data/whole_dataset/"
path_annotations = "data/annotations/"
path_db = "data/app_data/data.db"

# Load shelve database
storage = Storage(path_db)

# Load data
data = MaldiData(path_data, path_annotations)

# Load Atlas and Figures objects. At first launch, many objects will be precomputed and shelved in
# the classes Atlas and Figures.
atlas = Atlas(data, storage, resolution=25, sample = True)
scRNAseq = ScRNAseq()
figures = Figures(data, storage, atlas, scRNAseq, sample = True)

### Load cell type data

In [3]:
table_cells = pd.read_csv('notebooks/cell_type_atlas/data/Data_Sheet_2_A Cell Atlas for the Mouse Brain.csv', usecols=[0,4,5,6,7,8,9] )
table_cells

Unnamed: 0,Regions,Excitatory [mm-3],Inhibitory [mm-3],Modulatory [mm-3],Astrocytes [mm-3],Oligodendrocytes [mm-3],Microglia [mm-3]
0,root,137352.601147,11751.903955,849.961392,12782.383276,43770.289886,25613.012771
1,Basic cell groups and regions,153231.404562,13110.496156,948.222144,12058.873983,38918.967718,24809.444601
2,Cerebrum,79759.103521,14290.585238,514.529938,10005.391872,32256.018628,23460.962106
3,Cerebral cortex,90968.098789,10860.004085,492.496350,9855.058423,31800.904552,26231.659016
4,Cortical plate,91783.848364,10988.904301,487.060649,9882.181587,31371.949122,26817.575208
...,...,...,...,...,...,...,...
950,mammillary peduncle,0.000000,0.000000,0.000000,21523.131673,93115.065243,683.274021
951,epithalamus related,0.000000,0.000000,0.000000,21679.418886,74844.640106,44423.860885
952,stria medullaris,0.000000,0.000000,0.000000,26794.563438,80137.764179,48436.063474
953,fasciculus retroflexus,0.000000,0.000000,0.000000,9203.914308,63826.500926,36811.425549


In [4]:
# Get list of leafs in the hierarchy of structures
l_leafs_ids = []
l_leafs_acronyms = []
l_leafs_names = []

for x in atlas.dic_acronym_children_id:
    if len(atlas.dic_acronym_children_id[x]) == 1:
        l_leafs_ids.extend(list(atlas.dic_acronym_children_id[x]))
        l_leafs_acronyms.append(x)
        l_leafs_names.append(atlas.dic_acronym_name[x])
    if len(atlas.dic_acronym_children_id[x]) == 0:
        raise ValueError("No leaf found for structure: " + x)

dic_names_id = {x:y for x,y in zip(l_leafs_names, l_leafs_ids)}
      

In [5]:
# Keep only the regions that are leafs in the hierarchy to avoid overlap
table_cells = table_cells[table_cells['Regions'].isin(l_leafs_names)] 
table_cells

Unnamed: 0,Regions,Excitatory [mm-3],Inhibitory [mm-3],Modulatory [mm-3],Astrocytes [mm-3],Oligodendrocytes [mm-3],Microglia [mm-3]
270,Ectorhinal area/Layer 1,0.000000,54217.228792,65.750071,1835.406238,5625.128418,23225.163391
271,Ectorhinal area/Layer 2/3,107547.686691,6218.964878,267.907385,3672.527141,18883.517931,42262.609627
272,Ectorhinal area/Layer 5,122254.027187,3989.699982,647.703455,6073.657105,36820.408408,36673.812803
273,Ectorhinal area/Layer 6a,134478.115867,2997.008767,712.944817,5866.391611,54271.823964,29816.056387
274,Ectorhinal area/Layer 6b,92443.399000,6341.664216,790.355778,2672.155248,68243.457806,18168.773890
...,...,...,...,...,...,...,...
949,mammillotegmental tract,0.000000,0.000000,0.000000,13082.290981,95431.204740,3591.836735
950,mammillary peduncle,0.000000,0.000000,0.000000,21523.131673,93115.065243,683.274021
952,stria medullaris,0.000000,0.000000,0.000000,26794.563438,80137.764179,48436.063474
953,fasciculus retroflexus,0.000000,0.000000,0.000000,9203.914308,63826.500926,36811.425549


#### Get a dictionnary of lipid expression for each region

In [6]:

def compute_array_exp_lipids(l_id_regions, brain_1 = False, decrease_resolution_factor = 5, ):
    ll_exp_lipids = []
    l_name_lipids = []
    # Simulate a click on all lipid names
    for name in sorted(
        figures._data.get_annotations_MAIA_transformed_lipids(brain_1=brain_1).name.unique()
    ):
        structures = figures._data.get_annotations_MAIA_transformed_lipids(brain_1=brain_1)[
            figures._data.get_annotations_MAIA_transformed_lipids(brain_1=brain_1)["name"] == name
        ].structure.unique()
        for structure in sorted(structures):
            cations = figures._data.get_annotations_MAIA_transformed_lipids(brain_1=brain_1)[
                (
                    figures._data.get_annotations_MAIA_transformed_lipids(brain_1=brain_1)["name"]
                    == name
                )
                & (
                    figures._data.get_annotations_MAIA_transformed_lipids(brain_1=brain_1)[
                        "structure"
                    ]
                    == structure
                )
            ].cation.unique()
            for cation in sorted(cations):
                l_selected_lipids = []
                l_slices = figures._data.get_slice_list(indices="brain_1" if brain_1 else "brain_2")
                for slice_index in l_slices:

                    # Find lipid location
                    l_lipid_loc = (
                        figures._data.get_annotations()
                        .index[
                            (figures._data.get_annotations()["name"] == name)
                            & (figures._data.get_annotations()["structure"] == structure)
                            & (figures._data.get_annotations()["slice"] == slice_index)
                            & (figures._data.get_annotations()["cation"] == cation)
                        ]
                        .tolist()
                    )

                    # If several lipids correspond to the selection, we have a problem...
                    if len(l_lipid_loc) > 1:
                        logging.warning("More than one lipid corresponds to the selection")
                        l_lipid_loc = [l_lipid_loc[-1]]
                    # If no lipid correspond to the selection, set to -1
                    if len(l_lipid_loc) == 0:
                        l_lipid_loc = [-1]

                    # add lipid index for each slice
                    l_selected_lipids.append(l_lipid_loc[0])

                # Get final lipid name
                lipid_string = name + " " + structure + " " + cation

                # If lipid is present in at least one slice
                if np.sum(l_selected_lipids) > -len(l_slices):

                    # Build the list of mz boundaries for each peak and each index
                    lll_lipid_bounds = [
                        [
                            [
                                (
                                    float(figures._data.get_annotations().iloc[index]["min"]),
                                    float(figures._data.get_annotations().iloc[index]["max"]),
                                )
                            ]
                            if index != -1
                            else None
                            for index in [lipid_1_index, -1, -1]
                        ]
                        for lipid_1_index in l_selected_lipids
                    ]
                    print("getting data for lipid ", lipid_string)
                    l_name_lipids.append(lipid_string)
                    l_expr = []
                    for id_region in l_id_regions:
                        #try:
                        interpolated_array = figures.compute_3D_volume_figure(ll_t_bounds = lll_lipid_bounds, name_lipid_1=lipid_string,decrease_dimensionality_factor=decrease_resolution_factor,return_interpolated_array=True, structure_guided_interpolation=False, set_id_regions = set([id_region]))
                        l_expr.append(np.mean(interpolated_array))
                        #except:
                        #    print('An error has occured')
                        #    l_expr.append(np.nan)
                    ll_exp_lipids.append(l_expr)

    return np.array(ll_exp_lipids).T, l_name_lipids



In [None]:
load_from_save = False
l_name_regions = list(table_cells['Regions'])
l_name_cells = table_cells.columns[1:]
if not load_from_save:    
    array_exp_lipids_brain_2, l_name_lipids_brain_2 = compute_array_exp_lipids(l_id_regions = [dic_names_id[x] for x in table_cells['Regions']], brain_1 = False) 
else:
    array_exp_lipids_brain_2 = np.load('notebooks/cell_type_atlas/data/array_exp_lipids_False.npy')
    array_name_lipids_False = np.load('notebooks/cell_type_atlas/data/array_name_lipids_False.npy')

### Save lipid data in numpy arrays

In [None]:
# Save array of lipid expression for brain 2
with open('notebooks/cell_type_atlas/data/array_exp_lipids_False.npy', 'wb') as f:
    np.save(f, array_exp_lipids_brain_2)

# Save corresponding names for brain 2
with open('notebooks/cell_type_atlas/data/array_name_lipids_False.npy', 'wb') as f:
    np.save(f, np.array(l_name_lipids_brain_2))

### Get cell type data as an array

In [10]:
array_exp_cells = table_cells.iloc[:,1:].to_numpy()
array_exp_cells

array([[0.00000000e+00, 5.42172288e+04, 6.57500710e+01, 1.83540624e+03,
        5.62512842e+03, 2.32251634e+04],
       [1.07547687e+05, 6.21896488e+03, 2.67907385e+02, 3.67252714e+03,
        1.88835179e+04, 4.22626096e+04],
       [1.22254027e+05, 3.98969998e+03, 6.47703455e+02, 6.07365711e+03,
        3.68204084e+04, 3.66738128e+04],
       ...,
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 2.67945634e+04,
        8.01377642e+04, 4.84360635e+04],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 9.20391431e+03,
        6.38265009e+04, 3.68114255e+04],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 4.20945455e+04,
        8.29672727e+04, 4.54981818e+04]])

##### Make an elastic net regression to explain lipid expression in terms of cell type

In [None]:
def compute_regression_all_lipids(array_exp_lipids, array_exp_cells):
    """Compute the elastic net regression coefficients for all lipids.

    Returns:
        list(list(float)), list(float): List of coefficients (for each lipid) and list of scores
            for the elastic net regression explaining lipid expression in terms of gene expression.
    """
    # Define regression as a function for potential parallelization
    def compute_regression(index_lipid):
        clf = linear_model.ElasticNet(fit_intercept=True, alpha=0.01, positive=False)
        clf.fit(array_exp_cells, array_exp_lipids[:, index_lipid])
        return [
            clf.coef_,
            clf.score(array_exp_cells, array_exp_lipids[:, index_lipid]),
        ]

    # Compute regression for all lipids
    l_lipid_indices = list(range(array_exp_lipids.shape[1]))
    l_res = [x for x in map(compute_regression, l_lipid_indices)]

    # Store the coefficients and the score of the regressions
    ll_coef = []
    l_score = []
    for res in l_res:
        ll_coef.append(res[0])
        l_score.append(res[1])

    # Return result
    return np.array(ll_coef), l_score

In [None]:
# Do the LASSO regression
array_coef_brain_2, l_score_brain_2 = compute_regression_all_lipids(array_exp_lipids_brain_2, array_exp_cells)
array_coef_brain_2_reversed, l_score_brain_2_reversed = compute_regression_all_lipids(array_exp_cells, array_exp_lipids_brain_2)

#### Save the filtered data from the molecular atlas

In [None]:

with open('notebooks/cell_type_atlas/data/array_coef.npy', 'wb') as f:
    np.save(f, array_coef_brain_2)
with open('notebooks/cell_type_atlas/data/array_score.npy', 'wb') as f:
    np.save(f, np.array(l_score_brain_2))

with open('notebooks/cell_type_atlas/data/array_coef_reversed.npy', 'wb') as f:
    np.save(f, array_coef_brain_2_reversed)
with open('notebooks/cell_type_atlas/data/array_score_reversed.npy', 'wb') as f:
    np.save(f, np.array(l_score_brain_2_reversed))