In [None]:

import spherpro.bro as spb
import spherpro.db as db
import pathlib
import scanpy as sc
import anndata
import pandas as pd
from sqlalchemy.orm import aliased
import shutil

In [None]:
import spherpro.configuration as conf

In [None]:
sm = snakemake

# Aim: export the Mean Intensity data


In [None]:
class C:
    fn_config =  sm.input.fn_config
    fol_export = pathlib.Path(sm.output.fol_export)
    fol_cpout = pathlib.Path(sm.input.fol_cp)
    fol_masks = fol_cpout / 'masks'
    fol_images = fol_cpout / 'imgs'

In [None]:
C.fol_export.mkdir(exist_ok=True)

In [None]:
fn_readme = C.fol_export / 'README.md'


In [None]:
%%writefile $fn_readme


## Export Oexp Analysis
by Vito Zanotelli et al, Bodenmiller Lab UZH, 2020

This is the export of the overexpression dataset from the paper:
"A quantitative analysis of the interplay of environment, neighborhood and cell state in 3D spheroids"
Raw data: 10.5281/zenodo.4055780
Please cite the paper if you use this data!

###Experimental design (More details in the paper):
- Overexpressing 51 signaling constructs, 4 control contstructs (2x GFP,
    1x HcRed, 1x Luciferase) and 1 'empty' mock transfection controls
    grown in 5 replicates on 5 different plates ('empty' control has 35
    replicates).

- Most signaling constructs have a GFP tag. Typically only a subset of cells per sphere were overexpresing.

- 4 plates were pooled into one block with 240 well barcoding, 2 plates in one block with 120 well barcoding.

- A pellet of each pool was generated and cut into several 6um thick sections

- A subset of these sections (='sites') were stained with an IMC pane and acquired as 1 or more 'acquisitions' containing
  multiple spheres each.

- Spheres in these acquisitions were identified via computer vision and croped into individual 'images'

- In each image the following 'objects' were identified via computer vision:
    - 'cell's (cell sections)
    - 'nucleiexp' (slighly expanded cell centers around nuclei)
    - 'cyto' (cytoplasm, cell region without nuclei)
    -> In the manuscript only 'cell' level data was used.

- The data was exported using the 'anndata' csv format:
   https://anndata.readthedocs.io/en/stable/anndata.AnnData.html

Some notes on the files and their columns:

- **{object}_X.csv**:
    - The data matrix
    - Shape: #objects x #features
    - The measurements of this export contain both compensated MeanIntensities (MeanIntensityComp) of an IMC image stack
    (FullStackFiltered). In the case of 'cell' objects there are also measurements of min/max and mean Intensities from a pixel-wise compensated IMC image (FullStackComp) as well an Imunofluorescent stack
     image (IfStack, Dapi+GFP channel) and a pixel-probability stack (ProbPos, channels: prop-pos, prop-neg),
    - column metadata: **{object}_var.csv** table
    - row metadata: **{object}_obs.csv** table

- **{object}_var.csv**:
    - Variable metadata
    - Shape: #features x #columns
    - Columns:
        - measurement_id: unique measurement id
        - measurement_name: Name of measurement (this export: all compensated mean intensity)
        - measurement_type: Type of measurement (this export: only Intensity features)
        - channel_name, metal: Isotope name
        - stack_name: multicolor image stack containing this channel
        - ref_plane_number: position of the measured channel in it's image stack
        - goodname: The name of the marker
               no prefix: total protein
               p-: phopho protein
               []: phospho residue
               BC: barcoding metal
        - Antibody Clone: antibody clone name
        - is_cc: bool, indication if this marker is considered a classical cell cycle marker
        - working: bool, indicates if the markers are working and of biological value. I would only look at the marker with working=1
        Not important:
        - scale: scale of raw data (data is already scaled)
        - plane_id: database id for image plane.

- **{object}_obs.csv**:
    - Object (cell/nuclei/cytoplasma section) level metadata
    - Shape: #objects x #columns
    - Columns:
        - object_id: Unique object id (unique also accross object types)
        - image_id: The key linking to the 'image_meta.csv' table
        - object_number: id corresponding to the object value in the segmentation mask
        - distrim: Estimated distance to sphere border -> unit 'um'
        - Center_X/Y: Centroid of object in image -> unit 'um'
        - dist-sphere: distance to estimated spheroid section border
        - dist-other: distance to other spheroid section in image
        - dist-bg: distance to background pixels

- **relations_{source}_{target}.csv**:
    - Cell relationship graphs
    - Shape: #relations x #columns
    - Encoding relations between objects:
        - cell_neighbors: Neighbourhood graph:
            - object_id_cell: id of cell
            - object_id_neighbour: id of neighbor
        - cell_nuclei: Relationship between cells and nuclei
            - object_id_cell
            - object_id_nucleiexp
            -> This is not necessarily a 1:1 correspondence
        -cell_cyto: Relationship between cells and cytoplasm
            - object_id_cell
            - object_id_cyto
            -> This is not necessarily a 1:1 correspondence


- **image_meta.csv**:
    - Image (=spheroid section) metadata
    - Shape: #images x #columns
    - Columns:
        - Image metadata:
            - image_id: The unique key of this table. Each row corresponds to a single spheroid section
            - image_shape_h/w: width/heigh of image in pixels/um
            - acquisition_id: unique id of IMC acquisition this image was cropped from
            - site_id: unique id of the section this sphere cut comes from.
                    All cuts in the same section were stained together.
            - slide_id: unique id for a single slide containing 1 or more sites
            - sampleblock_id: unique id of the sample block this sphere was pooled and processed in.

            Not important:
            - image_number: original cellprofiler image number
            - crop_number: object number of the sphere that was used for this crop
            - image_pos_x/y: top left coordinate of crop of sphere from original acquisition
            - bc_depth: cells within this distance from border were considered for debarcoding
            - bc_invalid: number of invalid debarcoded objects in this sphere crop
            - bc_highest_count: number of cells assigned to the main barcode of this crop
            - bc_second_count: number of cells assigned to the second most frequent barcode of this crop
            - barcode: dictionary containing the barcode
            - bc_plate, bc_x, bc_y: barcode metadata
            - acquisition_mcd_acid: original MCD aquisition id
            - site_mcd_panoramaid: original MCD panorama id
            - acquisition_mcd_roiid: original MCD roiid
            - slideac_id/name: unique id for each aquisition of a slide. Corresponds to a single mcd file
            - slide_number: original number of slide this acquisition comes from

        - Experimental metadata:
            - condition_id: id of the physical spheroid the slice belongs to. Unique to each sphere replicate.
            - condition_name: name of the growth condition this sphere came from
            - plate_id: id of the plate the spheroid was grown in
            - well_name: position of the well the spheroid was grown in
            - sampleblock_id/sampleblock_name: id/name of the pooled block the spheroid was processed in
            - site_id: corresponds to the site the spheroid slice was located on. All spheroid slices in the same site were stained together.
            - file_name: filename of the segmentation mask found in masks_cell

        - Filenames:
            - mask_filename_{object}: filename of the object mask corresponding to this image
            - image_stack_filename_{imagestack}: filename of the image stack with this name.
                Note: all mean intensity measurements are usually done in the
                'FullStackFiltered' (raw image with only filtered for strong outliers)
                and then compensated for metal impurities (as recomended in Chevrier, Zanotelli and Crowell 2018).
                For visualization and Min/Max measurements 'FullStackComp'
                can be used as there the image was corrected for metal
                impurities.
                The channel order is the same for both stacks.

- Folder **masks**:
    - Folder containing the segmentation masks (See image_meta -> Filenames)

- Folder **images**:
    - Folder containing the image stacks (See image_meta -> Filenames)
    - The mapping between channels and image planes number is given through
        the 'ref_plane_number' from the {object}_var.csv metadata.

In [None]:
bro = spb.get_bro(C.fn_config)

## Build an object level metadata table

Add filters

In [None]:
q_obj = (bro.data.get_objectmeta_query()
         .filter(db.objects.object_type == 'cell')
        )

fil_name = 'modelfitcond_v1'
fil = (bro.session.query(db.object_filters)
           .join(db.object_filter_names)
           .filter(db.object_filter_names.object_filter_name == fil_name)).subquery()
q_obj = (q_obj
     .join(fil, fil.c.object_id == db.objects.object_id)
     .add_columns(fil.c.filter_value.label(fil_name))
    )

Add distance to border

In [None]:
fil = (bro.filters.measurements.get_measmeta_filter_statements(
    channel_names=['object'],
    stack_names=['ObjectStack'],
    measurement_names=[('dist-rim', 'Center_X', 'Center_Y')],
    measurement_types=['Location'])
    )

q_meas = (bro.data.get_measmeta_query()
              .filter(fil)
              .add_column(db.ref_stacks.scale)
             )

tdat = bro.io.objmeasurements.get_measurements(q_obj=q_obj, q_meas=q_meas,)
tdat = bro.io.objmeasurements.scale_anndata(tdat)
dat_location = pd.DataFrame(tdat.X, index=tdat.obs.object_id, columns=tdat.var.measurement_name).reset_index()

Add distance to other spheres in the image

In [None]:
fil = bro.filters.measurements.get_measmeta_filter_statements(
    channel_names=[None],
    stack_names=['DistStack'],
    measurement_names=['MeanIntensity'],
    measurement_types=['Intensity'])

q_meas = (bro.data.get_measmeta_query()
              .filter(fil)
              .add_columns(db.ref_stacks.scale,
                         db.ref_planes.channel_name)
             )

tdat = bro.io.objmeasurements.get_measurements(q_obj=q_obj, q_meas=q_meas,)
tdat = bro.io.objmeasurements.scale_anndata(tdat)
dat_dist = pd.DataFrame(tdat.X, index=tdat.obs.object_id, columns=tdat.var.channel_name).reset_index()

In [None]:
dat_obs = (bro.doquery(q_obj)
          .merge(dat_location)
          .merge(dat_dist))

In [None]:
dat_obs.head()

Rename modelfitcondition classes

In [None]:
class_labels = ['doubt', 'ctrl', 'oexp-NB', 'oexp']

dat_obs[fil_name] = pd.Categorical.from_codes(dat_obs[fil_name], categories=class_labels)

In [None]:
dat_obs.head()

## Build an measurement metadata table

In [None]:
bro.doquery(bro.session.query(db.stacks))

In [None]:
fil = bro.filters.measurements.get_measmeta_filter_statements(
    channel_names=[None, None],
    stack_names=['FullStackFiltered', ('FullStackComp', 'IfStack', 'ProbPos')],
    measurement_names=['MeanIntensityComp', ('MinIntensity', 'MaxIntensity', 'MeanIntensity')],
    measurement_types=[None, None])


q_meas = (bro.data.get_measmeta_query()
          .filter(fil)
          .add_columns(
              db.ref_planes.channel_name,
              db.ref_stacks.scale,
              db.planes.ref_plane_number,
              db.stacks.stack_name
          )
         )

dat_panel = bro.data.pannel[['metal', 'working', 'goodname', 'Antibody Clone', 'is_cc']]

dat_measmeta = (bro.doquery(q_meas)
                
                .merge(dat_panel, left_on=db.ref_planes.channel_name.key,
                                  right_on='metal',how='left')
                
               )

dat_measmeta

## Build an image level metadata table

In [None]:
q_imgmeta = (bro.session.query(
        db.images,
        db.conditions,
        db.acquisitions,
        db.sites,
        db.slideacs,
        db.slides,
    )
    .join(db.conditions)
    .join(db.acquisitions)
    .join(db.sites)
    .join(db.slideacs)
    .join(db.slides)
    .join(db.valid_images)
)
dat_imagemeta = bro.doquery(q_imgmeta)
bro.data._read_experiment_layout()

dat_imagemeta = (dat_imagemeta.loc[: , ~dat_imagemeta.columns.duplicated()]
                 .dropna(how='all', axis=1)
                 .merge(bro.data.experiment_layout.drop(columns=['iscontrol', 'control_name']))
                )
                 


In [None]:
dat_imagemeta.iloc[0,:]

In [None]:
q_imagestack_meta = (bro.session.query(
        db.image_stacks, db.stacks.stack_name)
        .join(db.stacks)
        # The we dont export the full stack image, so showing it here is confusing.
        .filter(db.stacks.stack_name != 'FullStack')
)

dat_stack_filenames = (bro.doquery(q_imagestack_meta).pivot(db.images.image_id.key,
                                     db.stacks.stack_name.key,
                                     db.image_stacks.image_stack_filename.key)
        .rename(columns=lambda x: f'{db.image_stacks.image_stack_filename.key}_{x}')
         .reset_index()
)

In [None]:
dat_stack_filenames.iloc[0,:]

In [None]:
q_mask_meta = (bro.session.query(
        db.masks)
             #.filter(db.masks.object_type == 'cell')
               )

dat_mask_filenames = (bro.doquery(q_mask_meta).pivot(db.images.image_id.key,
                                     db.masks.object_type.key,
                                     db.masks.mask_filename.key)
        .rename(columns=lambda x: f'{db.masks.mask_filename.key}_{x}')
         .reset_index()
)


In [None]:
dat_mask_filenames.iloc[0,:]

## Query the actual cell slice data

In [None]:
%%time
ad_cells = bro.io.objmeasurements.get_measurements(dat_obj=dat_obs, dat_meas=dat_measmeta)
ad_cells = bro.io.objmeasurements.scale_anndata(ad_cells)

### Also query matching nuclei and cytoplasma

In [None]:
q = bro.session.query(db.objects.object_type).distinct()
bro.doquery(q)

In [None]:
q=bro.session.query(db.object_relation_types)
bro.doquery(q)

Query cytoplasma which is a child of a cell

In [None]:
obj_alias = aliased(db.objects)
subq_cyto = subq_cyto = (q_obj
             .join(db.object_relations, db.object_relations.object_id_parent == db.objects.object_id)
             .join(db.object_relation_types)
             .filter(db.object_relation_types.object_relationtype_name=='Parent')
             .join(obj_alias, db.object_relations.object_id_child == obj_alias.object_id)
              .filter(obj_alias.object_type == 'cyto')
              .with_entities(obj_alias)
             .distinct()
             )


dat_obs_cyto = bro.doquery(subq_cyto)

In [None]:
dat_obs_cyto

Query nuclei which is a child of the cell

In [None]:
obj_alias = aliased(db.objects)
subq_nuc = subq_cyto = (q_obj
             .join(db.object_relations, db.object_relations.object_id_parent == db.objects.object_id)
             .join(db.object_relation_types)
             .filter(db.object_relation_types.object_relationtype_name=='Child')
             .join(obj_alias, db.object_relations.object_id_child == obj_alias.object_id)
              .filter(obj_alias.object_type == 'nucleiexp')
              .with_entities(obj_alias)
             .distinct()
             )


dat_obs_nuclei = bro.doquery(subq_nuc)

In [None]:
dat_obs_nuclei

In [None]:
ad_nuclei = bro.io.objmeasurements.get_measurements(dat_obj=dat_obs_nuclei, dat_meas=dat_measmeta)
ad_nuclei = bro.io.objmeasurements.scale_anndata(ad_nuclei)

In [None]:
ad_cyto = bro.io.objmeasurements.get_measurements(dat_obj=dat_obs_cyto, dat_meas=dat_measmeta)
ad_cyto = bro.io.objmeasurements.scale_anndata(ad_cyto)

In [None]:
# Write first Nuclei and cytoplasm and cells
for obj_type, ad in [('nucleiexp', ad_nuclei), ('cyto', ad_cyto), ('cell', ad_cells)]:
    ad.write_csvs(C.fol_export, skip_data=False)
    for x in ['X.csv', 'var.csv', 'obs.csv']:
        fn = (C.fol_export / x)
        fn.rename(C.fol_export / f'{obj_type}_{x}')


In [None]:
(C.fol_export / 'uns').rmdir()
for x in ['varm.csv', 'obsm.csv']:
    fn = (C.fol_export / x)
    fn.unlink()


In [None]:
for fn in C.fol_export.glob('*'):
    print(fn)

In [None]:
dat_img_meta_final = (dat_imagemeta
                     .merge(dat_mask_filenames)
                      .merge(dat_stack_filenames)
                     )
dat_img_meta_final.columns

In [None]:
dat_img_meta_final.to_csv(C.fol_export /'image_meta.csv',index=False)

### Export neighbourhood graph

In [None]:
bro.doquery(bro.session.query(db.object_relation_types))

In [None]:
q= (bro.session.query(db.object_relations.object_id_parent, db.object_relations.object_id_child)
    .join(db.objects, db.objects.object_id == db.object_relations.object_id_parent)
    .filter(db.objects.object_type == 'cell')
    
    .join(db.object_relation_types)
    .filter(db.object_relation_types.object_relationtype_name == 'Neighbors')
   )
dat_neighbour = bro.doquery(q).rename(columns={'object_id_parent': 'object_id_cell',
                                                                  'object_id_child': 'object_id_neighbor'})

In [None]:
dat_neighbour.iloc[0]

In [None]:
dat_neighbour.to_csv(C.fol_export /'relations_cell_neighbors.csv',index=False)

In [None]:
### Export object relations

In [None]:
%%time
obj_alias = aliased(db.objects)
subq_cyto = subq_cyto = (q_obj
             .join(db.object_relations, db.object_relations.object_id_parent == db.objects.object_id)
             .join(db.object_relation_types)
             .filter(db.object_relation_types.object_relationtype_name=='Parent')
             .join(obj_alias, db.object_relations.object_id_child == obj_alias.object_id)
              .filter(obj_alias.object_type == 'cyto')
              .with_entities(db.object_relations.object_id_parent, db.object_relations.object_id_child)
             .distinct()
             )


dat_map_cell_cyto = bro.doquery(subq_cyto).rename(columns={'object_id_parent': 'object_id_cell',
                                                                  'object_id_child': 'object_id_cyto'})

In [None]:
dat_map_cell_cyto.to_csv(C.fol_export /'relations_cell_cyto.csv',index=False)

In [None]:
%%time
obj_alias = aliased(db.objects)
subq_cyto = subq_cyto = (q_obj
             .join(db.object_relations, db.object_relations.object_id_parent == db.objects.object_id)
             .join(db.object_relation_types)
             .filter(db.object_relation_types.object_relationtype_name=='Child')
             .join(obj_alias, db.object_relations.object_id_child == obj_alias.object_id)
              .filter(obj_alias.object_type == 'nucleiexp')
              .with_entities(db.object_relations.object_id_parent, db.object_relations.object_id_child)
             .distinct()
             )


dat_map_cell_nucleiexp = bro.doquery(subq_cyto).rename(columns={'object_id_parent': 'object_id_cell',
                                                                  'object_id_child': 'object_id_nucleiexp'})

In [None]:
dat_map_cell_nucleiexp.to_csv(C.fol_export /'relations_cell_nucleiexp.csv',index=False)

## Copy images and masks

In [None]:
%%time
try:
    shutil.copytree(C.fol_images, C.fol_export /'images')
except FileExistsError:
    print('Folder already exists')

In [None]:
%%time
try:
    shutil.copytree(C.fol_masks, C.fol_export /'masks')
except FileExistsError:
    print('Folder already exists')