In [None]:
sm = snakemake

In [None]:
import pandas as pd

import spherpro.bro as spb
import spherpro.db as db

import numpy as np
import matplotlib.pyplot as plt
import plotnine as gg

import scanpy as sc

import pathlib

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from src.config import Conf
from src.variables import Vars

In [None]:
class C(Conf):
    fn_config = sm.input.fn_config
    fol_paper = pathlib.Path(sm.output.fol_plots)
    chan_pher2 = 'Gd155'
    chans_dead = ['Yb172', 'Yb174']
    fn_panel_ord = sm.input.fn_panel_ord

In [None]:
C.fn_panel_ord = sm.input.fn_panel_ord
C.fol_paper.mkdir()

In [None]:
class V(Vars):
    COL_LEIDEN = 'leiden'
    FIL_MITOSIS = 'is_mitotic'
    FIL_APOPTOSIS = 'is_apoptotic'

In [None]:
C.fol_paper.mkdir(exist_ok=True)

In [None]:
bro = spb.get_bro(C.fn_config)

In [None]:
sc.settings.figdir = str( C.fol_paper)
sc.settings._vector_friendly = True

In [None]:
import spherpro.bromodules.helpers_vz as helpers_vz

hpr = helpers_vz.HelperVZ(bro)

In [None]:

fil = bro.filters.measurements.get_measmeta_filter_statements(
    channel_names=[None, None],
    stack_names=['FullStackFiltered', 'FullStackComp'],
    measurement_names=[('MeanIntensityComp', 'NbMeanMeanIntensityComp', 'NbMaxMeanIntensityComp'), 'MeanIntensity'],
    measurement_types=[None, None])

q_meas = (bro.data.get_measmeta_query()
          .filter(fil)
          .add_columns(db.ref_stacks.scale, db.ref_planes.channel_name,
                      db.stacks.stack_name)
         )

q_obj = (bro.data.get_objectmeta_query()
         .join(db.conditions, db.conditions.condition_id == db.images.condition_id)
         .filter(db.objects.object_type == 'cell')
         .add_entity(db.conditions)
         .add_entity(db.acquisitions)
         .add_entity(db.sites)
         .add_entity(db.slideacs)
         .add_entity(db.slides)
        )

Query data

In [None]:
%%time
dat = bro.io.objmeasurements.get_measurements(q_meas=q_meas, q_obj=q_obj)
dat = bro.io.objmeasurements.scale_anndata(dat)

In [None]:
dat.obs = dat.obs.loc[:,~dat.obs.columns.duplicated()]
dat.var = dat.var.loc[:,~dat.var.columns.duplicated()]

Transform data

In [None]:
def censor_dat(x, q=99.9):
    x = np.copy(x)
    pmax = np.percentile(x,q=q)
    x[ x > pmax ] = pmax
    pmin = np.percentile(x,q=100-q)
    x[x < pmin] = pmin
    return x

def cur_logtransf(x):
    return np.log10(x+0.1)

def cur_transf(x):
    x= censor_dat(x, 99.9)
    x= cur_logtransf(x)
    return x

In [None]:
dat.X = np.apply_along_axis(cur_transf, 0, dat.X)

Add metadata

In [None]:
bro.data._read_experiment_layout()

In [None]:
dat_d2rim = hpr.get_d2rim()

In [None]:
bro.helpers.anndata.add_anndata_obsmeta(dat, dat_d2rim.drop(columns=['object_number', 'image_id', 'object_type']))

In [None]:
bro.helpers.anndata.add_anndata_obsmeta(dat, bro.data.experiment_layout)

In [None]:
bro.helpers.anndata.add_anndata_varmeta(dat, bro.data.pannel.rename(columns={'metal': db.ref_planes.channel_name.key}))

In [None]:
fil = (dat.var[db.ref_planes.channel_name.key].isin(list(bro.data.pannel.query(f'working==1')['metal'])+[C.chan_pher2]) &
       (dat.var[db.measurement_names.measurement_name.key] == 'MeanIntensityComp') &
       (dat.var[db.stacks.stack_name.key] == 'FullStackFiltered'))
datf = dat[:,fil]

In [None]:
sc.pp.neighbors(datf, use_rep='X')

In [None]:
sc.tl.umap(datf)

In [None]:
fig = sc.pl.umap(datf, color=['cellline'], s=1,)

In [None]:
sc.pl.umap(datf, color=['cellline'], s=1, save='_celline_overview.pdf')

In [None]:
sc.tl.leiden(datf, resolution=1, key_added='leiden')

In [None]:
sc.pl.stacked_violin(datf, var_names=datf.var.goodname, groupby='leiden', swap_axes=True, dendrogram=False, gene_symbols='goodname')

In [None]:
(datf.obs.groupby(['leiden', 'cellline']).size().rename('n').reset_index() >> 
 gg.ggplot(gg.aes(x='leiden', y='n')) +
 gg.geom_bar( gg.aes(fill='cellline'),stat='identity', position='dodge')+
 #gg.coord_cartesian(ylim=(0,20))+
 gg.theme(figure_size=(20,5))
    
 
 
)

In [None]:
(datf.obs.groupby(['leiden', 'cellline']).size().rename('n').reset_index() >> 
 gg.ggplot(gg.aes(x='cellline', y='n')) +
 gg.facet_wrap('leiden',scales='free')+
 gg.geom_bar( gg.aes(fill='cellline'),stat='identity', position='dodge')+
 #gg.coord_cartesian(ylim=(0,20))+
 gg.theme(figure_size=(20,5))
    
 
 
)

Calculate for each image what fraction of cells are comming from a cluster that is dominated by the same cell line

In [None]:
tdat = (datf.obs.groupby(['leiden', 'cellline']).size().rename('n')
 .groupby('leiden').apply(lambda x: x==max(x)).rename('is_majority')
 .reset_index()
 .merge(datf.obs[['cellline', 'leiden', 'image_id', 'object_id', 'object_number']]) 
)

In [None]:
tdat2 = (tdat
 .groupby(['image_id','cellline'],observed=True)['is_majority'].mean()

)
print(tdat2.sort_values())
(tdat2.reset_index() >>
  gg.ggplot(gg.aes(x='is_majority', fill='cellline'))+
 gg.geom_histogram()+
 gg.coord_cartesian(xlim=(0.75,1.0))+
 gg.scale_y_sqrt()
 
)

In [None]:
p = (tdat2.reset_index() >>
  gg.ggplot(gg.aes(x='is_majority'))
  + gg.geom_histogram()
 + gg.ylab('Number of sphere slices in bin\n [sqrt scaled axis]')
 + gg.xlab('Fraction of cells per sphere slice\n'
           'belonging to a cluster with\n a consistent cell line label')
  + gg.coord_cartesian(xlim=(0.75,1.0))
  + gg.scale_y_sqrt()
  #+ gg.guides(fill=gg.guide_legend(title='Cell Line'))
  + gg.theme(figure_size=(2,1),text=gg.element_text(size=6))
)
gg.ggsave(p, C.fol_paper /'db_qc_cell_assignment.pdf')
p

In [None]:
p = (datf.obs.groupby(['leiden', 'cellline']).size().rename('n').reset_index() >> 
 gg.ggplot(gg.aes(x='leiden', y='n'))
 #gg.facet_wrap('leiden',scales='free')+
 + gg.geom_bar( gg.aes(fill='cellline'),stat='identity', position='dodge')
 #gg.coord_cartesian(ylim=(0,20))+
 + gg.ylab('Number of cells')
 + gg.xlab('Cluster number')
 + gg.guides(fill=gg.guide_legend(title='Cell Line'))
 + gg.theme(figure_size=(2,1),text=gg.element_text(size=6))
)
gg.ggsave(p, C.fol_paper /'db_qc_cluster_assignment.pdf')
p

In [None]:
sc.pl.umap(datf, color=['leiden'], s=1, save='_celline_leiden.pdf')

In [None]:
tdat2.describe()

In [None]:
tdat['image_id'].nunique()

-> Maximal 12% of cells do not belong to the majority class cluster in any image

## There seem to be a split of of the UMAP/clusters do to pHer2
This is as mean pHer2 levels can often be exactly 0.

Check if whole spheres are pHer2 positive or if spheres can have both pos and neg slices

Check clusters with median > 0 pHer2

In [None]:
d=(datf.obs[[V.COL_LEIDEN]]
    .assign(**{V.COL_VALUE: datf.X[:, datf.var[V.COL_CHANNELNAME] == C.chan_pher2]})
    .groupby(V.COL_LEIDEN).median()
    
)
d.hist()

In [None]:
clust_her2pos = d.query(f'{V.COL_VALUE} > -0.5').index.values

In [None]:
t_her2 = (tdat
 .assign(**{'is_her2': lambda x: x['leiden'].isin(clust_her2pos).astype('float')})
 .merge(datf.obs)
 .assign(**{'date': lambda d: d['slideac_name'].map(lambda x: pd.to_datetime(x.split('_')[0]))})
 .groupby(['image_id']+['cellline',
                        'condition_id',
                        'time_point',
                        'plate_id',
                        'hastelox',
                        'well_name','bc_x', 'bc_y', 
                        'slideac_name','date'
                        ],observed=True)[['is_her2']].mean()

)

In [None]:
t_her2['is_her2'].hist(bins=30)
plt.suptitle('Fraction of cells of an image belonging to a pHer2 positive cluster')

-> There seem to be some images with exclusively positive cells

In [None]:
(t_her2.reset_index() >>
    gg.ggplot(gg.aes(x='is_her2'))+
     gg.facet_grid('plate_id~cellline')+
     gg.geom_histogram(size=1)
)

It seems that all T47D seem to be positive, for the others there seems to be quite a spread between
fraction of pHer2 positive cells per images.

How does this look per sphere?

In [None]:
t_her2_cond =(t_her2
    .assign(**{'all_her2': lambda x: (x['is_her2'] > 0.50).astype(np.float)})
    .groupby(['cellline',
                        'condition_id',
                        'time_point',
                        'plate_id',
                        'hastelox',
                        'well_name','bc_x', 'bc_y'
                        ], observed=True)['all_her2'].describe().reset_index()
)

In [None]:
t_her2_cond['mean'].hist(bins=30)
plt.title('Fraction of images per sphere with more than 50% of Her2 positive cells')

In [None]:
(t_her2_cond >>
    gg.ggplot(gg.aes(x='count', y='mean'))+
     gg.facet_grid('plate_id~cellline')+
     gg.geom_point(size=1)+
     gg.xlab('Nr of images per sphere')+
     gg.ylab('Fraction of images with > 50% pHer2 positive cells')
)

Seem to be quite random - is there a pattern on the plates?

In [None]:
(t_her2_cond >>
    gg.ggplot(gg.aes(x='bc_x', y='bc_y',fill='mean'))+
     gg.facet_wrap('plate_id',ncol=2)+
     gg.geom_point(size=10)+
     gg.ggtitle('Fraction of images with > 50% pHer2 positive cells over plate')+
     gg.theme(figure_size=(10,2))
)

Seems to be that there are more pHer2 negative images in the plate 176 (96h)

Is there a trend over the acquisition time?

In [None]:
(t_her2.reset_index() >>
  gg.ggplot(gg.aes(x='np.argsort(date)', y='is_her2'))+
 gg.facet_wrap('cellline')+
 gg.geom_point()+
 gg.geom_smooth()
)

Doesn't seem to be the case, at least not strongly

Instead of looking via clusters, how does this look if we directly look for cells
with 0 mean counts:  
Her2 < log10(0.01)

In [None]:
datf.var.columns

In [None]:
t_her2 = (datf.obs
 .join(pd.DataFrame({'ct_her2': datf[:, datf.var.channel_name == C.chan_pher2].X.squeeze()}, index=datf.obs.index))
 .assign(**{'is_her2': lambda x: x['ct_her2'] >-1})
 .assign(**{'date': lambda d: d['slideac_name'].map(lambda x: pd.to_datetime(x.split('_')[0]))})
 .groupby(['image_id']+['cellline',
                        'condition_id',
                        'time_point',
                        'plate_id',
                        'hastelox',
                        'well_name','bc_x', 'bc_y', 
                        'slideac_name','date'
                        ],observed=True)[['is_her2']].mean()
)

In [None]:
(t_her2.reset_index() >>
  gg.ggplot(gg.aes(x='np.argsort(date)', y='is_her2'))+
 gg.facet_grid('cellline~plate_id')+
 gg.geom_point()+
 gg.geom_smooth())

In [None]:
(t_her2.reset_index() >>
  gg.ggplot(gg.aes(x='(np.argsort(condition_id))', y='is_her2', group='condition_id'))+
 gg.facet_grid('cellline~plate_id')+
 gg.geom_boxplot())

-> Looks similar

Conclusion: The pHer2 being zero sometimes does not seem to be something systematically drifting over acquiisiton time. Also it does not seem to be affecting only certain images or spheres.

Thus I conclude that pHer2 being sometimes low may be something biological and I leave pHer2 in the data.

## Continue plotting:

In [None]:
sc.pl.umap(datf, color=['leiden'], s=1,)

In [None]:
for c in datf.obs.cellline.unique():
    xy = datf.obsm['X_umap'][datf.obs.cellline==c,:]
    plt.scatter(xy[:,0],xy[:,1], s=1,rasterized=True)
    plt.title(c)
    plt.show()
    plt.close()

In [None]:
sc.pl.umap(datf, color=['cellline',
                        'concentration',
                        'time_point',
                        'plate_id',
                        'hastelox',
                        'well_name','bc_x', 'bc_y'
                        ], s=1)

In [None]:
sc.pl.umap(datf, color=list(datf.var.goodname)+['cellline',
                        'concentration',
                        'time_point',
                        'plate_id',
                        'distrim'
                        ], gene_symbols='goodname', s=1)

In [None]:
sc.pl.umap(datf, color=list(datf.var.goodname)+['cellline',
                        'concentration',
                        'time_point',
                        'plate_id',
                        'distrim'
                        ], gene_symbols='goodname', s=1)

Load Mitosis/Apoptosis Filter

In [None]:
q = (bro.session.query(db.object_filters,
                   db.object_filter_names.object_filter_name)
    .join(db.object_filter_names)
     .filter(db.object_filter_names.object_filter_name.in_([V.FIL_MITOSIS,
                                        V.FIL_APOPTOSIS]))
)
dat_fil = (bro.doquery(q)
           .pivot_table(values=V.COL_FILTERVAL, index=V.COL_OBJID,
                        columns=db.object_filter_names.object_filter_name.key)
           .reset_index()
          )

In [None]:
datf = bro.helpers.anndata.add_anndata_obsmeta(datf, dat_fil)

In [None]:
sc.pl.umap(datf, color=list(datf.var.goodname)+['cellline', V.FIL_MITOSIS, V.FIL_APOPTOSIS,
                        'concentration',
                        'time_point',
                        'plate_id',
                        'distrim'
                        ],
           gene_symbols='goodname', s=1, save='_markers.pdf')

In [None]:
sc.set_figure_params(dpi_save=600,vector_friendly=True, figsize=(1, 1), fontsize=6)
sc.pl.umap(datf, color=list(datf.var.goodname)+['cellline', V.FIL_MITOSIS, V.FIL_APOPTOSIS,
                            'concentration',
                            'time_point',
                            'plate_id',
                            'distrim'
                            ],
               gene_symbols='goodname', s=1)

In [None]:
dat_panel_ord = pd.read_csv(C.fn_panel_ord)

In [None]:
COL_CLASS = 'class'
COL_ORD_WITHIN = 'ord_within_class'
COL_METAL = 'metal'
class_ord = ['tag','egf','mtor', 'cellcycle', 'stress', 'apoptosis','total']
dat_panel_ord[COL_CLASS] = pd.Categorical(dat_panel_ord[COL_CLASS], categories=class_ord)

In [None]:
ord_pltname = datf.var.merge(dat_panel_ord[[COL_METAL, COL_CLASS, COL_ORD_WITHIN]], left_on=V.COL_CHANNELNAME,
                             right_on=COL_METAL).sort_values([ COL_CLASS, COL_ORD_WITHIN])[V.COL_GOODNAME].unique()

In [None]:
sc.set_figure_params(dpi_save=600,vector_friendly=True, figsize=(0.8, 0.8), fontsize=6)
sc.pl.umap(datf, color=['cellline', 'distrim']+ list(datf.var.goodname),
               gene_symbols='goodname', s=0.1, save='_markers.svg')

In [None]:
fig = sc.pl.umap(datf, color=['cellline', 'distrim']+ list(datf.var.goodname),
               gene_symbols='goodname', s=0.1,return_fig=True)

In [None]:
for ax in fig.axes:
    ax.set_xlabel('')
    ax.set_ylabel('')
    ax.xaxis.labelpad=0.0 
    ttl = ax.title
    ttl.set_position([.5, 1.0])

In [None]:
fig.tight_layout()

In [None]:
fig

In [None]:
datf.obs['time_point'] = datf.obs['time_point'].astype(pd.CategoricalDtype(categories=sorted(datf.obs['time_point'].unique())))
datf.obs['concentration'] = datf.obs['concentration'].astype(pd.CategoricalDtype(categories=sorted(datf.obs['concentration'].unique())))

In [None]:
marks = ['cellline']+ list(ord_pltname) + ['distrim', 'concentration', 'time_point', V.FIL_MITOSIS, V.FIL_APOPTOSIS]
plt.ioff()
ncol = 5
fig, axs = plt.subplots(nrows=int(np.ceil(len(marks)/ncol)), ncols=ncol, figsize=(5,6))

for ax, m in zip(axs.flatten(), marks):
    sc.pl.umap(datf, color=m,
               gene_symbols='goodname', s=0.1, ax=ax,title='')
    ax.set_xlabel('')
    ax.set_ylabel('')
    ax.xaxis.labelpad=0.0 
    ttl = ax.title
    ttl.set_position([.5, 0.5])
    ax.set_aspect('equal')
    ax.set_title(m, pad=0)
plt.ion()

In [None]:
import anndata as ad


In [None]:
x = np.arange(datf.shape[0])
np.random.shuffle(x)

In [None]:
obsm = {'X_umap': datf.obsm['X_umap'][x,:]}
obsm

In [None]:
datf2 = ad.AnnData(datf.X[x,:], obs=datf.obs.iloc[x,:], var=datf.var, obsm=obsm)

In [None]:

sc.set_figure_params(dpi_save=600,vector_friendly=True, figsize=(1.5, 1.5), fontsize=6)
marks = ['distrim'] + list(ord_pltname) + ['cellline', 'concentration', 'time_point', V.FIL_MITOSIS, V.FIL_APOPTOSIS]
plt.ioff()
ncol = 5
fig, axs = plt.subplots(nrows=int(np.ceil(len(marks)/ncol)), ncols=ncol, figsize=(6,7))

for ax, m in zip(axs.flatten(), marks):
    sc.pl.umap(datf2, color=m,
               gene_symbols='goodname', s=0.1, ax=ax,title='')
    ax.set_xlabel('')
    ax.set_ylabel('')
    ax.xaxis.labelpad=0.0 
    ttl = ax.title
    ttl.set_position([.5, 0.5])
    ax.set_aspect('equal')
    ax.set_title(m, pad=0)
plt.ion()

In [None]:
#fig.tight_layout()

In [None]:
fig.savefig(C.fol_paper / 'umap_markers.svg', dpi=600)