In [None]:
sm = snakemake

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
%matplotlib inline

In [None]:
import spherpro.bro as spb
import spherpro.datastore as spd
import spherpro.library as spl
import spherpro.configuration as conf
import spherpro.db as db
import imp
import pycytools as pct
import pycytools.library as pclib
import re
import os
import pandas as pd
import numpy as np
import spherpro.library as lib
import matplotlib.pyplot as plt
import plotnine as gg
import seaborn as sns
import pathlib

# Aim: identify overexpressing cells and their neighbours

The aim of this notebooks is to identify overexpressing cells as well as their neighbours.

For consturcts with GFP we have 3 ways to do this:
- two IMC GFP antibodies
- primary GFP fluorescence
- A pixel classifier trained for 'positive' pixels (visual) based on both IMC antibodies + primary fluorescence

For constructs with FLAG tag we only have the FLAG tag measured by IMC



Setup the paths/configuration

In [None]:
class C:
    # input files
    fn_config = sm.input.fn_config
    fn_constructs = pathlib.Path(sm.input.fn_constructs)
    
    # output files
    fol_out = pathlib.Path(sm.params.fol_figures)
    fol_plots = fol_out
    
    # module specific
    metal_flag = ['Yb176']
    fdr_flag = 0.01
    metal_gfp = ['Er167', 'Tm169']
    fdr_gfp = 0.05
    measurement_name = 'MeanIntensityComp'
    cell_type ='cell'
    transform = lambda x: np.log10(x+0.1)
    FIL_GFP='is-gfppos'
    FIL_FLAG='is-flagpos'

Get the bro and setup some helpers


In [None]:
bro = spb.get_bro(C.fn_config)

In [None]:
import spherpro.bromodules.helpers_vz as helpers_vz
hpr = helpers_vz.HelperVZ(bro)

In [None]:
from src.variables import Vars as V

In [None]:
V.COL_METAL = 'metal'
V.COL_TAGFLAG = 'TagFLAG'
V.COL_TAGGFP = 'TagGFP'
V.COL_ISFLAG = 'isFLAG'
V.COL_ISGFP = 'isGFP'
V.COL_CO = 'co'
V.COL_ISSIG = 'is-sig'

Query metadata

In [None]:
dat_pannelcsv = hpr.get_pannelcsv()

dat_measmeta = hpr.get_measuremeta(dat_pannelcsv, measurement_names=[C.measurement_name])

dat_imgmeta = hpr.get_imgmeta()
#dat_imgmeta[V.COL_SITELEVEL] = dat_imgmeta[V.COL_SITELEVEL].astype('str')

fil_good_meas = hpr.get_fil_good_meas(dat_measmeta)

In [None]:
dat_constructs = pd.read_csv(C.fn_constructs)
dat_constructs.head()

In [None]:
bro.doquery(bro.session.query(db.conditions)).head()

Generate a metadata table for the overexpression conditions

In [None]:
dat_condmeta = (bro.doquery(bro.session.query(db.conditions))
                .merge(dat_constructs)
               )

We have either: c-TER Flag taged & FLAG-GFP tagged constructs.

Setup a boolean indicator if we expect a GFP or FLAG signal

In [None]:
dat_condmeta[V.COL_ISFLAG] = dat_condmeta.apply(lambda x: x[V.COL_TAGGFP] | x[V.COL_TAGFLAG], axis=1)
dat_condmeta[V.COL_ISGFP] = dat_condmeta.apply(lambda x: x[V.COL_TAGGFP] , axis=1)

Prepare queries that identify specific measurements & objects in the database

In [None]:
fil_m = bro.filters.measurements.get_measmeta_filter_statements(
        channel_names = [('GFP', 'DAPI'),  'prop-pos', tuple(C.metal_gfp+C.metal_flag)],
        stack_names = ['IfStack', 'ProbPos', 'FullStackFiltered'],
        measurement_names= ['MeanIntensity', ('UpperQuartileIntensity', 'MeanIntensity', 'NbAllMaxMeanIntensity'),
                            ('MeanIntensityComp', 'NbAllMaxMeanIntensityComp')],
        measurement_types=['Intensity', 'Intensity', 'Intensity'])

q_meas = (bro.data.get_measmeta_query()
            #.filter(db.stacks.stack_name.in_(['ifstack','gfpclass']))
            .filter(fil_m)
            .add_columns(db.ref_planes.channel_name,
                        db.ref_stacks.scale)
           )

q_obj = (bro.data.get_objectmeta_query()
        .filter(db.objects.object_type == 'cell')
        .add_columns(db.objects.object_number))

Query the data measurements/objects from the database as an anndata object

In [None]:
dat = bro.io.objmeasurements.get_measurements(q_obj=q_obj, q_meas=q_meas)
dat = bro.io.objmeasurements.scale_anndata(dat)

Convert the data into a tidy format to quickly plot via plotnine

In [None]:
fil = dat.var.measurement_name.isin(['MeanIntensity', 'MeanIntensityComp'])
pdat = pd.DataFrame(dat[:,fil].X, index=dat.obs.image_id, columns=dat[:,fil].var.channel_name).reset_index().merge(dat_imgmeta).merge(dat_condmeta)

In [None]:
(gg.ggplot(pdat, gg.aes(x='np.log10(GFP*2**16+300)', y='np.log10(Tm169+Er167+0.1)'))+
     gg.facet_grid(f'.~{V.COL_ISGFP}+{V.COL_ISFLAG}')+
     # gg.facet_grid('doxocyline~.')+
     #gg.geom_point(alpha=1, size=0.01)+
     gg.geom_bin2d(bins=200)+
     gg.geom_smooth()+
     gg.xlab('GFP primary fluorescence')+
     gg.ylab('GFP IMC antibodies')+
     gg.coord_equal()
)

-> Seems like the sum of the GFP antibodies correlate with the GFP IF primary fluorescence.

It looks like the IMC antibodies have a higher sensitivity

In [None]:
(gg.ggplot(pdat, gg.aes(x='np.log10(Er167+0.1)', y='np.log10(Tm169+0.1)'))+
     gg.facet_grid(f'.~{V.COL_ISGFP}+{V.COL_ISFLAG}')+
    # gg.facet_grid('doxocyline~.')+
         #gg.geom_point(alpha=1, size=0.01)+
         gg.geom_bin2d(bins=200)+
         gg.geom_smooth()+
         gg.coord_equal()
)



-> The two antibodies are also correlating strongly

In [None]:
(gg.ggplot(pdat.rename(columns={'prop-pos':'pos'}), gg.aes(x='pos', y='np.log10(Tm169+0.1)'))+
    gg.facet_grid(f'.~{V.COL_ISGFP}+{V.COL_ISFLAG}')+
    # gg.facet_grid('doxocyline~.')+
         #gg.geom_point(alpha=1, size=0.01)+
         gg.geom_bin2d(bins=200)+
        gg.scale_color_continuous(trans = "log10", option='inferno')
)

-> The trained classifier seems to be quite specific.

Some potential false positives can be seen in the Control cells.

Interestingly the cells which had only a FLAG tag had a higher number of false positives

In [None]:
(gg.ggplot(pdat.rename(columns={'prop-pos':'pos'}),
           gg.aes(x='pos'))+
    gg.facet_grid(f'.~{V.COL_ISGFP}')+
    # gg.facet_grid('doxocyline~.')+
         #gg.geom_point(alpha=1, size=0.01)+
         gg.geom_histogram(gg.aes(y='..density..'), bins=200)+
     gg.scale_y_sqrt()+
 gg.xlab('Average positive pixel probability [a.u.]')+
 gg.ylab('Density [sqrt scale]') +
 gg.theme(figure_size=(5,3))
)

In [None]:
(gg.ggplot(pdat.rename(columns={'prop-pos':'pos'}),
           gg.aes(x='pos'))+
    gg.facet_grid(f'.~{V.COL_ISGFP}')+
    # gg.facet_grid('doxocyline~.')+
         #gg.geom_point(alpha=1, size=0.01)+
         gg.geom_histogram(gg.aes(y='..density..'), bins=200)+
 gg.coord_cartesian(ylim=(0,10))+
 gg.xlab('Average positive pixel probability [a.u.]')+
 gg.ylab('Density [sqrt scale]') +
 gg.theme(figure_size=(5,3))
)

Plots the sum of both IMC antibodies on X, the GFP IF on Y and the probability for 'overexpression' as color.


In [None]:
d=pdat.query(f'{V.COL_TAGGFP}==True')
plt.figure()
plt.hexbin(C.transform(d['Er167']+d['Tm169']), C.transform(d['GFP']*2**16+300), C= d['prop-pos'], gridsize=80)
plt.colorbar()

Now plot some example images to see how this looks in practice:

In [None]:
from src.plots.plot_ad_image import AnndataImagePlotter

In [None]:
ap = AnndataImagePlotter(bro)

In [None]:
a = ap.plot_anndata_subplots(1, dat, figsize=2)

In [None]:
plt.close('all')

In [None]:
a = ap.plot_anndata_subplots(12, dat, figsize=2)

Query some imageids for plotting

In [None]:
(bro.session.query(db.images.image_id, db.conditions.condition_name)
            .join(db.conditions)
            .filter(db.conditions.condition_name.startswith('GFP_GFP'))
            .join(db.valid_images)).all()

In [None]:
imid = 504
a = ap.plot_anndata_subplots(imid, dat, figsize=1.5)

Adjust some plotting parameters

In [None]:
ax = a[1][0]
im = ax.images[0]
im.set_clim(0,5)
ax = a[1][4]
im = ax.images[0]
im.set_clim(0,5)

In [None]:
ax = a[1][3]
im = ax.images[0]
im.set_clim(0,3000)

In [None]:
img = a[1][1].images[0]

In [None]:
img.set_clim(0,1)

In [None]:
ax.get_figure()

Now decide on a cutoff

In [None]:
def score(x,y, ta=0.1, tb=0.3):
    out = np.zeros(len(x))
    sa = x > ta
    sb = x > (y * tb)
    neg = sa == False
    pos = sa & sb
    doubt = (sa & (sb == False))
    out[neg] = 0
    out[pos] = 2
    out[doubt] = 1
    return out
    
    
    
    

In [None]:


tdat = dat[dat.obs.query(f'{V.COL_IMGID}=={imid}').index,:][:, dat.var.query(f'({V.COL_CHANNELNAME}=="prop-pos") & {V.COL_MEASNAME}=="MeanIntensity"').index]



    

In [None]:
(bro.session.query(db.images.image_id, db.conditions.condition_name)
            .join(db.conditions)
            .filter(db.conditions.condition_name.startswith('TNF'))
            .join(db.valid_images)).all()

In [None]:
imid = 1111
tdat = dat[dat.obs.query(f'{V.COL_IMGID}=={imid}').index,:][:,dat.var.channel_name.isin(['Er167', 'GFP', 'Tm169', 'prop-pos'])]
a = ap.plot_anndata_subplots(imid, tdat, figsize=3)


mask = bro.io.masks.get_mask(imid, 'cell')

In [None]:
imid = 1111
tdat = dat[dat.obs.query(f'{V.COL_IMGID}=={imid}').index,:][:,dat.var.channel_name.isin(['Er167', 'GFP', 'Tm169', 'prop-pos']) & (
dat.var.measurement_name.isin(['MeanIntensity', 'MeanIntensityComp', 'NbAllMaxMeanIntensity']))]
a = ap.plot_anndata_subplots(imid, tdat, figsize=3)


mask = bro.io.masks.get_mask(imid, 'cell')
ax = a[1][0]
im = ax.images[0]
im.set_clim(0,4)
ax = a[1][1]
im = ax.images[0]
im.set_clim(0,2000)
ax = a[1][2]
im = ax.images[0]
im.set_clim(0,4)

Try cutoffs

In [None]:
score_ta = 0.01
score_tb = 0.3

In [None]:
val = score(tdat[:, tdat.var.query(f'({V.COL_CHANNELNAME}=="prop-pos") & {V.COL_MEASNAME}=="MeanIntensity"').index].X.squeeze(),
     tdat[:, tdat.var.query(f'({V.COL_CHANNELNAME}=="prop-pos") & {V.COL_MEASNAME}=="NbAllMaxMeanIntensity"').index].X.squeeze(),
    ta = score_ta, tb=score_tb)

for ax in a.flatten():
    ap._add_contour(mask, val, objnr=tdat.obs[db.objects.object_number.key], ax=ax, cmap=['white','red', 'green'],linestyles=':', linewidths=1)

In [None]:
ax.get_figure()

-> Green = Overexpression

   Red = unsure due to spatial spillover

In [None]:
val_gfp = score(dat[:, dat.var.query(f'({V.COL_CHANNELNAME}=="prop-pos") & {V.COL_MEASNAME}=="MeanIntensity"').index].X.squeeze(),
     dat[:, dat.var.query(f'({V.COL_CHANNELNAME}=="prop-pos") & {V.COL_MEASNAME}=="NbAllMaxMeanIntensity"').index].X.squeeze(),
    ta =score_ta , tb=score_tb)

obj_gfp = dat.obs.object_id
dat_gfpfil = pd.DataFrame({db.object_filters.object_id.key: obj_gfp, 
                           db.object_filters.filter_value.key: val_gfp})

In [None]:
bro.filters.objectfilterlib.write_filter_to_db(dat_gfpfil, C.FIL_GFP, drop=True)

In [None]:
unique, counts = np.unique(val, return_counts=True)
unique, counts

In [None]:
import seaborn as sns

In [None]:
t=score_ta
x = (pdat.groupby(by=[V.COL_ISGFP, V.COL_ISFLAG])['prop-pos'].apply(lambda x: {'npos': np.sum(x > t),
                                                                  'fracpos': np.mean(x>t),
                                                                 'n': len(x)})
     .unstack()
     .assign(**{'exp_pos': lambda x:  x['n']*x.loc[(False, False), 'fracpos']})
     .assign(**{'fdr': lambda x: x['exp_pos']/x['npos']})
    )

x

In [None]:
dat

Find also score for FLAG

In [None]:

def find_fdr_cutoff(values, neg_values, fdr=0.05):
    """
    calculates the empirical FDR cuttof using a series of values values and a true negative label is_tn
    :param value:
    :param is_tn:
    :return:
    """
    univals = list(values.tolist())
    univals.extend(neg_values.tolist())
    univals.sort(reverse=True)
    n_neg = len(neg_values)
    n_vals = len(values)
    for co in univals:
        if n_neg > 0:
            fpr = np.sum(neg_values >= co)/n_neg
        if fpr > 0:
            e_fdr = (fpr*n_vals)/np.sum(values >= co)
            if e_fdr > fdr:
                return co

In [None]:
tdat = dat[:,dat.var.query(f'({V.COL_CHANNELNAME} in {C.metal_flag}) & ({V.COL_MEASNAME}=="MeanIntensityComp")').index]
tdat_max = dat[:,dat.var.query(f'({V.COL_CHANNELNAME} in {C.metal_flag}) & ({V.COL_MEASNAME}=="NbAllMaxMeanIntensityComp")').index]

In [None]:
is_empty = dat_imgmeta.merge(dat_condmeta.query(f'({V.COL_TAGFLAG} | {V.COL_TAGGFP}) == False'))[V.COL_IMGID]

In [None]:
is_flag = dat_imgmeta.merge(dat_condmeta.query(f'({V.COL_TAGFLAG})'))[V.COL_IMGID]

In [None]:
x_empty =  tdat[tdat.obs[V.COL_IMGID].isin(is_empty),:].X.squeeze()
x_flag = tdat[tdat.obs[V.COL_IMGID].isin(is_flag),:].X.squeeze()
co = find_fdr_cutoff(x_flag, x_empty, fdr=0.05)
co

In [None]:
tdat.obs[V.COL_IMGID]

In [None]:
plt.figure()
plt.hist([C.transform(np.array(x_empty)),C.transform(np.array(x_flag))],bins=100,density=True)
plt.gca().axvline(C.transform(co))

In [None]:
C.transform(co)

In [None]:
np.sum(x_flag > co)

In [None]:
val_flag = score(tdat.X.squeeze(), tdat_max.X.squeeze(), ta=co, tb=0.1)
obj_flag = tdat.obs[V.COL_OBJID]

dat_flagfil = pd.DataFrame({db.object_filters.object_id.key: obj_flag, 
                           db.object_filters.filter_value.key: val_flag})

In [None]:
bro.filters.objectfilterlib.write_filter_to_db(dat_flagfil, C.FIL_FLAG, drop=True)

In [None]:
imid = int(dat[dat.obs.object_id.isin(obj_flag[val_flag==2]) & dat.obs.image_id.isin(is_flag),:].obs.image_id.unique()[10])
tdat = dat[dat.obs.query(f'{V.COL_IMGID}=={imid}').index,:][:,dat.var.channel_name.isin(C.metal_flag+['GFP', 'Tm169', 'prop-pos'])]
a = ap.plot_anndata_subplots(imid, tdat, figsize=1.7)


mask = bro.io.masks.get_mask(imid, 'cell')

In [None]:
a[1][0].images[0].set_clim(0,3)

In [None]:
a[0][0].images[0].set_clim(0,3)

In [None]:
dic_flagval = {o: v for o, v in zip(obj_flag, val_flag)} 

In [None]:
val = np.array(list(map(dic_flagval.get, tdat.obs.object_id.values)))
for ax in a.flatten():
    ap._add_contour(mask, val, objnr=tdat.obs[db.objects.object_number.key], ax=ax, cmap=['white','red', 'green'],linestyles=':', linewidths=1)

Propagate the filters to neighbouring cells

In [None]:
def get_fildat(filname, obj_type='cell'):
    d = bro.doquery(bro.session.query(db.object_filters, db.object_filter_names.object_filter_name)
               .join(db.object_filter_names)
               .filter(db.object_filter_names.object_filter_name == filname)
               .join(db.objects)
                    .join(db.images)
                .filter(db.objects.object_type == obj_type)
                .join(db.valid_objects)
                    .join(db.valid_images)
                   )
    return d

In [None]:
nbagg = bro.processing.nb_aggregation
VAL_NB = 'Neighbors'

In [None]:
dat_nb = nbagg.get_nb_dat(VAL_NB,obj_type='cell')

In [None]:
def get_nb_filterval(fil_name, dat_nb, exclude_selfepos=False, pos_val=1):
    """
    Generates a filter if there is any positive neightbours.
    If the cell itself i
    """
    COL_OBJ_ID = db.objects.object_id.key
    COL_PARENTOBJID = db.object_relations.object_id_parent.key
    COL_CHILDOBJID = db.object_relations.object_id_child.key
    COL_FILTERVAL = db.object_filters.filter_value.key
    dat_fil = (get_fildat(fil_name)[[COL_OBJ_ID, COL_FILTERVAL]])
    if exclude_selfepos:
        dat_fil_pos = dat_fil.query('{} == {}'.format(COL_FILTERVAL,pos_val))[COL_OBJ_ID]
        # drop the children that are positive themselves
        dat_nb= dat_nb.loc[dat_nb[COL_CHILDOBJID].isin(dat_fil_pos) == False,:]
        
    nb_fil = (dat_fil
     .merge(dat_nb,left_on=COL_OBJ_ID, right_on=COL_PARENTOBJID)
     .drop(COL_PARENTOBJID, axis=1)
     .drop(COL_OBJ_ID,axis=1)
     .rename(columns={COL_CHILDOBJID: COL_OBJ_ID})
     .assign(**{COL_FILTERVAL: lambda x: x[COL_FILTERVAL] == pos_val})
     .groupby(COL_OBJ_ID)[COL_FILTERVAL]
     .sum()
     .reset_index())
    # Fill the object_ids that disappeared with 0
    nb_fil = dat_fil[COL_OBJ_ID].to_frame().merge(nb_fil, how='left') 
    nb_fil[COL_FILTERVAL] = nb_fil[COL_FILTERVAL].fillna(0)
    return nb_fil

In [None]:
x= get_nb_filterval(C.FIL_FLAG, dat_nb, pos_val=2, exclude_selfepos=False)

In [None]:
x.filter_value.median()

In [None]:
y = get_fildat(C.FIL_FLAG)

In [None]:
np.unique(y.query('filter_value==2').merge(x, on='object_id')['filter_value_y'], return_counts=True)

In [None]:
np.unique(x.filter_value, return_counts=True)

In [None]:
fils = [C.FIL_FLAG, C.FIL_GFP]
fil_suffix = 'Nb'

for f in fils:
    dat_filnb = get_nb_filterval(f, dat_nb, pos_val=2)
    print(f)
    print(dat_filnb[db.object_filters.filter_value.key].sum())
    bro.filters.objectfilterlib.write_filter_to_db(dat_filnb, f+fil_suffix, drop=True)

In [None]:
fils = [C.FIL_FLAG, C.FIL_GFP]
fil_suffix = 'NbWeak'

for f in fils:
    dat_filnb = get_nb_filterval(f, dat_nb, pos_val=1)
    print(f)
    print(dat_filnb[db.object_filters.filter_value.key].sum())
    bro.filters.objectfilterlib.write_filter_to_db(dat_filnb, f+fil_suffix, drop=True)

Quick look at where the false positives from the FLAG taged constructs come from:

In [None]:
(pdat.query(f'{V.COL_ISGFP} == False')
 .assign(**{'pos': lambda x: x['prop-pos']>0.01})
 .groupby([ V.COL_CONDNAME, 'modification', 'tag', V.COL_CONDID])['pos'].sum()
 .reset_index()
  .sort_values('pos', ascending=False)
 >> gg.ggplot(gg.aes(x='pos', fill=V.COL_CONDNAME))
 + gg.geom_histogram()
 + gg.coord_cartesian(ylim=(0,25))
 + gg.ggtitle('GFP positives per sphere')
)

-> There seem to be multiple spheres positive for both myrFLAG-AKT and myr-FLAG-PI3K

In [None]:
tpdat = (pdat.query(f'{V.COL_ISGFP} == False')
 .assign(**{'pos': lambda x: x['prop-pos']>0.01})
 .groupby([ V.COL_CONDNAME, 'modification', 'tag', V.COL_CONDID, V.COL_IMGID])['pos'].sum()
 .reset_index()
  .sort_values('pos', ascending=False)
)

In [None]:
(tpdat
 >> gg.ggplot(gg.aes(x='pos', fill=V.COL_CONDNAME))
 + gg.geom_histogram()
 + gg.coord_cartesian(ylim=(0,25))
  + gg.ggtitle('GFP positives per image')
)

-> Looks like the positive cells also come from multiple images

I actually did a re-sequencing of the plasmids with C-ter FLAG tags as well as some Nter GFP constructs as control. I used a EGF N rev and Nter GFP fwd primer and found:
- GFP sequence with both primers fo Nter GFP constructs -> as expected
- An PI3K sequence for the EGFP N rev primer but not sequence for the Nter GFP fwd primer -> This suggests - as one would expext - that no N-ter GFP is found but ther seems to be a C-ter GFP -> We dont have a expression vector with C-ter GFP in the lab -> likely the entry clone was contaminated :/
- For the other FLAG constructs I found no evidence for an unexpected GFP

-> It seems that the PI3K construct seems to somehow have an unexpected Nter GFP

In [None]:
tpdat.head()

In [None]:
imid=998
tdat = dat[dat.obs.query(f'{V.COL_IMGID}=={imid}').index,:][:,dat.var.channel_name.isin(C.metal_flag+['GFP', 'Tm169', 'prop-pos'])]
a = ap.plot_anndata_subplots(imid, tdat, figsize=1.7)

ax = a[1][0]
im = ax.images[0]
im.set_clim(0,1)
ax = a[1][1]
im = ax.images[0]
im.set_clim(0,1)
ax = a[1][2]
im = ax.images[0]
im.set_clim(0,1000)

-> I think these cells in an 'empty' sphere seem like real false positives, e.g. because of an air-bubble with higher background in the tissue:
Both GFP antibodies seem o be slightly higher in this are, which can explain why the classifier classified the pixels positive.
