In [None]:
sm = snakemake

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import spherpro.bro as sb
import spherpro.db as db

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import plotnine as gg
import scipy as sp
import pathlib

%matplotlib inline

In [None]:
from src.variables import Vars
from src.config import Conf

## Aim

Compare if variability analysis is related to dependence on overexpression.

## 0) Setup configuration and bro

In [None]:
class V(Vars):
    COL_MODELFITCOND = 'modelfitcond'
    COL_MODELCLASS = 'modelclass'

In [None]:
class C(Conf):
    fn_config = sm.input.fn_config
    fn_out_sig = sm.input.fn_sigstats
    fn_mod_stat = sm.input.fn_modstats
    refcond = 'Empty_nan'
    #refcond ='293T_c1.0_te%_tp96'
    #COL_R2 = 'r2'
    chan_oexp = ['Yb176', 'Er167','Tm169']
    fol_plots = pathlib.Path(sm.output.fol_plots)

In [None]:
sm.output.fol_plots

In [None]:
C.fol_plots.mkdir(exist_ok=True)

In [None]:
class M:
    dist = 'dist'
    dist_nb = 'dist_nb'

In [None]:
bro = sb.get_bro(C.fn_config)

## 1) Analysis

In [None]:
dat_sig = pd.read_csv(C.fn_out_sig)

In [None]:
#dat_mod = pd.read_csv(C.fn_varanalysis_modstats_out)
dat_mod = pd.read_csv(C.fn_mod_stat)

In [None]:
dat_mod

Hypothesis:  
markers where a lot of variability is explained by dist_nb-dist are more often affected by overexpression

In [None]:
dat_mod[V.COL_CONDNAME].unique()

In [None]:
dat_delta = (dat_mod
 #.query(f'{V.COL_CONDNAME} in {[C.refcond]}')
  .query(f'{V.COL_CHANNELNAME} not in {C.chan_oexp}')
 .loc[dat_mod[V.COL_CONDNAME].isin([ 'T-REx-293_c0.25_tp96',
       'T-REx-293_c0.5_tp96', 'T-REx-293_c1.0_tp96'
       ]),:]
 .pivot_table(index=[V.COL_CONDNAME, V.COL_CHANNELNAME],columns = V.COL_MODELCLASS, values=C.COL_R2)
 #.assign(**{V.COL_DELTA: lambda d: d['dist_nb']-d['dist']})
 .assign(**{V.COL_DELTA: lambda d: d['dist_nb']-d['dist']})
 .groupby(V.COL_CHANNELNAME)[V.COL_DELTA].mean()
 .reset_index()
)

In [None]:
dat_delta.sort_values(V.COL_DELTA).merge(bro.data.pannel[[V.COL_METAL, V.COL_GOODNAME]], left_on=V.COL_CHANNELNAME, right_on=V.COL_METAL)

In [None]:
dat_fracsig = (dat_sig
 .pipe(lambda d: d.loc[~d[V.COL_CONDNAME].str.endswith('C-TER FLAG'),:])
 .query(f'{V.COL_ISNB} in ["oexp-NB", "ctrl"]')
 .groupby(V.COL_CHANNELNAME)[V.COL_ISSIG].sum()
 .reset_index()
 
)

In [None]:
(dat_delta.sort_values(V.COL_DELTA)
 .merge(dat_fracsig)
 .merge(bro.data.pannel[[V.COL_METAL, V.COL_GOODNAME]], left_on=V.COL_CHANNELNAME, right_on=V.COL_METAL)
  >>
    gg.ggplot(gg.aes(x=V.COL_DELTA, y=V.COL_ISSIG))
              +gg.geom_point()
         +gg.geom_smooth(method='lm')
 + gg.ylab('Significantly affected in overexpressions')
 + gg.xlab('Variability explained Neighbourhood-Global')
        # +gg.geom_label(gg.aes(label=V.COL_GOODNAME))
)

In [None]:
nsig=0

In [None]:
COL_ANYSIG = 'any_sign'

In [None]:
dat_fracsig[COL_ANYSIG] = dat_fracsig[V.COL_ISSIG] > nsig

In [None]:

pltdat = (dat_delta.sort_values(V.COL_DELTA)
 .merge(dat_fracsig)
 .merge(bro.data.pannel[[V.COL_METAL, V.COL_GOODNAME]], left_on=V.COL_CHANNELNAME, right_on=V.COL_METAL)
       )

p =(pltdat >>
    gg.ggplot(gg.aes(y=V.COL_DELTA, x=COL_ANYSIG))
             + gg.geom_boxplot(outlier_alpha=0, color='grey')
              +gg.geom_jitter(height=0, width=0.1, alpha=1, random_state=10)
 
 + gg.xlab('Significantly affected in\n neighbourhood')
 + gg.ylab(r'$R_{localenv + globalenv}^2 - R_{globalenv}^2$')
  + gg.theme_bw()
     + gg.expand_limits(y=0)
        # +gg.geom_label(gg.aes(label=V.COL_GOODNAME))
     + gg.theme(text=gg.element_text(size=6),
               figure_size=(1.2,1.9))
)
p

In [None]:
gg.ggsave(p, pathlib.Path(C.fol_plots) / 'oexp_vs_nb.pdf')

In [None]:
(p +gg.geom_text(gg.aes(label=V.COL_GOODNAME),position=gg.position_jitter())
     + gg.theme(text=gg.element_text(size=6),
               figure_size=(6,6))
)

In [None]:
(dat_delta 
    .merge(dat_fracsig)
     [COL_ANYSIG].mean()
)

In [None]:
(dat_delta 
    .merge(dat_fracsig)
     .groupby(COL_ANYSIG)[V.COL_DELTA].mean()
)

In [None]:

p_ttest = (dat_delta 
    .merge(dat_fracsig)
     .pipe(lambda d: sp.stats.ttest_ind(d.loc[d[COL_ANYSIG],
                                              V.COL_DELTA].values,
                                        d.loc[d[COL_ANYSIG] == False,
                                              V.COL_DELTA].values ))
).pvalue
p_ttest

In [None]:
p_kruskal =(dat_delta 
    .merge(dat_fracsig)
     .pipe(lambda d: sp.stats.kruskal(
         d.loc[d[COL_ANYSIG], V.COL_DELTA].values,
         d.loc[d[COL_ANYSIG] == False, V.COL_DELTA].values))
).pvalue
p_kruskal

In [None]:
(dat_delta 
    .merge(dat_fracsig)
     .pipe(lambda d: sp.stats.pearsonr(d[V.COL_DELTA], d[V.COL_ISSIG]))
)

In [None]:
(dat_fracsig
 .eval(f'asig={V.COL_ISSIG} > 0')
 .merge(dat_delta)
 .groupby('asig')[V.COL_DELTA].describe())

In [None]:
dat_fracsig.merge(dat_delta).sort_values(V.COL_DELTA).merge(bro.data.pannel[[V.COL_METAL, V.COL_GOODNAME]], left_on=V.COL_CHANNELNAME, right_on=V.COL_METAL)