## Notebook to compare the significant result of the *cis* correlation signals between modalities by day

In [None]:
!date

#### import libraries

In [None]:
from pandas import read_csv, concat
from dask.dataframe import read_parquet
import numpy as np
from os.path import exists
from seaborn import relplot, scatterplot
import concurrent.futures
import matplotlib.pyplot as plt
from matplotlib.pyplot import rc_context

%matplotlib inline
# for white background of figures (only for docs rendering)
%config InlineBackend.print_figure_kwargs={'facecolor' : "w"}
%config InlineBackend.figure_format='retina'

#### set notebook variables

In [None]:
# parameters
modality = ''

In [None]:
# naming
cohort = 'foundin'

# directories
wrk_dir = '/home/gibbsr/working/foundin/foundin_qtl'
tensorqtl_dir = f'{wrk_dir}/tensorqtl'
results_dir = f'{wrk_dir}/results'

# input files

# out files
results_file = f'{results_dir}/{cohort}_{modality}.cis.csv'

# constants
days = ['da0', 'da25', 'da65']
autosomes = [str(x) for x in list(range(1,23))]
alpha_value = 0.05
dpi_value = 50 #100
DEBUG = False

### load the results

In [None]:
# load the independent results
cis_df = None
for day in days:
    cohort_set = f'{cohort}_{day}_{modality}'
    cis_file = f'{tensorqtl_dir}/{cohort_set}.cis.map.csv'
    if exists(cis_file):
        this_df = read_csv(cis_file)
        this_df = this_df.loc[this_df.pval_perm < alpha_value]    
        feature_cnt = this_df['phenotype_id'].nunique()
        print(f'day {day} shape is {this_df.shape} for {feature_cnt} features')
        this_df['day'] = day
        cis_df = concat([cis_df, this_df])
        print(f'after day {day} total shape {cis_df.shape}')    

In [None]:
print(cis_df.shape)
if DEBUG:
    display(cis_df.sample(5))

#### make a key for the variant/pheno pair

In [None]:
cis_df['cispair'] = cis_df['phenotype_id'] + ':' + cis_df['variant_id']
print(cis_df.shape)
display(cis_df.sample(5))

In [None]:
print(cis_df['cispair'].value_counts().value_counts())
print(cis_df['day'].value_counts())
print(cis_df['cispair'].nunique())

In [None]:
bh_temp = cis_df.loc[cis_df.bh_fdr < alpha_value]
print(bh_temp['cispair'].value_counts().value_counts())
print(bh_temp['day'].value_counts())
print(bh_temp['cispair'].nunique())

#### what is the max p-value in the independent signals

In [None]:
max_pvalue = bh_temp['pval_nominal'].max()
print(f'max cis top nominal pvalue {max_pvalue}')

max_pval_perm = bh_temp['pval_perm'].max()
print(f'max cis top nominal pvalue {max_pval_perm}')

#### now that all the cispairs of interest are known get complete data for these

In [None]:
def load_missing_qtl_results(find_items, in_file, day):
    qtl_df = read_parquet(in_file).compute()
    qtl_df['cispair'] = qtl_df['phenotype_id'] + ':' + qtl_df['variant_id']
    found_df = qtl_df.loc[qtl_df['cispair'].isin(find_items)].copy()
    found_df['day'] = day
    return found_df

In [None]:
%%time

fs_list = []
lm_results = []
with concurrent.futures.ThreadPoolExecutor() as tpe:
    for day in days:
        cohort_set = f'{cohort}_{day}'
        result_set = f'{cohort_set}_{modality}'
        for chrom in autosomes:
            this_result_file = f'{tensorqtl_dir}/{result_set}.cis_qtl_pairs.chr{chrom}.parquet'
            if exists(this_result_file):
                fs_list.append(tpe.submit(load_missing_qtl_results, cis_df['cispair'], 
                                          this_result_file, day))
for future in concurrent.futures.as_completed(fs_list):
    lm_results.append(future.result()) 

In [None]:
# combine the read results
results_df = concat(lm_results)
print(results_df.shape)
if DEBUG:
    display(results_df.sample(10))

In [None]:
print(results_df['cispair'].value_counts().value_counts())
print(results_df['day'].value_counts())

In [None]:
results_df['log10_pvalue'] = np.log10(results_df['pval_nominal'])*-1
results_df['score'] = results_df['slope']/results_df['slope_se']
results_df['score_abs'] = np.abs(results_df['score'])
results_df['is_sig'] = 0
results_df.loc[results_df['pval_nominal'] < max_pvalue, 'is_sig'] = 1

In [None]:
if DEBUG:
    display(results_df.sample(5))

In [None]:
print(results_df['is_sig'].value_counts())

### save the combined results

In [None]:
results_df.to_csv(results_file, index=False)

In [None]:
temp = results_df.groupby(['cispair'])['is_sig'].agg('sum')
display(results_df.loc[results_df['is_sig'] == 0].head())
display(temp.value_counts())

In [None]:
display(temp[temp == 0].head())
test_feature = temp[temp == 0].index[0]
print(f'\n test feature: {test_feature}')

In [None]:
display(results_df.loc[results_df['cispair'] == test_feature])

In [None]:
results_df.groupby(['cispair'])['is_sig'].agg('sum').value_counts()

#### for the stuff that isn't in largest powered group any visit standout

In [None]:
results_df.loc[results_df['day'] != 0]['day'].value_counts()

In [None]:
results_df.loc[results_df['day'] != 0].groupby(['cispair'])['is_sig'].agg('sum').value_counts()

### plotting

#### annotate direction of effect for plotting

In [None]:
results_df['Direction'] = np.where(results_df['slope'] > 0, 
                                         'Increase', 'Descrease')
results_df = results_df.reset_index(drop=True)
if DEBUG:
    display(results_df.sample(5))

#### do some quick plotting

In [None]:
# for seaborn lm plot the day needs to be numeric for ordinal interpret
results_df['daynum'] = results_df['day'].str.replace('da','').astype('int32')

In [None]:
with rc_context({'figure.figsize': (8, 8), 'figure.dpi': dpi_value}):
    plt.style.use('seaborn-bright')
    relplot(x='daynum', y='score', kind='line', hue='Direction',
            data=results_df, palette='Set1')

In [None]:
with rc_context({'figure.figsize': (8, 8), 'figure.dpi': dpi_value}):
    plt.style.use('seaborn-bright')
    relplot(x='daynum', y='slope', kind='line', hue='Direction',
            data=results_df, palette='Set1')

In [None]:
with rc_context({'figure.figsize': (8, 8), 'figure.dpi': dpi_value}):
    plt.style.use('seaborn-bright')
    relplot(x='daynum', y='slope_se', kind='line', hue='Direction',
            data=results_df, palette='Set1')

In [None]:
with rc_context({'figure.figsize': (8, 8), 'figure.dpi': dpi_value}):
    plt.style.use('seaborn-bright')
    relplot(x='daynum', y='log10_pvalue', kind='line', hue='Direction',
            data=results_df, palette='Set1')

In [None]:
with rc_context({'figure.figsize': (8, 8), 'figure.dpi': dpi_value}):
    plt.style.use('seaborn-bright')
    relplot(x='tss_distance', y='log10_pvalue',hue='Direction',
            data=results_df, palette='Set1')
    plt.axhline(-np.log10(max_pvalue), color='black', linestyle='--')
    plt.axhline(0, color='black')
    plt.axvline(0, color='black', zorder=0)  

In [None]:
with rc_context({'figure.figsize': (8, 8), 'figure.dpi': dpi_value}):
    plt.style.use('seaborn-bright')
    scatterplot(x='tss_distance', y='log10_pvalue', hue='day',
                data=results_df.sample(frac=1), palette='Set1')
    plt.axhline(-np.log10(max_pvalue), color='black', linestyle='--')
    plt.axhline(0, color='black')
    plt.axvline(0, color='black', zorder=0)       

In [None]:
with rc_context({'figure.figsize': (8, 8), 'figure.dpi': dpi_value}):
    plt.style.use('seaborn-bright')
    relplot(x='tss_distance', y='score',hue='Direction',
            data=results_df, palette='Set1')

In [None]:
with rc_context({'figure.figsize': (8, 8), 'figure.dpi': dpi_value}):
    plt.style.use('seaborn-bright')
    relplot(x='tss_distance', y='score',hue='day',
            data=results_df.sample(frac=1), palette='Set1')
    plt.axhline(-np.log10(max_pvalue), color='black', linestyle='--')
    plt.axhline(np.log10(max_pvalue), color='black', linestyle='--')    
    plt.axhline(0, color='black')
    plt.axvline(0, color='black', zorder=0)    

### plot scatter for two timepoints or ternary for three timepoints
ATAC, PDUI, RNAB, RNAS all have three timepoints (da0, 25, and 65) where as METH only has two timepoints (da0 and 65) and SCRN only one timepoint at da65

#### format data for scatter or ternary plots

In [None]:
da0 = results_df.loc[results_df['day'] == 'da0'].sort_values(by=['cispair'])
if not modality == 'METH':
    da25 = results_df.loc[results_df['day'] == 'da25'].sort_values(by=['cispair'])
da65 = results_df.loc[results_df['day'] == 'da65'].sort_values(by=['cispair'])

In [None]:
if not modality == 'METH':
    all_df = da0.merge(da25, how='left', left_on='cispair', right_on='cispair')
    all_df = all_df.merge(da65, how='left', left_on='cispair', right_on='cispair')
else:
    all_df = da0.merge(da65, how='left', left_on='cispair', right_on='cispair')
if DEBUG:
    display(all_df.head())

#### clean up the column names

In [None]:
if not modality == 'METH':
    all_df.rename(columns={'score_abs_x': 'da0', 'score_abs_y': 'da25', 
                           'score_abs': 'da65'}, inplace=True)
else:
    all_df.rename(columns={'score_x': 'da0', 'score_y': 'da65'}, inplace=True)    

In [None]:
all_df.fillna(0, inplace=True)

#### merge the counts for number days significant

In [None]:
if not modality == 'METH':
    all_df['days_detected'] = all_df['is_sig_x'] + all_df['is_sig_y'] + all_df['is_sig']
else:
    all_df['days_detected'] = all_df['is_sig_x'] + all_df['is_sig_y']
all_df['days_detected'].value_counts()

In [None]:
all_df.loc[all_df['days_detected'] == 0, 'days_detected'] = 1
all_df['days_detected'] = all_df['days_detected'].astype('int')
all_df['days_detected'].value_counts()

In [None]:
if not modality == 'METH':
    import plotly.express as px
    fig = px.scatter_ternary(all_df, a='da0', b='da25', c='da65', color='days_detected')
    fig.show()    
else:
    with rc_context({'figure.figsize': (8, 8), 'figure.dpi': 100}):
        plt.style.use('seaborn-bright')
        figure = scatterplot(x='da0', y='da65',
                             data=all_df.sample(frac=1), palette='Set1')
        figure.axhline(0, color='black', zorder=0)
        figure.axvline(0, color='black', zorder=0)    

In [None]:
all_df['cispair'].value_counts()

In [None]:
cis_df['cispair'].value_counts()

In [None]:
present_all = cis_df['cispair'].value_counts()
present_all = present_all[present_all == 3]
present_all

In [None]:
indep_present_all = cis_df.loc[cis_df['cispair'].isin(present_all.index)]
print(indep_present_all.shape)
display(indep_present_all.head())

In [None]:
print(cis_df.loc[cis_df.tss_distance < 5].shape)
print(cis_df.loc[cis_df.tss_distance < 5]['phenotype_id'].nunique())

In [None]:
with rc_context({'figure.figsize': (8, 8), 'figure.dpi': dpi_value}):
    plt.style.use('seaborn-bright')
    cis_df.tss_distance.plot.density()

In [None]:
!date