## Notebook to compare the significant top eQTL singals by cell-type

note I'm only looking at iDA, DA, and lNP, not all the types quantified

- 'Immature Dopaminergic Neurons': 'iDA',
- 'Dopaminergic Neurons': 'DA',
- 'Proliferating Floor Plate Progenitors': 'PFPP',
- 'Early neuron Progenitor': 'eNP',
- 'Ependymal-like Cells': 'ElC',
- 'Late neuron Progenitor': 'lNP',
- 'Neuroepithelial-like Cells': 'NlC'

In [None]:
!date

#### import libraries

In [None]:
from pandas import read_csv, concat
from dask.dataframe import read_parquet
import numpy as np
from os.path import exists
from seaborn import relplot, scatterplot
import concurrent.futures
import matplotlib.pyplot as plt
from matplotlib.pyplot import rc_context

%matplotlib inline
# for white background of figures (only for docs rendering)
%config InlineBackend.print_figure_kwargs={'facecolor' : "w"}
%config InlineBackend.figure_format='retina'

#### set notebook variables

In [None]:
# naming
cohort = 'foundin'
day = 'da65'
modality = 'SCRN'

# directories
wrk_dir = '/labshare/raph/datasets/foundin_qtl'
tensorqtl_dir = f'{wrk_dir}/tensorqtl'
results_dir = f'{wrk_dir}/results'

# input files

# out files
results_file = f'{results_dir}/{cohort}_{modality}.cis.csv'

# constants
cell_types = ['iDA', 'DA', 'lNP']
autosomes = [str(x) for x in list(range(1,23))]
ALPHA = 0.05
dpi_value = 50 #100
DEBUG = False

### load the results

In [None]:
cis_df = None
for cell_type in cell_types:
    cohort_set = f'{cohort}_{day}_{modality}-{cell_type}'
    cis_file = f'{tensorqtl_dir}/{cohort_set}.cis.map.csv'
    if exists(cis_file):
        this_df = read_csv(cis_file)
        this_df = this_df.loc[this_df.pval_perm < ALPHA]    
        feature_cnt = this_df['phenotype_id'].nunique()
        print(f'{cell_type} shape is {this_df.shape} for {feature_cnt} features')
        this_df['cell_type'] = cell_type
        cis_df = concat([cis_df, this_df])
        print(f'after {cell_type} total shape {cis_df.shape}')    

In [None]:
print(cis_df.shape)
if DEBUG:
    display(cis_df.sample(5))

#### make a key for the variant/pheno pair

In [None]:
cis_df['cispair'] = cis_df['phenotype_id'] + ':' + cis_df['variant_id']
print(cis_df.shape)
display(cis_df.sample(5))

In [None]:
print(cis_df['cispair'].value_counts().value_counts())
print(cis_df['cell_type'].value_counts())
print(cis_df['cispair'].nunique())

In [None]:
bh_temp = cis_df.loc[cis_df.bh_fdr < ALPHA]
print(bh_temp['cispair'].value_counts().value_counts())
print(bh_temp['cell_type'].value_counts())
print(bh_temp['cispair'].nunique())

#### what is the max p-value in the independent signals

In [None]:
max_pvalue = bh_temp['pval_nominal'].max()
print(f'max cis top nominal pvalue {max_pvalue}')

max_pval_perm = bh_temp['pval_perm'].max()
print(f'max cis top nominal pvalue {max_pval_perm}')

#### now that all the cispairs of interest are known get complete data for these

In [None]:
def load_missing_qtl_results(find_items, in_file, cell_type):
    qtl_df = read_parquet(in_file).compute()
    qtl_df['cispair'] = qtl_df['phenotype_id'] + ':' + qtl_df['variant_id']
    found_df = qtl_df.loc[qtl_df['cispair'].isin(find_items)].copy()
    found_df['cell_type'] = cell_type
    return found_df

In [None]:
%%time

fs_list = []
lm_results = []
print('searching results threaded')
with concurrent.futures.ProcessPoolExecutor() as ppe:
    for cell_type in cell_types:
        cohort_set = f'{cohort}_{day}'
        result_set = f'{cohort_set}_{modality}-{cell_type}'
        for chrom in autosomes:
            this_result_file = f'{tensorqtl_dir}/{result_set}.cis_qtl_pairs.chr{chrom}.parquet'
            if exists(this_result_file):
                fs_list.append(ppe.submit(load_missing_qtl_results, cis_df['cispair'], 
                                          this_result_file, cell_type))
for future in concurrent.futures.as_completed(fs_list):
    lm_results.append(future.result()) 

In [None]:
# combine the read results
results_df = concat(lm_results)
print(results_df.shape)
if DEBUG:
    display(results_df.sample(10))

In [None]:
print(results_df['cispair'].value_counts().value_counts())
print(results_df['cell_type'].value_counts())

In [None]:
results_df['log10_pvalue'] = np.log10(results_df['pval_nominal'])*-1
results_df['score'] = results_df['slope']/results_df['slope_se']
results_df['score_abs'] = np.abs(results_df['score'])
results_df['is_sig'] = 0
results_df.loc[results_df.pval_nominal <= max_pvalue, 'is_sig'] = 1

In [None]:
if DEBUG:
    display(results_df.sample(5))

In [None]:
print(results_df['is_sig'].value_counts())

### save the combined results

In [None]:
results_df.to_csv(results_file, index=False)

In [None]:
temp = results_df.groupby(['cispair'])['is_sig'].agg('sum')
display(results_df.loc[results_df['is_sig'] == 0].head())
display(temp.value_counts())

In [None]:
display(temp[temp == 0].head())
test_feature = temp[temp == 0].index[0]
print(f'\n test feature: {test_feature}')

In [None]:
display(results_df.loc[results_df['cispair'] == test_feature])

In [None]:
results_df.groupby(['cispair'])['is_sig'].agg('sum').value_counts()

#### for the stuff that isn't in largest powered group any visit standout

In [None]:
results_df.loc[results_df['cell_type'] != 0]['cell_type'].value_counts()

In [None]:
results_df.loc[results_df['cell_type'] != 0].groupby(['cispair'])['is_sig'].agg('sum').value_counts()

### plotting

#### annotate direction of effect for plotting

In [None]:
results_df['Direction'] = np.where(results_df['slope'] > 0, 
                                         'Increase', 'Descrease')
results_df = results_df.reset_index(drop=True)
if DEBUG:
    display(results_df.sample(5))

#### do some quick plotting

In [None]:
# for seaborn lm plot the day needs to be numeric for ordinal interpret
results_df['cellnum'] = results_df.cell_type.map({'lNP': 0, 'iDA': 1, 'DA': 2})
if DEBUG:
    display(results_df.cell_type.value_counts())
    display(results_df.cellnum.value_counts())    

In [None]:
with rc_context({'figure.figsize': (8, 8), 'figure.dpi': dpi_value}):
    plt.style.use('seaborn-v0_8-bright')
    relplot(x='cellnum', y='score', kind='line', hue='Direction',
            data=results_df, palette='Set1')

In [None]:
with rc_context({'figure.figsize': (8, 8), 'figure.dpi': dpi_value}):
    plt.style.use('seaborn-v0_8-bright')
    relplot(x='cellnum', y='slope', kind='line', hue='Direction',
            data=results_df, palette='Set1')

In [None]:
with rc_context({'figure.figsize': (8, 8), 'figure.dpi': dpi_value}):
    plt.style.use('seaborn-v0_8-bright')
    relplot(x='cellnum', y='slope_se', kind='line', hue='Direction',
            data=results_df, palette='Set1')

In [None]:
with rc_context({'figure.figsize': (8, 8), 'figure.dpi': dpi_value}):
    plt.style.use('seaborn-v0_8-bright')
    relplot(x='cellnum', y='log10_pvalue', kind='line', hue='Direction',
            data=results_df, palette='Set1')

In [None]:
with rc_context({'figure.figsize': (8, 8), 'figure.dpi': dpi_value}):
    plt.style.use('seaborn-v0_8-bright')
    relplot(x='start_distance', y='log10_pvalue',hue='Direction',
            data=results_df, palette='Set1')
    plt.axhline(-np.log10(max_pvalue), color='black', linestyle='--')
    plt.axhline(0, color='black')
    plt.axvline(0, color='black', zorder=0)  

In [None]:
with rc_context({'figure.figsize': (8, 8), 'figure.dpi': dpi_value}):
    plt.style.use('seaborn-v0_8-bright')
    scatterplot(x='start_distance', y='log10_pvalue', hue='cell_type',
                data=results_df.sample(frac=1), palette='Set1')
    plt.axhline(-np.log10(max_pvalue), color='black', linestyle='--')
    plt.axhline(0, color='black')
    plt.axvline(0, color='black', zorder=0)       

In [None]:
with rc_context({'figure.figsize': (8, 8), 'figure.dpi': dpi_value}):
    plt.style.use('seaborn-v0_8-bright')
    relplot(x='start_distance', y='score',hue='Direction',
            data=results_df, palette='Set1')

In [None]:
with rc_context({'figure.figsize': (8, 8), 'figure.dpi': dpi_value}):
    plt.style.use('seaborn-v0_8-bright')
    relplot(x='start_distance', y='score', hue='cell_type',
            data=results_df.sample(frac=1), palette='Set1')
    plt.axhline(-np.log10(max_pvalue), color='black', linestyle='--')
    plt.axhline(np.log10(max_pvalue), color='black', linestyle='--')    
    plt.axhline(0, color='black')
    plt.axvline(0, color='black', zorder=0)    

#### format data for scatter or ternary plots

In [None]:
lNP = results_df.loc[results_df.cell_type == 'lNP'].sort_values(by=['cispair'])
iDA = results_df.loc[results_df.cell_type == 'iDA'].sort_values(by=['cispair'])
DA = results_df.loc[results_df.cell_type == 'DA'].sort_values(by=['cispair'])

In [None]:
all_df = lNP.merge(iDA, how='left', left_on='cispair', right_on='cispair')
all_df = all_df.merge(DA, how='left', left_on='cispair', right_on='cispair')
if DEBUG:
    display(all_df.head())

#### clean up the column names

In [None]:
all_df.rename(columns={'score_abs_x': 'lNP', 'score_abs_y': 'iDA', 
                       'score_abs': 'DA'}, inplace=True)  

#### merge the counts for number cell_types significant

In [None]:
all_df['cells_detected'] = all_df['is_sig_x'] + all_df['is_sig_y'] + all_df['is_sig']
all_df.cells_detected = all_df.cells_detected.fillna(0)
display(all_df.cells_detected.value_counts())

In [None]:
all_df.loc[all_df['cells_detected'] == 0, 'cells_detected'] = 1
all_df['cells_detected'] = all_df['cells_detected'].astype('int')
all_df['cells_detected'].value_counts()

In [None]:
import plotly.express as px
fig = px.scatter_ternary(all_df, a='lNP', b='iDA', c='DA', color='cells_detected')
fig.show()    

In [None]:
all_df['cispair'].value_counts()

In [None]:
cis_df['cispair'].value_counts()

In [None]:
present_all = cis_df['cispair'].value_counts()
present_all = present_all[present_all == 3]
present_all

In [None]:
indep_present_all = cis_df.loc[cis_df['cispair'].isin(present_all.index)]
print(indep_present_all.shape)
display(indep_present_all.head())

In [None]:
print(cis_df.loc[cis_df.start_distance < 5].shape)
print(cis_df.loc[cis_df.start_distance < 5]['phenotype_id'].nunique())

In [None]:
with rc_context({'figure.figsize': (8, 8), 'figure.dpi': dpi_value}):
    plt.style.use('seaborn-v0_8-bright')
    cis_df.start_distance.plot.density()

In [None]:
!date