## Notebook to visualize colocalization results as a heatmap

In [None]:
!date

#### import libraries

In [None]:
from pandas import read_csv, concat, pivot, read_pickle
import matplotlib.pyplot as plt
from matplotlib.pyplot import rc_context
from seaborn import heatmap
from os.path import exists
from math import ceil
import numpy as np

%matplotlib inline
# for white background of figures (only for docs rendering)
%config InlineBackend.print_figure_kwargs={'facecolor' : "w"}
%config InlineBackend.figure_format='retina'

#### set notebook variables

In [None]:
# naming
cohort = 'foundin'
dx = 'PD'

# directories
wrk_dir = '/home/jupyter/foundin_qtl'
results_dir = f'{wrk_dir}/results'
figures_dir = f'{wrk_dir}/figures'
quants_dir = f'{wrk_dir}/quants'
public_dir = f'{wrk_dir}/public'

# out files
figure_file = f'{figures_dir}/{cohort}.colocalization.{dx}.png'
rnab_features_file = f'{public_dir}/gencode_v29.lncipedia_v5_2_hc.annotation.pkl'
pdui_features_file = f'{quants_dir}/{cohort}_PDUI.features.csv'

# variables
DEBUG = False
days = ['da0', 'da25', 'da65', 'daNA']
# modalities = ['ATAC', 'METH', 'PDUI', 'RNAB', 'RNAS', 'SCRN-DA', 'SCRN-ElC', 
#               'SCRN-eNP', 'SCRN-iDA', 'SCRN-lNP', 'SCRN-NlC', 'SCRN-PFPP']
modalities = ['PDUI', 'RNAB', 'RNAS', 'SCRN-DA', 'SCRN-ElC', 
              'SCRN-eNP', 'SCRN-iDA', 'SCRN-lNP', 'SCRN-NlC', 'SCRN-PFPP', 
              'Bryois-Astro', 'Bryois-Endo', 'Bryois-ExN', 'Bryois-InN', 
              'Bryois-Micro', 'Bryois-OPC', 'Bryois-Oligo', 'Bryois-Peri']
dpi_value = 100
min_h4 = 0.05

### for each day and modality load the colocalization results

In [None]:
coloc_df = None
for day in days:
    for modality in modalities:
        print(day, modality, end=':')
        in_file = f'{results_dir}/{cohort}_{day}_{modality}_{dx}.coloc.pp.csv'
        if exists(in_file):
            this_df = read_csv(in_file)
            print(f'loaded {this_df.shape[0]} results')
            # add day and modality
            this_df['day'] = day
            this_df['modality'] = modality
            coloc_df = concat([coloc_df, this_df])
print(f'\ntotal results loaded {coloc_df.shape[0]}')
if DEBUG:
    display(coloc_df.sample(5))
    display(coloc_df.day.value_counts())
    display(coloc_df.modality.value_counts())        

### load needed feature information

#### load RNAB features
RNAB features use Ensembl IDs need to map to gene names

In [None]:
%%time
annots_df = read_pickle(rnab_features_file)
print(f'annotations shape: {annots_df.shape}')

if DEBUG:
    display(annots_df.head())

##### deference feature IDs to features names

In [None]:
feature_annots = annots_df.loc[annots_df.gene_id.isin(coloc_df.feature)]
feature_annots = feature_annots[['gene_id', 'gene_name']]
feature_annots.drop_duplicates(inplace=True)
features_dict = feature_annots.set_index('gene_id').to_dict()['gene_name']
if DEBUG:
    display(features_dict)

#### replace the RNAB features

In [None]:
coloc_df.feature.replace(features_dict, inplace=True)
print(f'update df shape {coloc_df.shape}')
if DEBUG:
    display(coloc_df.loc[coloc_df.modality == 'RNAB'].head())

#### load PDUI features

In [None]:
%%time
annots_df = read_csv(pdui_features_file)
print(f'annotations shape: {annots_df.shape}')

if DEBUG:
    display(annots_df.head())

##### deference feature IDs to features names

In [None]:
feature_annots = annots_df.loc[annots_df.Loci.isin(coloc_df.feature)]
feature_annots = feature_annots[['Loci', 'Gene']]
# some genes have multiple PDUI features, append to handle
multi_features = feature_annots.duplicated(subset=['Gene'], keep=False)
feature_annots.loc[multi_features, 'Gene'] = feature_annots.Gene + ':' + feature_annots.Loci
# feature_annots.drop_duplicates(inplace=True)
features_dict = feature_annots.set_index('Loci').to_dict()['Gene']
if DEBUG:
    display(features_dict)

#### replace the PDUI features

In [None]:
coloc_df.feature.replace(features_dict, inplace=True)
print(f'update df shape {coloc_df.shape}')
if DEBUG:
    display(coloc_df.loc[coloc_df.modality == 'PDUI'].head())

### subset based on the minimum H4 variable

In [None]:
coloc_df = coloc_df.loc[coloc_df.H4 > min_h4]
print(f'results shape after filter on H4 {coloc_df.shape}')
if DEBUG:
    display(coloc_df.head())

### reshape the dataframe from long to wide

In [None]:
# drop the cols we don't need
temp_df = coloc_df.drop(columns=['H0', 'H1', 'H2', 'H3', 'h4_supported'])
# sort not sure if this will help may have to set order in plotting explicityly
temp_df = temp_df.sort_values(by=['modality', 'day'])
# want day/modality combo's so combine
temp_df['day_modality'] = temp_df.day + '-' + temp_df.modality
# wcoloc_df = pivot(temp_df, index='feature', columns='day_modality', values='H4')
wcoloc_df = pivot(temp_df, index='feature', columns=['day', 'modality'], values='H4')
# set precision
wcoloc_df = wcoloc_df.round(2)
#replace all zeros with NaN values
wcoloc_df.replace(0, np.nan, inplace=True)
# drop rows that are all null
wcoloc_df.dropna(how='all', inplace=True)
# sort by foundin DA neuron results
wcoloc_df = wcoloc_df.sort_values(by=[('da65', 'SCRN-DA'), ('da65', 'SCRN-iDA'), 
                                      ('da65', 'RNAB'), ('da65', 'PDUI')], ascending=False)
print(f'shape of wide reformated results {wcoloc_df.shape}')
if DEBUG:
    display(wcoloc_df)

### visualize the reformated data as a heatmap

In [None]:

if wcoloc_df.shape[0] > 9:
    height = 9+ceil(wcoloc_df.shape[0]/6)
else:
    height = 9
print(dx, height)        
with rc_context({'figure.figsize': (11, height), 'figure.dpi': dpi_value}):
    plt.style.use('seaborn-bright')    
    heatmap(wcoloc_df, annot=True, linecolor='grey',
            annot_kws={"fontsize":10}, linewidths=0.05, cmap='Blues')    
    plt.title(f'Colocalization H4 for {dx} and QTL')
    plt.savefig(figure_file, dpi=dpi_value, bbox_inches='tight', 
                transparent=True, pad_inches=1)
    plt.show()

In [None]:
!date