## Notebook to visualize omics QTL and colocalization with PD risk tracks for the FOUNDIN-PD QTL project

will start with single figure with localized tracks for each regions including GWAS, gene annotations, 'risk' ATAC peaks, colocalization H4 probilities for different omics features and then QTL tracks , will also include Encode CRE, then show try to incorporate the Hi-C data and SCAT co-asscessibility 

using pyGenomeTracks to generate the figure
- [pyGenomeTracks github](https://pygenometracks.readthedocs.io/en/latest/index.html)
- [pyGenomeTrack docs](https://pygenometracks.readthedocs.io/en/latest/index.html)

In [None]:
!date

#### import libraries

In [None]:
from pandas import read_parquet, read_csv, DataFrame, read_hdf, concat
from math import ceil, floor
from numpy import log10
from IPython.display import Image
from os.path import exists

#### set notebook variables

In [None]:
# parameters
index_variant = 'rs13294100'
day = 'daNA'
modality = 'DAn-meta'

In [None]:
# naming
cohort = 'foundin'
dx = 'PD'
set_name = f'{cohort}_{day}_{modality}_{dx}'

# directories
wrk_dir = '/home/jupyter/foundin_qtl'
quants_dir = f'{wrk_dir}/quants'
results_dir = f'{wrk_dir}/results'
public_dir = f'{wrk_dir}/public'
figures_dir = f'{wrk_dir}/figures'

# in files
gwas_sum_stats_file = f'{public_dir}/nalls_pd_gwas/pd_sumstats_23andme_buildGRCh38.tsv.gz'
index_variants_file = f'{public_dir}/nalls_pd_gwas/index_variants.list'
# gwas_sum_stats_file = f'{public_dir}/nalls_pd_gwas/pd_sumstats_no23andme_buildGRCh38.tsv.gz'
gene_annot_file = f'{public_dir}/hg38.ncbiRefSeq.gtf.gz'
encode_bb_file = f'{public_dir}/encodeCcreCombined.bb'
coloc_file = f'{results_dir}/{set_name}.coloc.pp.csv'
causals_file = f'{results_dir}/{set_name}.casuals.pp.parquet'
ld_info_file = f'{public_dir}/ld_info/{cohort}_{dx}.ld_prime.csv'
risk_peaks_bed = f'{quants_dir}/{cohort}_da65_SCAT-DA_{dx}_risk_peaks.bed'

# out files
figure_file = f'{figures_dir}/{cohort}.{index_variant}.{dx}.locus.png'
config_file = f'{figures_dir}/tracks/{cohort}.{index_variant}.{dx}.locus.ini'
gwas_bedgraph_file = f'{figures_dir}/tracks/{dx}.gwas.{index_variant}.bedgraph'
encode_bed_file = f'{figures_dir}/tracks/{dx}.{index_variant}.encodeCcreCombined.bed'

# variables
DEBUG = False
config_content = ''
nl = '\n'
HIGH_LD = 0.8
MAX_DIST = 1500000
FLANK_PAD = 5000
SUG_THRESHOLD = 1.00e-05
MIN_CICERO = 0.2
min_h4 = 0.2
days = ['da0', 'da25', 'da65']
day_colors = {'da0': 'black', 'da25': 'green', 'da65': 'blue'}

#### utility functions

In [None]:
# run command line procsss with bash majic
# make this a small function so it can be target function for parallelization if needed
def run_bash_cmd(this_cmd: str, verbose: bool=False):
    if verbose:
        print(this_cmd)
    !{this_cmd}

### GWAS track

#### load the summary stats

In [None]:
%%time
gwas_df = read_csv(gwas_sum_stats_file, sep='\t')
print(gwas_df.shape)
if DEBUG:
    display(gwas_df.sample(5))

#### load the risk index variants

In [None]:
%%time
index_df = read_csv(index_variants_file)
print(index_df.shape)
index_variants = list(index_df.variant.unique())
if DEBUG:
    display(index_df.head())
    print(index_variants)

#### set the actual edges based on statistically significant p-values for region centered on index variant

In [None]:
risk_index = gwas_df.loc[gwas_df.variant_id == index_variant]
chrom = risk_index.chromosome.values[0]
start_bp = risk_index.base_pair_location.values[0] - MAX_DIST
stop_bp = risk_index.base_pair_location.values[0] + MAX_DIST
print(f'initial region is chrom{chrom}:{start_bp}-{stop_bp}')

In [None]:
# filter to the specified region
gwas_df = gwas_df.loc[(gwas_df.chromosome == chrom) & 
                      (gwas_df.base_pair_location >= start_bp) & 
                      (gwas_df.base_pair_location <= stop_bp)]
risk_df = gwas_df.loc[(gwas_df.p_value <= SUG_THRESHOLD)]
start_bp = risk_df.base_pair_location.min() - FLANK_PAD
stop_bp = risk_df.base_pair_location.max() + FLANK_PAD
print(f'risk region is chrom{chrom}:{start_bp}-{stop_bp}')
print(f'risk region shape {risk_df.shape}')
if DEBUG:
    display(risk_df.head())

#### convert gwas df to bedgraph

In [None]:
# re-filter with narrower edges
gwas_df = gwas_df.loc[(gwas_df.chromosome == chrom) & 
                      (gwas_df.base_pair_location >= start_bp) & 
                      (gwas_df.base_pair_location <= stop_bp)]
gwas_bed = gwas_df[['chromosome', 'base_pair_location', 'variant_id']].copy()
gwas_bed.rename(columns={'chromosome': 'chr', 'base_pair_location': 'position', 
                         'variant_id': 'id'}, inplace=True)
gwas_bed['end'] = gwas_bed['position'] + 1
gwas_bed['signal'] = -log10(gwas_df.p_value)
gwas_bed = gwas_bed[['chr', 'position', 'end', 'signal', 'id']]
print(gwas_bed.shape)
if DEBUG:
    display(gwas_bed.sample(5))
# save the gwas bedgraph
gwas_bed.to_csv(gwas_bedgraph_file, index=False, header=False, sep='\t')

#### highlight index variants in the region

In [None]:
index_bed = gwas_bed.loc[gwas_bed.id.isin(index_variants)]
print(f'index variants included {index_bed.shape}')
index_bedgraph_file = f'{figures_dir}/tracks/{dx}.gwas.{index_variant}.indices.bedgraph'
index_bed.to_csv(index_bedgraph_file, index=False, header=False, sep='\t')
if DEBUG:
    display(index_bed)

#### format gwas track config string

In [None]:
this_content = f'[spacer]{nl}[GWAS]{nl}\
title = {dx} GWAS{nl}\
file = {gwas_bedgraph_file}{nl}\
file_type = bedgraph{nl}\
type = points:6{nl}\
color = black{nl}\
use_middle = true{nl}\
min_value = 0{nl}\
max_value = {ceil(gwas_bed.signal.max())}{nl}\
height = 3{nl}{nl}\
[hlines ovelayed]{nl}\
color = grey{nl}\
line_width = 1{nl}\
line_style = dotted{nl}\
y_values = 7.3{nl}\
file_type = hlines{nl}\
overlay_previous = share-y{nl}\
[GWAS]{nl}\
file = {index_bedgraph_file}{nl}\
file_type = bedgraph{nl}\
type = points:8{nl}\
color = red{nl}\
use_middle = true{nl}\
overlay_previous = share-y{nl}\
[x-axis]{nl}{nl}'

config_content = config_content + this_content

if DEBUG:
    print(this_content)

#### convert LD info to bedgraph file

In [None]:
# get LD info for the index variant
ld_info_df = read_csv(ld_info_file)
print(f'shape of LD info {ld_info_df.shape}')
ld_info_df = ld_info_df.loc[ld_info_df.SNP_A == index_variant]
print(f'shape of LD info for risk indix variant is {ld_info_df.shape}')
ld_bed = ld_info_df[['CHR_B', 'BP_B', 'SNP_B', 'DP']].copy()
ld_bed =ld_bed.rename(columns={'CHR_B': 'chr', 'BP_B': 'position', 'SNP_B': 'id'})
ld_bed['end'] = ld_bed['position'] + 1
# ld_bed.DP = ld_bed.DP * 100
ld_bed = ld_bed[['chr', 'position', 'end', 'DP', 'id']]
index_ld_bed_file = f'{figures_dir}/tracks/{dx}.gwas.{index_variant}.ld.bedgraph'
ld_bed.to_csv(index_ld_bed_file, index=False, header=False, sep='\t')
if DEBUG:
    display(ld_info_df.head())
    display(ld_bed.head())

#### format LD info track config string

In [None]:
this_content = f'{nl}{nl}\
[LD]{nl}\
title = AMP-PD D\'{nl}\
file = {index_ld_bed_file}{nl}\
use_middle = true{nl}\
height = 1{nl}\
fontsize = 8{nl}\
file_type = bedgraph{nl}\
display = collapsed{nl}\
labels = false{nl}\
type = points:6{nl}\
color = black{nl}\
min_value = 0{nl}\
max_value = 1{nl}\
{nl}{nl}'

config_content = config_content + this_content

if DEBUG:
    print(this_content)

### format the genes annot config string

In [None]:
this_content = f'{nl}{nl}\
[genes]{nl}\
title = Genes{nl}\
file = {gene_annot_file}{nl}\
height = 2{nl}\
color = darkblue{nl}\
merge_transcripts = true{nl}\
prefered_name = gene_name{nl}\
fontsize = 12{nl}\
style = UCSC{nl}\
max_labels = 200{nl}\
file_type = gtf{nl}{nl}'

config_content = config_content + this_content

if DEBUG:
    print(this_content)

### eQTL colocalization track

#### load the colocalization feature probabilities

In [None]:
coloc_df = read_csv(coloc_file)
print(f'colocalization shape {coloc_df.shape}')
# subset based on min
coloc_df = coloc_df.loc[coloc_df.H4 > min_h4]
print(f'colocalization shape at min_h4 of {min_h4}: {coloc_df.shape}')
if DEBUG:
    display(coloc_df)

#### load the colocalization variant probabilities

In [None]:
def format_qtl_bedgraph(df: DataFrame, out_file: str, verbose: bool=True) -> float:
    qtl_bed = df[['chromosome', 'base_pair_location']].copy()
    qtl_bed['variant_id'] = df.index.values
    qtl_bed['end'] = qtl_bed['base_pair_location'] + 1
    qtl_bed['signal'] = -log10(df.p_value_qtl)
    # qtl_bed['signal'] = df.logABF_qtl    
    qtl_bed = qtl_bed[['chromosome', 'base_pair_location', 'end', 'signal', 'variant_id']]
    print(qtl_bed.shape)
    if verbose:
        display(qtl_bed.head())    
    qtl_bed.to_csv(out_file, index=False, header=False, sep='\t')
    return qtl_bed.signal.max()

def format_pip_bedgraph(df: DataFrame, feature: str, verbose: bool=True) -> float:
    qtl_bed = df[['chromosome', 'base_pair_location']].copy()
    qtl_bed['variant_id'] = df.index.values
    qtl_bed['end'] = qtl_bed['base_pair_location'] + 1
    qtl_bed['signal'] = -log10(df.p_value_qtl)
    qtl_bed = qtl_bed[['chromosome', 'base_pair_location', 'end', 'signal', 'variant_id']]
    for pp_type in ['risk', 'qtl', 'h4']:
        qtl_bed.signal = df[[f'{pp_type}_PiP']]
        this_file = f'{figures_dir}/tracks/{dx}.{feature}_{pp_type}_pip.{index_variant}.bedgraph'
        qtl_bed.to_csv(this_file, index=False, header=False, sep='\t') 
        print(f'{pp_type} bed shape {qtl_bed.shape}')
        print(f'{pp_type} bed max {qtl_bed.signal.max()}')

In [None]:
qtl_max = 0
causals_df = read_parquet(causals_file)
# rename PIPs for ease of use later
causals_df = causals_df.rename(columns={'PP_risk': 'risk_PiP', 
                                        'PP_qtl': 'qtl_PiP', 
                                        'h4_pp': 'h4_PiP'})
print(f'causals shape {causals_df.shape}')
index_subset = causals_df.loc[causals_df.index == index_variant]
print(f'index variant causals shape {index_subset.shape}')
features = index_subset.MarkerName.str.split(':', expand=True)[0].unique()
# only include those above min_H4
features = list(set(features) & set(coloc_df.feature))
for feature in features:
    print(feature)
    feature_causals_df = causals_df.loc[causals_df.MarkerName.str.startswith(f'{feature}:')]
    print(f'feature causals shape {feature_causals_df.shape}')
    qtl_out_file = f'{figures_dir}/tracks/{dx}.{feature}.qtl.{index_variant}.bedgraph'
    feature_max = format_qtl_bedgraph(feature_causals_df, qtl_out_file, verbose=DEBUG)
    format_pip_bedgraph(feature_causals_df, feature, verbose=DEBUG)
    if feature_max > qtl_max:
        qtl_max = feature_max
    if DEBUG:
        display(feature_causals_df.head())

#### format qtl tracks config string

In [None]:
for feature in features:
    qtl_bedgraph_file = f'{figures_dir}/tracks/{dx}.{feature}.qtl.{index_variant}.bedgraph'
    this_content = f'[eQTL]{nl}\
    title = {feature}{nl}\
        DAn meta-eQTL{nl}\
    file = {qtl_bedgraph_file}{nl}\
    file_type = bedgraph{nl}\
    type = points:8{nl}\
    color = black{nl}\
    use_middle = true{nl}\
    max_value = {ceil(qtl_max+1)}{nl}\
    height = 3{nl}\
    [hlines ovelayed]{nl}\
    color = black{nl}\
    line_width = 0.5{nl}\
    line_style = solid{nl}\
    y_values = 0{nl}\
    file_type = hlines{nl}\
    overlay_previous = share-y{nl}\
    {nl}{nl}'    
    
    # min_value = 0{nl}\    

    config_content = config_content + this_content

    if DEBUG:
        print(this_content)

#### format risk and qtl colocalization PiP tracks config string

In [None]:
# for feature in features:
#     this_file = f'{figures_dir}/tracks/{dx}.{feature}_risk_pip.{index_variant}.bedgraph'
#     this_content = f'[PiPs]{nl}\
#     title = {modality} PiPs{nl}\
#         {dx} (black){nl}\
#         {feature} (purple){nl}\
#         H4 (red){nl}\
#     file = {this_file}{nl}\
#     file_type = bedgraph{nl}\
#     type = points:8{nl}\
#     color = black{nl}\
#     use_middle = true{nl}\
#     min_value = 0{nl}\
#     max_value = 1.1{nl}\
#     height = 2{nl}{nl}'

#     if exists(this_file):
#         config_content = config_content + this_content

#     if DEBUG:
#         print(this_content)

#     this_file = f'{figures_dir}/tracks/{dx}.{feature}_qtl_pip.{index_variant}.bedgraph'    
#     this_content = f'[PiPs]{nl}\
#     file = {this_file}{nl}\
#     file_type = bedgraph{nl}\
#     type = points:8{nl}\
#     color = purple{nl}\
#     use_middle = true{nl}\
#     overlay_previous = share-y{nl}{nl}'

#     if exists(this_file):
#         config_content = config_content + this_content
    
#     this_file = f'{figures_dir}/tracks/{dx}.{feature}_h4_pip.{index_variant}.bedgraph'
#     this_content = f'[PiPs]{nl}\
#     file = {this_file}{nl}\
#     file_type = bedgraph{nl}\
#     type = points:8{nl}\
#     color = red{nl}\
#     use_middle = true{nl}\
#     overlay_previous = share-y{nl}\
#     [x-axis]{nl}{nl}'

#     if exists(this_file):
#         config_content = config_content + this_content

#     if DEBUG:
#         print(this_content)        

### format the SCAT peaks

#### format SCAT-DA peaks that include risk variants

In [None]:
peaks_df = read_csv(risk_peaks_bed)
print(peaks_df.shape)
# get peaks in regions
peaks_df = peaks_df.loc[(peaks_df.Chr == f'chr{chrom}') & 
                        (peaks_df.Start >= start_bp) & 
                        (peaks_df.End <= stop_bp)]
peaks_bed = peaks_df[['Chr', 'Start', 'End']].copy()
peaks_bed['signal'] = 1
peaks_bed['id'] = peaks_df.feature_id
risk_peaks_bedgraph_file  = f'{figures_dir}/tracks/{dx}.risk_peaks.{index_variant}.bedgraph'
peaks_bed.to_csv(risk_peaks_bedgraph_file, index=False, header=False, sep='\t')   
print(peaks_bed.shape)
if DEBUG:
    display(peaks_df.head())
    display(peaks_bed.head())     

In [None]:
this_content = f'[risk peaks]{nl}\
title = Risk Peaks{nl}\
file = {risk_peaks_bedgraph_file}{nl}\
color = grey{nl}\
max_value = 1{nl}\
min_value = 0{nl}\
show_data_range = false{nl}\
height = 0.5{nl}{nl}'

config_content = config_content + this_content

if DEBUG:
    print(this_content)

In [None]:
# this is from Frank's formatting and files
this_content = f'[Dopaminergic_Neurons]{nl}\
file = {quants_dir}/foundin_da65_SCAT-DA.means.bedgraph{nl}\
color = #FFD700{nl}\
height = 1.5{nl}\
title = SCAT-DA{nl}\
min_value = 0.0{nl}\
max_value = 0.70{nl}{nl}\
[Immature_Dopaminergic_Neurons]{nl}\
file = {quants_dir}/foundin_da65_SCAT-iDA.means.bedgraph{nl}\
color = #EE5C42{nl}\
height = 1.5{nl}\
title = SCAT-iDA{nl}\
min_value = 0.0{nl}\
max_value = 0.70{nl}{nl}\
[Late_neuron_Progenitor]{nl}\
file = {quants_dir}/foundin_da65_SCAT-lNP.means.bedgraph{nl}\
color = #2E8B57{nl}\
height = 1.5{nl}\
title = SCAT-LNP{nl}\
min_value = 0.0{nl}\
max_value = 0.70{nl}{nl}'

config_content = config_content + this_content

if DEBUG:
    print(this_content)

### SCAT co-accessibility

In [None]:
def format_cicero_arcs(cicero_df: DataFrame, chrom: str, verbose: bool=False) -> DataFrame:
    # to speed up a bit do initial subset by chrom
    cicero_df = cicero_df.loc[cicero_df[0].str.startswith(f'chr{chrom}-')]
    cicero_score_df = concat([cicero_df[0].str.split(r':|-', expand=True), 
                              cicero_df[1].str.split(r':|-', expand=True), 
                              cicero_df[[2]]], axis=1)
    cicero_score_df.columns=['chrom1', 'start1', 'stop1', 'chrom2', 
                             'start2', 'stop2', 'coaccess']
    cicero_score_df[['start1', 'stop1', 
                     'start2', 'stop2']]=cicero_score_df[['start1', 'stop1', 
                                                          'start2', 'stop2']].astype('int')
    print(cicero_score_df.shape) 
    # convert coaccess df to arcs
    cicero_arcs_df = cicero_score_df.loc[(cicero_score_df.chrom1 == f'chr{chrom}') & 
                          (cicero_score_df.start1 >= start_bp) & 
                          (cicero_score_df.stop1 <= stop_bp) & 
                          (cicero_score_df['coaccess'].notna()) &
                          (cicero_score_df.coaccess > MIN_CICERO)]
    print(cicero_arcs_df.shape)
    return cicero_arcs_df

#### load and format the Cicero results by cell-type

In [None]:
%%time
# here not including all but most relevant
# scat_cell_types = ['Dopaminergic_Neurons', 'Immature_Dopaminergic_Neurons', 'Late_neuron_Progenitor']
scat_cell_types = ['Dopaminergic_Neurons', 'Immature_Dopaminergic_Neurons']
max_cicero_value = 0
for cell_type in scat_cell_types:
    print(cell_type)
    cicero_in_file = f'{quants_dir}/SCAT.{cell_type}.cicero.conns.txt.gz'
    cicero_out_file = f'{figures_dir}/tracks/{dx}.{cell_type}.cicero.{index_variant}.arcs'
    cicero_df = read_csv(cicero_in_file, sep='\t',skiprows=1, header=None)
    print(cicero_df.shape)
    cicero_arcs_df = format_cicero_arcs(cicero_df, chrom, verbose=DEBUG)
    this_max = cicero_arcs_df.coaccess.max()
    print(this_max)
    if this_max > max_cicero_value:
        max_cicero_value = this_max 
    cicero_arcs_df.to_csv(cicero_out_file, index=False, header=False, sep='\t')        
    if DEBUG:
        display(cicero_df.sample(5))    

### format cicero SCAT track config string

In [None]:
# scat_colors = {'Dopaminergic_Neurons': '#FFD700', 'Immature_Dopaminergic_Neurons': '#EE5C42', 
#                'Late_neuron_Progenitor': '#2E8B57', 'Early_neuron_Progenitor': '#4682B4', 
#                'Proliferating_Floor_Plate_Progenitors': '#8B8B00'}
scat_colors = {'Dopaminergic_Neurons': 'Blues', 
               'Immature_Dopaminergic_Neurons': 'Reds', 
               'Late_neuron_Progenitor': 'Greens'}
cell_type_abbrvs = {'Dopaminergic_Neurons': 'DA', 
               'Immature_Dopaminergic_Neurons': 'iDA', 
               'Late_neuron_Progenitor': 'LNP'}
for cell_type in scat_cell_types:
    cicero_arcs_file = f'{figures_dir}/tracks/{dx}.{cell_type}.cicero.{index_variant}.arcs'
    this_color = scat_colors.get(cell_type)
    cell_abbrv = cell_type_abbrvs.get(cell_type)
    this_content = f'[Cierco]{nl}\
title = SCAT-{cell_abbrv}{nl}\
    co-accessibility{nl}\
file = {cicero_arcs_file}{nl}\
file_type = links{nl}\
color = {this_color}{nl}\
min_value = -0.2{nl}\
max_value = {ceil(max_cicero_value)}{nl}\
height = 4{nl}\
line_type = arcs{nl}\
line_width = 0.5{nl}\
orientation = inverted{nl}\
compact_arcs_level = 1{nl}{nl}'

    config_content = config_content + this_content

    if DEBUG:
        print(this_content)

### bulk feature ~ DA cell fractions tracks

In [None]:
epi_types = ['ATAC', 'METH']
for epi_type in epi_types:
    print(epi_type)
    this_file = f'{results_dir}/{cohort}_{epi_type}_DopaminergicNeurons_lmm.csv'
    cell_fracs_df = read_csv(this_file, index_col=0)
    cell_fracs_df['z'] = cell_fracs_df.coef/cell_fracs_df.stderr
    cell_fracs_df.z = cell_fracs_df.z.fillna(0)
    if epi_type == 'ATAC':
        # these peak names are postion named so split and use
        peaks_info = cell_fracs_df.index.str.split('_', expand=True).to_frame()
        peaks_info.columns = ['chrom','start', 'end']
        cell_fracs_df['chrom'] = peaks_info.chrom.values
        cell_fracs_df['start'] = peaks_info.start.astype(int).values
        cell_fracs_df['end'] = peaks_info.end.astype(int).values 
    if epi_type == 'METH':
        features_file = f'{quants_dir}/EPIC_annotation_hg38.txt'
        features_df = read_csv(features_file, sep='\t', header=None)
        features_df.columns = ['chrom', 'start', 'end', 'feature']
        features_df.end = features_df.end + 5
        cell_fracs_df = cell_fracs_df.merge(features_df, how='inner', left_index=True, right_on='feature')
        cell_fracs_df = cell_fracs_df.set_index('feature')
    # chromosome format
    cell_fracs_df.chrom = cell_fracs_df.chrom.str.replace('chr', '')
    # format the bedgraph
    frac_bed = cell_fracs_df[['chrom', 'start', 'end', 'z']]
    frac_bed['feature'] = cell_fracs_df.index.values
    # subset to region
    frac_bed = frac_bed.loc[(frac_bed.chrom == str(chrom)) & 
                            (frac_bed.start >= start_bp) & 
                            (frac_bed.end <= stop_bp)]
    this_out_file = f'{figures_dir}/tracks/{dx}.DA.{epi_type}.corr_cell_fracs.{index_variant}.bedgraph'
    frac_bed.to_csv(this_out_file, index=False, header=False, sep='\t')
    if DEBUG:
        display(cell_fracs_df.head())
        display(frac_bed.head())

##### format feature ~ cell fraction config string

In [None]:
for epi_type in epi_types:
    fracs_corr_bedgraph_file = f'{figures_dir}/tracks/{dx}.DA.{epi_type}.corr_cell_fracs.{index_variant}.bedgraph'
    this_content = f'[DA {epi_type} cis_corr]{nl}\
    title = DA ~ {epi_type}{nl}\
    file = {fracs_corr_bedgraph_file}{nl}\
    file_type = bedgraph{nl}\
    color = black{nl}\
    height = 2{nl}\
    [hlines ovelayed]{nl}\
    color = black{nl}\
    line_width = 0.5{nl}\
    line_style = solid{nl}\
    y_values = 0{nl}\
    file_type = hlines{nl}\
    overlay_previous = share-y{nl}\
    [hlines ovelayed]{nl}\
    color = grey{nl}\
    line_width = 0.5{nl}\
    line_style = dotted{nl}\
    y_values = 1.96{nl}\
    file_type = hlines{nl}\
    overlay_previous = share-y{nl}\
    [hlines ovelayed]{nl}\
    color = grey{nl}\
    line_width = 0.5{nl}\
    line_style = dotted{nl}\
    y_values = -1.96{nl}\
    file_type = hlines{nl}\
    overlay_previous = share-y{nl}{nl}'    

    config_content = config_content + this_content

    if DEBUG:
        print(this_content)

### caQTL colocalization scores track

In [None]:
features_file = f'{quants_dir}/{cohort}_consensus_peaks.saf'
features_df = read_csv(features_file, sep='\t')
features_df.columns = ['feature', 'chrom', 'start', 'end', 'strand']
print(f'features annot shape {features_df.shape}')
if DEBUG:
    display(features_df.head())

In [None]:
def format_chst_qtl_bedgraph(features_df: DataFrame, day: str, chst_modality: str, 
                             verbose: bool=False):
    this_file = f'{results_dir}/{cohort}_{day}_{chst_modality}_{dx}.coloc.pp.csv'
    coloc_df = read_csv(this_file)
    print(f'{day} {chst_modality} coloc shape {coloc_df.shape}')
    coloc_df = coloc_df.merge(features_df, how='left', on='feature')
    print(f'annotated {day} {chst_modality} coloc shape {coloc_df.shape}')
    # format the chrom
    coloc_df.chrom = coloc_df.chrom.str.replace('chr', '')
    # subset to region
    coloc_df = coloc_df.loc[(coloc_df.chrom == chrom) & 
                            (coloc_df.start >= start_bp) & 
                            (coloc_df.end <= stop_bp)]
    print(f'risk region {day} {chst_modality} coloc shape {coloc_df.shape}')
    qtl_bed = coloc_df[['chrom', 'start', 'end', 'H4', 'feature']]
    bedgraph_file = f'{figures_dir}/tracks/{dx}.{day}_{chst_modality}.qtl_h4.{index_variant}.bedgraph'
    qtl_bed.to_csv(bedgraph_file, index=False, header=False, sep='\t')        
    if DEBUG:
        display(coloc_df.head())
        display(qtl_bed.head())

In [None]:
chst_modality = 'ATAC'
for day in days:
    print(day)
    format_chst_qtl_bedgraph(features_df, day, chst_modality, verbose=DEBUG)

#### format caQTL colocalization H4 track config string

In [None]:
# for day in days:
#     day_color = day_colors.get(day)
#     bedgraph_file = f'{figures_dir}/tracks/{dx}.{day}_{chst_modality}.qtl_h4.{index_variant}.bedgraph'
#     this_content = f'[{day} {chst_modality}]{nl}\
#     file = {bedgraph_file}{nl}\
#     title = {day} caQTL H4{nl}\
#     file_type = bedgraph{nl}\
#     color = {day_color}{nl}\
#     height = 2{nl}\
#     min_value = 0.0{nl}\
#     max_value = 1{nl}\
#     [hlines ovelayed]{nl}\
#     color = black{nl}\
#     line_width = 0.5{nl}\
#     line_style = solid{nl}\
#     y_values = 0{nl}\
#     file_type = hlines{nl}\
#     overlay_previous = share-y{nl}\
#     {nl}{nl}'

#     config_content = config_content + this_content

#     if DEBUG:
#         print(this_content)

### mQTL colocalization scores track

In [None]:
features_file = f'{quants_dir}/EPIC_annotation_hg38.txt'
features_df = read_csv(features_file, sep='\t', header=None)
features_df.columns = ['chrom', 'start', 'end', 'feature']
print(f'features annot shape {features_df.shape}')
if DEBUG:
    display(features_df.head())

In [None]:
chst_modality = 'METH'
for day in days:
    print(day)
    if day == 'da25':
        continue
    format_chst_qtl_bedgraph(features_df, day, chst_modality, verbose=DEBUG)

#### format mQTL colocalization H4 tracks config string

In [None]:
# for day in days:
#     if day == 'da25':
#         continue
#     day_color = day_colors.get(day)
#     bedgraph_file = f'{figures_dir}/tracks/{dx}.{day}_{chst_modality}.qtl_h4.{index_variant}.bedgraph'
#     this_content = f'[{day} {chst_modality}]{nl}\
#     file = {bedgraph_file}{nl}\
#     title = {day} mQTL H4{nl}\
#     file_type = bedgraph{nl}\
#     color = {day_color}{nl}\
#     height = 2{nl}\
#     min_value = 0.0{nl}\
#     max_value = 1{nl}\
#     [hlines ovelayed]{nl}\
#     color = black{nl}\
#     line_width = 0.5{nl}\
#     line_style = solid{nl}\
#     y_values = 0{nl}\
#     file_type = hlines{nl}\
#     overlay_previous = share-y{nl}\
#     {nl}{nl}'

#     config_content = config_content + this_content

#     if DEBUG:
#         print(this_content)

### Bryois CNS cell-type eQTL track

In [None]:
cns_names = {'Astrocytes': 'Bryois-Astro', 'Endothelial.cells': 'Bryois-Endo', 
               'Excitatory.neurons': 'Bryois-ExN', 'Inhibitory.neurons': 'Bryois-InN', 
               'Microglia': 'Bryois-Micro', 'OPCs...COPs': 'Bryois-OPC', 
               'Oligodendrocytes': 'Bryois-Oligo', 'Pericytes': 'Bryois-Peri'}

vars_df = read_csv(f'{public_dir}/bryois_brain_eqtl/snp_pos.txt', sep='\t')
vars_df = vars_df.rename(columns={'SNP': 'variant_id', 'chr': 'chromosome', 'pos_hg38': 'base_pair_location'})

for cell_type, cell_name in cns_names.items():
    print(cell_type)
    cns_file = f'{public_dir}/bryois_brain_eqtl/{cell_type}.{chrom}.gz'
    cell_qtl_df = read_csv(cns_file, sep='\s+', header=None)
    cell_qtl_df.columns = ['gene_info', 'variant_id', 'tss_distance', 'p_value_qtl', 'slope']
    temp_df = cell_qtl_df.gene_info.str.split('_', n=1, expand=True)
    temp_df.columns = ['gene_name', 'gene_id']
    cell_qtl_df['gene_id'] = temp_df.gene_id
    cell_qtl_df['gene_name'] = temp_df.gene_name
    cell_qtl_df = cell_qtl_df.merge(vars_df, how='left', on='variant_id')
    cell_qtl_df.chromosome = cell_qtl_df.chromosome.str.replace('chr', '')
    cell_qtl_df = cell_qtl_df.set_index('variant_id')
    print(f'cell qtl shape {cell_qtl_df.shape} for {cell_name}')
    # use same features from foundin eQTL
    for feature in features:
        print(feature)
        feature_qtl_df = cell_qtl_df.loc[cell_qtl_df.gene_name == feature]
        print(f'feature qtl shape {feature_qtl_df.shape}')
        qtl_out_file = f'{figures_dir}/tracks/{dx}.{feature}.{cell_name}.qtl.{index_variant}.bedgraph'
        feature_max = format_qtl_bedgraph(feature_qtl_df, qtl_out_file, verbose=DEBUG)
        if feature_max > qtl_max:
            qtl_max = feature_max
        if DEBUG:
            display(feature_qtl_df.head())    

#### format Bryois CNS qtl track(s) config string

In [None]:
cns_colors = {'Bryois-Astro': 'blue', 'Bryois-Endo': 'purple', 
              'Bryois-ExN': 'violet', 'Bryois-InN': 'red', 
              'Bryois-Micro': 'green', 'Bryois-OPC': 'orange', 
              'Bryois-Oligo': 'black', 'Bryois-Peri': 'grey'}
for feature in features:
    for cell_type, cell_name in cns_names.items():
        print(cell_type, cell_name)
        cell_color = cns_colors.get(cell_name)
        this_file = f'{figures_dir}/tracks/{dx}.{feature}.{cell_name}.qtl.{index_variant}.bedgraph'
        # specify first cell-type Astro, then loop the rest as overlay
        if cell_type == 'Astrocytes':
            this_content = f'[CNS eQTL]{nl}\
            title = Bryois CNS{nl}\
                {feature} eQTL{nl}\
                Astro (blue){nl}\
                Endo (purple){nl}\
                ExN (violet){nl}\
                InN (red){nl}\
                Micro (green){nl}\
                OPC (orange){nl}\
                Oligo (black){nl}\
                Peri (grey){nl}\
            file = {this_file}{nl}\
            file_type = bedgraph{nl}\
            type = points:8{nl}\
            color = {cell_color}{nl}\
            use_middle = true{nl}\
            min_value = 0{nl}\
            max_value = {ceil(qtl_max+1)}{nl}\
            height = 8{nl}\
            [hlines ovelayed]{nl}\
            color = black{nl}\
            line_width = 0.5{nl}\
            line_style = solid{nl}\
            y_values = 0{nl}\
            file_type = hlines{nl}\
            overlay_previous = share-y{nl}\
            {nl}{nl}'            
        else:
            this_content = f'[CNS eQTL]{nl}\
            file = {this_file}{nl}\
            file_type = bedgraph{nl}\
            type = points:8{nl}\
            color = {cell_color}{nl}\
            use_middle = true{nl}\
            overlay_previous = share-y{nl}\
            [hlines ovelayed]{nl}\
            color = black{nl}\
            line_width = 0.5{nl}\
            line_style = solid{nl}\
            y_values = 0{nl}\
            file_type = hlines{nl}\
            overlay_previous = share-y{nl}\
            {nl}{nl}'            

        config_content = config_content + this_content
        if DEBUG:
            print(this_content)

### MetaBrain regions eQTL track

In [None]:
region = 'basalganglia-EUR-30PCs'
region_file = f'{public_dir}/metabrain/2021-07-23-{region}-chr{chrom}.txt.gz'
region_qtl_df = read_csv(region_file, sep='\s+')
print(f'{region} qtl shape {region_qtl_df.shape}')
if DEBUG:
    display(region_qtl_df.head())

In [None]:
region_names_dict = {
    'basalganglia-EUR-30PCs': 'basalganglia', 
    'cerebellum-EUR-60PCs': 'cerebellum', 
    'cortex-EUR-80PCs': 'cortex', 'hippocampus-EUR-30PCs': 'hippocampus', 
    'spinalcord-EUR-20PCs': 'spinalcord'
}

for region_in, region_out in region_names_dict.items():
    print(cell_type)
    region_file = f'{public_dir}/metabrain/2021-07-23-{region_in}-chr{chrom}.txt.gz'
    region_qtl_df = read_csv(region_file, sep='\s+')
    # split the SNP col to get variant dbSNP ID
    temp_df = region_qtl_df.SNP.str.split(':', expand=True)
    temp_df.columns = ['chrom', 'pos', 'variant_id', 'alleles']
    region_qtl_df['variant_id'] = temp_df.variant_id    
    region_qtl_df = region_qtl_df.rename(columns={'SNPChr': 'chromosome', 
                                                  'SNPPos': 'base_pair_location', 
                                                  'MetaP': 'p_value_qtl'})
    region_qtl_df = region_qtl_df.set_index('variant_id')
    print(f'cell qtl shape {region_qtl_df.shape} for {cell_name}')
    # use same features from foundin eQTL
    for feature in features:
        print(feature)
        feature_qtl_df = region_qtl_df.loc[region_qtl_df.GeneSymbol == feature]
        print(f'feature qtl shape {feature_qtl_df.shape}')
        qtl_out_file = f'{figures_dir}/tracks/{dx}.{feature}.metabrain-{region_out}.qtl.{index_variant}.bedgraph'
        feature_max = format_qtl_bedgraph(feature_qtl_df, qtl_out_file, verbose=DEBUG)
        if feature_max > qtl_max:
            qtl_max = feature_max
        if DEBUG:
            display(feature_qtl_df.head())    

#### format MetaBrain region qtl track(s) config string

In [None]:
cns_colors = {'basalganglia': 'blue', 'cerebellum': 'purple', 
              'cortex': 'red', 'hippocampus': 'green', 
              'spinalcord': 'orange'}
for feature in features:
    for region_in, region_out in region_names_dict.items():
        print(region_in, region_out)
        cell_color = cns_colors.get(region_out)
        this_file = f'{figures_dir}/tracks/{dx}.{feature}.metabrain-{region_out}.qtl.{index_variant}.bedgraph'
        if region_out == 'basalganglia':
            this_content = f'[region eQTL]{nl}\
            title = MetaBrain{nl}\
                {feature} eQTL{nl}\
                basalganglia (blue){nl}\
                cerebellum (purple){nl}\
                cortex (red){nl}\
                hippocampus (green){nl}\
                spinalcord (orange){nl}\
            file = {this_file}{nl}\
            file_type = bedgraph{nl}\
            type = points:8{nl}\
            color = {cell_color}{nl}\
            use_middle = true{nl}\
            min_value = 0{nl}\
            max_value = {ceil(qtl_max+1)}{nl}\
            height = 8{nl}\
            [hlines ovelayed]{nl}\
            color = black{nl}\
            line_width = 0.5{nl}\
            line_style = solid{nl}\
            y_values = 0{nl}\
            file_type = hlines{nl}\
            overlay_previous = share-y{nl}\
            {nl}{nl}'            
        else:
            this_content = f'[region eQTL]{nl}\
            file = {this_file}{nl}\
            file_type = bedgraph{nl}\
            type = points:8{nl}\
            color = {cell_color}{nl}\
            use_middle = true{nl}\
            overlay_previous = share-y{nl}\
            [hlines ovelayed]{nl}\
            color = black{nl}\
            line_width = 0.5{nl}\
            line_style = solid{nl}\
            y_values = 0{nl}\
            file_type = hlines{nl}\
            overlay_previous = share-y{nl}\
            {nl}{nl}'            

        config_content = config_content + this_content
        if DEBUG:
            print(this_content)

### create the Encode CRE bed

In [None]:
this_cmd = f'bigBedToBed {encode_bb_file} -chrom=chr{chrom} -start={start_bp} \
-end={stop_bp} {encode_bed_file}'
run_bash_cmd(this_cmd, DEBUG)

### format the HiC data tracks

In [None]:
# # this is from Frank's formatting and files
# this_content = f'[HiC regions da0]{nl}\
# file = {figures_dir}/tracks/hic_da0_regions.bed{nl}\
# height = 0.25{nl}\
# title = da0 Regions{nl}\
# file_type = bed{nl}\
# display = collapsed{nl}\
# color = #00AFBB{nl}\
# labels = false{nl}{nl}\
# [HiC da0]{nl}\
# file = {figures_dir}/tracks/hic_da0.bedpe{nl}\
# title = da0 Loops{nl}\
# height = 2{nl}\
# line_width = 2{nl}\
# links_type = arcs{nl}\
# color = #00AFBB{nl}\
# line_style = solid{nl}\
# file_type = links{nl}\
# orientation = inverted{nl}\
# use_middle = true{nl}{nl}\
# [HiC regions da65]{nl}\
# file = {figures_dir}/tracks/hic_da65_regions.bed{nl}\
# height = 0.25{nl}\
# title = da65 Regions{nl}\
# file_type = bed{nl}\
# display = collapsed{nl}\
# color = #FC4E07{nl}\
# labels = false{nl}{nl}\
# [HiC da65]{nl}\
# file = {figures_dir}/tracks/hic_da65.bedpe{nl}\
# title = da65 Loops{nl}\
# height = 2{nl}\
# line_width = 2{nl}\
# links_type = arcs{nl}\
# color = #FC4E07{nl}\
# line_style = solid{nl}\
# file_type = links{nl}\
# orientation = inverted{nl}\
# use_middle = true{nl}{nl}'

# config_content = config_content + this_content

# if DEBUG:
#     print(this_content)

### add encode track to config

In [None]:
this_content = f'{nl}{nl}\
[CRE]{nl}\
title = ENCODE CRE{nl}\
file = {encode_bed_file}{nl}\
use_middle = true{nl}\
height = 2{nl}\
fontsize = 8{nl}\
file_type = bed{nl}\
color = Blues{nl}\
display = collapsed{nl}\
labels = false{nl}\
show_data_range = false{nl}\
[x-axis]{nl}{nl}'

config_content = config_content + this_content

if DEBUG:
    print(this_content)

### save the config file content

In [None]:
with open(config_file, 'w') as f:
    f.write(config_content)

### generate the figure

In [None]:
this_cmd = f'pyGenomeTracks --tracks {config_file} \
--region chr{chrom}:{start_bp}-{stop_bp} --dpi 200 \
--title "{dx} {index_variant}" \
--outFileName {figure_file}'

run_bash_cmd(this_cmd, DEBUG)

### view the generated figure

In [None]:
Image(figure_file)

In [None]:
!date