## Notebook for calculating PD GRS 
assumes genome plink bfile already available (like one formatted for running tensorQTL)

In [None]:
!date

#### import libraries

In [None]:
from pandas import read_csv, read_parquet
import concurrent.futures
import matplotlib.pyplot as plt
from seaborn import boxenplot, stripplot
from sklearn.preprocessing import RobustScaler, MinMaxScaler
from matplotlib.pyplot import rc_context

%matplotlib inline
# for white background of figures (only for docs rendering)
%config InlineBackend.print_figure_kwargs={'facecolor' : "w"}
%config InlineBackend.figure_format='retina'

#### set notebook variables

In [None]:
# naming
cohort = 'foundin'
version = 'amppdv1'
cohort_version = f'{cohort}.{version}'

# directories
wrk_dir = '/labshare/raph/datasets/foundin_qtl'
geno_dir = f'{wrk_dir}/genotypes'
info_dir = f'{wrk_dir}/sample_info'
public_dir = f'{wrk_dir}/public'

# input files
bfile_prefix_path = f'{geno_dir}/{cohort}.{version}.bfile'
# if agreement in place use summary stats that include 23andMe data
gwas_sum_stats_file = f'{public_dir}/nalls_pd_gwas/pd_sumstats_23andme_buildGRCh38.tsv.gz'
# gwas_sum_stats_file = f'{public_dir}/nalls_pd_gwas/pd_sumstats_no23andme_buildGRCh38.tsv.gz'
index_variants_file = f'{public_dir}/nalls_pd_gwas/index_variants.list'  
psam_file = f'{geno_dir}/{cohort}.{version}.chr1.psam'
coloc_file = f'{wrk_dir}/results/{cohort}_daNA_DAn-meta_PD.casuals.pp.parquet'

# output files
score_file = f'{geno_dir}/PD_risk_betas.txt'
grs_file = f'{geno_dir}/{cohort}_grs'
grs_scaled_file = f'{info_dir}/{cohort}_grs_scaled.csv'

# constants
DEBUG = False
dpi_value = 100
# add variant to exclude from GRS, set to None is not
# drop_variants = ['rs76763715', 'rs34637584']
drop_variants = []
limit_to_coloc = True

#### utility functions

In [None]:
# function to run bash command
def run_bash_cmd(this_cmd):
    !{this_cmd}

# function to run a list of bash commands in parallel
def run_bash_cmds_parallel(cmd_list):
    with concurrent.futures.ProcessPoolExecutor() as ppe:
        for this_cmd in cmd_list:
            ppe.submit(run_bash_cmd, this_cmd) 

### create score file from independent risk signals for calculating GRS

#### load GWAS results

In [None]:
%%time
gwas_stats_df = read_csv(gwas_sum_stats_file, sep='\t')
print(gwas_stats_df.shape)

if DEBUG:
    display(gwas_stats_df.sample(5))

#### read the GWAS index variants

In [None]:
%%time
variants_oi_df = read_csv(index_variants_file)
variants_oi_df = variants_oi_df.loc[~variants_oi_df.variant.isin(drop_variants)]
print(variants_oi_df.shape)
index_variants = list(variants_oi_df.variant.unique())
if DEBUG:
    display(variants_oi_df.head())
    print(index_variants)

### if specified limit the index variants to those present in colocalization analysis of DAn-meta
where to H4 Prob was at least 50%

In [None]:
if limit_to_coloc:
    coloc_df = read_parquet(coloc_file)
    print(f'coloc results shape {coloc_df.shape}')
    # subset index variant to those present in coloc results
    coloc_df = coloc_df.loc[coloc_df.index.isin(index_variants)]
    print(f'coloc subset shape {coloc_df.shape}')    
    index_variants = list(coloc_df.index.values)
    print(f'subset index_variants length is {len(index_variants)}')
    if DEBUG:
        display(coloc_df.head())
        print(index_variants)

#### subset index variant stats

In [None]:
index_stats_df = gwas_stats_df.loc[gwas_stats_df.variant_id.isin(index_variants)]
print(index_stats_df.shape)
if DEBUG:
    display(index_stats_df.head())

#### format and save score file for Plink input

In [None]:
score_df = index_stats_df[['variant_id', 'effect_allele', 'beta']]
score_df.to_csv(score_file, header=False, index=False, sep='\t')
print(f'score file shape is {score_df.shape}')
if DEBUG:
    display(score_df.head())

### calculate the GRS for samples

use the plink bfiles that were prepped for the <i>cis</i>-QTL analysis using tensorQTL

In [None]:
this_cmd = f'plink2 --bfile {bfile_prefix_path} --score {score_file} 1 2 3 header \
list-variants --silent --out {grs_file}'

print(this_cmd)
run_bash_cmd(this_cmd)

In [None]:
this_cmd = f'tail -n 15 {grs_file}.log'
run_bash_cmd(this_cmd)

#### see if any variants weren't used in scoring

In [None]:
scored_variants_file = f'{grs_file}.sscore.vars'
scored_vars_df = read_csv(scored_variants_file, header=None)
scored_vars_df.columns = ['variant']
print(scored_vars_df.shape)

missing_variants = set(index_variants) - set(scored_vars_df['variant'])

display(gwas_stats_df.loc[gwas_stats_df['variant_id'].isin(missing_variants)])

### load covariates files

In [None]:
psam_df = read_csv(psam_file, sep='\s+')
print(psam_df.shape)
# rename from AMP-PD prefix PP- to FOUNDIN-PD prefix PPMI
psam_df.IID = psam_df.IID.str.replace('PP-', 'PPMI')
if DEBUG:
    display(psam_df.sample(5))

### load grs scores, take a look at score by DX

In [None]:
#load grs scores, take a look at score by DX
scored_samples_file = f'{grs_file}.sscore'
scores_df = read_csv(scored_samples_file,sep='\s+')
print(scores_df.shape)
scores_df.rename(columns={'SCORE1_AVG': 'GRS', '#IID': 'IID'}, inplace=True)
scores_df = scores_df[['IID','GRS']]
scores_df = scores_df.merge(psam_df,how='inner',left_on='IID',right_on='IID')
scores_df = scores_df.drop_duplicates(subset=['IID'], keep='first')
print(scores_df.shape)
if DEBUG:
    display(scores_df.head())

In [None]:
scores_df['DX'].value_counts()

#### plot raw GRS scores

In [None]:
# grouping = 'study_arm'
grouping = 'DX'

# plot raw GRS scores
with rc_context({'figure.figsize': (9, 9), 'figure.dpi': dpi_value}):
    plt.style.use('seaborn-v0_8-bright')
    boxenplot(x=grouping,y='GRS', width_method='exponential', data=scores_df, 
              k_depth='trustworthy', color='Purple')

    grsplt = stripplot(x=grouping,y='GRS',data=scores_df, alpha=0.75, 
                       jitter=True, color='darkgrey')
    loc, labels = plt.xticks()
    grsplt.set_xticklabels(labels, rotation=60)
    plt.title('GRS by Group', fontsize='large') 
    plt.xlabel('Group')
    plt.ylabel('raw GRS')
    plt.show()

#### standardize the GRS sample scores

In [None]:
# standardize the GRS sample scores and re-plot
#z-score
scores_df['zGRS'] =  MinMaxScaler().fit_transform(RobustScaler(quantile_range=(25, 75))
                                                  .fit_transform(scores_df[['GRS']]))
if DEBUG:
    display(scores_df.head())

#plot
with rc_context({'figure.figsize': (9, 9), 'figure.dpi': dpi_value}):
    plt.style.use('seaborn-v0_8-bright')
    boxenplot(x=grouping,y='zGRS', width_method='exponential', data=scores_df, 
              k_depth='trustworthy', color='Purple')

    grsplt = stripplot(x=grouping,y='zGRS',data=scores_df, alpha=0.75, 
                       jitter=True, color='darkgrey')
    loc, labels = plt.xticks()
    grsplt.set_xticklabels(labels, rotation=60)
    plt.title('GRS by Group', fontsize='large') 
    plt.xlabel('Group')
    plt.ylabel('Scaled GRS')
    plt.subplots_adjust(wspace=0.3)
    plt.show()

#### saved to the scaled GRS file

In [None]:
scores_df['GRS'] = scores_df['zGRS']
scores_df[['IID', 'DX', 'GRS']].to_csv(grs_scaled_file, index=False)

In [None]:
!date