## Notebook to prep FOUNDIN-PD modality for a specific day
will output a scaled and covariate adjusted file for the day and modality specified; notebook expects the split quant by day notebook has already been run

- this notebook is still very duplicative and the step and code included in the split quants by day notebook

In [None]:
!date

#### import libraries

In [None]:
from pandas import read_csv, read_hdf, DataFrame, read_pickle
import nb_util_funcs as nuf
from random import sample
from seaborn import distplot
import matplotlib.pyplot as plt
from matplotlib.pyplot import rc_context

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline
# for white background of figures (only for docs rendering)
%config InlineBackend.print_figure_kwargs={'facecolor' : "w"}
%config InlineBackend.figure_format='retina'

#### set notebooks variables

In [None]:
# parameters
modality = ''
day = ''

In [None]:
# naming
cohort = 'foundin'
set_name = f'{cohort}_{day}_{modality}'

# directories
wrk_dir = '/home/jupyter/foundin_qtl'
quants_dir = f'{wrk_dir}/quants'
info_dir = f'{wrk_dir}/sample_info'
public_dir = f'{wrk_dir}/public'

# in files
quants_file = f'{quants_dir}/{set_name}.hdf5'
covariates_file = f'{info_dir}/{cohort}_{modality}_sample_info.csv'
if modality == 'ATAC':
    features_file = f'{quants_dir}/{cohort}_consensus_peaks.saf'
elif modality == 'METH':
    features_file = f'{quants_dir}/EPIC_annotation_hg38.txt'    
elif modality.startswith('PDUI'):
    features_file = f'{quants_dir}/{cohort}_{modality}.features.csv'
elif modality == 'RNAB' or modality.startswith('SCRN'):
    features_file = f'{public_dir}/gencode_v29.lncipedia_v5_2_hc.annotation.pkl'
elif modality == 'CIRC':
    features_file = f'{quants_dir}/circRNA_genomicRegionList.tsv'    

# out files
umap_covs_file = f'{info_dir}/{set_name}.umap.covs.csv'
scaled_file = f'{quants_dir}/{set_name}.scaled.hdf5'
adj_quants_file = f'{quants_dir}/{set_name}.scaled.adj.hdf5'
tnsrqtl_pheno_file = f'{quants_dir}/{set_name}.scaled.adj.bed.gz'

# constants
if modality == 'METH':
    min_detection_rate = 0.75
else:
    min_detection_rate = 0.25
if modality.startswith('SCRN') or modality in ['PDUI-DA', 'PDUI-iDA']:
    size_covariate='EstimatedNumberofCells'
else:
    size_covariate='DAn'    
DEBUG = False
low_var_quartile = '75%'
dpi_value = 50

other_id_columns = ['sampleid', 'cdi', 'PPMI_ID', 'DZNE_Barcode', 'DZNE_ID', 
                    'participant_id', 'wgsid', 'PATNO', 'Barcode_LNG', 
                    'Barcode_DZNE', 'Alternate MRN', 'IID', 'FID']
exclude_addl_info_cols = ['data_split', 'ENSG00000188906.15', 'ENSG00000131979.18',
                          'ENSG00000129003.17', 'ENSG00000069329.17', 
                          'ENSG00000177628.15', 'ENSG00000158828.7',
                          'ENSG00000145335.15', 'ENSG00000164535.14', 
                          'ENSG00000165092.12', 'ENSG00000147133.15',
                          'ENSG00000155961.4']
# to match geno's use PPMI3966 Batch3
replace_id_dict = {'PPMI3966B3': 'PPMI3966'}

### load input data

#### load the quantified features matrix

In [None]:
%%time
quants_df = read_hdf(quants_file)
print(quants_df.shape)

if DEBUG:
    display(quants_df.head())

#### split name index to find info

In [None]:
id_parts = quants_df.index.str.split('_', expand=True).to_frame()
id_parts.columns = ['assay', 'sampleid', 'day']

id_parts['assayid'] = quants_df.index.values
print(id_parts.shape)
if DEBUG:
    display(id_parts.sample(5))

#### get counts by day

In [None]:
id_parts['day'].value_counts()

#### capture the assayid to wgsid for formatting phenotypes for use with wgs genotypes later

In [None]:
id_map = id_parts[['sampleid', 'assayid']]
id_map['sampleid'] = id_map['sampleid'].replace(replace_id_dict)

#### load covariates files

In [None]:
covs_df = read_csv(covariates_file, index_col=0)
# drop any duplicated indices
print(covs_df.shape)
covs_df = covs_df[~covs_df.index.duplicated(keep='first')]
print(f'covariates shape {covs_df.shape}')
if DEBUG:
    display(covs_df.sample(5))

In [None]:
# check for any unexpected samples; ie probably name frmt issue
set(quants_df.index) - set(covs_df.index)

In [None]:
set(id_parts['sampleid']) - set(covs_df['sampleid'])

#### for further analysis remove the ID columns

In [None]:
print(covs_df.shape)
cols_to_keep = set(covs_df.columns) - set(other_id_columns) - set(exclude_addl_info_cols)
covs_df = covs_df[cols_to_keep]
print(f'covariates shape {covs_df.shape}')
if DEBUG:
    display(covs_df.head())

#### load feature annotations

In [None]:
%%time
if modality == 'ATAC':
    features_df = read_csv(features_file, sep='\t')
    features_df.columns = ['feature', 'chrom', 'start', 'end', 'strand']
elif modality == 'METH':
    features_df = read_csv(features_file, sep='\t', header=None)
    features_df.columns = ['Chr', 'start', 'end', 'feature']
elif modality.startswith('PDUI'):
    features_df = read_csv(features_file)
    features_df = features_df.rename(columns={'Loci': 'feature'})
elif modality == 'CIRC':
    features_df = read_csv(features_file, sep='\t')
    features_df = features_df.rename(columns={'circRNA_ID': 'feature', 
                                              'chr': 'chrom', 
                                              'circRNA_start': 'start', 
                                              'circRNA_end': 'end'})    
elif modality == 'RNAB' or modality.startswith('SCRN'):
    features_df = read_pickle(features_file)
    # features_df.columns = ['feature', 'chrom', 'start', 'end', 'strand']
    # drop the ont and tag columns
    discard_cols = features_df.columns[(features_df.columns.str.startswith('ont')) |
                                       (features_df.columns.str.startswith('tag')) | 
                                       (features_df.columns.str.startswith('havana_')) |                                       
                                       (features_df.columns.str.startswith('gene_alias')) | 
                                       (features_df.columns.str.startswith('transcript_alias'))]
    features_df = features_df.drop(columns=discard_cols)
    # subset to just 'gene' features
    features_df = features_df.loc[features_df.feature == 'gene']
    # now drop existing feature col so we can use that name
    features_df = features_df.drop(columns=['feature'])
    if modality == 'RNAB':
        features_df = features_df.rename(columns={'seqname': 'chrom', 'gene_id': 'feature'})    
    elif modality.startswith('SCRN'):
        features_df = features_df.rename(columns={'seqname': 'chrom', 'gene_name': 'feature'})
    
print(f'features shape {features_df.shape}')
if DEBUG:
    display(features_df.head())

#### find IDs for features on sex chromosomes, for dropping later

In [None]:
sex_chr_feature_ids = features_df.loc[features_df.chrom
                                      .isin(['chrX', 'chrY'])]['feature'].unique()
print(len(sex_chr_feature_ids))

### check expected sex of samples

In [None]:
#Vawter MP, Evans S, Choudary P et al. Gender-specific gene expression in 
#post-mortem human brain: localization to sex chromosomes. 
#Neuropsychopharmacology 2004;29:373–84.
sex_genes = ['XIST','RPS4Y1','RPS4Y2','KDM5D','UTY','DDX3Y','USP9Y']

if modality == 'ATAC':
    sex_specific_features = ['chrX_73852329_73852963', 'chrY_2841364_2842239', 
                             'chrY_19744015_19745452', 'chrY_13478234_13480597', 
                             'chrY_12904296_12906267', 'chrY_12661424_12663659']
elif modality == 'METH':
    sex_specific_features = features_df.loc[features_df['Chr']
                                          .isin(['chrX', 'chrY'])]['feature'].unique()
elif modality.startswith('PDUI'):
    sex_features = features_df.loc[features_df.Gene.isin(sex_genes)]
    sex_specific_features = sex_features.feature.to_list()
elif modality == 'RNAB':
    sex_features = features_df.loc[features_df.gene_name.isin(sex_genes)]
    sex_specific_features = sex_features.feature.to_list()
elif modality == 'CIRC':
    sex_specific_features = ['chrX:73852031|73852204', 'chrX:73826115|73837503', 
                             'chrY:2845670|2854771', 'chrY:2865189|2866886', 
                             'chrY:2854744|2865182', 'chrY:2890914|2891101', 
                             'chrY:2847677|2847984', 'chrY:2854733|2865176', 
                             'chrY:19739528|19741857', 'chrY:13251002|13369349', 
                             'chrY:13323555|13378010', 'chrY:13369256|13400051', 
                             'chrY:13393859|13450820', 'chrY:13410993|13470229', 
                             'chrY:12909360|12913062', 'chrY:12912963|12914649', 
                             'chrY:12912963|12914982', 'chrY:12909363|12913062', 
                             'chrY:12909360|12914649', 'chrY:12707760|12709543', 
                             'chrY:12709279|12709543', 'chrY:12716791|12841133', 
                             'chrY:12735612|12739629', 'chrY:12738157|12758642']    
elif modality.startswith('SCRN'):
    sex_features = features_df.loc[features_df.feature.isin(sex_genes)]
    sex_specific_features = sex_features.feature.to_list()    
else:
    sex_specific_features = ['XIST','RPS4Y1','RPS4Y2','KDM5D','UTY','DDX3Y','USP9Y']
sex_features_present = list(set(sex_specific_features) & set(quants_df.columns))
print(f'found {len(sex_features_present)} sex features: \n{sex_features_present}')
quants_sex_df = quants_df[sex_features_present].copy()
print(f'sex features matrix shape {quants_sex_df.shape}')

In [None]:
%%time
sex_umap_df = nuf.generate_umap_covs_df(quants_sex_df, covs_df)
nuf.plot_umap_clusters(sex_umap_df, hue_cov='sex', style_cov='Batch')

### calculate, plot detection rates and subset well detected features

In [None]:
%%time
trait_miss_rates, sample_miss_rates = nuf.calculate_detection_rates(quants_df, modality)
nuf.plot_missing_rates(trait_miss_rates, sample_miss_rates)
bad_call_rate_features = nuf.bad_callrate_features(trait_miss_rates, min_detection_rate)
quants_wd_df = nuf.subset_well_detected_features(quants_df, bad_call_rate_features)

### standardize the dataset using transform

In [None]:
%%time
traits_scaled_df = nuf.scale_dataframe(quants_wd_df)

In [None]:
# check transformation for random feature
nuf.plot_trnsfrm_effect_example(quants_df, traits_scaled_df,
                                bf_label=modality, 
                                af_label='quantile transformed')

### save scaled, well detected data for all days

In [None]:
nuf.write_df_to_hdf(traits_scaled_df, scaled_file)

### generate covariates from variance

#### exclude low variance features from covariate generation

In [None]:
quants_var_df = nuf.exclude_low_var_features(traits_scaled_df, 
                                             quartile_to_drop=low_var_quartile)

In [None]:
variance_features = set(quants_var_df.columns) - (set(sex_specific_features))
print(len(variance_features))

### take a look at the data

#### not going to use PCs but take a look at PCA anyway

In [None]:
pcs_df = nuf.generate_pca(quants_var_df[variance_features], plot_variance=True)
print(pcs_df.shape)
if DEBUG:
    display(pcs_df)

In [None]:
%%time
pcs_df = pcs_df.merge(covs_df, how='left', left_index=True, right_index=True)
# since just checking the PCs and not using just run ppscore on 1st three
covs_target_list = ['PC_0', 'PC_1', 'PC_2']
covs_to_check = nuf.pps_predict_targets(pcs_df, covs_target_list)
nuf.plot_ppscore_matrix(pcs_df, covs_to_check, covs_target_list)

In [None]:
nuf.plot_pca_pair(pcs_df, 'PC_0', 'PC_1', hue_cov='Batch', size_cov=size_covariate)

#### generate unknown covariates and see if know covariates are source of variations

In [None]:
%%time
umap_df = nuf.generate_umap_covs_df(quants_var_df[variance_features], covs_df)
covs_target_list = ['x_umap', 'y_umap']
covs_to_check = nuf.pps_predict_targets(umap_df, covs_target_list)
nuf.plot_ppscore_matrix(umap_df, covs_to_check, covs_target_list)
if len(covs_to_check) > 0:
    umap_dums_covs_df = nuf.dummy_covs_as_needed(umap_df[(set(covs_to_check) | 
                                                      set(covs_target_list))])
    nuf.plot_correlation_heatmap(umap_dums_covs_df)
    nuf.plot_correlation_heatmap(umap_dums_covs_df, covs_target_list)

#### plot umap of with known covariates of interest

In [None]:
nuf.plot_umap_clusters(umap_df, hue_cov='Batch', size_cov=size_covariate)

### keep created covars and save them

In [None]:
# standardize the covariates
umap_covs_df = nuf.scale_dataframe(umap_df[covs_target_list])
# now save the covariates
umap_covs_df.to_csv(umap_covs_file)

### adjust the scaled data by the covariates

In [None]:
# check to see in df's have same indices
if not traits_scaled_df.index.equals(umap_covs_df.index):
    print('indices are not equal re-index')
    # umap_covs_df.reindex(traits_scaled_df.index)
    shared_indices = traits_scaled_df.index.intersection(umap_covs_df.index)
    traits_scaled_df = traits_scaled_df.loc[shared_indices,]
    umap_covs_df = umap_covs_df.loc[shared_indices,]    
    
traits_scaled_df.index.equals(umap_covs_df.index)    

In [None]:
%%time

residuals_df, cov_scores_df = nuf.covariate_residuals(traits_scaled_df, umap_covs_df)

#take a peek at the data
print(f'residuals shape {residuals_df.shape}')
print(f'scores shape {cov_scores_df.shape}')
if DEBUG:
    display(cov_scores_df.head())
    display(residuals_df.head())

In [None]:
# get a summary of the covariates model scores
print(cov_scores_df.describe())
# look at the distribution of covariate model scores, 
# ie get a sense any feature driven by covariates
with rc_context({'figure.figsize': (8, 8), 'figure.dpi': dpi_value}):
    plt.style.use('seaborn-bright')
    distplot(cov_scores_df['score'])
    plt.show()

#### remove any features that had more than 75% score

In [None]:
drop_features = cov_scores_df[cov_scores_df.score > 0.75].index.values
keep_features = list(set(residuals_df.columns) - set(drop_features))
residuals_df = residuals_df[keep_features]
cov_scores_df = cov_scores_df.loc[cov_scores_df.index.isin(keep_features)]
print(len(drop_features))
print(len(keep_features))
print(residuals_df.shape)
print(cov_scores_df.shape)

### save scaled and covariate adjusted data

In [None]:
%%time 

nuf.write_df_to_hdf(residuals_df, adj_quants_file)

#### take a look at the scaled and covariate adjusted data

In [None]:
nuf.plot_trnsfrm_effect_example(traits_scaled_df, residuals_df)

In [None]:
# find feature with largest score
large_adj_trait = cov_scores_df.loc[cov_scores_df['score'] == max(cov_scores_df['score'])]
print(large_adj_trait)
large_adj_traid_id = large_adj_trait.index.values[0]

# spot check same feature with largest adjustment effect
nuf.plot_trnsfrm_effect_example(traits_scaled_df, residuals_df, large_adj_traid_id)

#### what are the post scaled and covariate adjusted umap variables correlated with

In [None]:
%%time
umap_df = nuf.generate_umap_covs_df(residuals_df, covs_df)
covs_target_list = ['x_umap', 'y_umap']
covs_to_check = nuf.pps_predict_targets(umap_df, covs_target_list)
nuf.plot_ppscore_matrix(umap_df, covs_to_check, covs_target_list)

In [None]:
nuf.plot_umap_clusters(umap_df, hue_cov='Batch', size_cov=size_covariate)

### since switching to tensorQTL can just use one large transcriptome pheno bed instead of per chrom pheno

In [None]:
%%time

# get feature annots for present features
feature_present_df = features_df.loc[features_df['feature'].isin(residuals_df.columns)]
print(f'features present shape {feature_present_df.shape}')
# tensorQTL pheno bed is rows = features and columns = samples
# where first four columns are chr, start, end, phenotype_id, then sample1 ... sampleN

# create dict for renaming columns (samples) from assayid to geno_id
sample_col_dict = id_map.set_index('assayid').to_dict()['sampleid']

# transpose the residuals df from sample x feature to feature x sample
tresiduals_df = residuals_df.transpose()

# modify annots
if modality == 'METH':
    feature_present_df = feature_present_df.rename(columns={'Chr': 'chr'})
else:
    feature_present_df = feature_present_df.rename(columns={'chrom': 'chr'})
feature_present_df['end'] = feature_present_df['start'] + 1
print(f'features presnt shape {feature_present_df.shape}')
feature_present_df = feature_present_df.drop_duplicates(subset=['feature'], 
                                                        keep='first', 
                                                        ignore_index=True)
feature_present_df = feature_present_df.set_index('feature', drop=False)
feature_present_df = feature_present_df.reindex(tresiduals_df.index)

# insert the feature annots
tresiduals_df.insert( 0, column='chr', value=feature_present_df['chr'])
tresiduals_df.insert( 1, column='start', value=feature_present_df['start'])
tresiduals_df.insert( 2, column='end', value=feature_present_df['end'])
# METH, PDUI or RNAB
tresiduals_df.insert( 3, column='phenotype_id', value=feature_present_df['feature'])

# if there are any features that were in quants but not feature annots
# remove these with missing positions
tresiduals_df = tresiduals_df.loc[~tresiduals_df['chr'].isna()]
# make the positions ints instead of floats
tresiduals_df['start'] = tresiduals_df['start'].astype('int64')
tresiduals_df['end'] = tresiduals_df['end'].astype('int64')

# now rename sample ids in columns
tresiduals_df = tresiduals_df.rename(columns=sample_col_dict)

tresiduals_df.to_csv(tnsrqtl_pheno_file, index=False, sep='\t', compression='gzip')

In [None]:
!date