## Notebook to prep FOUNDIN-PD modalities for analyses
will output a scaled and covariate adjusted file for full dataset; ie across days
will also output unprepped data just split by differentiation day

In [None]:
!date

#### import libraries

In [None]:
from pandas import read_csv, read_pickle, DataFrame
from dask.dataframe import read_csv as dd_read_csv
import nb_util_funcs as nuf
# from numpy import cumsum
import concurrent.futures
from random import sample
from seaborn import distplot
import matplotlib.pyplot as plt
import statsmodels.stats.multitest as smm
from scipy.stats import f_oneway
from matplotlib.pyplot import rc_context

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline
# for white background of figures (only for docs rendering)
%config InlineBackend.print_figure_kwargs={'facecolor' : "w"}
%config InlineBackend.figure_format='retina'

#### set notebooks variables

In [None]:
# parameters
modality = ''

In [None]:
# naming
cohort = 'foundin'
set_name = f'{cohort}_daALL_{modality}'

# directories
wrk_dir = '/labshare/raph/datasets/foundin_qtl'
quants_dir = f'{wrk_dir}/quants'
info_dir = f'{wrk_dir}/sample_info'
public_dir = f'{wrk_dir}/public'

# in files
quants_file = f'{quants_dir}/{set_name}.csv'
if modality == 'ATAC':
    features_file = f'{quants_dir}/{cohort}_consensus_peaks.saf'
elif modality == 'PDUI':
    features_file = f'{quants_dir}/{set_name}.features.csv'
elif modality == 'METH':
    features_file = f'{quants_dir}/EPIC_annotation_hg38.txt'    
elif modality == 'RNAB':
    features_file = f'{public_dir}/gencode_v29.lncipedia_v5_2_hc.annotation.pkl'
elif modality == 'CIRC':
    features_file = f'{quants_dir}/circRNA_genomicRegionList.tsv'    
elif modality == 'RNAS':
    features_file = f'{quants_dir}/{cohort}_{modality}_features.csv'    
covariates_file = f'{info_dir}/{cohort}_{modality}_sample_info.csv'

# out files
all_quants_file = f'{quants_dir}/{set_name}.hdf5'
var_covs_file = f'{info_dir}/{set_name}.variance.covs.csv'
scaled_file = f'{quants_dir}/{set_name}.scaled.hdf5'
adj_quants_file = f'{quants_dir}/{set_name}.scaled.adj.hdf5'

# constants
if modality == 'METH' or modality == 'ATAC':
    max_missing_rate = 0.75
else:
    max_missing_rate = 0.25
min_ppscore = 0.05
min_pearson = 0.22
DEBUG = False
low_var_quartile = '25%'
dpi_value = 50

other_id_columns = ['sampleid', 'cdi', 'PPMI_ID', 'DZNE_Barcode', 'DZNE_ID', 
                    'participant_id', 'wgsid', 'PATNO', 'Barcode_LNG', 
                    'Barcode_DZNE', 'Alternate MRN', 'IID', 'FID']
exclude_addl_info_cols = ['data_split', 'ENSG00000188906.15', 'ENSG00000131979.18',
                          'ENSG00000129003.17', 'ENSG00000069329.17', 
                          'ENSG00000177628.15', 'ENSG00000158828.7',
                          'ENSG00000145335.15', 'ENSG00000164535.14', 
                          'ENSG00000165092.12', 'ENSG00000147133.15',
                          'ENSG00000155961.4']
# to match geno's use PPMI3966 Batch3
replace_id_dict = {'PPMI3966B3': 'PPMI3966'}

if DEBUG:
    print(f'quants_file = {quants_file}')
    print(f'covariates_file = {covariates_file}')
    print(f'features_file = {features_file}')
    print(f'all_quants_file = {all_quants_file}')
    print(f'var_covs_file = {var_covs_file}')
    print(f'scaled_file = {scaled_file}')
    print(f'adj_quants_file = {adj_quants_file}')

### load input data

#### load the quantified features matrix and save as hdf5

In [None]:
%%time
quants_df = read_csv(quants_file, index_col=0)
print(f'shape of input matrix {quants_df.shape}')
# now save the quant matrix in faster file type
nuf.write_df_to_hdf(quants_df, all_quants_file)

if DEBUG:
    display(quants_df.head())

#### split name index to find info

In [None]:
id_parts = quants_df.index.str.split('_', expand=True).to_frame()
id_parts.columns = ['assay', 'sampleid', 'day']
# id_parts['fullassayid'] = quant_df.index
id_parts['assayid'] = id_parts['assay'] + '_' + id_parts['sampleid'] + '_' + id_parts['day']
print(id_parts.shape)
if DEBUG:
    display(id_parts.sample(5))

#### get counts by day

In [None]:
days = id_parts['day'].value_counts()
display(days)

#### load covariates files

In [None]:
covs_df = read_csv(covariates_file, index_col=0)
# drop any duplicated indices
print(covs_df.shape)
covs_df = covs_df[~covs_df.index.duplicated(keep='first')]
print(covs_df.shape)
if DEBUG:
    display(covs_df.sample(5))

In [None]:
# check for any unexpected samples; ie probably name frmt issue
set(quants_df.index) - set(covs_df.index)

In [None]:
set(id_parts['sampleid']) - set(covs_df['sampleid'])

#### for further analysis remove the ID columns

In [None]:
print(covs_df.shape)
cols_to_keep = list(set(covs_df.columns) - set(other_id_columns) - set(exclude_addl_info_cols))
covs_df = covs_df[cols_to_keep]
print(covs_df.shape)
# display(covs_df.head())

#### load feature annotations

In [None]:
%%time
if modality == 'ATAC':
    features_df = read_csv(features_file, sep='\t')
    features_df.columns = ['feature', 'chrom', 'start', 'end', 'strand']
elif modality == 'METH':
    features_df = read_csv(features_file, sep='\t', header=None)
    features_df.columns = ['chrom', 'start', 'end', 'feature']
elif modality == 'PDUI':
    features_df = read_csv(features_file)
    features_df = features_df.rename(columns={'Loci': 'feature'})
elif modality == 'CIRC':
    features_df = read_csv(features_file, sep='\t')
    features_df = features_df.rename(columns={'circRNA_ID': 'feature', 
                                              'chr': 'chrom', 
                                              'circRNA_start': 'start', 
                                              'circRNA_end': 'end'})
elif modality == 'RNAB':
    features_df = read_pickle(features_file)
    # features_df.columns = ['feature', 'chrom', 'start', 'end', 'strand']
    # drop the ont and tag columns
    discard_cols = features_df.columns[(features_df.columns.str.startswith('ont')) |
                                       (features_df.columns.str.startswith('tag')) | 
                                       (features_df.columns.str.startswith('havana_')) |                                       
                                       (features_df.columns.str.startswith('gene_alias')) | 
                                       (features_df.columns.str.startswith('transcript_alias'))]
    features_df = features_df.drop(columns=discard_cols)
    # subset to just 'gene' features
    features_df = features_df.loc[features_df.feature == 'gene']
    # now drop existing feature col so we can use that name
    features_df = features_df.drop(columns=['feature'])
    features_df = features_df.rename(columns={'seqname': 'chrom', 'gene_id': 'feature'})
elif modality == 'RNAS':
    features_df = read_csv(features_file)
    
print(features_df.shape)
if DEBUG:
    display(features_df.head())

### save the quantified features matrix split by day

In [None]:
%%time
# run the saves in parallel    
with concurrent.futures.ThreadPoolExecutor() as tpe:
    for day in days.index:
        day_df = id_parts.loc[id_parts['day'] == day]
        this_quant_df = quants_df[quants_df.index.isin(day_df['assayid'])]
        print(f'{cohort} {day} {this_quant_df.shape}')
        cohort_quant_filename = f'{quants_dir}/{cohort}_{day}_{modality}.hdf5'
        tpe.submit(nuf.write_df_to_hdf, this_quant_df, cohort_quant_filename)    

#### find IDs for features on sex chromosomes, for dropping later

In [None]:
sex_chr_feature_ids = features_df.loc[features_df.chrom
                                      .isin(['chrX', 'chrY'])]['feature'].unique()
print(len(sex_chr_feature_ids))

### check expected sex of samples

In [None]:
#Vawter MP, Evans S, Choudary P et al. Gender-specific gene expression in 
#post-mortem human brain: localization to sex chromosomes. 
#Neuropsychopharmacology 2004;29:373–84.
sex_genes = ['XIST','RPS4Y1','RPS4Y2','KDM5D','UTY','DDX3Y','USP9Y']

if modality == 'ATAC':
    sex_specific_features = ['chrX_73852329_73852963', 'chrY_2841364_2842239', 
                             'chrY_19744015_19745452', 'chrY_13478234_13480597', 
                             'chrY_12904296_12906267', 'chrY_12661424_12663659']
elif modality == 'METH':
    sex_specific_features = features_df.loc[features_df['chrom']
                                            .isin(['chrX', 'chrY'])]['feature'].unique()
elif modality == 'PDUI':
    sex_features = features_df.loc[features_df.Gene.isin(sex_genes)]
    sex_specific_features = sex_features.feature.to_list()
elif modality == 'RNAB':
    sex_features = features_df.loc[features_df.gene_name.isin(sex_genes)]
    sex_specific_features = sex_features.feature.to_list()
elif modality == 'CIRC':
    sex_specific_features = ['chrX:73852031|73852204', 'chrX:73826115|73837503', 
                             'chrY:2845670|2854771', 'chrY:2865189|2866886', 
                             'chrY:2854744|2865182', 'chrY:2890914|2891101', 
                             'chrY:2847677|2847984', 'chrY:2854733|2865176', 
                             'chrY:19739528|19741857', 'chrY:13251002|13369349', 
                             'chrY:13323555|13378010', 'chrY:13369256|13400051', 
                             'chrY:13393859|13450820', 'chrY:13410993|13470229', 
                             'chrY:12909360|12913062', 'chrY:12912963|12914649', 
                             'chrY:12912963|12914982', 'chrY:12909363|12913062', 
                             'chrY:12909360|12914649', 'chrY:12707760|12709543', 
                             'chrY:12709279|12709543', 'chrY:12716791|12841133', 
                             'chrY:12735612|12739629', 'chrY:12738157|12758642']
elif modality == 'RNAS':
    sex_specific_features = list(sex_chr_feature_ids)    
else:
    sex_specific_features = ['XIST','RPS4Y1','RPS4Y2','KDM5D','UTY','DDX3Y','USP9Y']
sex_features_present = list(set(sex_specific_features) & set(quants_df.columns))
print(f'found {len(sex_features_present)} sex features: \n{sex_features_present}')
quants_sex_df = quants_df[sex_features_present].copy()
print(f'sex features matrix shape {quants_sex_df.shape}')

In [None]:
%%time
_,sex_pca_df,_,_ = nuf.generate_selected_model(2, quants_sex_df, 'PCA')
print(f'shape of sex_pca_df is {sex_pca_df.shape}')
nuf.plot_pair(sex_pca_df.merge(covs_df, how='left', 
                               left_index=True, right_index=True),
              'PCA_0', 'PCA_1', hue_cov='sex', style_cov='Batch')
if DEBUG:
    display(sex_pca_df.head())

#### visualize the sexomes features in 2D with MDE and UMAP

In [None]:
%%time
nuf.show_2d_embed(quants_sex_df, covs_df, type='MDE', hue='sex', style='day')
nuf.show_2d_embed(quants_sex_df, covs_df, type='UMAP', hue='sex', style='day')

### calculate, plot detection rates and subset well detected features

In [None]:
%%time
trait_miss_rates, sample_miss_rates = nuf.calculate_detection_rates(quants_df, modality)
nuf.plot_missing_rates(trait_miss_rates, sample_miss_rates)
bad_call_rate_features = nuf.bad_callrate_features(trait_miss_rates, max_missing_rate)
quants_wd_df = nuf.subset_well_detected_features(quants_df, bad_call_rate_features)

### standardize the dataset using transform

In [None]:
%%time
traits_scaled_df = nuf.scale_dataframe(quants_wd_df)

In [None]:
# check transformation for random feature
nuf.plot_trnsfrm_effect_example(quants_df, traits_scaled_df,
                                bf_label=modality, 
                                af_label='quantile transformed')

### save scaled, well detected data for all days

In [None]:
nuf.write_df_to_hdf(traits_scaled_df, scaled_file)

### generate covariates from variance

#### take a look at variance in data, assuming mostly driven by d0 -> d65, ie IPSc -> differentiating neurons

#### exclude low variance features from covariate generation

In [None]:
quants_var_df = nuf.exclude_low_var_features(traits_scaled_df, 
                                             quartile_to_drop=low_var_quartile)

In [None]:
variance_features = list(set(quants_var_df.columns) - set(sex_chr_feature_ids))
print(len(variance_features))

### use PCA to model unknown covariates, ie global variance covariates

#### model PCA accuracy with different number of component

In [None]:
%%time
max_count = int(min(quants_var_df[variance_features].shape[0], quants_var_df[variance_features].shape[1])/2)
print(f'max count is {max_count}')

r2_values, rmse_values = nuf.iterate_model_component_counts(max_count, quants_var_df[variance_features], 'PCA')

#### use max curvature of accuracy to select number of components to use

In [None]:
knee_rmse = nuf.component_from_max_curve(rmse_values, 'RMSE')
knee_r2 = nuf.component_from_max_curve(r2_values, 'R2')
num_comp = min(knee_rmse, knee_r2)
print(num_comp)

#### regenerate the PCA model with the selected number of components

In [None]:
pca_mdl,pca_df,_,_ = nuf.generate_selected_model(num_comp, quants_var_df[variance_features], 'PCA')
print(f'shape of pca_df is {pca_df.shape}')
nuf.plot_pair(pca_df.merge(covs_df, how='left', left_index=True, right_index=True), 
              'PCA_0', 'PCA_1', hue_cov='day', style_cov='Batch')
print(pca_mdl.explained_variance_ratio_)
if DEBUG:
    display(pca_df.head())

#### Visualize the quantification variation covariates, the PCA components, in 2D with MDE and UMAP

In [None]:
%%time
nuf.show_2d_embed(pca_df, covs_df, type='MDE', hue='day', style='Batch')
nuf.show_2d_embed(pca_df, covs_df, type='UMAP', hue='day', style='Batch')

#### do quick anova by day to identify features that may be changed with cell differentiation 

this is since we know differention should be largest source of variation, so figure out which features to exclude to get around

In [None]:
# split samples by group (day)
feats_by_day = {}
for day in days.index:
    day_df = id_parts.loc[id_parts['day'] == day]
    this_quant_df = traits_scaled_df[traits_scaled_df.index.isin(day_df['assayid'])]
    feats_by_day[day] = this_quant_df
    print(f'{cohort} {day} {this_quant_df.shape}')

# calculate one-way ANOVA for the groups
if modality == 'METH':
    fvalues, pvalues = f_oneway(feats_by_day.get('da0'),  
                                      feats_by_day.get('da65'))    
else:
    fvalues, pvalues = f_oneway(feats_by_day.get('da0'), 
                                      feats_by_day.get('da25'), 
                                      feats_by_day.get('da65'))

# make df from results
anova_results_df = DataFrame(data={'fvalues': fvalues, 'pvalues': pvalues}, 
                                index=traits_scaled_df.columns)
# apply a B&H FDR to pvalues
anova_results_df['bh_fdr'] = smm.fdrcorrection(anova_results_df.pvalues.fillna(1))[1]

print(anova_results_df.shape)
if DEBUG:
    display(anova_results_df.head())

In [None]:
anova_results_df.loc[anova_results_df['bh_fdr'] < 0.05].shape

#### determine final set of features to use for variance detection
exluding bottom 25% variance features, sex features, tissue elevated features, and cell differentiation features

In [None]:
no_change_features = anova_results_df.loc[anova_results_df['bh_fdr'] > 0.05].index.values
print(len(no_change_features))

no_change_variance_features = list((set(no_change_features) & set(quants_var_df.columns)) - set(sex_chr_feature_ids))
print(len(no_change_variance_features))

### remodel with new variance feature set

In [None]:
%%time
max_count = int(min(quants_var_df[no_change_variance_features].shape[0], 
                    quants_var_df[no_change_variance_features].shape[1])/2)
print(f'max count is {max_count}')

r2_values, rmse_values = nuf.iterate_model_component_counts(max_count, 
                                                            quants_var_df[no_change_variance_features], 
                                                            'PCA')

#### use max curvature of accuracy to select number of components to use

In [None]:
knee_rmse = nuf.component_from_max_curve(rmse_values, 'RMSE')
knee_r2 = nuf.component_from_max_curve(r2_values, 'R2')
num_comp = min(knee_rmse, knee_r2)
print(num_comp)

#### regenerate the PCA model with the selected number of components

In [None]:
pca_mdl,pca_df,_,_ = nuf.generate_selected_model(num_comp, quants_var_df[no_change_variance_features], 'PCA')
print(f'shape of pca_df is {pca_df.shape}')
print(pca_mdl.explained_variance_ratio_)
if DEBUG:
    display(pca_df.head())

In [None]:
%%time
pcs_df = pca_df.merge(covs_df, how='left', left_index=True, right_index=True)
if DEBUG:
    display(pcs_df.head())
# since just checking the PCs and not using just run ppscore on 1st three
covs_target_list = pca_df.columns.to_list()
covs_to_check = nuf.pps_predict_targets(pcs_df, covs_target_list)
nuf.plot_ppscore_matrix(pcs_df, covs_to_check, covs_target_list)

In [None]:
nuf.plot_pair(pcs_df, 'PCA_0', 'PCA_1', hue_cov='day', style_cov='Batch')

In [None]:
if len(covs_to_check) > 0:
    pcs_dums_covs_df = nuf.dummy_covs_as_needed(pcs_df[list(set(covs_to_check) | set(covs_target_list))])
    nuf.plot_correlation_heatmap(pcs_dums_covs_df)

#### Visualize the quantification variation covariates, the PCA components, in 2D with MDE and UMAP

In [None]:
%%time
nuf.show_2d_embed(pca_df, covs_df, type='MDE', hue='day', style='Batch')
nuf.show_2d_embed(pca_df, covs_df, type='UMAP', hue='day', style='Batch')

#### save created variance covars

In [None]:
# standardize the covariates
var_covs_df = nuf.scale_dataframe(pca_df, with_qt=False)
# now save the covariates
var_covs_df.to_csv(var_covs_file)

### adjust the scaled data by the covariates

In [None]:
# check to see in df's have same indices
if not traits_scaled_df.index.equals(var_covs_df.index):
    print('indices are not equal re-index')
    shared_indices = traits_scaled_df.index.intersection(var_covs_df.index)
    traits_scaled_df = traits_scaled_df.loc[shared_indices,]
    var_covs_df = var_covs_df.loc[shared_indices,]    
    
traits_scaled_df.index.equals(var_covs_df.index)   

In [None]:
%%time

residuals_df, cov_scores_df = nuf.covariate_residuals(traits_scaled_df, var_covs_df)

#take a peek at the data
print(f'residuals shape {residuals_df.shape}')
print(f'scores shape {cov_scores_df.shape}')
if DEBUG:
    display(cov_scores_df.head())
    display(residuals_df.head())

In [None]:
# get a summary of the covariates model scores
print(cov_scores_df.describe())
# look at the distribution of covariate model scores, 
# ie get a sense any feature driven by covariates
with rc_context({'figure.figsize': (8, 8), 'figure.dpi': dpi_value}):
    plt.style.use('seaborn-bright')
    distplot(cov_scores_df['score'])
    plt.show()

#### how many features have more than 75% score

In [None]:
drop_features = cov_scores_df[cov_scores_df.score > 0.75].index.values
keep_features = list(set(residuals_df.columns) - set(drop_features))
print(len(drop_features))
print(len(keep_features))
print(residuals_df.shape)
print(cov_scores_df.shape)

### save scaled and covariate adjusted data

In [None]:
%%time 

nuf.write_df_to_hdf(residuals_df, adj_quants_file)

#### take a look at the scaled and covariate adjusted data

In [None]:
nuf.plot_trnsfrm_effect_example(traits_scaled_df, residuals_df)

In [None]:
# find feature with largest score
large_adj_trait = cov_scores_df.loc[cov_scores_df['score'] == max(cov_scores_df['score'])]
print(large_adj_trait)
large_adj_traid_id = large_adj_trait.index.values[0]

# spot check same feature with largest adjustment effect
nuf.plot_trnsfrm_effect_example(traits_scaled_df, residuals_df, large_adj_traid_id)

#### what are the post scaled and covariate adjusted latent variables correlated with

In [None]:
%%time
max_count = int(min(residuals_df[no_change_variance_features].shape[0], 
                    residuals_df[no_change_variance_features].shape[1])/2)
print(f'max count is {max_count}')

r2_values, rmse_values = nuf.iterate_model_component_counts(max_count, 
                                                            residuals_df[no_change_variance_features], 
                                                            'PCA')

#### use max curvature of accuracy to select number of components to use

In [None]:
knee_rmse = nuf.component_from_max_curve(rmse_values, 'RMSE')
knee_r2 = nuf.component_from_max_curve(r2_values, 'R2')
# num_comp = max(knee_rmse, knee_r2)
num_comp = min(knee_rmse, knee_r2)
print(num_comp)

#### regenerate the PCA model with the selected number of components

In [None]:
pca_mdl,pca_df,_,_ = nuf.generate_selected_model(num_comp, 
                                                 residuals_df[no_change_variance_features], 
                                                 'PCA')
print(f'shape of pca_df is {pca_df.shape}')
print(pca_mdl.explained_variance_ratio_)
if DEBUG:
    display(pca_df.head())

In [None]:
%%time
pcs_df = pca_df.merge(covs_df, how='left', left_index=True, right_index=True)
# since just checking the PCs and not using just run ppscore on 1st three
covs_target_list = pca_df.columns.to_list()
covs_to_check = nuf.pps_predict_targets(pcs_df, covs_target_list)
nuf.plot_ppscore_matrix(pcs_df, covs_to_check, covs_target_list)

In [None]:
nuf.plot_pair(pcs_df, 'PCA_0', 'PCA_1', hue_cov='day', style_cov='Batch')

In [None]:
if len(covs_to_check) > 0:
    dums_covs_df = nuf.dummy_covs_as_needed(pcs_df[list(set(covs_to_check) | 
                                                        set(covs_target_list))])
    nuf.plot_correlation_heatmap(dums_covs_df)

#### Visualize the quantification variation covariates, the PCA components, in 2D with MDE and UMAP

In [None]:
%%time
nuf.show_2d_embed(pca_df, covs_df, type='MDE', hue='day', style='Batch')
nuf.show_2d_embed(pca_df, covs_df, type='UMAP', hue='day', style='Batch')
nuf.show_2d_embed(pca_df, covs_df, type='MDE', hue='day', size='DopaminergicNeurons')
nuf.show_2d_embed(pca_df, covs_df, type='UMAP', hue='day', size='DopaminergicNeurons')

In [None]:
!date