#### Notebook to split FOUNDIN-PD SCRN quants by celltype
will also output a scaled and covariate adjusted file for full dataset; ie across defined cell-types

In [None]:
!date

#### import libraries

In [None]:
from pandas import read_csv, read_pickle, DataFrame
import nb_util_funcs as nuf
import concurrent.futures
from random import sample
from seaborn import distplot
import matplotlib.pyplot as plt
import statsmodels.stats.multitest as smm
from scipy.stats import f_oneway
from matplotlib.pyplot import rc_context

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline
# for white background of figures (only for docs rendering)
%config InlineBackend.print_figure_kwargs={'facecolor' : "w"}
%config InlineBackend.figure_format='retina'

#### set notebooks variables

In [None]:
# naming
cohort = 'foundin'
modality = 'SCRN'
day = 'da65'
set_name = f'{cohort}_daALL_{modality}'

# directories
wrk_dir = '/labshare/raph/datasets/foundin_qtl'
quants_dir = f'{wrk_dir}/quants'
info_dir = f'{wrk_dir}/sample_info'
public_dir = f'{wrk_dir}/public'

# in files
quants_local_file = f'{quants_dir}/{modality}.avgnormbroad.csv'
features_file = f'{public_dir}/gencode_v29.lncipedia_v5_2_hc.annotation.pkl'
covariates_file = f'{info_dir}/{cohort}_{modality}_sample_info.csv'

# out files
all_quants_file = f'{quants_dir}/{set_name}.hdf5'
var_covs_file = f'{info_dir}/{set_name}.variance.covs.csv'
scaled_file = f'{quants_dir}/{set_name}.scaled.hd5f'
adj_quants_file = f'{quants_dir}/{set_name}.scaled.adj.hdf5'

# variable
max_missing_rate = 0.965
min_ppscore = 0.05
min_pearson = 0.22
low_var_quartile = '25%'
DEBUG = True
dpi_value = 50
other_id_columns = ['sampleid', 'cdi', 'PPMI_ID', 'DZNE_Barcode', 'DZNE_ID', 
                    'participant_id', 'wgsid', 'PATNO', 'Barcode_LNG', 
                    'Barcode_DZNE', 'Alternate MRN', 'IID', 'FID', 'fullassayid']
cell_abbrvs = {'Immature Dopaminergic Neurons': 'iDA', 
              'Dopaminergic Neurons': 'DA', 
              'Proliferating Floor Plate Progenitors': 'PFPP', 
              'Early neuron Progenitor': 'eNP', 
              'Ependymal-like Cells': 'ElC', 
              'Late neuron Progenitor': 'lNP', 
              'Neuroepithelial-like Cells': 'NlC'}

if DEBUG:
    print(f'quants_local_file = {quants_local_file}')
    print(f'covariates_file = {covariates_file}')
    print(f'features_file = {features_file}')
    print(f'all_quants_file = {all_quants_file}')
    print(f'var_covs_file = {var_covs_file}')
    print(f'scaled_file = {scaled_file}')
    print(f'adj_quants_file = {adj_quants_file}')

### load the quantified features matrix

In [None]:
%%time
quants_df = read_csv(quants_local_file, sep='\t')
quants_df = quants_df.transpose()
print(f'shape of input matrix {quants_df.shape}')

if DEBUG:
    display(quants_df.head())

#### split name index to find info

In [None]:
id_parts = quants_df.index.str.split(':', expand=True).to_frame()
id_parts.columns = ['assayid', 'cell_type']
# id_parts['cell_type'] = id_parts['cell_type'].str.replace(' ','')
id2_parts = id_parts['assayid'].str.split('_', expand=True)
id2_parts.columns = ['assay', 'sampleid', 'cdi', 'day']
id_parts['fullassayid'] = quants_df.index
id_parts['assay'] = id2_parts['assay']
id_parts['sampleid'] = id2_parts['sampleid']
id_parts['cdi'] = id2_parts['cdi']
id_parts['day'] = id2_parts['day']
print(f'shape of id parts {id_parts.shape}')
if DEBUG:
    display(id_parts.head())

In [None]:
if DEBUG:
    display(id_parts.loc[id_parts.sampleid.str.startswith('PPMI3966')])                         

#### what are the cell type counts and day counts (should only be day 65)

In [None]:
print(id_parts['cell_type'].value_counts())
print(id_parts['day'].value_counts())

#### replace cell-type name with abbreviation

In [None]:
id_parts.cell_type.replace(cell_abbrvs, inplace=True)
print(f'shape of id parts {id_parts.shape}')
print(id_parts['cell_type'].value_counts())
if DEBUG:
    display(id_parts.head())

#### now assign assay ID consistent with other modalities

In [None]:
id_parts.assayid = f'{modality}-' + id_parts.cell_type+ '_' + id_parts.sampleid+ '_' + id_parts.day
if DEBUG:
    display(id_parts.head())

In [None]:
if DEBUG:
    display(id_parts.loc[id_parts.sampleid.str.startswith('PPMI3966')]) 

#### capture the assayid to wgsid for formatting phenotypes for use with wgs genotypes later

In [None]:
id_map = id_parts[['sampleid', 'assayid']]

In [None]:
if DEBUG:
    display(id_map.loc[id_map.sampleid.str.startswith('PPMI3966')])  

In [None]:
if DEBUG:
    print(id_map.sampleid.value_counts())
    print(id_map.assayid.value_counts())    

#### replace the quants matrix index with the corrected ID

In [None]:
quants_df.index = id_parts.assayid
quants_df.index.set_names('assayid')
if DEBUG:
    display(quants_df.head())

In [None]:
if DEBUG:
    print(quants_df.index.value_counts())

### save the quant matrix in faster file type

In [None]:
%%time
nuf.write_df_to_hdf(quants_df, all_quants_file)

### load covariates files

In [None]:
covs_df = read_csv(covariates_file, index_col=0)
# drop any duplicated indices
print(f'covariates shape {covs_df.shape}')
covs_df = covs_df[~covs_df.index.duplicated(keep='first')]
print(f'covariates shape {covs_df.shape}')
if DEBUG:
    display(covs_df.sample(5))

In [None]:
# check for any unexpected samples; ie probably name frmt issue
set(id_parts['sampleid']) - set(covs_df['PPMI_ID'])

#### for merging known covariates with umaps will need to add cell labelled assay ids into covariates dataframe

In [None]:
id_parts.reset_index(inplace=True)
id_parts.drop(columns=['level_0', 'level_1'], inplace=True)
covs_df = covs_df.merge(id_parts, left_on='PPMI_ID', right_on='sampleid')
covs_df.index = covs_df['assayid']
covs_df = covs_df[~covs_df.index.duplicated(keep='first')]
print(f'covariates shape {covs_df.shape}')
if DEBUG:
    display(covs_df.head())

In [None]:
if DEBUG:
    display(covs_df.sampleid.value_counts())
    display(covs_df.assayid.value_counts())
    display(covs_df.cell_type.value_counts())

### load feature annotations

In [None]:
%%time
features_df = read_pickle(features_file)
# drop the ont and tag columns
discard_cols = features_df.columns[(features_df.columns.str.startswith('ont')) |
                                   (features_df.columns.str.startswith('tag')) | 
                                   (features_df.columns.str.startswith('havana_')) |                                       
                                   (features_df.columns.str.startswith('gene_alias')) | 
                                   (features_df.columns.str.startswith('transcript_alias'))]
features_df.drop(columns=discard_cols, inplace=True)
# subset to just 'gene' features
features_df = features_df.loc[features_df.feature == 'gene']
# now drop existing feature col so we can use that name
features_df.drop(columns=['feature'], inplace=True)    
features_df.rename(columns={'seqname': 'chrom', 'gene_id': 'feature'}, inplace=True)
print(f'features shape {features_df.shape}')
if DEBUG:
    display(features_df.head())

#### since single-cell features are typically gene names instead of geneIDs see if missing from feature annots

In [None]:
genes_missing_in_annots = set(quants_df.columns) - set(features_df['gene_name'])
if len(genes_missing_in_annots) < 20:
    print(len(genes_missing_in_annots))
    print(genes_missing_in_annots)
else:
    print(len(genes_missing_in_annots))
    print(list(genes_missing_in_annots)[:20])

#### get counts by cell-type

In [None]:
cell_types = id_parts.cell_type.value_counts()
for cell_type, count in cell_types.items():
    print(f'{cell_type} {count}')

### save the quantified features matrix and info split by cell-type

In [None]:
%%time
# run the saves in parallel    
with concurrent.futures.ThreadPoolExecutor() as tpe:
    for cell_type in cell_types.index:
        # get id info for samples of this cell type
        cell_df = id_parts.loc[id_parts.cell_type == cell_type]
        # get quantified features for samples of this cell type
        this_quant_df = quants_df[quants_df.index.isin(cell_df.assayid)]
        print(f'{cohort} {cell_type} quants {this_quant_df.shape}')
        # now save these cell type quantified features
        cohort_quant_filename = f'{quants_dir}/{cohort}_{day}_{modality}-{cell_type}.hdf5'
        tpe.submit(nuf.write_df_to_hdf, this_quant_df, cohort_quant_filename) 
        this_covs_df = covs_df[covs_df.index.isin(cell_df.assayid)]
        print(f'{cohort} {cell_type} info {this_covs_df.shape}')
        cohort_covs_filename = f'{info_dir}/{cohort}_{modality}-{cell_type}_sample_info.csv'
        this_covs_df.to_csv(cohort_covs_filename)

#### for further analysis remove the ID columns

In [None]:
print(f'covariates shape {covs_df.shape}')
cols_to_keep = list(set(covs_df.columns) - set(other_id_columns))
covs_df = covs_df[cols_to_keep]
print(f'covariates shape {covs_df.shape}')
if DEBUG:
    display(covs_df.head())

#### find IDs for features on sex chromosomes, for dropping later

In [None]:
sex_chr_feature_ids = features_df.loc[features_df.chrom
                                      .isin(['chrX', 'chrY'])]['gene_name'].unique()
print(len(sex_chr_feature_ids))

### check expected sex of samples

In [None]:
#Vawter MP, Evans S, Choudary P et al. Gender-specific gene expression in 
#post-mortem human brain: localization to sex chromosomes. 
#Neuropsychopharmacology 2004;29:373–84.

sex_specific_features = ['XIST','RPS4Y1','RPS4Y2','KDM5D','UTY','DDX3Y','USP9Y']
sex_features_present = list(set(sex_specific_features) & set(quants_df.columns))
print(f'found {len(sex_features_present)} sex features: \n{sex_features_present}')
quants_sex_df = quants_df[sex_features_present].copy()
print(f'sex features matrix shape {quants_sex_df.shape}')

In [None]:
%%time
_,sex_pca_df,_,_ = nuf.generate_selected_model(2, quants_sex_df, 'PCA')
print(f'shape of sex_pca_df is {sex_pca_df.shape}')
nuf.plot_pair(sex_pca_df.merge(covs_df, how='left', 
                               left_index=True, right_index=True),
              'PCA_0', 'PCA_1', hue_cov='sex', style_cov='cell_type')
nuf.plot_pair(sex_pca_df.merge(covs_df, how='left', 
                               left_index=True, right_index=True),
              'PCA_0', 'PCA_1', hue_cov='sex', style_cov='Batch')
if DEBUG:
    display(sex_pca_df.head())

### calculate, plot detection rates and subset well detected features

In [None]:
%%time
trait_miss_rates, sample_miss_rates = nuf.calculate_detection_rates(quants_df, modality)
nuf.plot_missing_rates(trait_miss_rates, sample_miss_rates)
bad_call_rate_features = nuf.bad_callrate_features(trait_miss_rates, max_missing_rate)
quants_wd_df = nuf.subset_well_detected_features(quants_df, bad_call_rate_features)

### scale the full dataset using quantile transform and minmax scaler

In [None]:
%%time
traits_scaled_df = nuf.scale_dataframe(quants_wd_df)

In [None]:
nuf.plot_trnsfrm_effect_example(quants_df, traits_scaled_df,
                                bf_label=modality, 
                                af_label='quantile transformed and scaled')

### save scaled, well detected data for all days

In [None]:
nuf.write_df_to_hdf(traits_scaled_df, scaled_file)

### generate covariates for variance

#### take a look at variance in data, assuming mostly driven by cell-type

#### exclude low variance features from covariate generation

In [None]:
quants_var_df = nuf.exclude_low_var_features(traits_scaled_df, 
                                             quartile_to_drop=low_var_quartile)

In [None]:
variance_features = list(set(quants_var_df.columns) - (set(sex_specific_features)))
print(len(variance_features))

### use PCA to model unknown covariates, ie global variance covariates

#### model PCA accuracy with different number of component

In [None]:
%%time
max_count = int(min(quants_var_df[variance_features].shape[0], quants_var_df[variance_features].shape[1])/2)
print(f'max count is {max_count}')

r2_values, rmse_values = nuf.iterate_model_component_counts(max_count, quants_var_df[variance_features], 'PCA')

#### use max curvature of accuracy to select number of components to use

In [None]:
knee_rmse = nuf.component_from_max_curve(rmse_values, 'RMSE')
knee_r2 = nuf.component_from_max_curve(r2_values, 'R2')
num_comp = min(knee_rmse, knee_r2)
print(num_comp)

#### regenerate the PCA model with the selected number of components

In [None]:
pca_mdl,pca_df,_,_ = nuf.generate_selected_model(num_comp, quants_var_df[variance_features], 'PCA')
print(f'shape of pca_df is {pca_df.shape}')
nuf.plot_pair(pca_df.merge(covs_df, how='left', left_index=True, right_index=True), 
              'PCA_0', 'PCA_1', hue_cov='cell_type', style_cov='Batch')
print(pca_mdl.explained_variance_ratio_)
if DEBUG:
    display(pca_df.head())

#### Visualize the quantification variation covariates, the PCA components, in 2D with MDE

In [None]:
%%time
embd_df = nuf.generate_2d_embed_df(pca_df, covs_df)
print(f'embd_df shape is {embd_df.shape}')
if DEBUG:
    display(embd_df.head())

In [None]:
nuf.plot_pair(embd_df, 'LD_1', 'LD_2', hue_cov='cell_type', style_cov='Batch')

#### do quick anova by day to identify features change with cell differentiation 

this is since we know differention should be largest source of variation, so figure out which features to exclude to get around

In [None]:
# split samples by group (day)
feats_by_cell = {}
for cell_type in cell_types.index:
    # get id info for samples of this cell type
    cell_df = id_parts.loc[id_parts['cell_type'] == cell_type]
    # get quantified features for samples of this cell type
    this_quant_df = traits_scaled_df[traits_scaled_df.index.isin(cell_df['assayid'])]
    feats_by_cell[cell_type] = this_quant_df
    print(f'{cohort} {cell_type} {this_quant_df.shape}')

# calculate one-way ANOVA for the groups
fvalues, pvalues = f_oneway(feats_by_cell.get('iDA'), 
                            feats_by_cell.get('DA'), 
                            feats_by_cell.get('PFPP'), 
                            feats_by_cell.get('eNP'), 
                            feats_by_cell.get('ElC'), 
                            feats_by_cell.get('lNP'), 
                            feats_by_cell.get('NlC'))

# make df from results
anova_results_df = DataFrame(data={'fvalues': fvalues, 'pvalues': pvalues}, 
                                index=traits_scaled_df.columns)
# apply a B&H FDR to pvalues
anova_results_df['bh_fdr'] = smm.fdrcorrection(pvalues)[1]

print(anova_results_df.shape)
if DEBUG:
    display(anova_results_df.head())

In [None]:
anova_results_df.loc[anova_results_df['bh_fdr'] < 0.05].shape

#### determine final set of features to use for variance detection
exluding bottom variance features, sex features, tissue elevated features 

cannot use the cell difference genes from anova as that is pretty much all of them

In [None]:
no_change_features = anova_results_df.loc[anova_results_df['bh_fdr'] > 0.05].index.values
print(len(no_change_features))

no_change_variance_features = list((set(no_change_features) & set(quants_var_df.columns)) - set(sex_chr_feature_ids))
print(len(no_change_variance_features))

### remodel with new variance feature set

In [None]:
%%time
max_count = int(min(quants_var_df[no_change_variance_features].shape[0], 
                    quants_var_df[no_change_variance_features].shape[1])/2)
print(f'max count is {max_count}')

r2_values, rmse_values = nuf.iterate_model_component_counts(max_count, 
                                                            quants_var_df[no_change_variance_features], 
                                                            'PCA')

#### use max curvature of accuracy to select number of components to use

In [None]:
knee_rmse = nuf.component_from_max_curve(rmse_values, 'RMSE')
knee_r2 = nuf.component_from_max_curve(r2_values, 'R2')
num_comp = min(knee_rmse, knee_r2)
print(num_comp)

#### regenerate the PCA model with the selected number of components

In [None]:
pca_mdl,pca_df,_,_ = nuf.generate_selected_model(num_comp, quants_var_df[no_change_variance_features], 'PCA')
print(f'shape of pca_df is {pca_df.shape}')
nuf.plot_pair(pca_df.merge(covs_df, how='left', left_index=True, right_index=True), 
              'PCA_0', 'PCA_1', hue_cov='cell_type', style_cov='Batch')
print(pca_mdl.explained_variance_ratio_)
if DEBUG:
    display(pca_df.head())

In [None]:
%%time
pcs_df = pca_df.merge(covs_df, how='left', left_index=True, right_index=True)
if DEBUG:
    display(pcs_df.head())
# since just checking the PCs and not using just run ppscore on 1st three
covs_target_list = pca_df.columns.to_list()
covs_to_check = nuf.pps_predict_targets(pcs_df, covs_target_list)
nuf.plot_ppscore_matrix(pcs_df, covs_to_check, covs_target_list)

In [None]:
nuf.plot_pair(pcs_df, 'PCA_0', 'PCA_1', hue_cov='cell_type', style_cov='Batch')

In [None]:
if len(covs_to_check) > 0:
    pcs_dums_covs_df = nuf.dummy_covs_as_needed(pcs_df[list(set(covs_to_check) | set(covs_target_list))])
    nuf.plot_correlation_heatmap(pcs_dums_covs_df)

#### Visualize the quantification variation covariates, the PCA components, in 2D with MDE

In [None]:
%%time
embd_df = nuf.generate_2d_embed_df(pca_df, covs_df)
print(f'embd_df shape is {embd_df.shape}')
if DEBUG:
    display(embd_df.head())

In [None]:
nuf.plot_pair(embd_df, 'LD_1', 'LD_2', hue_cov='cell_type', style_cov='Batch')

In [None]:
nuf.plot_pair(embd_df, 'LD_1', 'LD_2', hue_cov='cell_type', size_cov='EstimatedNumberofCells')

In [None]:
nuf.plot_pair(embd_df, 'LD_1', 'LD_2', hue_cov='cell_type', size_cov='MeanReadsperCell')

#### keep created covars and save them

In [None]:
# standardize the covariates
var_covs_df = nuf.scale_dataframe(pca_df, with_qt=False)
# now save the covariates
var_covs_df.to_csv(var_covs_file)

#### adjust the scaled data by the covariates

In [None]:
# check to see in df's have same indices
if not traits_scaled_df.index.equals(var_covs_df.index):
    print('indices are not equal re-index')
    shared_indices = traits_scaled_df.index.intersection(var_covs_df.index)
    traits_scaled_df = traits_scaled_df.loc[shared_indices,]
    var_covs_df = var_covs_df.loc[shared_indices,]    
    
traits_scaled_df.index.equals(var_covs_df.index)   

In [None]:
%%time

residuals_df, cov_scores_df = nuf.covariate_residuals(traits_scaled_df, var_covs_df)

#take a peek at the data
print(f'residuals shape {residuals_df.shape}')
print(f'scores shape {cov_scores_df.shape}')
if DEBUG:
    display(cov_scores_df.head())
    display(residuals_df.head())

In [None]:
# get a summary of the covariates model scores
print(cov_scores_df.describe())
# look at the distribution of covariate model scores, 
# ie get a sense any feature driven by covariates
with rc_context({'figure.figsize': (8, 8), 'figure.dpi': dpi_value}):
    plt.style.use('seaborn-bright')
    distplot(cov_scores_df['score'])
    plt.show()

#### how many features have more than 75% score

In [None]:
drop_features = cov_scores_df[cov_scores_df.score > 0.75].index.values
keep_features = list(set(residuals_df.columns) - set(drop_features))
print(len(drop_features))
print(len(keep_features))
print(residuals_df.shape)
print(cov_scores_df.shape)

### save scaled and covariate adjusted data

In [None]:
%%time 

nuf.write_df_to_hdf(residuals_df, adj_quants_file)

#### take a look at the scaled and covariate adjusted data

In [None]:
nuf.plot_trnsfrm_effect_example(traits_scaled_df, residuals_df)

In [None]:
# find feature with largest score
large_adj_trait = cov_scores_df.loc[cov_scores_df['score'] == max(cov_scores_df['score'])]
print(large_adj_trait)
large_adj_traid_id = large_adj_trait.index.values[0]

# spot check same feature with largest adjustment effect
nuf.plot_trnsfrm_effect_example(traits_scaled_df, residuals_df, large_adj_traid_id)

#### what are the post scaled and covariate adjusted latent variables correlated with

In [None]:
%%time
max_count = int(min(residuals_df[no_change_variance_features].shape[0], 
                    residuals_df[no_change_variance_features].shape[1])/2)
print(f'max count is {max_count}')

r2_values, rmse_values = nuf.iterate_model_component_counts(max_count, 
                                                            residuals_df[no_change_variance_features], 
                                                            'PCA')

#### use max curvature of accuracy to select number of components to use

In [None]:
knee_rmse = nuf.component_from_max_curve(rmse_values, 'RMSE')
knee_r2 = nuf.component_from_max_curve(r2_values, 'R2')
# num_comp = max(knee_rmse, knee_r2)
num_comp = min(knee_rmse, knee_r2)
print(num_comp)

#### regenerate the PCA model with the selected number of components

In [None]:
pca_mdl,pca_df,_,_ = nuf.generate_selected_model(num_comp, 
                                                 residuals_df[no_change_variance_features], 
                                                 'PCA')
print(f'shape of pca_df is {pca_df.shape}')
nuf.plot_pair(pca_df.merge(covs_df, how='left', left_index=True, right_index=True), 
          'PCA_0', 'PCA_1', hue_cov='cell_type', style_cov='Batch')
print(pca_mdl.explained_variance_ratio_)
if DEBUG:
    display(pca_df.head())

In [None]:
%%time
pcs_df = pca_df.merge(covs_df, how='left', left_index=True, right_index=True)
# since just checking the PCs and not using just run ppscore on 1st three
covs_target_list = pca_df.columns.to_list()
covs_to_check = nuf.pps_predict_targets(pcs_df, covs_target_list)
nuf.plot_ppscore_matrix(pcs_df, covs_to_check, covs_target_list)

In [None]:
nuf.plot_pair(pcs_df, 'PCA_0', 'PCA_1', hue_cov='cell_type', style_cov='Batch')

In [None]:
if len(covs_to_check) > 0:
    dums_covs_df = nuf.dummy_covs_as_needed(pcs_df[list(set(covs_to_check) | 
                                                        set(covs_target_list))])
    nuf.plot_correlation_heatmap(dums_covs_df)

#### Visualize the quantification variation covariates, the PCA components, in 2D with MDE

In [None]:
%%time
embd_df = nuf.generate_2d_embed_df(pca_df, covs_df)
print(f'embd_df shape is {embd_df.shape}')
if DEBUG:
    display(embd_df.head())

In [None]:
nuf.plot_pair(embd_df, 'LD_1', 'LD_2', hue_cov='cell_type', style_cov='Batch')
nuf.plot_pair(embd_df, 'LD_1', 'LD_2', hue_cov='cell_type', size_cov='EstimatedNumberofCells')

In [None]:
!date