## Notebook to prep quantified features data
detection, sex check, normalization, and covariates

In [None]:
!date

#### import libraries and notebook variables

In [None]:
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.preprocessing import QuantileTransformer, MinMaxScaler
import random
from umap import UMAP
import ppscore as pps
from sklearn.linear_model import LinearRegression
import scipy.stats as stats
import concurrent.futures

import warnings
warnings.filterwarnings('ignore')

In [None]:
# parameter variables
cohort = 'foundin'
cell_type = 'DA'
day = 'da65'
quant_type = 'scrn'

In [None]:
# naming
cohort_build = f'{cohort}.{day}.{cell_type}'

# directories
wrk_dir = f'/home/jupyter/sceqtl'
quants_dir = f'{wrk_dir}/quants'
info_dir = f'{wrk_dir}/sample_info'

# input files
src_quants_matrix = f'{quants_dir}/{cohort_build}.{quant_type}.hdf5'
covariates_file = f'{info_dir}/{cohort}_{quant_type}_sample_info.csv'
features_file = f'{quants_dir}/gencode_v29.lncipedia_v5_2_hc.annotation.pkl'
# gene file of genes to exclude from variance detection
# using Human Protein Atlas 'elevated' gene set for braim
tissue_genes_file = f'{quants_dir}/tissue_category_rna_brain_Tissue.tsv'

# output files
umap_covs_file = f'{info_dir}/{cohort_build}.umap.covs.csv'
scaled_quants_file = f'{quants_dir}/{cohort_build}.scaled.hdf5'
adj_quants_file = f'{quants_dir}/{cohort_build}.scaled.adj.hdf5'
tnsrqtl_pheno_file = f'{quants_dir}/{cohort_build}.scaled.adj.bed.gz'

# constant values
max_missing_rate = 0.25
min_ppscore = 0.05
min_pearson = 0.22

repeated_id_dict = {'PPMI3966B1': 'PPMI3966', 'PPMI3966B2': 'PPMI3966', 
                    'PPMI3966B3': 'PPMI3966', 'PPMI3966B5': 'PPMI3966',
                    'PPMI3966B5E6': 'PPMI3966', 'PPMI3966B5E8': 'PPMI3966'}
# to match geno's use PPMI3966 Batch3
replace_id_dict = {'PPMI3966B3': 'PPMI3966'}

### notebook functions

#### functions for detection rates calculations and plotting

In [None]:
def calculate_detection_rates(this_df, quant_type, round_percision=1, 
                              min_quant_value=None):
    if min_quant_value is None:
        min_quant_value = this_df.round(round_percision).min().min()

    print(f'minimun {quant_type} value is {min_quant_value}')

    detected_df = this_df.mask(this_df.round(round_percision) <= min_quant_value, 0)

    # calculate the missing counts from the detected df mask
    trait_missing_rates = round(detected_df.isin({0}).sum(0)/detected_df.shape[0], 2)
    sample_missing_rates = round(detected_df.isin({0}).sum(1)/detected_df.shape[1], 2)

    print(f'{len(trait_missing_rates)} features with mean missing \
rate = {trait_missing_rates.mean()}')
    print(f'{len(sample_missing_rates)} samples with mean missing \
rate = {sample_missing_rates.mean()}')
    return trait_missing_rates, sample_missing_rates

def plot_missing_rates(feature_rates, sample_rates):
    sns.set()
    plt.figure(figsize=(12, 12))
    plt.subplot(2, 2, 1)
    sns.distplot(feature_rates.values)
    plt.title('Features missingness rates')
    plt.subplot(2, 2, 2)
    sns.distplot(sample_rates.values)
    plt.title('Samples missingness rates')
    plt.show()
    
def bad_callrate_features(features_missing_rates, max_missing_rate):
    bad_call_rates = features_missing_rates[features_missing_rates > max_missing_rate]
    print(f'features with bad call rates shape {bad_call_rates.shape}, \
fraction of features with bad rates {bad_call_rates.shape[0]/features_missing_rates.shape[0]}')
    return bad_call_rates

def subset_well_detected_features(this_df, bad_call_rates):
    detected_traits = list(set(this_df.columns)-set(bad_call_rates.index))
    this_wd_df = this_df[detected_traits]
    print(f'shape of well detected quants {this_wd_df.shape}')
    return this_wd_df

#### function to generate and visualize known and unknow covariates using UMAP and PPScore

In [None]:
# function for plotting umap of traits with covar high lights
def plot_umap_clusters(umap_df, hue_cov=None, style_cov=None, size_cov=None):
    # umap_plot_file = f'{WRKDIR}/{COHORTBUILD}.umap.residuals.umap.plot.png'
    sns.set()
    plt.figure(figsize=(12, 12))
    sns_plot = sns.scatterplot(x='x_umap',y='y_umap', \
                               hue=hue_cov, style=style_cov, size=size_cov, \
                               data=umap_df)
    plt.xlabel('x-umap')
    plt.ylabel('y-umap')
    #plt.legend(loc='lower right', prop={'size': 6})
    plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0,prop={'size': 10})
    # plt.savefig(umap_plot_file,format='png',dpi=600,bbox_inches='tight')
    plt.show()

# small function to generate umap from pandas dataframe, for all features (columns) \
# and return back as dataframe with source index intact
def generate_umap_covs_df(this_df, other_covs_df=None, 
                             rnd_digits=3, merge_input=False):
    #run UMAP on the data frame features
    umap_results = UMAP(random_state=42).fit_transform(this_df)
    umap_df = pd.DataFrame(umap_results,columns=['x_umap','y_umap'], \
                                       index=this_df.index).round(rnd_digits)
    if merge_input:
        umap_df = umap_df.merge(this_df,left_index=True,right_index=True)
    if other_covs_df is not None:
        umap_df = umap_df.merge(other_covs_df, how='left', 
                                left_index=True, right_index=True)
    print(f'The dimensions of the umap df and the traits are {umap_df.shape}')
    return umap_df 

# function to iterate over target features and use PPScore to find covarites of interest
def pps_predict_targets(this_df, target_list):
    covs_to_check = []
#     covs_list = ['x_umap', 'y_umap']
    for this_cov in target_list:
        print(this_cov)
        predictors_df = pps.predictors(this_df, this_cov)
        # drop anything that has ppscore of zero
        predictors_df = predictors_df.loc[predictors_df['ppscore'] > min_ppscore]
        display(predictors_df)
        covs_to_check.extend(list(predictors_df['x'].values))

    print(f'found {len(covs_to_check)} covariates that may preditct target covariates')    
    return covs_to_check

# plot ppscore matrix 
def plot_ppscore_matrix(this_df, covs_to_check, cov_targets):
    matrix_df = pps.matrix(this_df[(set(covs_to_check) | set(cov_targets))])
    matrix_df = matrix_df.loc[matrix_df['ppscore'] > min_ppscore]
    print(matrix_df.shape)

    matrix_df['ppscore'] = matrix_df['ppscore'].round(2)
    plot_matrix_df = matrix_df[['x', 'y', 'ppscore']].pivot(columns='x', index='y', values='ppscore')
    print(plot_matrix_df.shape)
    # display(plot_matrix_df)

    plt.figure(figsize=(plot_matrix_df.shape[0],plot_matrix_df.shape[1])) 
    sns.heatmap(plot_matrix_df, vmin=0, vmax=1, cmap='Blues', linewidths=0.05, 
                annot=True, annot_kws={'fontsize':12})
    plt.title('PPScore heatmap')
    plt.show()
    
# plot heatmap of Pearson correlation matrix for PPScore covariates
def plot_correlation_heatmap(this_df, covs_list : list=None):
    sns.set()
    cor = this_df.corr(method='pearson')
    cor.dropna(how='all', inplace=True)
    modified_title = ''
    if covs_list is not None:
        
        limited_cor = cor[covs_list]
        cor = limited_cor.loc[(limited_cor['x_umap'].abs() > min_pearson) | 
                              (limited_cor['y_umap'].abs() > min_pearson)]
        modified_title = 'limited'
    print(cor.shape)
    fig_width = cor.shape[1] if cor.shape[1] > 12 else 12
    fig_height = cor.shape[0] if cor.shape[1] > 12 else 12
    plt.figure(figsize=(fig_width, fig_height))        
    sns.heatmap(cor[(cor > min_pearson) | (cor < -min_pearson)], annot=True, 
                annot_kws={"fontsize":10}, linewidths=0.05, cmap='Blues')    
    plt.title(f'Pearson heatmap of PPScore covariates {modified_title}')
    plt.show()

# function to one-hot encode the categorical covariates and merge with continuous ones    
def dummy_covs_as_needed(this_df):
    temp_df = this_df.copy()
    cats_df = temp_df.select_dtypes(include=['object'])
    print(f'categoricals shape {cats_df.shape}')
    dums_df = pd.get_dummies(cats_df)
    print(f'one-hot encoded categoricals shape {dums_df.shape}')

    temp_df = temp_df.merge(dums_df, how='inner', left_index=True, right_index=True)
    print(f'new covs df shape {temp_df.shape}')
    return temp_df

#### visualization functions

In [None]:
# small function to plot before and after of transform based on named feature,
# or if a feature isn't specified then one pull at random
def plot_trnsfrm_effect_example(before_df, after_df, feature_id=None,
                                bf_label='quantile transformed', 
                                af_label='quantile transformed and covariate adjusted'):
    # if no feature ID provided get randome one
    if feature_id is None:
        feature_id = random.sample(list(after_df.columns), 1)[0]
    
    sns.distplot(before_df[feature_id])
    plt.title(f'{feature_id} {bf_label}')
    plt.show()
    sns.distplot(after_df[feature_id])
    plt.title(f'{feature_id} {af_label}')
    plt.show()
    sns.scatterplot(x=before_df[feature_id], y=after_df[feature_id])
    plt.title(f'{feature_id}')
    plt.xlabel(f'{bf_label}')
    plt.ylabel(f'{af_label}')

#### analysis functions

In [None]:
# small function to perform the quantile transform and minmax scale on a pandas dataframe
def scale_dataframe(this_df : pd.DataFrame):
    scaledX = MinMaxScaler().fit_transform(QuantileTransformer(output_distribution='normal')
                                           .fit_transform(this_df))
    scaled_df = pd.DataFrame(data=scaledX, columns=this_df.columns, 
                                 index=this_df.index)  
    return scaled_df    

# exclude low variance features from covariate generation
def exclude_low_var_features(this_df: pd.DataFrame, quartile_to_drop: str ='25%', 
                             known_feature_to_drop=None):
    quants_vars = this_df.var() 
    print(quants_vars.describe())
    # drop features within the lower quartile of variance
    min_variance = quants_vars.describe()['25%']
    # min_variance = quants_vars.describe()['50%']
    keep = quants_vars[quants_vars > min_variance]
    if known_feature_to_drop is not None:
        keep_ids = set(keep.index) - set(known_feature_to_drop)
    else:
        keep_ids = set(keep.index)
    quants_wd_var_df = this_df[keep_ids]
    print(f'shape of the features to keep {keep.shape}')
    print(f'shape of input features df {this_df.shape}')
    print(f'shape of variance features df {quants_wd_var_df.shape}')
    return quants_wd_var_df

# function to fit linear model to covariates and calculate the standardized residuals
def covariate_residuals(traits_df, covars_df):
    lm = LinearRegression(n_jobs=16)
    residuals_df = traits_df.copy()
    covar_scores_by_trait = {}

    for trait in traits_df:
            model = lm.fit(covars_df, traits_df[trait])
            covar_scores_by_trait[trait] = model.score(covars_df,traits_df[trait])
            model_predicted = model.predict(covars_df)
            residuals_df[trait] = stats.zscore(traits_df[trait] - model_predicted)
            
#     # We can use a with statement to ensure threads are cleaned up promptly
#     with concurrent.futures.ProcessPoolExecutor() as ppe:
#         # Start the load operations and mark each future with its URL
#         future_to_residual = {executor.submit(compute_residuals, trait): trait for trait in traits_df}
#         for future in concurrent.futures.as_completed(future_to_residual):
#             covar_scores_by_trait[trait], residuals_df[trait] = future_to_residual[future]

    # scale the residuals
    residualsX = MinMaxScaler().fit_transform(residuals_df)
    residuals_df = pd.DataFrame(data=residualsX, columns=traits_df.columns, 
                                index=traits_df.index)

    # grab the covariates model scores
    covar_scores_by_trait_df = pd.DataFrame.from_dict(covar_scores_by_trait,
                                                      columns=['score'],
                                                      orient='index').round(3)
    covar_scores_by_trait_df.index.name = 'featureID'
    return residuals_df, covar_scores_by_trait_df


#### input output functions

In [None]:
# small function to save hdf file
def write_df_to_hdf(this_df, file_name, key='quants', mode='w'):
    this_df.to_hdf(file_name, key=key, mode=mode)

#### load covariates files

In [None]:
covs_df = pd.read_csv(covariates_file, index_col=0)
# drop any duplicated indices
print(covs_df.shape)
covs_df = covs_df[~covs_df.index.duplicated(keep='first')]
print(covs_df.shape)
# display(covs_df.sample(5))

#### load the feature annotations

In [None]:
%%time
features_df = pd.read_pickle(features_file)
print(features_df.shape)
# display(features_df.head())

#### load the quantified features matrix

In [None]:
%%time
quants_df = pd.read_hdf(src_quants_matrix, index_col=0)
print(quants_df.shape)
# display(quants_df.head())

#### split assayid into meta data bits

In [None]:
id_parts = quants_df.index.str.split('_', expand=True).to_frame()
id_parts.columns = ['assay', 'sampleid', 'cdi', 'day']
id_parts['assayid'] = quants_df.index
print(id_parts.shape)
# display(id_parts.sample(5))
id_parts['sampleid'].replace(repeated_id_dict, inplace=True)

#### capture the assayid to wgsid for formatting phenotypes for use with wgs genotypes later

In [None]:
id_map = id_parts[['sampleid', 'assayid']]
id_map['sampleid'].replace(replace_id_dict, inplace=True)

#### check to see if missing covariate info

In [None]:
set(id_parts['sampleid']) - set(covs_df['PPMI_ID'])

#### for merging known covariates with umaps will need to add cell labelled assay ids into covariates dataframe

In [None]:
id_parts.reset_index(inplace=True)
id_parts.drop(columns=['level_0', 'level_1', 'level_2', 'level_3'], inplace=True)
covs_df = covs_df.merge(id_parts, left_on='PPMI_ID', right_on='sampleid')
covs_df.index = covs_df['assayid']
covs_df = covs_df[~covs_df.index.duplicated(keep='first')]
print(covs_df.shape)
# display(covs_df.head())

In [None]:
# check for any unexpected samples; ie probably name frmt issue
set(quants_df.index) - set(covs_df.index)

#### check expected sex of samples

In [None]:
#Vawter MP, Evans S, Choudary P et al. Gender-specific gene expression in 
#post-mortem human brain: localization to sex chromosomes. 
#Neuropsychopharmacology 2004;29:373–84.

sex_specific_genes = ['XIST','RPS4Y1','RPS4Y2','KDM5D','UTY','DDX3Y','USP9Y']
sex_genes_present = list(set(sex_specific_genes) & set(quants_df.columns))
quants_sex_df = quants_df[sex_genes_present]
print(quants_sex_df.shape)

In [None]:
%%time
sex_umap_df = generate_umap_covs_df(quants_sex_df, covs_df)
plot_umap_clusters(sex_umap_df, hue_cov='sex')

In [None]:
print(sex_umap_df.loc[(sex_umap_df['x_umap'] < 0) & (sex_umap_df['sex'] == 'Female')].shape)
print(sex_umap_df.loc[(sex_umap_df['x_umap'] > 0) & (sex_umap_df['sex'] == 'Male')].shape)

#### double check the age range to make sure no young subjects

In [None]:
covs_df['age_at_baseline'].describe()

#### calculate, plot detection rates and subset well detected features

In [None]:
%%time
trait_miss_rates, sample_miss_rates = calculate_detection_rates(quants_df, quant_type)
plot_missing_rates(trait_miss_rates, sample_miss_rates)
bad_call_rate_features = bad_callrate_features(trait_miss_rates, max_missing_rate)
quants_wd_df = subset_well_detected_features(quants_df, bad_call_rate_features)

#### standardize the full dataset using quantile transform

In [None]:
%%time
traits_scaled_df = scale_dataframe(quants_wd_df)

In [None]:
plot_trnsfrm_effect_example(quants_df, traits_scaled_df,
                            bf_label=quant_type, 
                            af_label='quantile transformed & scaled')

#### save quantile standardized, well detected data for all days

In [None]:
write_df_to_hdf(traits_scaled_df, scaled_quants_file)

#### exclude low variance features from covariate generation exclude tissue elelvated features

In [None]:
quants_var_df = exclude_low_var_features(traits_scaled_df)

In [None]:
tissue_features_df = pd.read_csv(tissue_genes_file, sep='\t')
print(tissue_features_df.shape)
# display(tissue_features_df.head())

variance_features = set(quants_var_df.columns) - (set(sex_specific_genes) | set(tissue_features_df['Gene']))
print(len(variance_features))

### take a look at the data

#### generate unknown covariates and see if know covariates are source of variations

In [None]:
%%time
umap_df = generate_umap_covs_df(quants_var_df[variance_features], covs_df)
covs_target_list = ['x_umap', 'y_umap']
covs_to_check = pps_predict_targets(umap_df, covs_target_list)
plot_ppscore_matrix(umap_df, covs_to_check, covs_target_list)
umap_dums_covs_df = dummy_covs_as_needed(umap_df[(set(covs_to_check) | 
                                                  set(covs_target_list))])
plot_correlation_heatmap(umap_dums_covs_df)
plot_correlation_heatmap(umap_dums_covs_df, covs_target_list)

#### plot umap of with known covariates of interest

In [None]:
plot_umap_clusters(umap_df, hue_cov='Batch', size_cov='EstimatedNumberofCells')

In [None]:
plot_umap_clusters(umap_df, hue_cov='RECRUITMENT_CAT', size_cov='TotalGenesDetected')

#### keep created covars and save them

In [None]:
# standardize the covariates
umap_covs_df = scale_dataframe(umap_df[covs_target_list])
# now save the covariates
umap_covs_df.to_csv(umap_covs_file)

#### covariate adjust the normalized data by the covariates

In [None]:
# check to see in df's have same indices
if not traits_scaled_df.index.equals(umap_covs_df.index):
    print('indices are not equal re-index')
    umap_covs_df.reindex(traits_scaled_df.index)
    
traits_scaled_df.index.equals(umap_covs_df.index)    

In [None]:
%%time

residuals_df, cov_scores_df = covariate_residuals(traits_scaled_df, umap_covs_df)

#take a peek at the data
print(residuals_df.shape)
print(cov_scores_df.shape)

# print(cov_scores_df.head())

In [None]:
# get a summary of the covariates model scores
print(cov_scores_df.describe())
# look at the distribution of covariate model scores, 
# ie get a sense any feature driven by covariates
sns.set()
plt.figure(figsize=(6,6))
sns.distplot(cov_scores_df['score'])
plt.show()

#### save quantile normalized and covariate adjusted data

In [None]:
%%time 

residuals_df.to_hdf(adj_quants_file, key='quants', mode='w')

#### take a look at the normalized and covariate adjusted data

In [None]:
plot_trnsfrm_effect_example(traits_scaled_df, residuals_df)

In [None]:
# find feature with largest score
large_adj_trait = cov_scores_df.loc[cov_scores_df['score'] == max(cov_scores_df['score'])]
print(large_adj_trait)
large_adj_traid_id = large_adj_trait.index.values[0]

# spot check same feature with largest adjustment effect
plot_trnsfrm_effect_example(traits_scaled_df, residuals_df, large_adj_traid_id)

#### what are the post normalization and covariate adjusted umap variables correlated with

In [None]:
%%time
umap_df = generate_umap_covs_df(residuals_df, covs_df)
covs_to_check = pps_predict_targets(umap_df, covs_target_list)
plot_ppscore_matrix(umap_df, covs_to_check, covs_target_list)
# umap_dums_covs_df = dummy_covs_as_needed(umap_df[(set(covs_to_check) | 
#                                                   set(covs_target_list))])
# plot_correlation_heatmap(umap_dums_covs_df)
# plot_correlation_heatmap(umap_dums_covs_df, covs_target_list)

In [None]:
plot_umap_clusters(umap_df, hue_cov='Batch', size_cov='EstimatedNumberofCells')

In [None]:
plot_umap_clusters(umap_df, hue_cov='RECRUITMENT_CAT', size_cov='TotalGenesDetected')

#### since switching to tensorQTL can just use one large transcriptome pheno bed instead of per chrom pheno

In [None]:
%%time

# get feature annots for present features
feature_present_df = features_df.loc[features_df['gene_name'].isin(residuals_df.columns)]
# tensorQTL pheno bed is rows = features and columns = samples
# where first four columns are chr, start, end, phenotype_id, then sample1 ... sampleN

# create dict for renaming columns (samples) from assayid to geno_id
sample_col_dict = id_map.set_index('assayid').to_dict()['sampleid']

# transpose the residuals df from sample x feature to feature x sample
tresiduals_df = residuals_df.transpose()

# modify annots
feature_present_df = feature_present_df[['seqname', 'start', 'end', 'gene_name', 'strand']].copy()
feature_present_df.rename(columns={'seqname': 'chr', 'start': 'fstart', 
                                   'end': 'fend'}, inplace=True)
# for tensorQTL 'end' column is TSS so set appropriately
feature_present_df['end'] = np.where(feature_present_df['strand'] == '+',  
                                     feature_present_df['fstart'], 
                                     feature_present_df['fend'])
feature_present_df['start'] = feature_present_df['end'] - 1
# there is a feature per transcript, so can be multiple entries per feature, so just keep longest
feature_present_df['length'] = feature_present_df['fend'] - feature_present_df['fstart']
feature_present_df.sort_values(by=['gene_name', 'length'], 
                               inplace=True, ascending=False)
print(feature_present_df.shape)
feature_present_df.drop_duplicates(subset=['gene_name'], keep='first', 
                                   inplace=True, ignore_index=True)
feature_present_df.set_index('gene_name', inplace=True, drop=False)
feature_present_df = feature_present_df.reindex(tresiduals_df.index)

# insert the feature annots
tresiduals_df.insert( 0, column='chr', value=feature_present_df['chr'])
tresiduals_df.insert( 1, column='start', value=feature_present_df['start'])
tresiduals_df.insert( 2, column='end', value=feature_present_df['end'])
tresiduals_df.insert( 3, column='phenotype_id', value=feature_present_df['gene_name'])

# if there are any genes that were in quants but not feature annots
# remove these with missing positions
tresiduals_df = tresiduals_df.loc[~tresiduals_df['chr'].isna()]
# make the positions ints instead of floats
tresiduals_df['start'] = tresiduals_df['start'].astype('int64')
tresiduals_df['end'] = tresiduals_df['end'].astype('int64')

# now rename sample ids in columns
tresiduals_df.rename(columns=sample_col_dict, inplace=True)

tresiduals_df.to_csv(tnsrqtl_pheno_file, index=False, sep='\t', compression='gzip')