## Notebook for performing QTS analysis for PD GRS and modality

In [None]:
!date

#### import libraries

In [None]:
from pandas import read_csv, DataFrame, read_hdf, get_dummies, concat, Series
import numpy as np
import statsmodels.stats.multitest as smm
import statsmodels.api as sm
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
from seaborn import regplot, scatterplot
from sklearn.preprocessing import MinMaxScaler
from matplotlib.pyplot import rc_context

%matplotlib inline
# for white background of figures (only for docs rendering)
%config InlineBackend.print_figure_kwargs={'facecolor' : "w"}
%config InlineBackend.figure_format='retina'

#### set notebook variables

In [None]:
# parameters
modality = ''
day = ''

In [None]:
# naming
cohort = 'foundin'
set_name = f'{cohort}_{day}_{modality}'

# directories
wrk_dir = '/home/jupyter/foundin_qtl'
quants_dir = f'{wrk_dir}/quants'
info_dir = f'{wrk_dir}/sample_info'
qts_dir = f'{wrk_dir}/qts'

# in files
covs_file = f'{info_dir}/{cohort}_{modality}_sample_info.csv'
grs_file = f'{info_dir}/{cohort}_grs_scaled.csv'
quants_file = f'{quants_dir}/{set_name}.scaled.adj.hdf5'

# out files
qts_file = f'{qts_dir}/{set_name}.qts.csv'

# constants
DEBUG = False
dpi_value = 50
covs_columns_to_use = ['female', 'PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6', 
                       'nonDA', 'Batch_2', 'Batch_3', 'Batch_4', 'Batch_5', 
                       'EstimatedNumberofCells']

#### analysis functions

In [None]:
# functions to run the linear regression
def reg_model(y, x, term):
    Xb = sm.add_constant(x)
    reg_model = sm.OLS(y, Xb).fit()
    # return the coef, stderr, adjusted r2, number of terms, and p-value
    return [reg_model.params[term], reg_model.bse[term], reg_model.rsquared_adj, 
            reg_model.params.shape[0], reg_model.pvalues[term]]

# compute B&H FDR for given p-values
def compute_fdr(pvalues):
    bh_adj = smm.fdrcorrection(pvalues)
    return bh_adj[1]

def grsresgression(cov_df, traits_df, dep_term='', extra_dep_terms=None) -> DataFrame:
    print(f'dependent term is {dep_term}')
    if not extra_dep_terms is None:
        this_cov_df = cov_df[[dep_term] + extra_dep_terms]
        print(f'additional covariate terms {extra_dep_terms}')
    else:
        this_cov_df = cov_df[[dep_term]]
    lm_results = traits_df.apply(lambda x: reg_model(x, this_cov_df, dep_term),
                                 result_type='expand').rename(index={0: 'coef', 
                                                                     1: 'stderr', 
                                                                     2: 'r2_adj', 
                                                                     3:'term_cnt', 
                                                                     4:'p-value'}).T
    # apply B&H FDR corrections to results
    lm_results['bh_fdr'] = compute_fdr(lm_results['p-value'].fillna(1))
    return lm_results

In [None]:
#plot the QTS
def plotqts(trait_id, study_name, score_df, traits_df):
    this_scores_df = score_df.loc[score_df.index.isin(traits_df.index)]
    this_scores_df = this_scores_df.reindex(traits_df.index)

    temp = traits_df.merge(this_scores_df,left_index=True,right_index=True)

    with rc_context({'figure.figsize': (9, 9), 'figure.dpi': dpi_value}):
        plt.style.use('seaborn-bright')
        regplot(x=trait_id,y='GRS', data=temp, ci=95)
        scatterplot(x=trait_id,y='GRS', data=temp, hue='DX')
        plt.xlabel('Trait')
        plt.ylabel('GRS')
        plt.title(f'{trait_id} in {study_name}')
        plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0,prop={'size': 10})
        plt.show()

### load input data

#### load the known sample covariates

In [None]:
covs_df = read_csv(covs_file, index_col=0)
print(covs_df.shape)
# if any duplicates, keep first
covs_df = covs_df[~covs_df.index.duplicated(keep='first')]
if DEBUG:
    display(covs_df.head())

#### load and add GRS

In [None]:
grs_df = read_csv(grs_file)
print(grs_df.shape)
if DEBUG:
    display(grs_df.head())

In [None]:
# covs_df = covs_df.merge(grs_df, how='inner', left_on='wgsid', right_on='IID')
covs_df = covs_df.reset_index().merge(grs_df, how='inner', left_on='sampleid', 
                                      right_on='IID').set_index('assayid')
print(covs_df.shape)
if DEBUG:
    display(covs_df.head())

#### load the quantified modality

In [None]:
%%time
quants_df = read_hdf(quants_file, index_col=0)
print(quants_df.shape)
if DEBUG:
    display(quants_df.head())

In [None]:
set(quants_df.index) - set(covs_df.index)

### prep the covariate terms for modeling

#### create a binarized covariate for sex

In [None]:
covs_df['female'] = 0
covs_df.loc[covs_df.sex == 'Female', 'female'] = 1
display(covs_df.sex.value_counts())
display(covs_df.female.value_counts())

#### create a combine non-DA neuron fraction as a covariate, or use single-cell info
if SCRN modality don't include this covariate term instead use the number of cells

In [None]:
sc_covs = ['EstimatedNumberofCells']
if modality.startswith('SCRN') or modality in ['PDUI-DA', 'PDUI-iDA']:
    covs_columns_to_use.remove('nonDA')
    scaled_covs = MinMaxScaler().fit_transform(covs_df[sc_covs])
    scaled_df = pd.DataFrame(data=scaled_covs, columns=sc_covs, index=covs_df.index)
    for sc_cov in sc_covs:
        covs_df[sc_cov] = scaled_df[sc_cov]
        if DEBUG:
            print(sc_cov)
            display(covs_df[sc_cov].describe())    
else:
    covs_df['nonDA'] = 1 - covs_df.DAn
    for sc_cov in sc_covs:
        covs_columns_to_use.remove(sc_cov)
    if DEBUG:
        display(covs_df.nonDA.describe())
if DEBUG:
    display(covs_df.head())        

#### one-hot encode the Batch covariate

In [None]:
onehot_batch = get_dummies(covs_df.Batch, drop_first=True)
# should have the same index
print(f'indices are equal: {covs_df.index.equals(onehot_batch.index)}')
covs_df = concat([covs_df, onehot_batch], axis=1)
print(f'new covariates shape: {covs_df.shape}')
if DEBUG:
    display(onehot_batch.sample(5))
    display(covs_df.sample(5))

#### drop samples where terms are missing
sometimes estimated cell-fraction is missing

In [None]:
covs_df = covs_df.dropna(subset=['GRS'] + covs_columns_to_use)
print(f'non-missing covariates shape {covs_df.shape}')
if DEBUG:
    display(covs_df.head())

#### verify quants and covs have same indices

In [None]:
def ensure_matched_indices(df1: DataFrame, df2: DataFrame) -> {DataFrame, DataFrame}:
    """ make sure the two datasets are ordered the same
        modifies both df1 and df2
    Args:
        df1 (pandas.DataFrame)
        df2 (pandas.DataFrame)
    """ 
    print('reindexing')
    shared_indices = df1.index.intersection(df2.index)
    df1 = df1.loc[shared_indices,]
    df2 = df2.loc[shared_indices,]
    temp = df1.index.values == df2.index.values
    display(Series(temp).value_counts())
    return df1, df2

In [None]:
quants_df, covs_df = ensure_matched_indices(quants_df, covs_df)

### run the QTS (feature ~ GRS regression)

In [None]:
%%time
grs_results = grsresgression(covs_df, quants_df, dep_term='GRS', extra_dep_terms=covs_columns_to_use)

In [None]:
if DEBUG:
    display(grs_results.sample(10))

### save the results files

In [None]:
grs_results.to_csv(qts_file)

In [None]:
print(grs_results.shape)
print(grs_results.loc[grs_results['bh_fdr'] <= 0.05].shape)

In [None]:
grs_results.describe()

In [None]:
display(grs_results.loc[grs_results['bh_fdr'] <= 0.05])

### plot the most significant results

In [None]:
this_trait = grs_results.loc[grs_results['p-value'] == min(grs_results['p-value']),].index[0]
print(grs_results.loc[this_trait])
plotqts(this_trait, cohort.upper(),  covs_df, quants_df)

In [None]:
!date