## Notebook to do longitudinal analysis of quantified feature over differentiation day
- features to be considered, probably should do in separate notebooks
    - feature ~ day + (1| subject)
    
based on running linear mixed effects models in python examples:
- [Comparing R lmer to Statsmodels MixedLM](https://nbviewer.jupyter.org/urls/umich.box.com/shared/static/6tfc1e0q6jincsv5pgfa.ipynb)
- [Linear Mixed Effects Models](https://www.statsmodels.org/devel/examples/notebooks/generated/mixed_lm_example.html)
- [statsmodel mixedlm from formula](https://www.statsmodels.org/dev/generated/statsmodels.regression.mixed_linear_model.MixedLM.from_formula.html)

In [None]:
!date

#### import libraries and set notebook variables

In [None]:
import pandas as pd
import numpy as np
import os
import statsmodels.api as sm
import statsmodels.formula.api as smf
import statsmodels.stats.multitest as smm
import seaborn as sns
import matplotlib.pyplot as plt
import concurrent.futures
from random import sample
import warnings
warnings.filterwarnings(action='once')

In [None]:
# parameters
cohort = 'foundin'
days = ['da0', 'da25', 'da65']
modality = 'ATAC'

# directories
home_dir = '/home/jupyter'
wrk_dir = f'{home_dir}/caqtl'
quants_dir = f'{wrk_dir}/quants'
info_dir = f'{wrk_dir}/sample_info'
results_dir = f'{wrk_dir}/results'

# in files
known_covs_file = f'{info_dir}/{cohort}_{modality}_sample_info.csv'
umaps_file = f'{info_dir}/{cohort}.{modality}.umap.covs.csv'
quants_file = f'{quants_dir}/{cohort}.{modality}.scaled.adj.hdf5'

# out files

# constants
cpu_count = len(os.sched_getaffinity(0))
covs_index_assay_id_to_replace = {'RNAB_PPMI3422_0683_da65_v1': 'RNAB_PPMI3422_1260_da65_v1',
'RNAB_PPMI3448_3236_da65_v1': 'RNAB_PPMI3448_2397_da65_v1',
'RNAB_PPMI3451_2397_da65_v1': 'RNAB_PPMI3451_3236_da65_v1',
'RNAB_PPMI3664_6647_da65_v1': 'RNAB_PPMI3664_2833_da65_v1',
'RNAB_PPMI3665_7215_da65_v1': 'RNAB_PPMI3665_4484_da65_v1',
'RNAB_PPMI3953_2833_da65_v1': 'RNAB_PPMI3953_6647_da65_v1',
'RNAB_PPMI4101_4484_da65_v2': 'RNAB_PPMI4101_7215_da65_v2',
'RNAB_PPMI4106_2056_da65_v1': 'RNAB_PPMI4106_0494_da65_v1',
'RNAB_PPMI54991_1260_da65_v1': 'RNAB_PPMI54991_0683_da65_v1'}

#### analysis functions

In [None]:
def mixed_model(formula, df, group_name, term):
    model = sm.MixedLM.from_formula(formula, df, groups=df[group_name])
    result = model.fit()
    return [result.params[term], result.bse[term], 
            result.params.shape[0], result.pvalues[term]]

def frmt_run_mm(dep_var, indep_var, df, group_name, hard_covs=False):
    warnings.filterwarnings('ignore')
    if hard_covs:
        this_formula = f'Q("{dep_var}") ~ {indep_var} + C(Batch) + \
ProliferatingFloorPlateProgenitors + x_umap + y_umap'        
    else:
        this_formula = f'Q("{dep_var}") ~ {indep_var}'
    try:
        results = mixed_model(this_formula, df, group_name, indep_var)
    except:
#         print(f'Caught NameError for {dep_var}')
        results = [np.nan, np.nan, np.nan, np.nan]
    return [dep_var] + results

#### load the known sample covariates

In [None]:
known_covs_df = pd.read_csv(known_covs_file, index_col=0)
print(known_covs_df.shape)
if modality == 'RNAB':
    known_covs_df.rename(index=covs_index_assay_id_to_replace, inplace=True)    
known_covs_df = known_covs_df[~known_covs_df.index.duplicated(keep='first')]
# display(known_covs_df.head())

In [None]:
umaps_df = pd.read_csv(umaps_file, index_col=0)
print(umaps_df.shape)
# display(umaps_df.head())

In [None]:
covs_df = umaps_df.merge(known_covs_df, how='inner', left_index=True, right_index=True)
print(covs_df.shape)
# display(covs_df.head())

#### add month column based on visit

In [None]:
covs_df['daynum'] = covs_df['day'].str.replace('da','').astype('int32')

In [None]:
covs_df['daynum'].value_counts()

#### load the quantification matrix

In [None]:
%%time
quants_df = pd.read_hdf(quants_file, index_col=0)
print(quants_df.shape)
# display(quants_df.head())

#### test something simple

In [None]:
data_df = quants_df.merge(covs_df, how='inner', 
                          left_index=True, right_index=True)
print(data_df.shape)

#### split the features into batches

In [None]:
features = quants_df.columns.values
print(len(features))
features_partioned = np.array_split(features, np.floor(cpu_count))
print(len(features_partioned))
print(len(features_partioned[0]))

#### run just the mixed effects model

In [None]:
def run_mm_batch(feat_list, ind_var, df, group_name, extra_terms=False):
    batch_results = []
    for feature in feat_list:
        batch_results.append(frmt_run_mm(feature, ind_var, df, group_name, extra_terms))
    return batch_results

#### run the simpler model

In [None]:
%%time
indep_var = 'daynum'
grouping = 'sampleid'

fs_list = []
lm_results = []
# I run ProcessPoolExecutor from jupyter on Ubuntu, not working on CentOS
# some threads out there that ProcessPoolExecutor won't work from juypter
# but that looks like OS based and switching process being done via spawn
# instead of fork, but CentOS uses fork
# so switching to ThreadPoolExecutor just to make is work
with concurrent.futures.ProcessPoolExecutor() as ppe:
# with concurrent.futures.ThreadPoolExecutor() as ppe:
    for batch_features in features_partioned:
        fs_list.append(ppe.submit(run_mm_batch, batch_features, indep_var, data_df, grouping))
for future in concurrent.futures.as_completed(fs_list):
    lm_results.append(future.result())

In [None]:
# flatten the list
results = [item for sublist in lm_results for item in sublist]

In [None]:
results_df = pd.DataFrame(data=results, columns=['feature', 'coef', 'stderr', 'term_cnt', 'p-value'])
print(results_df.shape)
display(results_df.head())

#### calc FDR

In [None]:
# if p-value is NaN, set value to 1
results_df['p-value'].fillna(1, inplace=True)
results_df.info()

In [None]:
alpha=0.05
method='fdr_bh'
test_adjust = smm.multipletests(np.array(results_df['p-value']),
                                alpha=alpha, method=method)
results_df['bh_fdr'] = test_adjust[1]

print(results_df.loc[results_df['bh_fdr'] < 0.05].shape)
display(results_df.head())

In [None]:
sig_df = results_df.loc[results_df['bh_fdr'] < 0.05]
display(sig_df.loc[sig_df['p-value'] == min(sig_df['p-value'])])
display(sig_df.loc[sig_df['p-value'] == max(sig_df['p-value'])])

#### save the simple mixed model results

In [None]:
out_file = f'{results_dir}/{cohort}.quants_time_mm.csv'
sig_df.to_csv(out_file, index=False)

#### do some example plotting

In [None]:
test_feature = sample(list(data_df.columns), 1)[0]
sns.lmplot(x='daynum', y=test_feature, hue='case_control_other_latest', 
           data=data_df, palette='Set1')

In [None]:
test_feature = sample(list(data_df.columns), 1)[0]
sns.lmplot(x='daynum', y=test_feature, hue='case_control_other_latest', 
           data=data_df, palette='Set1')

### now re-run everything including Batch, ProliferatingFloorPlateProgenitors and UMAP covariates in the mixed model by subject

#### run the model with extra covariates

In [None]:
print(data_df.shape)
data_df[covs_df.columns].info()

In [None]:
# extra covariates are 'Batch', 'ProliferatingFloorPlateProgenitors', some are null remove those rows
data_df = data_df.loc[(~data_df['Batch'].isna()) & 
                      (~data_df['ProliferatingFloorPlateProgenitors'].isna())]
print(data_df.shape)

In [None]:
# # test a single model run
# dep_var = 'chr4_128600128_128602552'
# indep_var = 'daynum'
# group_name = 'sampleid'
# this_formula = f'Q("{dep_var}") ~ {indep_var} + Batch + \
# ProliferatingFloorPlateProgenitors + x_umap + y_umap'        
# results = mixed_model(this_formula, data_df, group_name, indep_var)
# print(results)

# model = sm.MixedLM.from_formula(this_formula, data_df, groups=data_df[group_name])
# result = model.fit()
# print(result.summary())

In [None]:
%%time
indep_var = 'daynum'
grouping = 'sampleid'

fs_list = []
lm_results = []
with concurrent.futures.ProcessPoolExecutor() as ppe:
# with concurrent.futures.ThreadPoolExecutor() as ppe:
    for batch_features in features_partioned:
        fs_list.append(ppe.submit(run_mm_batch, batch_features, indep_var, data_df, grouping, True))
for future in concurrent.futures.as_completed(fs_list):
    lm_results.append(future.result())

In [None]:
# flatten the list
results = [item for sublist in lm_results for item in sublist]

In [None]:
results_df = pd.DataFrame(data=results, columns=['feature', 'coef', 'stderr', 'term_cnt', 'p-value'])
print(results_df.shape)
display(results_df.head())

#### calc FDR

In [None]:
# if p-value is NaN, set value to 1
results_df['p-value'].fillna(1, inplace=True)
results_df.info()

In [None]:
alpha=0.05
method='fdr_bh'
test_adjust = smm.multipletests(np.array(results_df['p-value']),
                                alpha=alpha, method=method)
results_df['bh_fdr'] = test_adjust[1]

print(results_df.loc[results_df['bh_fdr'] < 0.05].shape)
display(results_df.head())

#### save the discovery cohort results

In [None]:
sig_df = results_df.loc[results_df['bh_fdr'] < 0.05]
display(sig_df.loc[sig_df['p-value'] == min(sig_df['p-value'])])
display(sig_df.loc[sig_df['p-value'] == max(sig_df['p-value'])])

In [None]:
out_file = f'{results_dir}/{cohort}.quants_time_cov_mm.csv'
sig_df.to_csv(out_file, index=False)

#### do some example plotting

In [None]:
test_feature = sample(list(data_df.columns), 1)[0]
sns.lmplot(x='daynum', y=test_feature, hue='Batch', 
           data=data_df, palette='Set1')

In [None]:
test_feature = sample(list(data_df.columns), 1)[0]
sns.lmplot(x='daynum', y=test_feature, hue='Batch', 
           data=data_df, palette='Set1')

#### check the replicated mixed model results against the mixed model results also adjusted for age and sex

In [None]:
mm_file = f'{results_dir}/{cohort}.quants_time_mm.csv'
mm_cov_file =  f'{results_dir}/{cohort}.quants_time_cov_mm.csv'
mm_results_df = pd.read_csv(mm_file)
mm_results_df = mm_results_df.loc[mm_results_df['bh_fdr'] < 0.05]
print(mm_results_df.shape)
mm_cov_results_df = pd.read_csv(mm_cov_file)
mm_cov_results_df = mm_cov_results_df.loc[mm_cov_results_df['bh_fdr'] < 0.05]
print(mm_cov_results_df.shape)

In [None]:
in_both = set(mm_results_df['feature']) & set(mm_cov_results_df['feature'])
print(len(in_both))
in_just_mm = set(mm_results_df['feature']) - set(mm_cov_results_df['feature']) 
print(len(in_just_mm))
in_just_mm_cov = set(mm_cov_results_df['feature']) - set(mm_results_df['feature'])
print(len(in_just_mm_cov))

In [None]:
both_file = f'{results_dir}/{cohort}.quants_time_mm_and_mmcov.csv'
just_mm_file = f'{results_dir}/{cohort}.quants_time_mm_only.csv'
just_mmcov_file = f'{results_dir}/{cohort}.quants_time_mmcov_only.csv'

mm_results_df.loc[mm_results_df['feature'].isin(in_both)].to_csv(both_file, index=False)
mm_results_df.loc[mm_results_df['feature'].isin(in_just_mm)].to_csv(just_mm_file, index=False)
mm_cov_results_df.loc[mm_cov_results_df['feature'].isin(in_just_mm_cov)].to_csv(just_mmcov_file, index=False)

In [None]:
mm_results_df.head()

In [None]:
mm_cov_results_df.head()

In [None]:
both_df = mm_results_df.merge(mm_cov_results_df, how='inner', left_on='feature', right_on='feature')
print(both_df.shape)

In [None]:
results_df['log10_pvalue'] = np.log10(results_df['p-value'])*-1
results_df['score'] = results_df['coef']/results_df['stderr']

In [None]:
sns.scatterplot(x='coef_x', y='coef_y', data=both_df)

In [None]:
display(both_df.head())

In [None]:
both_same_dir = both_df.loc[((both_df['coef_x'] > 0) & (both_df['coef_y'] > 0)) | 
                            ((both_df['coef_x'] < 0) & (both_df['coef_y'] < 0))]
print(both_same_dir.shape)
display(both_same_dir.loc[both_same_dir['p-value_y'] == min(both_same_dir['p-value_y'])])
display(both_same_dir.loc[both_same_dir['p-value_y'] == max(both_same_dir['p-value_y'])])

#### do some example plotting

In [None]:
test_feature = sample(list(data_df.columns), 1)[0]
sns.lmplot(x='daynum', y=test_feature, hue='Batch', 
           data=data_df, palette='Set1')

In [None]:
test_feature = sample(list(data_df.columns), 1)[0]
sns.lmplot(x='daynum', y=test_feature, hue='Batch', 
           data=data_df, palette='Set1')