## Notebook to do lindex mixed effects model analysis of quantified feature with repeated measures
- features to be considered, probably should do in separate notebooks
    - feature ~ day + (1| subject)
    - feature ~ DAn fraction + (1|subject)
    
based on running linear mixed effects models in python examples:
- [Comparing R lmer to Statsmodels MixedLM](https://nbviewer.jupyter.org/urls/umich.box.com/shared/static/6tfc1e0q6jincsv5pgfa.ipynb)
- [Linear Mixed Effects Models](https://www.statsmodels.org/devel/examples/notebooks/generated/mixed_lm_example.html)
- [statsmodel mixedlm from formula](https://www.statsmodels.org/dev/generated/statsmodels.regression.mixed_linear_model.MixedLM.from_formula.html)

In [None]:
!date

#### import libraries

In [None]:
from pandas import read_csv, read_hdf, DataFrame
import numpy as np
from os import sched_getaffinity
import statsmodels.api as sm
import statsmodels.formula.api as smf
import statsmodels.stats.multitest as smm
from seaborn import lmplot
import matplotlib.pyplot as plt
from matplotlib.pyplot import rc_context
import concurrent.futures
from random import sample
import warnings
# warnings.filterwarnings(action='once')
warnings.filterwarnings('ignore')

%matplotlib inline
# for white background of figures (only for docs rendering)
%config InlineBackend.print_figure_kwargs={'facecolor' : "w"}
%config InlineBackend.figure_format='retina'

#### set notebook variables

In [None]:
# parameters
days = [] # ['da0', 'da25', 'da65']
modality = ''
exogenous = '' # 'daynum' DopaminergicNeurons'

In [None]:
# naming
cohort = 'foundin'

# directories
wrk_dir = '/home/jupyter/foundin_qtl'
quants_dir = f'{wrk_dir}/quants'
info_dir = f'{wrk_dir}/sample_info'
results_dir = f'{wrk_dir}/results'

# in files
info_file = f'{info_dir}/{cohort}_{modality}_sample_info.csv'
umaps_file = f'{info_dir}/{cohort}.{modality}.umap.covs.csv'
quants_file = f'{quants_dir}/{cohort}_daALL_{modality}.scaled.adj.hdf5'

# out files
out_file = f'{results_dir}/{cohort}_{modality}_{exogenous}_lmm.csv'

# variables
other_terms = ['sex', 'Batch', 'PC1', 'PC2', 'PC3']
MDL_OTHER_TERMS = '+ C(sex) + C(Batch) + PC1 + PC2 + PC3'
DEBUG = False
TESTING = False
test_feature_cnt = 250
dpi_value = 50
cpu_count = len(sched_getaffinity(0))

#### analysis functions

In [None]:
def mixed_model(formula: str, df: DataFrame, group_name: str, term: str) -> list:
    model = sm.MixedLM.from_formula(formula, df, groups=df[group_name])
    result = model.fit()
    return [result.params[term], result.bse[term], 
            result.params.shape[0], result.pvalues[term]]

def frmt_run_mm(endo: str, exo: str, df: DataFrame, group_name: str) -> list:
    this_formula = f'Q("{endo}") ~ {exo} {MDL_OTHER_TERMS}'        
    try:
        results = mixed_model(this_formula, df, group_name, exo)
    except:
#         print(f'Caught NameError for {endo}')
        results = [np.nan, np.nan, np.nan, np.nan]
    return [endo] + results

# compute B&H FDR for given p-values
def compute_fdr(pvalues):
    bh_adj = smm.fdrcorrection(pvalues)
    return bh_adj[1]

#### plotting functions

In [None]:
def plot_regression(df: DataFrame, endo_feature, exo_feature: str=None,
                    hue_feature: str='Batch'):
    if exo_feature is None:
        # if exogenous feature in none pull one a random
        exo_feature = sample(set(df.columns) - set([endo_feature, hue_feature]), 1)[0]
    with rc_context({'figure.figsize': (9, 9), 'figure.dpi': dpi_value}):
        plt.style.use('seaborn-bright')     
        lmplot(x=endo_feature, y=exo_feature, hue=hue_feature, data=df)

### generate to IDs of repeated samples to be excluded

In [None]:
repeated_samples_to_exclude = []

for day in days:
    print(day)
    day_excludes = [
        f'{modality}_PPMI3966B1v1_{day}', 
        f'{modality}_PPMI3966B1v2_{day}',
        f'{modality}_PPMI3966B1v3_{day}', 
        f'{modality}_PPMI3966B1v4_{day}',
        f'{modality}_PPMI3966B1v5_{day}', 
        f'{modality}_PPMI3966B1v6_{day}',
        f'{modality}_PPMI3966B1v7_{day}', 
        f'{modality}_PPMI3966B1v8_{day}',
        f'{modality}_PPMI3966B1v9_{day}', 
        f'{modality}_PPMI3966B2v1_{day}',
        f'{modality}_PPMI3966B2v2_{day}', 
        f'{modality}_PPMI3966B5v1_{day}'
    ]
    repeated_samples_to_exclude = repeated_samples_to_exclude + day_excludes

print(repeated_samples_to_exclude)

### load input data

#### load the known sample covariates

In [None]:
covs_df = read_csv(info_file, index_col=0)
print(f'covariates shape {covs_df.shape}')
covs_df = covs_df[~covs_df.index.duplicated(keep='first')]
print(f'post duplicate index drop covariates shape {covs_df.shape}')
if DEBUG:
    display(covs_df.head())

#### drop info for the repeated samples to be excluded

this will also cause them to be dropped from quantified features via inner merge

In [None]:
covs_df = covs_df[~covs_df.index.isin(repeated_samples_to_exclude)]
print(f'after repeated sample drop covariates shape {covs_df.shape}')
if DEBUG:
    display(covs_df.head())

#### subset to include only days specified

In [None]:
covs_df = covs_df.loc[covs_df.day.isin(days)]
print(f'shape of covariates after subset on specified days {covs_df.shape}')
display(covs_df.day.value_counts())

#### if exogenous specified is day then add day number column based on day value

In [None]:
if exogenous == 'daynum':
    covs_df['daynum'] = covs_df['day'].str.replace('da','').astype('int32')
    display(covs_df['daynum'].value_counts())

#### drop any samples that is missing one of the covariate terms to be used in modeling

In [None]:
if DEBUG:
    display(covs_df[[exogenous] + other_terms].info())
keep_indices = covs_df[[exogenous] + other_terms].dropna().index
covs_df = covs_df.loc[covs_df.index.intersection(keep_indices)]
print(f'after dropping samples with missing data, covariates shape {covs_df.shape}')
if DEBUG:
    display(covs_df.head())

#### load the quantification matrix

In [None]:
%%time
quants_df = read_hdf(quants_file)
print(f'quantified data shape {quants_df.shape}')
if DEBUG:
    display(quants_df.head())

##### if testing code subset to specified count

In [None]:
if TESTING:
    import random
    random.seed(42)
    feature_subset = random.sample(list(quants_df.columns.values), test_feature_cnt)
    quants_df = quants_df[feature_subset]
    print(f'shape of quants for testing {quants_df.shape}')
    if DEBUG:
        display(quants_df.head())        

#### merge quantified features with sample infomation covariate terms

In [None]:
data_df = quants_df.merge(covs_df, how='inner', 
                          left_index=True, right_index=True)
print(f'shape of merged quants and covariates {data_df.shape}')
if DEBUG:
    display(data_df.sample(5))

#### split the features into batches

In [None]:
features = quants_df.columns.values
print(f'number of features to analyze {len(features)}')
features_partioned = np.array_split(features, np.floor(cpu_count))
print(f'number of feature partions {len(features_partioned)}')
print(f'number of features in the 1st partition {len(features_partioned[0])}')

#### run the mixed effects models

In [None]:
def run_mm_batch(feat_list, ind_var, df, group_name):
    batch_results = []
    for feature in feat_list:
        batch_results.append(frmt_run_mm(feature, ind_var, df, group_name))
    return batch_results

In [None]:
%%time
grouping = 'sampleid'

fs_list = []
lm_results = []
# I run ProcessPoolExecutor from jupyter on Ubuntu, not working on CentOS
# some threads out there that ProcessPoolExecutor won't work from juypter
# but that looks like OS based and switching process being done via spawn
# instead of fork, but CentOS uses fork
# so switching to ThreadPoolExecutor just to make is work
with concurrent.futures.ProcessPoolExecutor() as ppe:
# with concurrent.futures.ThreadPoolExecutor() as ppe:
    for batch_features in features_partioned:
        fs_list.append(ppe.submit(run_mm_batch, batch_features, exogenous, data_df, grouping))
for future in concurrent.futures.as_completed(fs_list):
    lm_results.append(future.result())

In [None]:
# flatten the list
results = [item for sublist in lm_results for item in sublist]

In [None]:
results_df = DataFrame(data=results, columns=['feature', 'coef', 'stderr', 'term_cnt', 'p-value'])
print(f'shape of results {results_df.shape}')
if DEBUG:
    display(results_df.head())

##### if debugging double check a result with full summary info

In [None]:
%%time
if DEBUG:
    endo = results_df.feature[1] # check 2nd result

    this_formula = f'Q("{endo}") ~ {exogenous} {MDL_OTHER_TERMS}'        

    model = sm.MixedLM.from_formula(this_formula, data_df, groups=data_df[grouping])
    result = model.fit()

    print([result.params[exogenous], result.bse[exogenous], 
           result.params.shape[0], result.pvalues[exogenous]])
    print(result.summary())

#### calc FDR

In [None]:
results_df['bh_fdr'] = compute_fdr(results_df['p-value'].fillna(1))

print(results_df.loc[results_df['bh_fdr'] < 0.05].shape)
if DEBUG:
    display(results_df.head())

In [None]:
sig_df = results_df.loc[results_df['bh_fdr'] < 0.05]
if sig_df.shape[0] > 0:
    display(sig_df.loc[sig_df['p-value'] == min(sig_df['p-value'])])
    display(sig_df.loc[sig_df['p-value'] == max(sig_df['p-value'])])

#### save the simple mixed model results

In [None]:
results_df.to_csv(out_file, index=False)

#### do some example plotting

In [None]:
# don't set exogeneous feature to use random
plot_regression(data_df, exogenous)
plot_regression(data_df, exogenous)

In [None]:
plot_regression(data_df, exogenous, hue_feature='GROUP')
plot_regression(data_df, exogenous, hue_feature='GROUP')

In [None]:
!date