## Notebook that pulls together known subject and sample covariates cleans up as neccessary and writes to single file 

In [None]:
!date

#### import libraries

In [None]:
import pandas as pd
import ppscore as pps
import seaborn as sns
import matplotlib.pyplot as plt
import warnings

#### set notebook variables

In [None]:
# parameter variables
cohort = 'foundin'
modality = 'ATAC'

# directories 
wrk_dir = f'/home/jupyter/{cohort}/eqtl'
info_dir = f'{wrk_dir}/sample_info'

# input files
subject_info_file = f'{info_dir}/amppd_demographicsPlus_2019_v1release_1015.csv'
cell_info_file = f'{info_dir}/cell_metadata.csv'
genos_pca_file = f'{info_dir}/foundin.freeze9.pca.eigenvec'
cell_fracs_file = f'{info_dir}/rnab_cell_fracs_scaden.csv'
if modality == 'RNAB':
    assay_metrics_file = f'{info_dir}/foundin_rnab_seqqc_metrics.txt'
elif modality == 'ATAC':
    assay_metrics_file = f'{info_dir}/foundin_atac_metrics.csv'
elif modality == 'SCRN':
    assay_metrics_file = f'{info_dir}/COVARIATES_BATCH.txt'    
subj_overview_file = f'{info_dir}/Expanded_overview_of_included_PPMI_samples_overview.csv'
subj_grs_file = f'{info_dir}/Expanded_overview_of_included_PPMI_samples_GRS.csv'

# output files
assay_covs_files = f'{info_dir}/foundin_{modality}_sample_info.csv'

# constants
max_cov_missing_rate = 0.5
repeated_id_dict = {'PPMI3966B1': 'PPMI3966', 'PPMI3966B2': 'PPMI3966', 
                    'PPMI3966B3': 'PPMI3966', 'PPMI3966B5': 'PPMI3966'}

covs_index_assay_id_to_replace = {f'{modality}_PPMI3422_0683_da65_v1': f'{modality}_PPMI3422_1260_da65_v1',
                                  f'{modality}_PPMI3448_3236_da65_v1': f'{modality}_PPMI3448_2397_da65_v1',
                                  f'{modality}_PPMI3451_2397_da65_v1': f'{modality}_PPMI3451_3236_da65_v1',
                                  f'{modality}_PPMI3664_6647_da65_v1': f'{modality}_PPMI3664_2833_da65_v1',
                                  f'{modality}_PPMI3665_7215_da65_v1': f'{modality}_PPMI3665_4484_da65_v1',
                                  f'{modality}_PPMI3953_2833_da65_v1': f'{modality}_PPMI3953_6647_da65_v1',
                                  f'{modality}_PPMI4101_4484_da65_v2': f'{modality}_PPMI4101_7215_da65_v2',
                                  f'{modality}_PPMI4106_2056_da65_v1': f'{modality}_PPMI4106_0494_da65_v1',
                                  f'{modality}_PPMI54991_1260_da65_v1': f'{modality}_PPMI54991_0683_da65_v1'}

#### load cell line info

In [None]:
cell_info = pd.read_csv(cell_info_file)
print(cell_info.shape)
# add 'PPMI' to patno
cell_info['PPMI_ID'] = 'PPMI' + cell_info['PPMI_ID'].astype(str)
# display(cell_info.sample(10))

#### load subject info file, from AMP-PD, and merge with cell info

In [None]:
subj_info = pd.read_csv(subject_info_file)
subj_info['wgsid'] = subj_info['participant_id']
subj_info['participant_id'] = subj_info['participant_id'].str.replace('PP-', 'PPMI')
print(subj_info.shape)
# display(subj_info.head())

# now merge cell and subject info
info_df = pd.merge(cell_info, subj_info, how='left', left_on='PPMI_ID', right_on='participant_id')
print(info_df.shape)
# display(info_df.head())

#### load the FOUNDIN subject overiew files and merge

In [None]:
overview_df = pd.read_csv(subj_overview_file)
overview_df['PPMI_ID'] = 'PPMI' + overview_df['PATNO'].astype(str)
print(overview_df.shape)
# display(overview_df.head())

In [None]:
grs_df = pd.read_csv(subj_grs_file)
grs_df['PPMI_ID'] = grs_df['IID'].str.replace('PPMISI', 'PPMI')
print(grs_df.shape)
# display(grs_df.head())

In [None]:
# these originated from same file, think they have duplicated columns
print(set(overview_df.columns) & set(grs_df.columns))
display(overview_df.info())
display(grs_df.info())

In [None]:
cols_to_drop = ['RECRUITMENT_CAT', 'exclude', 'DESCRP_CAT', 'IID', 'IID', 
                'PHENO', 'NOTE']
grs_df.drop(columns=cols_to_drop, inplace=True)

In [None]:
# couple of the categoricals look like they have decent amount of missing
check_these_columns = ['DESCRP_CAT', 'mutation', 'Relatives', 'exclude']

for this_col in check_these_columns:
    print(this_col)
    print(overview_df[this_col].value_counts())

In [None]:
# so those missing can be filled with None category instead of missing
for this_col in check_these_columns:
    overview_df[this_col].fillna('None', inplace=True)

In [None]:
# merge overview and grs
overview_df = overview_df.merge(grs_df, how='left', on='PPMI_ID')
print(overview_df.shape)
# display(overview_df.sample(5))

In [None]:
# merge with large info
info_df = info_df.merge(overview_df, how='left', on='PPMI_ID')
print(info_df.shape)
# display(info_df.sample(5))

#### load and merge in the genetics PCs

In [None]:
genetic_components_df = pd.read_csv(genos_pca_file, sep='\s+', index_col=1)
genetic_components_df.drop(columns=['#FID'], inplace=True)
print(genetic_components_df.shape)

# merge genetics PCs with other info
info_df = info_df.merge(genetic_components_df, how='left', left_on='wgsid', right_index=True)
print(info_df.shape)
# display(info_df.head())

#### load the sample assays metrics info

In [None]:
# load the sample QC info
if modality == 'SCRN':
    metrics_info = pd.read_csv(assay_metrics_file, sep='\t', index_col=0).transpose()
    cols_to_keep = ['Estimated.Number.of.Cells', 'Mean.Reads.per.Cell', 
                    'Total.Genes.Detected', 'Median.UMI.Counts.per.Cell']
    metrics_info = metrics_info[cols_to_keep]    
else:
    metrics_info = pd.read_csv(assay_metrics_file, sep='\t', index_col=0)
print(metrics_info.shape)
# display(metrics_info.head())

In [None]:
# rename assayID that may be mismatched in metrics file
metrics_info.rename(index=covs_index_assay_id_to_replace, inplace=True)

#### split sample name index into constituent bits

In [None]:
col_names = ['assay', 'sampleid', 'cdi', 'day', 'version']
id_parts = metrics_info.index.str.split('_', expand=True).to_frame(index=False, name=col_names)
id_parts['assayid'] = metrics_info.index
print(id_parts.shape)
# display(id_parts.sample(5))
# fix sampleid for repeated sample 
id_parts['sampleid'].replace(repeated_id_dict, inplace=True)

#### get counts by day

In [None]:
id_parts['day'].value_counts()

#### merge the split assay IDs bits onto the other info

In [None]:
info_df = id_parts.merge(info_df, how='left', left_on='sampleid', right_on='PPMI_ID')
info_df.drop_duplicates(subset=['assayid'], inplace=True)
info_df.set_index('assayid', drop=True, inplace=True)
print(info_df.shape)
# display(info_df.head())

#### now merge the assay QC/metrics with rest of info by assay's sample id

In [None]:
info_df = info_df.merge(metrics_info, how='left', left_index=True, right_index=True)
print(info_df.shape)
# display(info_df.head())

#### load the cell fractions and merge with other info

In [None]:
cfracs_df = pd.read_csv(cell_fracs_file, index_col=0)
print(cfracs_df.shape)

info_df = info_df.merge(cfracs_df, how='left', left_index=True, right_index=True)
print(info_df.shape)
# display(info_df.head())

#### check the dtypes and fix as neccessary

In [None]:
pd.set_option('display.max_rows', None)
# display(info_df.dtypes)
display(info_df.info())
pd.reset_option('display.max_rows')

#### get rid of the columns that have single values or a lot missingness

In [None]:
cols_to_drop = []
for this_col in info_df.columns:
    drop_col = False
    try:
        percent_miss = info_df[this_col].isna().sum()/info_df.shape[0]
        if percent_miss > max_cov_missing_rate:
            drop_col = True
        else:
            total_unique = len(info_df[this_col].unique())
            if total_unique == 1 or (total_unique == info_df.shape[0] 
                                     and info_df[this_col].dtype == 'object'):
                drop_col = True
    except:
        drop_col = True

    if drop_col:
        cols_to_drop.append(this_col)

        
print(cols_to_drop)

In [None]:
info_df['ethnicity_y'].value_counts()

In [None]:
# fix those assay metrics colnames that have preceeding spaces
if modality == 'RNAB':
    cols_new_names = {' Proper Pairs': 'ProperPairs', ' Assigned': 'Assigned', 
                      'M Assigned': 'MAssigned', ' Aligned': 'Aligned', 
                      'M Aligned': 'MAligned', ' Aligned.1': 'Aligned.1', 
                      'M Aligned.1': 'MAligned.1', ' Dups': 'Dups', ' GC': 'GC',
                      'M Seqs': 'MSeqs'}
    info_df.rename(columns=cols_new_names, inplace=True)
elif modality == 'SCRN':
    cols_new_names = {'Estimated.Number.of.Cells': 'EstimatedNumberofCells',
                      'Mean.Reads.per.Cell': 'MeanReadsperCell',
                      'Total.Genes.Detected': 'TotalGenesDetected',
                      'Median.UMI.Counts.per.Cell': 'MedianUMICountsperCell'}    
    info_df.rename(columns=cols_new_names, inplace=True)

In [None]:
# see if sample without WGS is still there, ie won't have genetic PCs computed
info_df.loc[info_df['wgsid'].isna()]

In [None]:
# which samples are missing cell fractions
if modality == 'SCRN':
    print(info_df.loc[info_df['EstimatedNumberofCells'].isna()].shape)
    display(info_df.loc[info_df['EstimatedNumberofCells'].isna()])
    print(info_df.loc[info_df['EstimatedNumberofCells'].isna()].index)    
else:
    print(info_df.loc[info_df['DopaminergicNeurons'].isna()].shape)
    display(info_df.loc[info_df['DopaminergicNeurons'].isna()])
    print(info_df.loc[info_df['DopaminergicNeurons'].isna()].index)

#### for specific later cell-type specific analyses combine DopaminergicNeurons and ImmatureDopaminergicNeurons into DAn

In [None]:
if modality != 'SCRN':
    info_df['DAn'] = info_df['DopaminergicNeurons'] + info_df['ImmatureDopaminergicNeurons']
    print(info_df[['DAn', 'DopaminergicNeurons', 'ImmatureDopaminergicNeurons']].describe())
# display(info_df.sample(5))

#### if those columns look useless drop

In [None]:
info_df.drop(columns=cols_to_drop, inplace=True)
print(info_df.shape)

#### save the complete covariates file

In [None]:
info_df.to_csv(assay_covs_files)

#### take a look to see how corrlated or predictive covariates are and visualize

#### use Predictive Power Score to see what is associated with predict cell fractions

In [None]:
cfrac_covs = []
for cell_type in cfracs_df.columns:
    print(cell_type)
    # remove the other cell types
    other_cells = set(cfracs_df.columns) - set([cell_type])
    this_df = info_df.drop(columns=other_cells)
    predictors_df = pps.predictors(this_df, cell_type)
    # drop anything that has ppscore of zero
    predictors_df = predictors_df.loc[predictors_df['ppscore'] > 0]
    display(predictors_df)
    cfrac_covs.extend(list(predictors_df['x'].values))

print(cfrac_covs)

# check other cell type related covariates for ther predictorsTH_SCRN
temp_other_covs = ['TH_SCRN', 'MAP2_SCRN']
for this_cov in temp_other_covs:
    print(this_cov)
    predictors_df = pps.predictors(info_df, this_cov)
    # drop anything that has ppscore of zero
    predictors_df = predictors_df.loc[predictors_df['ppscore'] > 0]
    display(predictors_df)


In [None]:
matrix_df = pps.matrix(info_df[list(set(cfrac_covs) | set(cfracs_df.columns))])
matrix_df = matrix_df.loc[matrix_df['ppscore'] > 0]
print(matrix_df.shape)

matrix_df['ppscore'] = matrix_df['ppscore'].round(2)
plot_matrix_df = matrix_df[['x', 'y', 'ppscore']].pivot(columns='x', index='y', values='ppscore')
print(plot_matrix_df.shape)
display(plot_matrix_df)

In [None]:
plt.figure(figsize=(16,16)) 
sns.heatmap(plot_matrix_df, vmin=0, vmax=1, cmap="Blues", linewidths=0.05, 
            annot=True, annot_kws={"fontsize":12})
plt.show()

In [None]:
temp_df = info_df[list(set(cfrac_covs) | set(cfracs_df.columns))]
cats_df = temp_df.select_dtypes(include=['object'])
print(cats_df.shape)
dums_df = pd.get_dummies(cats_df)
print(dums_df.shape)

covs_df = temp_df.merge(dums_df, how='inner', left_index=True, right_index=True)
print(covs_df.shape)

In [None]:
sns.set()
cor = covs_df.corr(method='pearson')
cor.dropna(how='all', inplace=True)
print(cor.shape)
plt.figure(figsize=(16,16))        
sns.heatmap(cor[(cor > 0.22) | (cor < -0.22)], annot=True, annot_kws={"fontsize":10}, \
            linewidths=0.05)    
plt.show()

In [None]:
# a lot of warning can be generated related to number of members and n_splits=4
# so temp supress warnings
warnings.filterwarnings('ignore')

matrix_df = pps.matrix(info_df)
matrix_df = matrix_df.loc[matrix_df['ppscore'] > 0]
print(matrix_df.shape)

default_max_rows = pd.get_option('display.max_rows')
pd.set_option('display.max_rows', 100)
display(matrix_df)
pd.set_option('display.max_rows', default_max_rows)

# restore defaults warning setting
# warnings.filterwarnings('default')
warnings.filterwarnings('once')

In [None]:
matrix_df['ppscore'] = matrix_df['ppscore'].round(2)
plot_matrix_df = matrix_df[['x', 'y', 'ppscore']].pivot(columns='x', index='y', values='ppscore')
print(plot_matrix_df.shape)
display(plot_matrix_df)

In [None]:
# plt.figure(figsize=(24,20)) 
# sns.heatmap(plot_matrix_df, vmin=0, vmax=1, cmap="Blues", linewidths=0.05, 
#             annot=True, annot_kws={"fontsize":10})
# plt.show()

In [None]:
info_df.select_dtypes(include=['object']).columns

In [None]:
# cats_df = info_df.select_dtypes(include=['object'])
cats_df = info_df[['day', 'version', 'Batch', 'Culture_Media_iPSC', 'Growth_iPSC',
                   'Spontaneous_differentiation', 'Differentiation_Start',
                   'visit_name', 'sex', 'ethnicity_x', 'race',
                   'education_level_years', 'diagnosis_at_baseline', 
                   'diagnosis_latest', 'case_control_other_at_baseline', 
                   'case_control_other_latest', 'study_arm', 'prodromal_category', 
                   'Recruitment', 'RECRUIT', 'DX_INIT', 'DIAG', 'RECRUITMENT_CAT', 
                   'IMAGING_CAT', 'ENROLL_CAT', 'DESCRP_CAT', 'pheno',
                   'mutation', 'Relatives', 'GROUP']]
print(cats_df.shape)
dums_df = pd.get_dummies(cats_df)
print(dums_df.shape)

covs_df = info_df.merge(dums_df, how='inner', left_index=True, right_index=True)
print(covs_df.shape)

In [None]:
sns.set()
cor = covs_df.corr(method='pearson')
cor.dropna(how='all', inplace=True)
print(cor.shape)
plt.figure(figsize=(24,20))        
sns.heatmap(cor[(cor > 0.22) | (cor < -0.22)], annot=True, annot_kws={"fontsize":10}, \
            linewidths=0.05)
# sns.heatmap(cor[(cor > 0.1) | (cor < -0.1)], annot=True, annot_kws={"fontsize":12}, \
#             linewidths=0.05)
plt.show()