In [1]:
import sys
sys.path.insert(0,'..')
from paths import *
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import LabelEncoder
import statsmodels.api as sm
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns
import warnings
from tqdm import tqdm

from IPython.display import clear_output


In [42]:
def glm_model(data_, analyte_, sample_, remove_outlier):
    data_analyte = data_[selected_metadata_columns+[analyte_]]
    if remove_outlier:
        data_analyte = remove_outliers(data_analyte, analyte_)        
    data_analyte = data_analyte[[analyte_, "armms_category", "diet_label", "AGE", "BMI", "Gender_label", "site"]]
    X = sm.add_constant(data_analyte[['AGE', 'BMI', 'Gender_label', "armms_category", "diet_label"]])
    site_dummies = pd.get_dummies(data_analyte['site'], prefix='site', drop_first=True)
#     house_dummies = pd.get_dummies(data_analyte['house'], prefix='house', drop_first=True)
    X = pd.concat([X, site_dummies], axis=1)
    try:
        model = sm.GLM(data_analyte[analyte_], X, family=sm.families.Gamma(link=sm.families.links.log()))
        mdf = model.fit()
    except:
        mdf = None
    out_dict = {}
    out_dict["analyte"] = analyte_
    out_dict["sample"] = sample_
    out_dict["patient_sample_count"] = data_analyte.shape[0]
    out_dict["model"] = mdf
    return out_dict 


## Load selected patients based on ARMSS score

In [33]:
armss_data_lower = pd.read_csv('../../wetlab/data/patient_selection/patients_with_armss_data_lower.csv')
armss_data_higher = pd.read_csv('../../wetlab/data/patient_selection/patients_with_armss_data_higher.csv')
diet_info = pd.read_csv('../../wetlab/data/patient_selection/iMSMS Diet Restrictions.csv')

armss_data_lower.loc[:,'armms_category'] = 0
armss_data_higher.loc[:,'armms_category'] = 1

armss_data_combined = pd.concat([armss_data_lower, armss_data_higher], ignore_index=True)


## Loading Metabolomics Data

In [34]:
sample = "feces"

if sample == "serum":
    filename = GLOBAL_SERUM_DATA_FILENAME
else:
    filename = GLOBAL_STOOL_DATA_FILENAME
    

file_path = os.path.join(DATA_ROOT_PATH, filename)

sheet_name = ["Chemical Annotation", "Sample Meta Data", "Log Transformed Data"]

analyte_metadata = pd.read_excel(file_path, engine='openpyxl', sheet_name=sheet_name[0])
patient_metadata = pd.read_excel(file_path, engine='openpyxl', sheet_name=sheet_name[1])
data = pd.read_excel(file_path, engine='openpyxl', sheet_name=sheet_name[2])

data.head()


Unnamed: 0,PARENT_SAMPLE_NAME,30,35,49,50,54,55,62,71,72,...,999926020,999926058,999926062,999926092,999926093,999926094,999926099,999926101,999926103,999926108
0,UCSF-07758,0.402111,0.023685,0.1679,-0.992358,0.00021,-2.537045,0.081437,-1.68074,-0.362659,...,-1.000187,-0.194828,-2.431172,-1.092901,0.041653,-1.069907,-2.59171,-1.889138,-2.48589,-2.018001
1,UCSF-07759,2.328813,0.166186,1.689764,-0.094343,-0.196467,2.583854,2.291841,-1.68074,-0.349492,...,-0.01466,0.54468,-2.431172,1.013481,0.718372,-0.137027,-2.59171,-1.889138,-2.48589,-2.018001
2,UCSF-07760,-1.738102,0.465597,-4.169979,-1.197209,-0.110756,-2.537045,-1.401667,-1.68074,-1.803316,...,0.360661,0.422735,-2.431172,0.2901,1.623652,2.407763,-2.59171,-1.889138,-2.48589,-2.018001
3,UCSF-07761,-0.443539,-0.375407,-1.888223,-2.276426,-0.147195,-2.537045,-3.918197,-1.68074,-1.803316,...,-2.854939,-1.007445,-0.459153,-3.343635,-0.546764,-0.51983,-2.59171,-1.889138,-2.48589,-2.018001
4,UCSF-07762,0.341254,0.340142,-1.589614,-0.125123,1.206768,-2.537045,0.211002,1.15264,-1.803316,...,-0.723285,1.980702,-2.431172,1.517017,3.135178,3.720208,1.565557,-1.889138,0.688708,-2.018001


In [35]:
patient_metadata.loc[:, "site_code"] = patient_metadata["CLIENT_SAMPLE_ID"].apply(lambda x:x[0:3])


## Analyte selection

In [36]:
global_metabolomics_compound_spoke_map = pd.read_csv(os.path.join(os.path.dirname(OUTPUT_PATH), "global_metabolomics_compound_spoke_map.csv"))

analyte_columns = list(data.columns)
analyte_columns.remove("PARENT_SAMPLE_NAME")

analyte_columns_selected = global_metabolomics_compound_spoke_map[global_metabolomics_compound_spoke_map.CHEM_ID.isin(analyte_columns)]["CHEM_ID"].unique()

data_with_analyte_columns_selected = data[["PARENT_SAMPLE_NAME"]+list(analyte_columns_selected)]


## Merging patient metadata with analyte concentration data

In [37]:
selected_metadata_columns = ["PARENT_SAMPLE_NAME", "CLIENT_IDENTIFIER", "GROUP_NAME", "AGE", "BMI", "GENDER", "CLIENT_SAMPLE_ID", "CLIENT_MATRIX", "TREATMENT", "SAMPLE_AMOUNT_UNITS"]
patient_metadata_selected_columns = patient_metadata[selected_metadata_columns]
patient_metadata_selected_columns.loc[:, 'house'] = (patient_metadata_selected_columns['CLIENT_SAMPLE_ID'].str[:3] + patient_metadata_selected_columns['CLIENT_SAMPLE_ID'].str[-4:])
patient_metadata_selected_columns.loc[:, 'site'] = patient_metadata_selected_columns.loc[:, 'CLIENT_SAMPLE_ID'].str[:3]

le = LabelEncoder()
patient_metadata_selected_columns.loc[:, 'Gender_label'] = le.fit_transform(patient_metadata_selected_columns['GENDER'])

data_with_patient_metadata = pd.merge(data_with_analyte_columns_selected, patient_metadata_selected_columns, on="PARENT_SAMPLE_NAME")

selected_metadata_columns.append('Gender_label')
selected_metadata_columns.append('site')

clear_output()

data_with_patient_metadata.head()



Unnamed: 0,PARENT_SAMPLE_NAME,35,50,62,93,98,111,112,144,158,...,AGE,BMI,GENDER,CLIENT_SAMPLE_ID,CLIENT_MATRIX,TREATMENT,SAMPLE_AMOUNT_UNITS,house,site,Gender_label
0,UCSF-07758,0.023685,-0.992358,0.081437,1.225531,1.43921,-1.336249,-0.044492,0.906045,-2.241044,...,32.0,18.426534,F,71802-0027,feces,Control,ug,7180027,718,0
1,UCSF-07759,0.166186,-0.094343,2.291841,0.394435,1.246446,-0.067657,1.480448,0.891433,0.079749,...,44.0,19.84375,F,71702-0164,feces,Control,ug,7170164,717,0
2,UCSF-07760,0.465597,-1.197209,-1.401667,1.095159,1.556548,-1.543398,0.210838,0.647214,-1.386518,...,34.0,23.685228,F,71602-0131,feces,Control,ug,7160131,716,0
3,UCSF-07761,-0.375407,-2.276426,-3.918197,-1.385966,-1.021095,-1.543398,-0.053911,-0.691927,-2.241044,...,65.0,33.388778,F,71702-0214,feces,Control,ug,7170214,717,0
4,UCSF-07762,0.340142,-0.125123,0.211002,1.791118,1.316333,0.284801,1.346525,2.621563,-2.241044,...,29.0,28.133657,F,71802-0168,feces,Control,ug,7180168,718,0


## Merging with ARMSS score data

In [38]:
data_with_patient_metadata_and_armss_score = pd.merge(data_with_patient_metadata, armss_data_combined, left_on='CLIENT_SAMPLE_ID', right_on='Record.ID').drop(['Record.ID'], axis=1)


data_with_patient_metadata_and_armss_score_with_diet = pd.merge(data_with_patient_metadata_and_armss_score, diet_info, left_on='CLIENT_SAMPLE_ID', right_on='Record ID').drop(['Record ID'], axis=1)

data_with_patient_metadata_and_armss_score_with_diet.drop(['lARMSS', 'lMSSS', 'uGMSSS', 'oGMSSS', 'If OTHER, please describe:', 'Data Access Group', 'Event Name'], axis=1, inplace=True)

diet_map = {'No special dietary needs' : 'Non-Vegetarian',
 'Gluten Free,Vegetarian' : 'Vegetarian',
 'Other' : 'Non-Vegetarian',
 'Gluten Free,No special dietary needs' : 'Non-Vegetarian',
 'Gluten Free' : 'Non-Vegetarian',
 'Pescetarian' : 'Non-Vegetarian',
 'Lactose Intolerance,Gluten Free' : 'Non-Vegetarian',
 'Gluten Free,Pescetarian' : 'Non-Vegetarian',
 'Lactose Intolerance,Pescetarian' : 'Non-Vegetarian',
 'Vegetarian' : 'Vegetarian'
}

data_with_patient_metadata_and_armss_score_with_diet.loc[:, 'mapped_diet'] = data_with_patient_metadata_and_armss_score_with_diet['Do you have any special dietary needs/restrictions?'].map(diet_map)
selected_metadata_columns.append('armms_category')

data_with_patient_metadata_and_armss_score_with_diet


Unnamed: 0.1,PARENT_SAMPLE_NAME,35,50,62,93,98,111,112,144,158,...,Unnamed: 0,ageatedss,dd,edss,gARMSS,Gender,Age,armms_category,Do you have any special dietary needs/restrictions?,mapped_diet
0,UCSF-07787,-0.226544,-1.122074,-3.918197,-0.392817,-3.727361,-1.543398,-0.603480,-0.280889,-2.241044,...,153,47,12,6.0,7.69,M,47.0,1,No special dietary needs,Non-Vegetarian
1,UCSF-07790,0.178154,-0.592624,-3.918197,0.041937,1.422294,-1.543398,0.777008,-0.725859,-2.241044,...,53,30,3,0.0,0.88,F,29.0,0,No special dietary needs,Non-Vegetarian
2,UCSF-07792,-0.558773,-1.058159,-3.918197,-0.701228,-1.373445,-1.543398,1.210952,-0.552114,-2.241044,...,74,55,27,6.0,6.72,F,54.0,1,No special dietary needs,Non-Vegetarian
3,UCSF-07793,-0.458945,-1.936099,0.136307,-0.316882,1.106109,-1.543398,-0.085426,0.046193,-2.241044,...,18,74,30,8.0,8.85,F,73.0,1,No special dietary needs,Non-Vegetarian
4,UCSF-07827,0.002786,-0.721220,-2.116110,-0.374604,1.025983,-1.543398,1.333489,-0.287259,-2.241044,...,158,62,21,6.5,6.97,F,61.0,1,No special dietary needs,Non-Vegetarian
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,UCSF-08752,-0.106155,-0.602941,0.395569,-0.411303,-1.639334,-1.543398,0.047739,0.639264,-2.241044,...,35,64,3,1.0,0.30,M,63.0,0,No special dietary needs,Non-Vegetarian
96,UCSF-08753,-0.571121,-1.527935,-1.388333,-0.715152,0.888845,0.931591,0.043707,0.050014,-2.241044,...,112,38,6,0.0,0.56,F,37.0,0,No special dietary needs,Non-Vegetarian
97,UCSF-08754,-2.130696,-3.084500,-3.918197,-2.229978,1.455566,-0.119768,2.458673,-0.965316,-2.241044,...,190,54,17,0.0,0.17,F,54.0,0,Pescetarian,Non-Vegetarian
98,UCSF-08756,-0.044102,-0.727946,-3.918197,-1.015157,-1.069690,-1.543398,-0.368464,-0.733178,-2.241044,...,71,42,12,0.0,0.43,F,42.0,0,No special dietary needs,Non-Vegetarian


## Reversing the log transformation to the analyte concentration

In [39]:
data_with_patient_metadata_and_armss_score_with_diet_reverse_log_transformed = data_with_patient_metadata_and_armss_score_with_diet.copy()
for analyte in analyte_columns_selected:
    analyte_concentration = data_with_patient_metadata_and_armss_score_with_diet_reverse_log_transformed[analyte].values
    data_with_patient_metadata_and_armss_score_with_diet_reverse_log_transformed.loc[:, analyte] = np.exp(analyte_concentration)
    
clear_output()

    

In [40]:
data_with_patient_metadata_and_armss_score_with_diet_reverse_log_transformed.loc[:, 'house'] = (data_with_patient_metadata_and_armss_score_with_diet_reverse_log_transformed['CLIENT_SAMPLE_ID'].str[:3] + data_with_patient_metadata_and_armss_score_with_diet_reverse_log_transformed['CLIENT_SAMPLE_ID'].str[-4:])
data_with_patient_metadata_and_armss_score_with_diet_reverse_log_transformed.loc[:, 'site'] = data_with_patient_metadata_and_armss_score_with_diet_reverse_log_transformed.loc[:, 'CLIENT_SAMPLE_ID'].str[:3]

le = LabelEncoder()
data_with_patient_metadata_and_armss_score_with_diet_reverse_log_transformed.loc[:, 'diet_label'] = le.fit_transform(data_with_patient_metadata_and_armss_score_with_diet_reverse_log_transformed['mapped_diet'])
data_with_patient_metadata_and_armss_score_with_diet_reverse_log_transformed.loc[:, 'Gender_label'] = le.fit_transform(data_with_patient_metadata_and_armss_score_with_diet_reverse_log_transformed['GENDER'])

house_to_exclude = data_with_patient_metadata_and_armss_score_with_diet_reverse_log_transformed[data_with_patient_metadata_and_armss_score_with_diet_reverse_log_transformed.isna().any(axis=1)].house.values

data_with_patient_metadata_and_armss_score_with_diet_reverse_log_transformed_nan_removed = data_with_patient_metadata_and_armss_score_with_diet_reverse_log_transformed[~data_with_patient_metadata_and_armss_score_with_diet_reverse_log_transformed["house"].isin(house_to_exclude)]

selected_metadata_columns.append('diet_label')



In [43]:


model_dict_list = []
for analyte_column_selected in tqdm(analyte_columns_selected):
    model_dict_list.append(glm_model(data_with_patient_metadata_and_armss_score_with_diet_reverse_log_transformed_nan_removed,
              analyte_column_selected,
              sample,
              remove_outlier=False
             ))
clear_output()


In [44]:
if sample == "serum":
    filename = GLOBAL_SERUM_DATA_FILENAME
else:
    filename = GLOBAL_STOOL_DATA_FILENAME
file_path = os.path.join(DATA_ROOT_PATH, filename)
sheet_name = ["Chemical Annotation", "Sample Meta Data", "Log Transformed Data"]
analyte_metadata = pd.read_excel(file_path, engine='openpyxl', sheet_name=sheet_name[0])


model_summary_list_of_df = []
for index, model in enumerate(model_dict_list):
    model_summary_list = []
    analyte_name = analyte_metadata[analyte_metadata["CHEM_ID"] == model["analyte"]]["CHEMICAL_NAME"].values[0]
    try:            
        disease_coeff = model["model"].params["armms_category"]
        disease_coeff_pvalue = model["model"].pvalues["armms_category"]
        disease_coeff_CI = tuple(model["model"].conf_int().loc['armms_category'])
        model_converged_flag = model["model"].converged
    except:
        disease_coeff = None
        disease_coeff_pvalue = None
        disease_coeff_CI = None
        model_converged_flag = None
    model_summary_list.append((analyte_name, disease_coeff, disease_coeff_pvalue, disease_coeff_CI, model_converged_flag))        
    model_summary_list_of_df.append(pd.DataFrame(model_summary_list, columns=["analyte_name", "armss_coeff", "pvalue", "CI", "model_converged_flag"]))



In [45]:
model_summary = pd.concat(model_summary_list_of_df, ignore_index=True)
model_summary.loc[:, 'analyte_type'] = 'untargeted'
model_summary

# model_summary.to_csv(f'../../wetlab/data/patient_selection/untarget_compounds_model_results_using_{sample}_sample.csv', index=False)


Unnamed: 0,analyte_name,armss_coeff,pvalue,CI,model_converged_flag,analyte_type
0,S-1-pyrroline-5-carboxylate,-0.373942,0.007540,"(-0.6482352852130147, -0.09964918899371839)",True,untargeted
1,spermidine,-1.431302,0.000224,"(-2.191558167831003, -0.6710451706877621)",True,untargeted
2,"12,13-DiHOME",-0.116175,0.769655,"(-0.8937619312731077, 0.6614124094329487)",True,untargeted
3,alpha-ketoglutarate,-0.453137,0.060468,"(-0.9262098227440383, 0.01993588586481304)",True,untargeted
4,kynurenate,-0.242231,0.472589,"(-0.9032123282852283, 0.41874994073156235)",True,untargeted
...,...,...,...,...,...,...
672,fructosyllysine,-0.686439,0.051341,"(-1.3768755372210923, 0.003997054900956032)",True,untargeted
673,glycodeoxycholate 3-sulfate,,,,,untargeted
674,ascorbic acid 2-sulfate,-0.633844,0.259484,"(-1.7355653228140606, 0.4678763779746685)",False,untargeted
675,linoleoyl-linoleoyl-glycerol (18:2/18:2) [1]*,0.745471,0.085316,"(-0.10368790279243445, 1.5946290389633095)",False,untargeted


In [46]:
model_summary[(model_summary.pvalue<0.05) & (model_summary.model_converged_flag==True)]


Unnamed: 0,analyte_name,armss_coeff,pvalue,CI,model_converged_flag,analyte_type
0,S-1-pyrroline-5-carboxylate,-0.373942,0.007540,"(-0.6482352852130147, -0.09964918899371839)",True,untargeted
1,spermidine,-1.431302,0.000224,"(-2.191558167831003, -0.6710451706877621)",True,untargeted
9,hypoxanthine,-0.569320,0.002912,"(-0.944162164103471, -0.19447743869575468)",True,untargeted
15,N-formylmethionine,-0.498027,0.011131,"(-0.88254308837124, -0.11351118717531528)",True,untargeted
19,arginine,-1.428861,0.000477,"(-2.230500016844318, -0.6272213416205278)",True,untargeted
...,...,...,...,...,...,...
639,gamma-glutamylthreonine,-0.703861,0.000171,"(-1.0708983177735074, -0.33682406784547747)",True,untargeted
640,nicotinate ribonucleoside,-0.659542,0.033445,"(-1.2673758081640485, -0.051708294109598096)",True,untargeted
652,1-oleoyl-GPI (18:1),-1.124761,0.013734,"(-2.019389708934347, -0.23013171728950388)",True,untargeted
658,"5alpha-androstan-3beta,17beta-diol disulfate",1.509486,0.001719,"(0.5657717313988471, 2.4531997964937657)",True,untargeted


In [105]:
targeted_model_summary = pd.read_csv(f'../../wetlab/data/patient_selection/target_compounds_model_results_using_{sample}_sample.csv')
targeted_model_summary


Unnamed: 0,analyte_name,armss_coeff,pvalue,CI,model_converged_flag,analyte_type
0,2-Methylbutyric acid,0.398923,0.002354,"(0.14185098822643344, 0.6559946405415751)",True,targeted
1,Acetic acid,-0.258359,0.038999,"(-0.5036718307760064, -0.013045430213021209)",True,targeted
2,Butyric acid,-0.240478,0.214687,"(-0.6203428781540883, 0.13938652396638335)",True,targeted
3,Hexanoic acid,0.099496,0.790894,"(-0.6359660090439759, 0.8349571918373156)",True,targeted
4,Isobutyric acid,0.31281,0.014908,"(0.060984522964036614, 0.5646360419188723)",True,targeted
5,Isovaleric acid,0.4353,0.000802,"(0.18077220179846631, 0.68982771295882)",True,targeted
6,Propionic acid,-0.251199,0.054443,"(-0.5071871931299579, 0.00478959487613867)",True,targeted
7,Valeric acid,0.086141,0.541148,"(-0.19014766518020654, 0.3624304115825985)",True,targeted


In [106]:
total_model_summary = pd.concat([targeted_model_summary, model_summary], ignore_index=True)
total_model_summary = total_model_summary[(total_model_summary.pvalue<0.05) & (total_model_summary.model_converged_flag==True)]
total_model_summary.to_csv(f'../../wetlab/data/patient_selection/total_significant_compounds_model_results_using_{sample}_sample.csv', index=False)


In [47]:
model_summary_with_site_variable = model_summary[(model_summary.pvalue<0.05) & (model_summary.model_converged_flag==True)]
model_summary_with_site_variable.to_csv(f'../../wetlab/data/patient_selection/untargeted_significant_compounds_model_results_with_site_variable_using_{sample}_sample.csv', index=False)

