In [1]:
import sys
sys.path.insert(0,'..')
from paths import *
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import LabelEncoder
import statsmodels.api as sm
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns
import warnings
from tqdm import tqdm

from IPython.display import clear_output


  from pandas.core import (


In [2]:
def remove_outliers(data_, analyte):
    analyte_concentration_values = data_[analyte]
    # Calculate quartiles and IQR
    q1 = analyte_concentration_values.quantile(0.25)
    q3 = analyte_concentration_values.quantile(0.75)
    iqr = q3 - q1
    # Define Tukey's fences
    k = 3
    lower_fence = q1 - k * iqr
    upper_fence = q3 + k * iqr
    # Filter non-outliers
    non_outliers = data_[(data_[analyte] >= lower_fence) & (data_[analyte] <= upper_fence)]
    house_counts = non_outliers.groupby("house").size()
    # Filter to keep only the rows with paired house IDs
    paired_houses = house_counts[house_counts == 2].index
    non_outliers = non_outliers[non_outliers["house"].isin(paired_houses)]
    return non_outliers
    
def glm_model(data_, analyte_, sample_, remove_outlier):
    data_analyte = data_[selected_metadata_columns+[analyte_]]
    if remove_outlier:
        data_analyte = remove_outliers(data_analyte, analyte_)        
    data_analyte = data_analyte[[analyte_, "microbial_abundance", "Disease_label", "AGE", "BMI", "Gender_label", "site", "house"]]
    data_analyte['Gender_label'] = data_analyte['Gender_label'].astype(int)
    site_dummies = pd.get_dummies(data_analyte['site'], prefix='site', drop_first=True)
    house_dummies = pd.get_dummies(data_analyte['house'], prefix='house', drop_first=True)
    X = sm.add_constant(data_analyte[['AGE', 'BMI', 'Gender_label', 'Disease_label', 'microbial_abundance']])
    X = pd.concat([X, site_dummies, house_dummies], axis=1)
    X = X.astype({col: int for col in X.select_dtypes(include=['bool']).columns})
    X = X.apply(pd.to_numeric)
    try:
        model = sm.GLM(data_analyte[analyte_], X, family=sm.families.Gamma(link=sm.families.links.log()))
        mdf = model.fit()
    except:
        try:
            data_analyte = data_[selected_metadata_columns+[analyte_]]
            data_analyte = remove_outliers(data_analyte, analyte_)
            data_analyte = data_analyte[[analyte_, "microbial_abundance", "Disease_label", "AGE", "BMI", "Gender_label", "site", "house"]]
            data_analyte['Gender_label'] = data_analyte['Gender_label'].astype(int)
            site_dummies = pd.get_dummies(data_analyte['site'], prefix='site', drop_first=True)
            house_dummies = pd.get_dummies(data_analyte['house'], prefix='house', drop_first=True)
            X = sm.add_constant(data_analyte[['AGE', 'BMI', 'Gender_label', 'Disease_label', 'microbial_abundance']])
            X = pd.concat([X, site_dummies, house_dummies], axis=1)
            X = X.astype({col: int for col in X.select_dtypes(include=['bool']).columns})
            X = X.apply(pd.to_numeric)
            model = sm.GLM(data_analyte[analyte_], X, family=sm.families.Gamma(link=sm.families.links.log()))
            mdf = model.fit()
        except:            
            mdf = None
    out_dict = {}
    out_dict["analyte"] = analyte_
    out_dict["sample"] = sample_
    out_dict["patient_sample_count"] = data_analyte.shape[0]
    out_dict["model"] = mdf
    return out_dict 


In [22]:
# microbiome_data = pd.read_excel(os.path.join(DATA_ROOT_PATH, 'john_hopkins/1-s2.0-S0092867422011151-mmc6.xlsx'), engine='openpyxl', sheet_name='species')

microbiome_data[microbiome_data['Taxa']=='Faecalibaculum rodentium']


Unnamed: 0,Taxa,71401-0001,71401-0002,71401-0003,71401-0004,71401-0005,71401-0008,71401-0009,71401-0010,71401-0011,...,76502-0005,76502-0007,76502-0008,76502-0020,76502-0021,76502-0036,76502-0038,76502-0039,76502-0040,76502-0043
755,Faecalibaculum rodentium,0.0,8.22196e-07,4e-06,2e-06,8.080508e-07,0.0,4.471102e-07,2e-06,0.0,...,0.0,0.0,0.0,0.0,0.0,1e-05,0.0,0.0,5e-06,0.0


In [4]:
microbiome = 'Faecalibaculum'
taxa = 'species'

'''Options are:
genus:
Anaerostipes, Bacteroides, Dubosiella, Bifidobacterium, Lactobacillus, Faecalibaculum

species:
Dubosiella newyorkensis, Bacteroides thetaiotaomicron, Bifidobacterium pseudolongum, Faecalibaculum rodentium

Anaerostipes, Bacteroides, ASF356, Dubosiella, Negativibacillus, Bifidobacterium, 
Defluviitaleaceae_UCG-011, Lactobacillus, Candidatus Saccharimonas, and Faecalibaculum (genus level)

On the species level ILA supplementation had an effect on Dubosiella newyorkensis 
(human homolog = Clostridium innocuum), Bacteroides thetaiotaomicron, 
Bifidobacterium pseudolongum and Faecalibaculum rodentium (human homolog = Holdemanella biformis)
'''

category = 'PMS'

sample = 'serum'


In [5]:
# severity_data = pd.read_excel('../../wetlab/data/patient_selection/ms_base_epic_ids/iMSMS_EPIC.xlsx', engine='openpyxl')
severity_data = pd.read_csv('../../wetlab/data/patient_selection/armss_score_full_cohort.csv')
diet_info = pd.read_csv('../../wetlab/data/patient_selection/iMSMS Diet Restrictions.csv')
microbiome_data = pd.read_excel(os.path.join(DATA_ROOT_PATH, 'john_hopkins/1-s2.0-S0092867422011151-mmc6.xlsx'), engine='openpyxl', sheet_name=taxa)

if category != 'full':
    ms_category_data = pd.read_excel(os.path.join(DATA_ROOT_PATH, 'john_hopkins/1-s2.0-S0092867422011151-mmc2.xlsx'), engine='openpyxl', sheet_name='Sample phenotype')
    # ms_category_data = ms_category_data[['iMSMS_ID', 'disease_course_control']]
    ms_category_data = ms_category_data[ms_category_data['disease_course_control'] == category]
else:
    ms_category_data = pd.read_excel(os.path.join(DATA_ROOT_PATH, 'john_hopkins/1-s2.0-S0092867422011151-mmc2.xlsx'), engine='openpyxl', sheet_name='Sample phenotype')
    # ms_category_data = ms_category_data[['iMSMS_ID', 'disease_course_control']]
    # ms_category_data = ms_category_data[~ms_category_data['disease_course_control'].str.contains('Control')]


microbiome_data_selected = microbiome_data[microbiome_data['Taxa']==microbiome]
values = microbiome_data_selected.iloc[0, 1:].values
columns = microbiome_data_selected.columns[1:]
microbiome_data_selected = pd.DataFrame({'microbial_abundance': values, 'CLIENT_SAMPLE_ID': columns})


In [6]:


if sample == "serum":
    filename = GLOBAL_SERUM_DATA_FILENAME
else:
    filename = GLOBAL_STOOL_DATA_FILENAME
    

file_path = os.path.join(DATA_ROOT_PATH, filename)

sheet_name = ["Chemical Annotation", "Sample Meta Data", "Log Transformed Data"]

analyte_metadata = pd.read_excel(file_path, engine='openpyxl', sheet_name=sheet_name[0])
patient_metadata = pd.read_excel(file_path, engine='openpyxl', sheet_name=sheet_name[1])
data = pd.read_excel(file_path, engine='openpyxl', sheet_name=sheet_name[2])

data.head()

patient_metadata.loc[:, "site_code"] = patient_metadata["CLIENT_SAMPLE_ID"].apply(lambda x:x[0:3])



In [7]:
global_metabolomics_compound_spoke_map = pd.read_csv(os.path.join(os.path.dirname(OUTPUT_PATH), "global_metabolomics_compound_spoke_map.csv"))

analyte_columns = list(data.columns)
analyte_columns.remove("PARENT_SAMPLE_NAME")

analyte_columns_selected = global_metabolomics_compound_spoke_map[global_metabolomics_compound_spoke_map.CHEM_ID.isin(analyte_columns)]["CHEM_ID"].unique()

data_with_analyte_columns_selected = data[["PARENT_SAMPLE_NAME"]+list(analyte_columns_selected)]


In [8]:
selected_metadata_columns = ["PARENT_SAMPLE_NAME", "CLIENT_IDENTIFIER", "GROUP_NAME", "AGE", "BMI", "GENDER", "CLIENT_SAMPLE_ID", "CLIENT_MATRIX", "TREATMENT", "SAMPLE_AMOUNT_UNITS"]
patient_metadata_selected_columns = patient_metadata[selected_metadata_columns]
patient_metadata_selected_columns.loc[:, 'house'] = (patient_metadata_selected_columns['CLIENT_SAMPLE_ID'].str[:3] + patient_metadata_selected_columns['CLIENT_SAMPLE_ID'].str[-4:])
patient_metadata_selected_columns.loc[:, 'site'] = patient_metadata_selected_columns.loc[:, 'CLIENT_SAMPLE_ID'].str[:3]


data_with_patient_metadata = pd.merge(data_with_analyte_columns_selected, patient_metadata_selected_columns, on="PARENT_SAMPLE_NAME")

selected_metadata_columns.append('site')
selected_metadata_columns.append('house')

clear_output()

data_with_patient_metadata.head()
# data_with_patient_metadata = data_with_patient_metadata[data_with_patient_metadata['CLIENT_SAMPLE_ID'].str.startswith('714')]


Unnamed: 0,PARENT_SAMPLE_NAME,35,50,62,93,98,111,112,117,144,...,GROUP_NAME,AGE,BMI,GENDER,CLIENT_SAMPLE_ID,CLIENT_MATRIX,TREATMENT,SAMPLE_AMOUNT_UNITS,house,site
0,UCSF-06758,0.589314,-0.396834,-0.313341,0.85142,-0.739952,-0.544856,0.874154,-0.240343,0.562101,...,MS,52.0,19.42297,F,71701-0004,serum,Treated,uL,7170004,717
1,UCSF-06759,0.049593,0.13942,-0.15988,0.238199,-0.249942,-0.490267,-0.172927,-1.654431,-0.540335,...,MS,51.0,25.50175,F,71601-0150,serum,Off,uL,7160150,716
2,UCSF-06760,-0.416035,-1.045957,-0.740068,0.530813,-0.135617,-0.911855,-0.459714,-1.654431,-0.672326,...,Control,52.0,34.970939,M,71602-0150,serum,Control,uL,7160150,716
3,UCSF-06761,-0.386523,0.151742,0.580034,0.368174,0.307989,0.387318,0.176586,-0.504245,0.127822,...,MS,56.0,26.544467,M,71601-0164,serum,Treated,uL,7160164,716
4,UCSF-06762,-0.352323,-0.064386,0.691759,0.359623,-0.057788,-0.368919,0.57149,0.070276,-0.425296,...,Control,53.0,31.17614,F,71602-0164,serum,Control,uL,7160164,716


In [9]:
ila_chem_id = analyte_metadata[analyte_metadata['CHEMICAL_NAME'].str.startswith('indolelactate')].CHEM_ID.values[0]
iaa_chem_id = analyte_metadata[analyte_metadata['CHEMICAL_NAME'].str.startswith('indoleacetate')].CHEM_ID.values[0]



In [10]:
ila_iaa_imsms_data = data_with_patient_metadata[['CLIENT_SAMPLE_ID', ila_chem_id, iaa_chem_id, 'GENDER', 'BMI', 'AGE', 'site', 'house', 'PARENT_SAMPLE_NAME', 'CLIENT_IDENTIFIER', 'GROUP_NAME', 'CLIENT_MATRIX', 'TREATMENT', 'SAMPLE_AMOUNT_UNITS']]


In [11]:
microbiome_and_imsms_data = pd.merge(ila_iaa_imsms_data, microbiome_data_selected, on='CLIENT_SAMPLE_ID')
microbiome_and_imsms_data = microbiome_and_imsms_data.rename(columns={ila_chem_id:'exp_ILA', iaa_chem_id:'exp_IAA'})
microbiome_and_imsms_data.loc[:,'exp_ratio'] = microbiome_and_imsms_data.exp_ILA - microbiome_and_imsms_data.exp_IAA


In [12]:
microbiome_and_imsms_data_reverse_log_transformed = microbiome_and_imsms_data.copy()
analyte_columns_selected = ['exp_ILA', 'exp_IAA', 'exp_ratio']
for analyte in analyte_columns_selected:
    analyte_concentration = microbiome_and_imsms_data_reverse_log_transformed[analyte].values
    microbiome_and_imsms_data_reverse_log_transformed.loc[:, analyte] = np.exp(analyte_concentration)

clear_output()

    

In [13]:
microbiome_and_imsms_data_reverse_log_transformed.loc[:, 'house'] = (microbiome_and_imsms_data_reverse_log_transformed['CLIENT_SAMPLE_ID'].str[:3] + microbiome_and_imsms_data_reverse_log_transformed['CLIENT_SAMPLE_ID'].str[-4:])
microbiome_and_imsms_data_reverse_log_transformed.loc[:, 'site'] = microbiome_and_imsms_data_reverse_log_transformed.loc[:, 'CLIENT_SAMPLE_ID'].str[:3]

le = LabelEncoder()

microbiome_and_imsms_data_reverse_log_transformed.loc[:, 'Gender_label'] = le.fit_transform(microbiome_and_imsms_data_reverse_log_transformed['GENDER'])
microbiome_and_imsms_data_reverse_log_transformed.loc[:, 'Disease_label'] = le.fit_transform(microbiome_and_imsms_data_reverse_log_transformed['GROUP_NAME'])

house_to_exclude = microbiome_and_imsms_data_reverse_log_transformed[microbiome_and_imsms_data_reverse_log_transformed.isna().any(axis=1)].house.values

microbiome_and_imsms_data_reverse_log_transformed = microbiome_and_imsms_data_reverse_log_transformed[~microbiome_and_imsms_data_reverse_log_transformed["house"].isin(house_to_exclude)]

selected_metadata_columns.extend(['microbial_abundance', 'Gender_label', 'Disease_label'])

microbiome_and_imsms_data_reverse_log_transformed = microbiome_and_imsms_data_reverse_log_transformed[microbiome_and_imsms_data_reverse_log_transformed.house.isin(ms_category_data.household.astype(str))]


In [14]:
# severity_and_imsms_data_reverse_log_transformed.dropna(inplace=True)

model_dict_list = []
for analyte_column_selected in tqdm(analyte_columns_selected):
    model_dict_list.append(glm_model(microbiome_and_imsms_data_reverse_log_transformed,
              analyte_column_selected,
              sample,
              remove_outlier=False
             ))
clear_output()


In [15]:
if sample == "serum":
    filename = GLOBAL_SERUM_DATA_FILENAME
else:
    filename = GLOBAL_STOOL_DATA_FILENAME
file_path = os.path.join(DATA_ROOT_PATH, filename)
sheet_name = ["Chemical Annotation", "Sample Meta Data", "Log Transformed Data"]
analyte_metadata = pd.read_excel(file_path, engine='openpyxl', sheet_name=sheet_name[0])


model_summary_list_of_df = []
for index, model in enumerate(model_dict_list):
    model_summary_list = []
    analyte_name = model["analyte"]
    try:            
        disease_coeff = model["model"].params['microbial_abundance']
        disease_coeff_pvalue = model["model"].pvalues['microbial_abundance']
        disease_coeff_CI = tuple(model["model"].conf_int().loc['microbial_abundance'])
        model_converged_flag = model["model"].converged
        N = model['patient_sample_count']
    except:
        disease_coeff = None
        disease_coeff_pvalue = None
        disease_coeff_CI = None
        model_converged_flag = None
        N = model['patient_sample_count']
    model_summary_list.append((analyte_name, disease_coeff, disease_coeff_pvalue, disease_coeff_CI, model_converged_flag, N))        
    model_summary_list_of_df.append(pd.DataFrame(model_summary_list, columns=["analyte_name", "microbial_abundance_coeff", "pvalue", "CI", "model_converged_flag", "number_of_samples"]))



In [16]:
model_summary = pd.concat(model_summary_list_of_df, ignore_index=True)
model_summary.loc[:, 'analyte_type'] = 'untargeted'
model_summary

Unnamed: 0,analyte_name,microbial_abundance_coeff,pvalue,CI,model_converged_flag,number_of_samples,analyte_type
0,exp_ILA,-114.671973,0.755598,"(-836.7317523808333, 607.3878058)",True,180,untargeted
1,exp_IAA,565.579735,0.26246,"(-423.6411038264397, 1554.8005741756479)",True,180,untargeted
2,exp_ratio,-674.538378,0.272699,"(-1879.8533505999903, 530.7765952666537)",True,180,untargeted


In [17]:
model_dict_list[0]['patient_sample_count']


180