In [1]:
import sys
sys.path.insert(0,'..')
from paths import *
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import LabelEncoder
import statsmodels.api as sm
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns
import warnings
from tqdm import tqdm

from IPython.display import clear_output


  from pandas.core import (


In [2]:
def glm_model(data_, analyte_, sample_, remove_outlier):
    data_analyte = data_[selected_metadata_columns+[analyte_]]
    if remove_outlier:
        data_analyte = remove_outliers(data_analyte, analyte_)        
    data_analyte = data_analyte[[analyte_, severity_score, "AGE", "BMI", "Gender_label", "site"]]
    data_analyte['Gender_label'] = data_analyte['Gender_label'].astype(int)
    site_dummies = pd.get_dummies(data_analyte['site'], prefix='site', drop_first=True)
    X = sm.add_constant(data_analyte[['AGE', 'BMI', 'Gender_label', severity_score]])
    X = pd.concat([X, site_dummies], axis=1)
    X = X.astype({col: int for col in X.select_dtypes(include=['bool']).columns})
    X = X.apply(pd.to_numeric)
    try:
        model = sm.GLM(data_analyte[analyte_], X, family=sm.families.Gamma(link=sm.families.links.log()))
        mdf = model.fit()
    except:
        mdf = None
    out_dict = {}
    out_dict["analyte"] = analyte_
    out_dict["sample"] = sample_
    out_dict["patient_sample_count"] = data_analyte.shape[0]
    out_dict["model"] = mdf
    return out_dict 


In [3]:
severity_score = 'gARMSS'
# options are edss gARMSS	lARMSS	lMSSS	uGMSSS	oGMSSS	

category = 'PMS'

sample = 'serum'


In [4]:
# severity_data = pd.read_excel('../../wetlab/data/patient_selection/ms_base_epic_ids/iMSMS_EPIC.xlsx', engine='openpyxl')
severity_data = pd.read_csv('../../wetlab/data/patient_selection/armss_score_full_cohort.csv')
diet_info = pd.read_csv('../../wetlab/data/patient_selection/iMSMS Diet Restrictions.csv')

if category != 'full':
    ms_category_data = pd.read_excel(os.path.join(DATA_ROOT_PATH, 'john_hopkins/1-s2.0-S0092867422011151-mmc2.xlsx'), engine='openpyxl', sheet_name='Sample phenotype')
    severity_data = pd.merge(severity_data, ms_category_data[['iMSMS_ID', 'disease_course_control']], left_on='Record.ID', right_on='iMSMS_ID').drop('iMSMS_ID', axis=1)
    severity_data = severity_data[severity_data['disease_course_control']==category]
else:
    ms_category_data = pd.read_excel(os.path.join(DATA_ROOT_PATH, 'john_hopkins/1-s2.0-S0092867422011151-mmc2.xlsx'), engine='openpyxl', sheet_name='Sample phenotype')
    severity_data = pd.merge(severity_data, ms_category_data[['iMSMS_ID', 'disease_course_control']], left_on='Record.ID', right_on='iMSMS_ID').drop('iMSMS_ID', axis=1)
    severity_data = severity_data[~severity_data['disease_course_control'].str.contains('Control')]

severity_data.dropna(subset=[severity_score], inplace=True)


## Load untargeted metabolomics data

In [5]:


if sample == "serum":
    filename = GLOBAL_SERUM_DATA_FILENAME
else:
    filename = GLOBAL_STOOL_DATA_FILENAME
    

file_path = os.path.join(DATA_ROOT_PATH, filename)

sheet_name = ["Chemical Annotation", "Sample Meta Data", "Log Transformed Data"]

analyte_metadata = pd.read_excel(file_path, engine='openpyxl', sheet_name=sheet_name[0])
patient_metadata = pd.read_excel(file_path, engine='openpyxl', sheet_name=sheet_name[1])
data = pd.read_excel(file_path, engine='openpyxl', sheet_name=sheet_name[2])

data.head()

patient_metadata.loc[:, "site_code"] = patient_metadata["CLIENT_SAMPLE_ID"].apply(lambda x:x[0:3])



## Analyte selection

In [6]:
global_metabolomics_compound_spoke_map = pd.read_csv(os.path.join(os.path.dirname(OUTPUT_PATH), "global_metabolomics_compound_spoke_map.csv"))

analyte_columns = list(data.columns)
analyte_columns.remove("PARENT_SAMPLE_NAME")

analyte_columns_selected = global_metabolomics_compound_spoke_map[global_metabolomics_compound_spoke_map.CHEM_ID.isin(analyte_columns)]["CHEM_ID"].unique()

data_with_analyte_columns_selected = data[["PARENT_SAMPLE_NAME"]+list(analyte_columns_selected)]


## Merging patient metadata with analyte concentration data

In [7]:
selected_metadata_columns = ["PARENT_SAMPLE_NAME", "CLIENT_IDENTIFIER", "GROUP_NAME", "AGE", "BMI", "GENDER", "CLIENT_SAMPLE_ID", "CLIENT_MATRIX", "TREATMENT", "SAMPLE_AMOUNT_UNITS"]
patient_metadata_selected_columns = patient_metadata[selected_metadata_columns]
patient_metadata_selected_columns.loc[:, 'house'] = (patient_metadata_selected_columns['CLIENT_SAMPLE_ID'].str[:3] + patient_metadata_selected_columns['CLIENT_SAMPLE_ID'].str[-4:])
patient_metadata_selected_columns.loc[:, 'site'] = patient_metadata_selected_columns.loc[:, 'CLIENT_SAMPLE_ID'].str[:3]


data_with_patient_metadata = pd.merge(data_with_analyte_columns_selected, patient_metadata_selected_columns, on="PARENT_SAMPLE_NAME")

selected_metadata_columns.append('site')

clear_output()

data_with_patient_metadata.head()
# data_with_patient_metadata = data_with_patient_metadata[data_with_patient_metadata['CLIENT_SAMPLE_ID'].str.startswith('714')]


Unnamed: 0,PARENT_SAMPLE_NAME,35,50,62,93,98,111,112,117,144,...,GROUP_NAME,AGE,BMI,GENDER,CLIENT_SAMPLE_ID,CLIENT_MATRIX,TREATMENT,SAMPLE_AMOUNT_UNITS,house,site
0,UCSF-06758,0.589314,-0.396834,-0.313341,0.85142,-0.739952,-0.544856,0.874154,-0.240343,0.562101,...,MS,52.0,19.42297,F,71701-0004,serum,Treated,uL,7170004,717
1,UCSF-06759,0.049593,0.13942,-0.15988,0.238199,-0.249942,-0.490267,-0.172927,-1.654431,-0.540335,...,MS,51.0,25.50175,F,71601-0150,serum,Off,uL,7160150,716
2,UCSF-06760,-0.416035,-1.045957,-0.740068,0.530813,-0.135617,-0.911855,-0.459714,-1.654431,-0.672326,...,Control,52.0,34.970939,M,71602-0150,serum,Control,uL,7160150,716
3,UCSF-06761,-0.386523,0.151742,0.580034,0.368174,0.307989,0.387318,0.176586,-0.504245,0.127822,...,MS,56.0,26.544467,M,71601-0164,serum,Treated,uL,7160164,716
4,UCSF-06762,-0.352323,-0.064386,0.691759,0.359623,-0.057788,-0.368919,0.57149,0.070276,-0.425296,...,Control,53.0,31.17614,F,71602-0164,serum,Control,uL,7160164,716


## Extracting ids for ILA and IAA

In [8]:
ila_chem_id = analyte_metadata[analyte_metadata['CHEMICAL_NAME'].str.startswith('indolelactate')].CHEM_ID.values[0]
iaa_chem_id = analyte_metadata[analyte_metadata['CHEMICAL_NAME'].str.startswith('indoleacetate')].CHEM_ID.values[0]



## Extracting ILA and IAA iMSMS data

In [9]:
ila_iaa_imsms_data = data_with_patient_metadata[['CLIENT_SAMPLE_ID', ila_chem_id, iaa_chem_id, 'GENDER', 'BMI', 'AGE', 'site', 'house', 'PARENT_SAMPLE_NAME', 'CLIENT_IDENTIFIER', 'GROUP_NAME', 'CLIENT_MATRIX', 'TREATMENT', 'SAMPLE_AMOUNT_UNITS']]



## Merging severity scores with iMSMS data (ILA and IAA)

In [10]:
severity_and_imsms_data = pd.merge(ila_iaa_imsms_data, severity_data, left_on='CLIENT_SAMPLE_ID', right_on='Record.ID')
severity_and_imsms_data = severity_and_imsms_data.rename(columns={ila_chem_id:'exp_ILA', iaa_chem_id:'exp_IAA'})
severity_and_imsms_data.loc[:,'exp_ratio'] = severity_and_imsms_data.exp_ILA - severity_and_imsms_data.exp_IAA


In [11]:
severity_and_imsms_data_reverse_log_transformed = severity_and_imsms_data.copy()
analyte_columns_selected = ['exp_ILA', 'exp_IAA', 'exp_ratio']
for analyte in analyte_columns_selected:
    analyte_concentration = severity_and_imsms_data_reverse_log_transformed[analyte].values
    severity_and_imsms_data_reverse_log_transformed.loc[:, analyte] = np.exp(analyte_concentration)

clear_output()

    

In [12]:
# severity_and_imsms_data_reverse_log_transformed.loc[:, 'house'] = (severity_and_imsms_data_reverse_log_transformed['CLIENT_SAMPLE_ID'].str[:3] + severity_and_imsms_data_reverse_log_transformed['CLIENT_SAMPLE_ID'].str[-4:])
severity_and_imsms_data_reverse_log_transformed.loc[:, 'site'] = severity_and_imsms_data_reverse_log_transformed.loc[:, 'CLIENT_SAMPLE_ID'].str[:3]

le = LabelEncoder()

severity_and_imsms_data_reverse_log_transformed.loc[:, 'Gender_label'] = le.fit_transform(severity_and_imsms_data_reverse_log_transformed['GENDER'])

# house_to_exclude = severity_and_imsms_data_reverse_log_transformed[severity_and_imsms_data_reverse_log_transformed.isna().any(axis=1)].house.values

# severity_and_imsms_data_reverse_log_transformed = severity_and_imsms_data_reverse_log_transformed[~severity_and_imsms_data_reverse_log_transformed["house"].isin(house_to_exclude)]

selected_metadata_columns.extend([severity_score, 'Gender_label'])



In [13]:
# severity_and_imsms_data_reverse_log_transformed.dropna(inplace=True)

model_dict_list = []
for analyte_column_selected in tqdm(analyte_columns_selected):
    model_dict_list.append(glm_model(severity_and_imsms_data_reverse_log_transformed,
              analyte_column_selected,
              sample,
              remove_outlier=False
             ))
clear_output()


In [14]:
if sample == "serum":
    filename = GLOBAL_SERUM_DATA_FILENAME
else:
    filename = GLOBAL_STOOL_DATA_FILENAME
file_path = os.path.join(DATA_ROOT_PATH, filename)
sheet_name = ["Chemical Annotation", "Sample Meta Data", "Log Transformed Data"]
analyte_metadata = pd.read_excel(file_path, engine='openpyxl', sheet_name=sheet_name[0])


model_summary_list_of_df = []
for index, model in enumerate(model_dict_list):
    model_summary_list = []
    analyte_name = model["analyte"]
    try:            
        disease_coeff = model["model"].params[severity_score]
        disease_coeff_pvalue = model["model"].pvalues[severity_score]
        disease_coeff_CI = tuple(model["model"].conf_int().loc[severity_score])
        model_converged_flag = model["model"].converged
    except:
        disease_coeff = None
        disease_coeff_pvalue = None
        disease_coeff_CI = None
        model_converged_flag = None
    model_summary_list.append((analyte_name, disease_coeff, disease_coeff_pvalue, disease_coeff_CI, model_converged_flag))        
    model_summary_list_of_df.append(pd.DataFrame(model_summary_list, columns=["analyte_name", "armss_coeff", "pvalue", "CI", "model_converged_flag"]))



In [15]:
model_summary = pd.concat(model_summary_list_of_df, ignore_index=True)
model_summary.loc[:, 'analyte_type'] = 'untargeted'
model_summary

Unnamed: 0,analyte_name,armss_coeff,pvalue,CI,model_converged_flag,analyte_type
0,exp_ILA,-0.004672,0.760648,"(-0.034731368015825485, 0.025387336401644936)",True,untargeted
1,exp_IAA,-0.004346,0.868702,"(-0.05586961734472374, 0.04717835154847901)",True,untargeted
2,exp_ratio,0.005784,0.814538,"(-0.04254412527363042, 0.05411236039125523)",True,untargeted


In [16]:
severity_and_imsms_data_reverse_log_transformed.shape

(87, 27)