In [1]:
import sys
sys.path.insert(0,'..')
from paths import *
import pandas as pd
import numpy as np
import os
import statsmodels.api as sm
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
import joblib
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns
import multiprocessing as mp

from IPython.display import clear_output


In [2]:
NCORES = 8


In [3]:
def glm_model(data_, analyte_, sample_):
    data_analyte = data_[data_["Analyte"] == analyte_]
    le = LabelEncoder()
    data_analyte.loc[:, 'diet_label'] = le.fit_transform(data_analyte['mapped_diet'])
    data_analyte.loc[:, 'Gender_label'] = le.fit_transform(data_analyte['Gender'])
    data_analyte.loc[:, 'site'] = data_analyte.loc[:, 'Client Sample ID'].str[:3]
    data_analyte = data_analyte[["Result", "armms_category", "diet_label", "Age", "BMI", "Gender_label", "site"]]
    X = sm.add_constant(data_analyte[['Age', 'BMI', 'Gender_label', 'armms_category', 'diet_label']])
    site_dummies = pd.get_dummies(data_analyte['site'], prefix='site', drop_first=True)
#     house_dummies = pd.get_dummies(data_analyte['house'], prefix='house', drop_first=True)
    X = pd.concat([X, site_dummies], axis=1)
    model = sm.GLM(data_analyte['Result'], X, family=sm.families.Gamma(link=sm.families.links.log()))
    mdf = model.fit()
    out_dict = {}
    out_dict["analyte"] = analyte_
    out_dict["sample"] = sample_
    out_dict["patient_sample_count"] = data_analyte.shape[0]
    out_dict["model"] = mdf
    return out_dict 

def get_model_parallel(data_, analytes_, sample_):
    data_list = [data_]*len(analytes_)
    analytes_list = list(analytes_)
    sample_list = [sample_]*len(analytes_)
    arg_list = list(zip(data_list, analytes_list, sample_list))
    p = mp.Pool(NCORES)
    model_list = p.starmap(glm_model, arg_list)
    p.close()
    p.join()
    return model_list


In [4]:
armss_data_lower = pd.read_csv('../../wetlab/data/patient_selection/patients_with_armss_data_lower.csv')
armss_data_higher = pd.read_csv('../../wetlab/data/patient_selection/patients_with_armss_data_higher.csv')
diet_info = pd.read_csv('../../wetlab/data/patient_selection/iMSMS Diet Restrictions.csv')

armss_data_lower.loc[:,'armms_category'] = 0
armss_data_higher.loc[:,'armms_category'] = 1

armss_data_combined = pd.concat([armss_data_lower, armss_data_higher], ignore_index=True)
armss_data_combined = armss_data_combined[['Record.ID', 'gARMSS', 'armms_category']]





armss_data_combined.shape


(100, 3)

In [5]:
# armss_data_demographics = pd.read_csv('../../wetlab/data/patient_selection/clinical_data_with_ARMSS_score.csv')

# armss_data_demographics.dropna(subset='gARMSS', inplace=True)

# armss_data_sorted = armss_data_demographics.sort_values(by='gARMSS')
# armss_data_sorted = armss_data_sorted[armss_data_sorted['Record.ID'].str.startswith('714')]

# N = 12
# armss_data_lower = armss_data_sorted.head(N)
# armss_data_higher = armss_data_sorted.tail(N)
# armss_data_lower.loc[:,'armms_category'] = 0
# armss_data_higher.loc[:,'armms_category'] = 1

# # armss_data_higher

# armss_data_combined = pd.concat([armss_data_lower, armss_data_higher], ignore_index=True)
# armss_data_combined = armss_data_combined[['Record.ID', 'gARMSS', 'armms_category']]

# # armss_data_combined = armss_data_combined[armss_data_combined['Record.ID'].str.startswith('714')]

# clear_output()
# armss_data_combined.shape



In [6]:
sample = "feces"

filename = SHORT_CHAIN_FATTY_ACID_DATA_FILENAME
mapping_filename = "short_chain_fatty_acid_spoke_map.csv"
file_path = os.path.join(DATA_ROOT_PATH, filename)
mapping_filepath = os.path.join(os.path.dirname(OUTPUT_PATH), mapping_filename)
data = pd.read_excel(file_path, engine='openpyxl')
data = data[data["Client Matrix"]==sample]
mapping_data = pd.read_csv(mapping_filepath)
analytes = mapping_data["name"].unique()
data.head()

Unnamed: 0,Unique Tube Label ID,Client Sample ID,Client Matrix,Sample Description,Group Name,Treatment,Gender,Age,BMI,Race / Ethnicity,Analyte,Result,Unit,Dilution Factor,LLOQ,ULOQ,Analysis Comment
0,FB06341898,71801-0010,feces,Fecal sample,MS,Off,F,50.0,18.645344,Caucasian/Hispanic,2-Methylbutyric acid,159.251,µg/g,1,5,2000,
1,FB06341896,71801-0027,feces,Fecal sample,MS,Treated,M,29.0,23.038752,Caucasian/Hispanic,2-Methylbutyric acid,171.655,µg/g,1,5,2000,
2,FB06341882,71801-0022,feces,Fecal sample,MS,Treated,F,33.0,17.474717,Caucasian/Hispanic,2-Methylbutyric acid,26.897,µg/g,1,5,2000,
3,FB06341906,71801-0017,feces,Fecal sample,MS,Treated,F,53.0,25.721032,Caucasian/Hispanic,2-Methylbutyric acid,116.243,µg/g,1,5,2000,
4,FB06341902,71802-0027,feces,Fecal sample,Control,Control,F,32.0,18.426534,Caucasian/Hispanic,2-Methylbutyric acid,48.814,µg/g,1,5,2000,


In [7]:
data_with_armss = pd.merge(data, armss_data_combined, left_on='Client Sample ID', right_on='Record.ID')

data_with_armss.loc[:, 'house'] = (data_with_armss['Client Sample ID'].str[:3] + data_with_armss['Client Sample ID'].str[-4:])

data_exclude_outlier_threshold_column = data_with_armss.drop("Analysis Comment", axis=1)
house_to_exclude = data_exclude_outlier_threshold_column[data_exclude_outlier_threshold_column.isna().any(axis=1)].house.values
data_nan_removed = data_with_armss[~data_with_armss["house"].isin(house_to_exclude)]

data_nan_removed_with_diet = pd.merge(data_nan_removed, diet_info, left_on='Client Sample ID', right_on='Record ID').drop(['Record ID'], axis=1)

diet_map = {'No special dietary needs' : 'Non-Vegetarian',
 'Gluten Free,Vegetarian' : 'Vegetarian',
 'Other' : 'Non-Vegetarian',
 'Gluten Free,No special dietary needs' : 'Non-Vegetarian',
 'Gluten Free' : 'Non-Vegetarian',
 'Pescetarian' : 'Non-Vegetarian',
 'Lactose Intolerance,Gluten Free' : 'Non-Vegetarian',
 'Gluten Free,Pescetarian' : 'Non-Vegetarian',
 'Lactose Intolerance,Pescetarian' : 'Non-Vegetarian',
 'Vegetarian' : 'Vegetarian'
}

data_nan_removed_with_diet.loc[:, 'mapped_diet'] = data_nan_removed_with_diet['Do you have any special dietary needs/restrictions?'].map(diet_map)
# data_nan_removed_with_diet.rename(columns={'Do you have any special dietary needs/restrictions?': 'mapped_diet'}, inplace=True)

data_nan_removed_with_diet.dropna(subset='mapped_diet', inplace=True)
data_nan_removed_with_diet.shape


(768, 26)

In [18]:
data_nan_removed_with_diet_ = data_nan_removed_with_diet[['Client Sample ID', 'gARMSS', 'armms_category', 'Do you have any special dietary needs/restrictions?', 'mapped_diet']].drop_duplicates()

data_nan_removed_with_diet_ = data_nan_removed_with_diet_.rename(columns={'Do you have any special dietary needs/restrictions?':'diet_from_questionnaire'})

data_nan_removed_with_diet_.to_csv('../../wetlab/data/patient_selection/patient_armss_score_with_diet_info.csv', index=False, header=True)



In [58]:
%%time

model_dict_list = []
for analyte in analytes:
    model_dict_list.append(glm_model(data_nan_removed_with_diet, analyte, 'feces'))
    
clear_output()


CPU times: user 130 ms, sys: 5.7 ms, total: 135 ms
Wall time: 134 ms


In [59]:
model_dict_list[0]['model'].pvalues

const             8.576626e-22
Age               8.482188e-01
BMI               6.453670e-01
Gender_label      4.600111e-01
armms_category    7.715541e-02
diet_label        9.344025e-01
site_715          5.231362e-01
site_716          6.561615e-01
site_717          6.392070e-01
site_718          9.747522e-01
site_764          3.269864e-01
site_765          3.646293e-01
dtype: float64

In [60]:
model_summary_list_of_df = []
for index, model in enumerate(model_dict_list):
    model_summary_list = []
    analyte_name = model["analyte"]
    try:            
        armss_coeff = model["model"].params["armms_category"]
        armss_coeff_pvalue = model["model"].pvalues["armms_category"]
        armss_coeff_CI = tuple(model["model"].conf_int().loc['armms_category'])
        model_converged_flag = model["model"].converged
    except:
        armss_coeff = None
        armss_coeff_pvalue = None
        armss_coeff_CI = None
        model_converged_flag = None
    model_summary_list.append((analyte_name, armss_coeff, armss_coeff_pvalue, armss_coeff_CI, model_converged_flag))        
    model_summary_list_of_df.append(pd.DataFrame(model_summary_list, columns=["analyte_name", "armss_coeff", "pvalue", "CI", "model_converged_flag"]))



In [61]:
model_summary = pd.concat(model_summary_list_of_df, ignore_index=True)
model_summary.loc[:, 'analyte_type'] = 'targeted'

model_summary

# model_summary = model_summary[(model_summary.pvalue<0.05) & (model_summary.model_converged_flag==True)]

# model_summary.to_csv(f'../../wetlab/data/patient_selection/target_compounds_model_results_using_{sample}_sample.csv', index=False)


Unnamed: 0,analyte_name,armss_coeff,pvalue,CI,model_converged_flag,analyte_type
0,2-Methylbutyric acid,0.308011,0.077155,"(-0.03355206210589212, 0.6495743217959563)",True,targeted
1,Acetic acid,-0.167237,0.247204,"(-0.4504959876614269, 0.11602239308809853)",True,targeted
2,Butyric acid,-0.093407,0.690351,"(-0.5529534251376268, 0.366140277261949)",True,targeted
3,Hexanoic acid,-0.01319,0.976551,"(-0.8927238782600896, 0.866342973368677)",False,targeted
4,Isobutyric acid,0.24691,0.144739,"(-0.08492314140704832, 0.5787440816692186)",True,targeted
5,Isovaleric acid,0.325557,0.05832,"(-0.011465218886318795, 0.6625782696694361)",True,targeted
6,Propionic acid,-0.189454,0.239414,"(-0.5050836794942957, 0.12617567147515119)",True,targeted
7,Valeric acid,0.069209,0.688186,"(-0.26879725829150536, 0.4072162245102465)",True,targeted


In [261]:
model_summary

Unnamed: 0,analyte_name,armss_coeff,pvalue,CI,model_converged_flag,analyte_type
0,2-Methylbutyric acid,0.398923,0.002354,"(0.14185098822643344, 0.6559946405415751)",True,targeted
1,Acetic acid,-0.258359,0.038999,"(-0.5036718307760064, -0.013045430213021209)",True,targeted
2,Butyric acid,-0.240478,0.214687,"(-0.6203428781540883, 0.13938652396638335)",True,targeted
3,Hexanoic acid,0.099496,0.790894,"(-0.6359660090439759, 0.8349571918373156)",True,targeted
4,Isobutyric acid,0.31281,0.014908,"(0.060984522964036614, 0.5646360419188723)",True,targeted
5,Isovaleric acid,0.4353,0.000802,"(0.18077220179846631, 0.68982771295882)",True,targeted
6,Propionic acid,-0.251199,0.054443,"(-0.5071871931299579, 0.00478959487613867)",True,targeted
7,Valeric acid,0.086141,0.541148,"(-0.19014766518020654, 0.3624304115825985)",True,targeted
