In [4]:
import pandas as pd
import joblib
import os
from paths import *


In [5]:
population_type = "global_compounds"
filename = "GLM_{}_without_outlier_sample_serum_feces_ratio.joblib".format(population_type)

sheet_name = ["Chemical Annotation", "Sample Meta Data", "Log Transformed Data"]

file_path = os.path.join(DATA_ROOT_PATH, GLOBAL_STOOL_DATA_FILENAME)
analyte_metadata_feces = pd.read_excel(file_path, engine='openpyxl', sheet_name=sheet_name[0])

file_path = os.path.join(DATA_ROOT_PATH, GLOBAL_SERUM_DATA_FILENAME)
analyte_metadata_serum = pd.read_excel(file_path, engine='openpyxl', sheet_name=sheet_name[0])

analyte_metadata = pd.concat([analyte_metadata_serum, analyte_metadata_feces], ignore_index=True).drop_duplicates(subset=["CHEM_ID", "CHEMICAL_NAME"])


In [6]:
model = joblib.load(os.path.join("result", filename))


In [9]:
model_summary_list = []
for item in model:
    analyte_name = item["analyte"]
    try:            
        disease_coeff = item["model"].params["Disease_label"]
        disease_coeff_pvalue = item["model"].pvalues["Disease_label"]
        disease_coeff_CI = tuple(item["model"].conf_int().loc['Disease_label'])
        model_converged_flag = item["model"].converged
    except:
        disease_coeff = None
        disease_coeff_pvalue = None
        disease_coeff_CI = None
        model_converged_flag = None
    model_summary_list.append((analyte_name, disease_coeff, disease_coeff_pvalue, disease_coeff_CI, model_converged_flag))        

    
# analyte_list = []
# for item in model:    
#     analyte_list.append(analyte_metadata[analyte_metadata["CHEM_ID"] == item["analyte"]]["CHEMICAL_NAME"].values[0])

# disease_coeff = np.array(disease_coeff).transpose()
# disease_coeff[disease_coeff == None] = np.nan
# disease_coeff = np.where(disease_coeff == None, np.nan, disease_coeff).astype(float)
# disease_coeff_df = pd.DataFrame(disease_coeff, index=analyte_list, columns=model_name_list)
# disease_coeff_df.dropna(inplace=True)
# disease_coeff_df.reset_index(inplace=True)
# disease_coeff_df.rename(columns={"index":"analyte_name", "Without outlier":"disease_coeff"}, inplace=True)


model_summary_df = pd.DataFrame(model_summary_list, columns=["analyte_name", "disease_coeff", "pvalue", "CI", "model_converged_flag"])


In [32]:
model_summary_df_ = model_summary_df.dropna(subset=["disease_coeff"])
columns_selected = list(model_summary_df_.columns)
columns_selected.append("CHEMICAL_NAME")

model_summary_df_2 = pd.merge(model_summary_df_, analyte_metadata, left_on = "analyte_name", right_on = "CHEM_ID")[columns_selected].drop("analyte_name", axis=1).rename(columns={"CHEMICAL_NAME":"analyte_name"})
model_summary_df_2 = model_summary_df_2[list(model_summary_df_.columns)].sort_values(by="pvalue").reset_index().drop("index", axis=1)


In [33]:
# Create ExcelWriter object
writer = pd.ExcelWriter(os.path.join("result", 'GLM_result_{}_serum_feces_ratio.xlsx'.format(population_type)))

model_summary_df_2.to_excel(writer, sheet_name="without_outlier", index=False)

# Save and close the ExcelWriter object
writer.save()
writer.close()
