In [1]:
import pandas as pd
import numpy as np
from scipy.stats import rankdata
from statsmodels.formula.api import ols
from statsmodels.stats.multitest import multipletests

# Set a random seed for reproducibility
np.random.seed(531)

# Load data
qmp_slv_sp_kw_res = pd.read_csv("QMP_SLV_sp_kw_res.csv")
rmp_slv_sp_kw_res = pd.read_csv("RMP_SLV_sp_kw_res.csv")

# Filter significant results
qmp_slv_sp_kw_res_sig05 = qmp_slv_sp_kw_res[qmp_slv_sp_kw_res['AdjustedP'] < 0.05]
rmp_slv_sp_kw_res_sig05 = rmp_slv_sp_kw_res[rmp_slv_sp_kw_res['AdjustedP'] < 0.05]

# Load additional data for final analysis
qmp_slv_sp_kw_res = pd.read_csv("QMP_138sp_SLV_sp_kw_res.csv")
rmp_slv_sp_kw_res = pd.read_csv("RMP_138sp_SLV_sp_kw_res.csv")

# Filter significant results for final analysis
qmp_slv_sp_kw_res_sig05 = qmp_slv_sp_kw_res[qmp_slv_sp_kw_res['AdjustedP'] < 0.05]
rmp_slv_sp_kw_res_sig05 = rmp_slv_sp_kw_res[rmp_slv_sp_kw_res['AdjustedP'] < 0.05]

# Load and subset RMP data for GLM analysis (requires custom phylogenetic handling)
# Example placeholders for loading `p5_NPT.RMP_138_species` and `p5_NPT.QMP_138_species`
# Assume data loaded as pandas DataFrames named `rmp_data` and `qmp_data`

rmp_for_glm = rmp_data[rmp_data.index.isin(rmp_slv_sp_kw_res_sig05['X'])]
qmp_for_glm = qmp_data[qmp_data.index.isin(qmp_slv_sp_kw_res_sig05['QMP'])]

# Prepare metadata for GLM analysis
def prepare_metadata(obj_phylo, columns, new_column_names):
    mata_crc = obj_phylo[columns]
    mata_crc.columns = new_column_names
    mata_crc['CRCtype'] = pd.factorize(mata_crc['CRCtype'])[0] + 1
    return mata_crc

# For QMP data
qmp_mata_crc = prepare_metadata(qmp_for_glm, ['Calprotectin', 'CRC_general_status'], ['calpro', 'CRCtype'])
cohortdata = qmp_mata_crc[['calpro', 'CRCtype']]

# For RMP data
rmp_mata_crc = prepare_metadata(rmp_for_glm, ['Calprotectin', 'CRC_general_status'], ['calpro', 'CRCtype'])
cohortdata_rmp = rmp_mata_crc[['calpro', 'CRCtype']]

# GLM Analysis function
def perform_glm_analysis(datatab, cohortdata, additional_vars=None):
    results = []
    for feature in datatab.columns:
        y = rankdata(datatab[feature])
        
        if additional_vars:
            X = cohortdata[additional_vars].apply(rankdata)
            formula = "y ~ " + " + ".join(additional_vars) + " + CRCtype"
        else:
            X = cohortdata[['calpro']].apply(rankdata)
            formula = "y ~ calpro + CRCtype"
        
        X = pd.DataFrame(X, columns=additional_vars)
        X['CRCtype'] = rankdata(cohortdata['CRCtype'])
        
        model = ols(formula, data=X).fit()
        summary = model.summary()
        
        # Adjusted P-values
        adj_p = multipletests(model.pvalues, method='fdr_bh')[1]
        
        # Collect summary results for each variable
        result = {
            "dataset": "species_QMP" if additional_vars else "species_RMP",
            "feature": feature,
            "calprotectin_estimate": model.params[1],
            "calprotectin_std.Error": model.bse[1],
            "calprotectin_t.value": model.tvalues[1],
            "calprotectin_P": model.pvalues[1],
            "calprotectin_AdjP": adj_p[1]
        }
        results.append(result)
    return pd.DataFrame(results)

# Run GLM for both QMP and RMP data with and without additional covariates
qmp_glm_results = perform_glm_analysis(qmp_for_glm, cohortdata, additional_vars=['calpro', 'Moisture', 'BMI'])
rmp_glm_results = perform_glm_analysis(rmp_for_glm, cohortdata_rmp, additional_vars=['calpro', 'Moisture', 'BMI'])

# Filter results with adjusted p-value < 0.05
significant_qmp_results = qmp_glm_results[qmp_glm_results['calprotectin_AdjP'] < 0.05]
significant_rmp_results = rmp_glm_results[rmp_glm_results['calprotectin_AdjP'] < 0.05]

print(significant_qmp_results)
print(significant_rmp_results)


FileNotFoundError: [Errno 2] No such file or directory: 'QMP_SLV_sp_kw_res.csv'