In [1]:
import os
import pandas as pd
import numpy as np
from scipy.stats import kruskal
from statsmodels.stats.multitest import multipletests
from skbio.stats.composition import multiplicative_replacement
from skbio.diversity import alpha_diversity, beta_diversity
import seaborn as sns

# Set the seed for reproducibility
np.random.seed(531)

# Set the working directory
os.chdir(os.path.dirname(__file__))

# Load necessary data (assuming these are saved in pickle format)
LCPM_runs_589_seq = pd.read_pickle("LCPM_runs_589_seq_new_SLV_nr99_v138.1_filtered.pkl")
QMP_ASV_LCPM_SLV_species = pd.read_pickle("QMP_ASV_LCPM_SLV_species.pkl")
RMP_ASV_LCPM_SLV_species = pd.read_pickle("RMP_ASV_LCPM_SLV_species.pkl")
RMP_ASV_LCPM_SLV_species_RA = pd.read_pickle("RMP_ASV_LCPM_SLV_species_RA.pkl")

# Load metadata
LCMP_metadata_589 = pd.read_csv("LCMP_metadata_589.csv", index_col=0)

#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% Differential abundance tests %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% RMP @ Species SLV  %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

# Assign variables
ps_sp = RMP_ASV_LCPM_SLV_species_RA

# Remove taxa with 'unclassified' in their names
ps_sp_noUnk = ps_sp[~ps_sp.index.str.contains('.unclassified')]

# Further remove taxa containing "[" and "group." in their names
ps_sp_noUnk = ps_sp_noUnk[~ps_sp_noUnk.index.str.contains("\[|group\.")]

# Modify taxa names
ps_sp_noUnk.index = ps_sp_noUnk.index.str.replace(r'.*\g_', '', regex=True)

# Subset samples based on CRC_general_status values
LCPM_No_lesion = ps_sp_noUnk[LCMP_metadata_589['CRC_general_status'] == "No_lesion"]
LCPM_Polyp = ps_sp_noUnk[LCMP_metadata_589['CRC_general_status'] == "Polyp"]
LCPM_Tumor = ps_sp_noUnk[LCMP_metadata_589['CRC_general_status'] == "Tumor"]

# Define pruning functions (example using thresholds for prevalence and detection)
def prune_taxa(taxa_df, prevalence, detection):
    return taxa_df.loc[:, (taxa_df > detection).mean(axis=0) >= prevalence]

p5_N = prune_taxa(LCPM_No_lesion, prevalence=0.05, detection=1e-6)
p5_P = prune_taxa(LCPM_Polyp, prevalence=0.05, detection=1e-6)
p5_T = prune_taxa(LCPM_Tumor, prevalence=0.05, detection=1e-6)

# Compute set differences
N_vs_P_diff = set(p5_N.columns).difference(set(p5_P.columns))
N_vs_T_diff = set(p5_N.columns).difference(set(p5_T.columns))
T_vs_N_diff = set(p5_T.columns).difference(set(p5_N.columns))
P_vs_T_diff = set(p5_P.columns).difference(set(p5_T.columns))

# Subset taxa based on combined set differences
p5_NPT = ps_sp_noUnk[p5_N.columns.union(p5_P.columns).union(p5_T.columns)]

# Save taxa names to a CSV file
p5_NPT.to_csv("sp138_names.csv")
ps_sp_noUnk.loc[~ps_sp_noUnk.index.isin(p5_NPT.index)].to_csv("spNot138_names.csv")

# Perform Kruskal-Wallis tests
df_status = LCMP_metadata_589[['CRC_general_status']].join(p5_NPT, how="inner")
kw_res = pd.DataFrame(columns=["Chi2", "P", "AdjustedP"])

for col in df_status.columns[1:]:
    group_values = [df_status[df_status['CRC_general_status'] == status][col] for status in df_status['CRC_general_status'].unique()]
    chi2_stat, p_value = kruskal(*group_values)
    kw_res.loc[col] = [chi2_stat, p_value, np.nan]

# Adjust p-values for multiple testing
kw_res['AdjustedP'] = multipletests(kw_res['P'], method='fdr_bh')[1]
kw_res.to_csv("RMP_138sp_SLV_sp_kw_res.csv")

# Select significant results
significant_features = kw_res[kw_res['AdjustedP'] < 0.05].index
significant_df = df_status[["CRC_general_status"] + list(significant_features)]

# Perform Kruskal-Wallis effect size analysis (as an example; may require custom functions)
kruskal_effsize_results = []
for feature in significant_features:
    effect_size_result = kruskal_effsize(df_status[['CRC_general_status', feature]], 'CRC_general_status', feature)  # custom function
    kruskal_effsize_results.append(effect_size_result)

# Additional analyses, like Dunn's post hoc tests and saving results to CSV, follow here.

# Remaining steps follow a similar pattern for the QMP dataset.


ModuleNotFoundError: No module named 'skbio'