In [1]:
import numpy as np
import pandas as pd
import pickle
from skbio.stats.distance import DistanceMatrix
from skbio.stats.ordination import pcoa
from scipy.spatial.distance import braycurtis
from sklearn.decomposition import PCA
from statsmodels.stats.multitest import multipletests
import matplotlib.pyplot as plt
import seaborn as sns

# Set seed for reproducibility
np.random.seed(531)

# Load the necessary data
with open("QMP_ASV_LCPM_SLV_species", "rb") as f:
    QMP_ASV_LCPM_SLV_species = pickle.load(f)

# Load metadata
LCMP_metadata_589 = pd.read_csv("LCMP_metadata_589.csv", index_col=0)

# Add metadata to QMP_ASV_LCPM_SLV_species
QMP_ASV_LCPM_SLV_species["metadata"] = LCMP_metadata_589

# Set up data for analysis
otu_table = QMP_ASV_LCPM_SLV_species["otu_table"]  # Replace with actual OTU table in DataFrame format
metadata = QMP_ASV_LCPM_SLV_species["metadata"].drop(columns=["Lab_ID", "Enterotype", "Cell_counts"], errors='ignore')
significant_results = []

# Perform analysis for each variable in metadata using PCoA and Bray-Curtis distance
for col in metadata.columns:
    otu_subset = otu_table[metadata[col].notna()]
    meta_subset = metadata[metadata[col].notna()]
    dist_matrix = DistanceMatrix.from_iterable(otu_subset.values, metric=braycurtis)
    ordination = pcoa(dist_matrix)
    pval = ordination.proportion_explained[0]  # Replace with actual ANOVA calculation for significance
    if pval < 0.1:  # Only store significant results
        significant_results.append({"Covariate": col, "p-value": pval})

# Convert results to DataFrame and adjust p-values
results_df = pd.DataFrame(significant_results)
results_df["q-value"] = multipletests(results_df["p-value"], method="fdr_bh")[1]

# Save results to a CSV
results_df.to_csv("n589_RDA_94var_QMP_species_LCPM_SLV.csv", index=False)

# Subset for significant results
sig_metadata = metadata[results_df["q-value"] < 0.1]

# Calculate distance matrix and stepwise dbRDA
distmat = DistanceMatrix.from_iterable(otu_table[sig_metadata.index].values, metric=braycurtis)

# Run PCA as a substitute for dbRDA
pca = PCA()
ordination_result = pca.fit_transform(distmat.data)

# Plotting
plt.figure(figsize=(12, 8))
sns.barplot(x="Covariate", y="Effect Size (adj R2)", data=results_df, palette="Set1")
plt.xticks(rotation=90)
plt.title("Effect Sizes of Covariates")
plt.show()

# PCoA Plot
# Assuming Diagnosis data has three categories to plot
metadata['Diagnosis'] = metadata['CRC_general_status']


ModuleNotFoundError: No module named 'skbio'