In [1]:
import pandas as pd
import numpy as np
from skbio.stats.composition import clr, multiplicative_replacement
from skbio.stats.ordination import pcoa
import matplotlib.pyplot as plt
import seaborn as sns

# Set a random seed for reproducibility
np.random.seed(531)

# Load the microbiome data object (assuming it’s loaded as a DataFrame)
# Replace 'load_data_function' with the actual function to load your data
RMP_ASV_LCPM_SLV_species = load_data_function("RMP_ASV_LCPM_SLV_species.pkl")

# Check and adjust taxonomic names if necessary
RMP_ASV_LCPM_SLV_species.columns = RMP_ASV_LCPM_SLV_species.columns.str.replace("-", ".")

# Create a working copy of the data
Obj = RMP_ASV_LCPM_SLV_species.copy()

# Set filtering parameters
min_reads = 10000
min_prop = 0.001
cutoff = 0

# Convert the species table and apply filtering
matrix = Obj.values
sample_sums = matrix.sum(axis=0)
matrix = matrix[:, sample_sums >= min_reads]
matrix = matrix[(matrix >= min_prop).sum(axis=1) > cutoff, :]

# Function to estimate zeros (similar to R's estimate0.min function)
def estimate_zeros(matrix):
    matrix_p = matrix / matrix.sum(axis=0)  # proportionate normalization
    matrix_filled = matrix.copy()
    for i in range(matrix.shape[0]):
        non_zero_min = matrix_p[i, matrix_p[i, :] > 0].min()
        matrix_filled[i, matrix[i, :] == 0] = non_zero_min * sample_sums
    return matrix_filled

# Apply zero estimation and perform CLR transformation
matrix_filled = estimate_zeros(matrix)
matrix_clr = clr(multiplicative_replacement(matrix_filled.T)).T  # CLR transformation

# Plot histogram of row sums after CLR transformation
plt.hist(matrix_clr.sum(axis=1))
plt.title("Row Sums after CLR Transformation")
plt.show()

######### Create CLR-transformed phyloseq object #########

# Assuming you have metadata and taxonomy data as DataFrames
sample_metadata = Obj.sample_metadata
sample_metadata["Diagnosis"] = sample_metadata["CRC_general_status"]

# PCoA Ordination
ordination = pcoa(matrix_clr)
ordination_df = ordination.samples
ordination_df['Diagnosis'] = sample_metadata['Diagnosis'].values

# Plot PCoA with Diagnosis as color
sns.set(style="whitegrid")
pcoa_plot = sns.scatterplot(
    x=ordination_df['PC1'], y=ordination_df['PC2'],
    hue=ordination_df['Diagnosis'],
    palette=["#1B9E77", "#D95F02", "#7570B3"]
)
pcoa_plot.legend(loc='upper right')
plt.title("PCoA Ordination with CLR Transformation")
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.show()


ModuleNotFoundError: No module named 'skbio'