In [1]:
import numpy as np
import pandas as pd
import random
import pickle
from skbio import TreeNode
from skbio.stats.ordination import pcoa
from biom import Table

# Set the random seed for reproducibility
np.random.seed(531)
random.seed(531)

# Load data files
with open("seqtab_all_no_chimeras_tax_SLV138.1", "rb") as f:
    seqtab_all_no_chimeras_tax_SLV138 = pickle.load(f)

with open("seqtab_all_no_chimeras", "rb") as f:
    seqtab_all_no_chimeras = pickle.load(f)

with open("LCPM_all_ids", "rb") as f:
    LCPM_all_ids = pickle.load(f)

# Create a pandas DataFrame for taxonomy and OTU data
tax_df = pd.DataFrame(seqtab_all_no_chimeras_tax_SLV138)
otu_df = pd.DataFrame(seqtab_all_no_chimeras)
sample_df = pd.DataFrame(LCPM_all_ids)

# Remove taxa with zero counts across all samples
taxa_sums = otu_df.sum(axis=1)
otu_df = otu_df[taxa_sums > 0]

# Renaming taxa names
tax_df.fillna("unclassified", inplace=True)

# Prefix taxa ranks with unique labels if they are unclassified
tax_df['Genus'] = tax_df['Genus'].apply(lambda x: f"g_{x}")
tax_df['Family'] = tax_df['Family'].apply(lambda x: f"uc_f_{x}")
tax_df['Order'] = tax_df['Order'].apply(lambda x: f"uc_o_{x}")
tax_df['Class'] = tax_df['Class'].apply(lambda x: f"uc_c_{x}")
tax_df['Phylum'] = tax_df['Phylum'].apply(lambda x: f"uc_p_{x}")

# Update unclassified Genus with higher taxonomic ranks where needed
tax_df['Genus'] = np.where(tax_df['Genus'] == "g_unclassified", tax_df['Family'], tax_df['Genus'])
tax_df['Genus'] = np.where(tax_df['Genus'] == "uc_f_unclassified", tax_df['Order'], tax_df['Genus'])
tax_df['Genus'] = np.where(tax_df['Genus'] == "uc_o_unclassified", tax_df['Class'], tax_df['Genus'])
tax_df['Genus'] = np.where(tax_df['Genus'] == "uc_c_unclassified", tax_df['Phylum'], tax_df['Genus'])

# Combine Genus and Species with a dot separator
tax_df['Species'] = tax_df['Genus'] + '.' + tax_df['Species']

# Remove non-bacterial, archaea, and Homo sapiens reads
filtered_tax_df = tax_df[
    ~tax_df['Order'].isin(['uc_o_Chloroplast']) &
    ~tax_df['Family'].isin(['uc_f_Mitochondria']) &
    ~tax_df['Genus'].isin(['uc_p_unclassified', 'uc_f_Mitochondria'])
]

# Save the filtered data
with open("LCPM_all_names_edited_filtered", "wb") as f:
    pickle.dump(filtered_tax_df, f)

print("Filtered taxonomy data saved as LCPM_all_names_edited_filtered")


ModuleNotFoundError: No module named 'skbio'