In [1]:
import numpy as np
import pandas as pd
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
import random

random.seed(531)

# Load sequences from a file or database
# Replace with actual loading function or path if data format differs
LCPM_seq_filtered = "LCPM_runs_589_seq_new_SLV_nr99_v138.1_filtered.fasta"

# Save the DNA sequences to a FASTA file
def write_dna_sequences(seqs, filename):
    records = [SeqRecord(Seq(seq), id=f"seq{i+1}", description="") for i, seq in enumerate(seqs)]
    SeqIO.write(records, filename, "fasta")

# Uncomment to save the filtered sequences
# write_dna_sequences(LCPM_seq_filtered, "LCPM_runs_589_seq_new_SLV_nr99_v138.1_filtered.fa")

# Assume GCN predictions have been generated and loaded into a DataFrame `df_cnv`
df_cnv = pd.read_csv("LCPM_runs_589_seq_new_SLV_nr99_v138.1_filtered_GCN.csv")  # Example file path
df_cnv.set_index('label', inplace=True)

# Load the microbiome data as Phyl.Object.ASV (simulated data here for illustration)
Phyl_Object_ASV = pd.read_csv("Phyl_Object_ASV.csv", index_col=0)  # Example file path

# Check for taxa not present in the CNV database
taxa_in_data = set(Phyl_Object_ASV.columns)
taxa_in_cnv = set(df_cnv.index)
missing_taxa = taxa_in_data - taxa_in_cnv
print("Taxa missing from CNV database:", missing_taxa)

# CNV correction
otu_table = Phyl_Object_ASV.copy()
otu_copy = otu_table.join(df_cnv[['x']], how='left')
otu_copy.drop(columns=['label', 'probs'], inplace=True, errors='ignore')
otu_cnv = otu_copy.div(otu_copy['x'], axis=0).drop(columns=['x'])

# Prepare metadata
metadata_QMP = pd.DataFrame(index=otu_table.index)  # Load sample data as needed

# Rarefying function for even sampling depth
def rarefy_even_sampling_depth_opt(cnv_corrected_abundance_table, cell_counts_table, minimum_nr_reads):
    assert sorted(cnv_corrected_abundance_table.index) == sorted(cell_counts_table.index), \
        "cnv_corrected_abundance_table and cell_counts_table do not have the same sample names."

    cnv_corrected_abundance_table = np.ceil(cnv_corrected_abundance_table).astype(int)
    cell_counts_table = cell_counts_table.loc[cnv_corrected_abundance_table.index].T

    sample_sizes = cnv_corrected_abundance_table.sum(axis=1)
    sampling_depths = sample_sizes / cell_counts_table.iloc[0]
    minimum_sampling_depth = sampling_depths.min()

    rarefy_to = cell_counts_table.iloc[0] * minimum_sampling_depth
    if all(rarefy_to > minimum_nr_reads):
        samples_to_exclude = []
        minimum_sampling_depth_opt = minimum_sampling_depth
    else:
        lost_after_rarefaction = []
        for i in range(len(sampling_depths)):
            min_depth = sorted(sampling_depths, reverse=True)[len(sampling_depths) - i - 1]
            rarefy_to = cell_counts_table.iloc[0] * min_depth
            lost_after_rarefaction.append((rarefy_to < minimum_nr_reads).sum())
        lost_in_total = [lost_after_rarefaction[i] + i for i in range(len(lost_after_rarefaction))]
        nr_samples_to_exclude = np.argmin(lost_in_total)
        minimum_sampling_depth_opt = sorted(sampling_depths, reverse=True)[len(sampling_depths) - nr_samples_to_exclude - 1]
        samples_to_exclude = sampling_depths[sampling_depths < minimum_sampling_depth_opt].index.tolist()

    rarefy_to_opt = np.round(cell_counts_table.iloc[0] * minimum_sampling_depth_opt).astype(int)

    rarefied_matrix = []
    samples_included = []
    for i, row in cnv_corrected_abundance_table.iterrows():
        if i not in samples_to_exclude and rarefy_to_opt[i] > minimum_nr_reads:
            print(f"Sample {i} rarefied to {rarefy_to_opt[i]} reads.")
            rarefied_sample = np.random.choice(row.index, size=rarefy_to_opt[i], replace=False, p=row / row.sum())
            rarefied_matrix.append(pd.Series(rarefied_sample).value_counts())
            samples_included.append(i)

    print("Optimal sampling depth:", minimum_sampling_depth_opt)
    print(f"{len(samples_to_exclude)} samples excluded due to exclusion:", samples_to_exclude)

    rarefied_matrix_df = pd.DataFrame(rarefied_matrix).fillna(0)
    normalised_rarefied_matrix = rarefied_matrix_df.div(rarefied_matrix_df.sum(axis=1), axis=0)
    QMP = normalised_rarefied_matrix.multiply(cell_counts_table.iloc[0][samples_included], axis=0)
    return QMP

# Define necessary inputs for rarefaction
cnv_corrected_abundance_table = otu_cnv.T
cell_counts_table = metadata_QMP[['Cell_counts']]
minimum_nr_reads = 500 / otu_copy['x'].mean()

QMP = rarefy_even_sampling_depth_opt(cnv_corrected_abundance_table, cell_counts_table, minimum_nr_reads)

# Create a phyloseq-like object (dictionary with DataFrames)
QMP_ASV_LCPM_SLV = {
    'otu_table': QMP,
    'sample_data': metadata_QMP,
    'tax_table': Phyl_Object_ASV.columns,  # Modify as necessary
    'refseq': LCPM_seq_filtered  # Placeholder for DNA sequences
}

# Save QMP_ASV_LCPM_SLV as needed
# QMP_ASV_LCPM_SLV.to_pickle("QMP_ASV_LCPM_SLV.pkl")


ModuleNotFoundError: No module named 'Bio'