Taxa overlap of query database with metagnomics taxa from raw metagenomics database

In [None]:
# === File paths ===
# Paste the path of the raw metagenomics taxa CSV file
metagenomics_taxa_csv = r"C:\Users\Yusuf\OneDrive\LST\Derde_jaar\Y3Q4\Metaproteomics_with_db\db_results_analysis\unique_taxa_in_metagendb.csv"

# Paste the path of the query DataFrame CSV file
query_df_csv = r"C:\Users\Yusuf\OneDrive\LST\Derde_jaar\Y3Q4\Metaproteomics_with_db\pipelines\Diamond_alignments\diamond_df_rank_annotated.csv"

# === CONFIGURATION ===
# Choose a taxonomic rank (e.g., "species", "genus", etc.)
# Set to None to compare all taxa regardless of rank
selected_rank = "species"  # ← modify this line only

In [None]:
import pandas as pd

# === Load and filter metagenomics taxa ===
df_meta = pd.read_csv(metagenomics_taxa_csv)
if selected_rank:
    df_meta = df_meta[df_meta["rank"] == selected_rank]
meta_taxa = set(df_meta["taxon_name"])

# === Load and filter DIAMOND-based taxa ===
df_diamond = pd.read_csv(query_df_csv)
if selected_rank:
    df_diamond = df_diamond[df_diamond["taxonomy_rank"] == selected_rank]
diamond_taxa = set(df_diamond["organism"])

# === Compare sets ===
diamond_in_meta = diamond_taxa & meta_taxa
diamond_not_in_meta = diamond_taxa - meta_taxa

meta_in_diamond = meta_taxa & diamond_taxa
meta_not_in_diamond = meta_taxa - diamond_taxa

# === Print results ===
print(f"\nDIAMOND DB taxa found in metagenomics raw DB ({selected_rank or 'all ranks'}): {len(diamond_in_meta)}")
print(f"DIAMOND DB taxa NOT found in metagenomics raw DB ({selected_rank or 'all ranks'}): {len(diamond_not_in_meta)}")
print(f"Percentage of DIAMOND DB taxa found: "
      f"{(len(diamond_in_meta) / len(diamond_taxa)) * 100:.2f}%" if diamond_taxa else "0.00%")

print(f"\nMetagenomics raw DB taxa found in DIAMOND DB ({selected_rank or 'all ranks'}): {len(meta_in_diamond)}")
print(f"Metagenomics raw DB taxa NOT found in DIAMOND DB ({selected_rank or 'all ranks'}): {len(meta_not_in_diamond)}")
print(f"Percentage of raw metagenomics taxa found in DIAMOND DB: "
      f"{(len(meta_in_diamond) / len(meta_taxa)) * 100:.2f}%" if meta_taxa else "0.00%")



DIAMOND DB taxa found in metagenomics DB (species): 175
DIAMOND DB taxa NOT found in metagenomics DB (species): 92
Percentage of DIAMOND DB taxa found: 65.54%

Metagenomics DB taxa found in DIAMOND DB (species): 175
Metagenomics DB taxa NOT found in DIAMOND DB (species): 9299
Percentage of metagenomics taxa found in DIAMOND DB: 1.85%


Taxa overlap of query database with metagnomics taxa from psm's

In [11]:
# === File paths ===
# Paste the path of the metagenomics PSM taxa CSV file
metagenomics_psm_taxa_csv = r"C:\Users\Yusuf\OneDrive\LST\Derde_jaar\Y3Q4\Metaproteomics_with_db\db_results_analysis\unique_taxa_in_metagenpsmdb.csv"

# Paste the path of the query DataFrame CSV file
query_df_csv = r"C:\Users\Yusuf\OneDrive\LST\Derde_jaar\Y3Q4\Metaproteomics_with_db\pipelines\Diamond_alignments\diamond_df_rank_annotated.csv"

# === CONFIGURATION ===
# Choose a taxonomic rank (e.g., "species", "genus", etc.)
# Set to None to compare all taxa regardless of rank
selected_rank = "species"  # ← modify this line only

In [None]:
import pandas as pd

# === Load and filter metagenomics taxa ===
df_meta = pd.read_csv(metagenomics_psm_taxa_csv)
if selected_rank:
    df_meta = df_meta[df_meta["taxon_rank"] == selected_rank]
meta_taxa = set(df_meta["taxon_name"])

# === Load and filter DIAMOND-based taxa ===
df_diamond = pd.read_csv(query_df_csv)
if selected_rank:
    df_diamond = df_diamond[df_diamond["taxonomy_rank"] == selected_rank]
diamond_taxa = set(df_diamond["organism"])

# === Compare sets ===
diamond_in_meta = diamond_taxa & meta_taxa
diamond_not_in_meta = diamond_taxa - meta_taxa

meta_in_diamond = meta_taxa & diamond_taxa
meta_not_in_diamond = meta_taxa - diamond_taxa

# === Print results ===
print(f"\nDIAMOND DB taxa found in metagenomics psm DB ({selected_rank or 'all ranks'}): {len(diamond_in_meta)}")
print(f"DIAMOND DB taxa NOT found in metagenomics psm DB ({selected_rank or 'all ranks'}): {len(diamond_not_in_meta)}")
print(f"Percentage of DIAMOND DB taxa found: "
      f"{(len(diamond_in_meta) / len(diamond_taxa)) * 100:.2f}%" if diamond_taxa else "0.00%")

print(f"\nMetagenomics psm DB taxa found in DIAMOND DB ({selected_rank or 'all ranks'}): {len(meta_in_diamond)}")
print(f"Metagenomics psm DB taxa NOT found in DIAMOND DB ({selected_rank or 'all ranks'}): {len(meta_not_in_diamond)}")
print(f"Percentage of metagenomics psm taxa found in DIAMOND DB: "
      f"{(len(meta_in_diamond) / len(meta_taxa)) * 100:.2f}%" if meta_taxa else "0.00%")


DIAMOND DB taxa found in metagenomics DB (species): 54
DIAMOND DB taxa NOT found in metagenomics DB (species): 213
Percentage of DIAMOND DB taxa found: 20.22%

Metagenomics DB taxa found in DIAMOND DB (species): 54
Metagenomics DB taxa NOT found in DIAMOND DB (species): 98
Percentage of metagenomics taxa found in DIAMOND DB: 35.53%
