Taxa overlap of query database with metagnomics taxa from raw metagenomics database

In [3]:
# === File paths ===
# Paste the path of the raw metagenomics taxa CSV file
metagenomics_taxa_csv = r"C:\Users\Yusuf\OneDrive\LST\Derde_jaar\Y3Q4\Metaproteomics_with_db\db_results_analysis\unique_taxa_in_metagendb.csv"

# Paste the path of the query DataFrame CSV file
query_df_csv = r"C:\Users\Yusuf\OneDrive\LST\Derde_jaar\Y3Q4\Metaproteomics_with_db\pipelines\Diamond_alignments\diamond_df_rank_annotated.csv"

# Give a label for the query DataFrame
query_db_label = "Diamond hits dataframe"

# === CONFIGURATION ===
# Choose a taxonomic rank (e.g., "species", "genus", etc.)
# Set to None to compare all taxa regardless of rank
selected_rank = "species"  # ← modify this line only

In [7]:
import pandas as pd

# === Load and filter metagenomics taxa ===
df_meta = pd.read_csv(metagenomics_taxa_csv)
if selected_rank:
    df_meta = df_meta[df_meta["rank"] == selected_rank]
meta_taxa = set(df_meta["taxon_name"])

# === Load and filter DIAMOND-based taxa ===
df_query = pd.read_csv(query_df_csv)
if selected_rank:
    df_query = df_query[df_query["taxonomy_rank"] == selected_rank]
query_taxa = set(df_query["organism"])

# === Compare sets ===
query_in_meta = query_taxa & meta_taxa
query_not_in_meta = query_taxa - meta_taxa

meta_in_query = meta_taxa & query_taxa
meta_not_in_query = meta_taxa - query_taxa

# === Print results ===
print(f"Total unique {selected_rank} taxa in {query_db_label}: {len(query_taxa)}")
print(f"{query_db_label} taxa found in metagenomics raw DB ({selected_rank or 'all ranks'}): {len(query_in_meta)}")
print(f"{query_db_label} taxa NOT found in metagenomics raw DB ({selected_rank or 'all ranks'}): {len(query_not_in_meta)}")
print(f"Percentage of {query_db_label} taxa found: "
      f"{(len(query_in_meta) / len(query_taxa)) * 100:.2f}%" if query_taxa else "0.00%")

print(f"\nTotal unique {selected_rank} taxa in metagenomics: {len(meta_taxa)}")
print(f"Metagenomics raw DB taxa found in {query_db_label} ({selected_rank or 'all ranks'}): {len(meta_in_query)}")
print(f"Metagenomics raw DB taxa NOT found in {query_db_label} ({selected_rank or 'all ranks'}): {len(meta_not_in_query)}")
print(f"Percentage of raw metagenomics taxa found in {query_db_label}: "
      f"{(len(meta_in_query) / len(meta_taxa)) * 100:.2f}%" if meta_taxa else "0.00%")


Total unique species taxa in Diamond hits dataframe: 267
Diamond hits dataframe taxa found in metagenomics raw DB (species): 175
Diamond hits dataframe taxa NOT found in metagenomics raw DB (species): 92
Percentage of Diamond hits dataframe taxa found: 65.54%

Total unique species taxa in metagenomics: 9474
Metagenomics raw DB taxa found in Diamond hits dataframe (species): 175
Metagenomics raw DB taxa NOT found in Diamond hits dataframe (species): 9299
Percentage of raw metagenomics taxa found in Diamond hits dataframe: 1.85%


Taxa overlap of query database with metagnomics taxa from psm's

In [12]:
# === File paths ===
# Paste the path of the metagenomics PSM taxa CSV file
metagenomics_psm_taxa_csv = r"C:\Users\Yusuf\OneDrive\LST\Derde_jaar\Y3Q4\Metaproteomics_with_db\db_results_analysis\unique_taxa_in_metagenpsmdb.csv"

# Paste the path of the query DataFrame CSV file
query_df_csv = r"C:\Users\Yusuf\OneDrive\LST\Derde_jaar\Y3Q4\Metaproteomics_with_db\pipelines\Diamond_alignments\diamond_df_rank_annotated.csv"

# Give a label for the query DataFrame
query_db_label = "Diamond hits"

# === CONFIGURATION ===
# Choose a taxonomic rank (e.g., "species", "genus", etc.)
# Set to None to compare all taxa regardless of rank
selected_rank = "species"  # ← modify this line only

In [13]:
import pandas as pd

# === Load and filter metagenomics taxa ===
df_meta = pd.read_csv(metagenomics_psm_taxa_csv)
if selected_rank:
    df_meta = df_meta[df_meta["taxon_rank"] == selected_rank]
meta_taxa = set(df_meta["taxon_name"])

# === Load and filter DIAMOND-based taxa ===
df_query = pd.read_csv(query_df_csv)
if selected_rank:
    df_query = df_query[df_query["taxonomy_rank"] == selected_rank]
query_taxa = set(df_query["organism"])

# === Compare sets ===
query_in_meta = query_taxa & meta_taxa
query_not_in_meta = query_taxa - meta_taxa

meta_in_query = meta_taxa & query_taxa
meta_not_in_query = meta_taxa - query_taxa

# === Print results ===
print(f"Total unique {selected_rank} taxa in {query_db_label}: {len(query_taxa)}")
print(f"{query_db_label} taxa found in metagenomics psm DB ({selected_rank or 'all ranks'}): {len(query_in_meta)}")
print(f"{query_db_label} taxa NOT found in metagenomics psm DB ({selected_rank or 'all ranks'}): {len(query_not_in_meta)}")
print(f"Percentage of {query_db_label} taxa found: "
      f"{(len(query_in_meta) / len(query_taxa)) * 100:.2f}%" if query_taxa else "0.00%")

print(f"\nTotal unique {selected_rank} taxa in metagenomics psm: {len(meta_taxa)}")
print(f"Metagenomics psm DB taxa found in {query_db_label} ({selected_rank or 'all ranks'}): {len(meta_in_query)}")
print(f"Metagenomics psm DB taxa NOT found in {query_db_label} ({selected_rank or 'all ranks'}): {len(meta_not_in_query)}")
print(f"Percentage of metagenomics psm taxa found in {query_db_label}: "
      f"{(len(meta_in_query) / len(meta_taxa)) * 100:.2f}%" if meta_taxa else "0.00%")

Total unique species taxa in Diamond hits: 267
Diamond hits taxa found in metagenomics psm DB (species): 54
Diamond hits taxa NOT found in metagenomics psm DB (species): 213
Percentage of Diamond hits taxa found: 20.22%

Total unique species taxa in metagenomics psm: 152
Metagenomics psm DB taxa found in Diamond hits (species): 54
Metagenomics psm DB taxa NOT found in Diamond hits (species): 98
Percentage of metagenomics psm taxa found in Diamond hits: 35.53%
