In [1]:
from pathlib import Path

# Automatically get the base path of your project
base_path = Path.cwd().parents[0]  # adjust .parents[0] if needed
print("Base path of the project:", base_path)

Base path of the project: c:\Users\Yusuf\OneDrive\LST\Derde_jaar\Y3Q4\Metaproteomics_with_db


In [None]:
# This code is more robust and will work regardless of the current working directory.
# Make sure that base_path is defined correctly.
# Ensure the required packages are installed from the requirements.txt file
#!pip3 install -r "{base_path}/requirements.txt"

### Step 13: Fetch Proteomes for Organisms Using a Ranked Strategy

This cell initiates the *proteome-based strategy* to collect protein data for custom database creation. Instead of directly retrieving proteins from UniProtKB, this approach:
- Queries **whole proteomes** from UniProt based on the identified organisms.
- **Weights proteome downloads** by the number of peptide hits per organism (top 5 get more proteomes, lower-ranking ones get fewer).
- Ensures that high-confidence organisms are well-represented while maintaining a manageable dataset size.

This strategy allows the construction of a biologically realistic and computationally efficient protein database enriched with full proteomes of organisms most likely present in the sample.


In [None]:
import pandas as pd
import os
import requests
from datetime import datetime
from tqdm import tqdm  # For progress bar

# === 1. Setup output path ===

# Create a timestamped output folder for this DIAMOND session
session_time = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
output_dir = f"Diamond_alignments/session_diamond_align_{session_time}"
os.makedirs(output_dir, exist_ok=True)

# Define path for output CSV file containing matched proteomes
proteome_csv_path = os.path.join(output_dir, "diamond_proteome_matches_all.csv")

# === 2. Proteome limits per organism ranking group ===

# More peptide hits = fetch more proteomes
limits = {
    "top_5": 100,         # Organisms ranked 1–5
    "rank_6_10": 50,      # Organisms ranked 6–10
    "rank_11_20": 20,     # Organisms ranked 11–20
    "rank_21_50": 5,      # Organisms ranked 21–50
    "rank_51_plus": 1     # All others
}

# === 3. Load DIAMOND alignment results + count peptide hits ===

# Load previously annotated file (contains organism names and taxonomic ranks)
df = pd.read_csv("Diamond_alignments/diamond_df_rank_annotated_fixed.csv")

# Count how often each organism appears (i.e., number of peptide hits)
organism_counts = df["organism"].value_counts()

# Create an ordered list of organisms based on abundance
ranked_organisms = organism_counts.index.tolist()

# Create a lookup: organism name → its assigned taxonomy rank
rank_lookup = df.drop_duplicates("organism").set_index("organism")["taxonomy_rank"].to_dict()

# === 4. Plan how many proteomes to fetch per organism ===

fetch_plan = []
for i, org in enumerate(ranked_organisms):
    if i < 5:
        n = limits["top_5"]
    elif i < 10:
        n = limits["rank_6_10"]
    elif i < 20:
        n = limits["rank_11_20"]
    elif i < 50:
        n = limits["rank_21_50"]
    else:
        n = limits["rank_51_plus"]
    
    # Add to fetch plan: (ranking index, organism name, taxonomic rank, proteomes to fetch)
    fetch_plan.append((i + 1, org, rank_lookup.get(org, "unknown"), n))

# === 5. Function to fetch top proteomes for a given organism ===

def fetch_proteomes(organism, rank, n):
    """
    Queries UniProt's REST API to fetch up to `n` proteomes for a given organism.
    Prioritizes 'Reference' proteomes, then sorts by BUSCO completeness.
    """
    base_url = "https://rest.uniprot.org/proteomes/search"
    
    try:
        # Query UniProt API with organism name
        r = requests.get(base_url, params={"query": organism, "format": "json", "size": 500})
        r.raise_for_status()
        results = r.json().get("results", [])
    except Exception:
        return []  # On error or no results, return empty list

    # Prioritize reference proteomes first
    reference = [r for r in results if r.get("proteomeType") == "Reference"]
    if reference:
        results = reference + [r for r in results if r not in reference]

    # Sort by completeness using BUSCO scores
    results = sorted(results, key=lambda r: r.get("busco", {}).get("complete", 0), reverse=True)

    # Return top `n` proteomes (structured)
    return [
        (
            organism,
            r.get("id"),
            r.get("proteomeType"),
            r.get("taxonomy", {}).get("rank", "unknown"),
            "name"  # Placeholder; you can optionally include r.get("name")
        )
        for r in results[:n]
    ]

# === 6. Run fetch plan and collect results ===

results = []      # Holds all proteomes fetched
not_found = []    # Track organisms for which no proteomes were found

print("\nFetching proteomes based on rank...")

# Iterate through fetch plan and retrieve proteomes
for rank_idx, org, taxrank, limit in tqdm(fetch_plan):
    hits = fetch_proteomes(org, taxrank, limit)
    if hits:
        results.extend(hits)
    else:
        not_found.append(org)

# === 7. Save matched proteomes to CSV ===

df_out = pd.DataFrame(results, columns=["Organism", "Proteome ID", "Proteome Type", "Tax Rank", "Used Query"])
df_out.to_csv(proteome_csv_path, index=False)

# === 8. Summary logging ===

print("\n=== SUMMARY ===")
print(f"Proteomes matched for {len(set([r[0] for r in results]))} / {len(ranked_organisms)} organisms")
print(f"No proteomes found for: {len(not_found)}")
print(f"Output saved to: {proteome_csv_path}")



🚀 Fetching proteomes based on rank...


100%|██████████| 278/278 [00:29<00:00,  9.58it/s]


📊 === SUMMARY ===
✅ Proteomes matched for 261 / 278 organisms
❌ No proteomes found for: 17
📁 Output saved to: Diamond_alignments/session_diamond_align_2025-06-02_17-23-44\diamond_proteome_matches_all.csv





### Step 14: Download and Merge Protein FASTA Files from Matched Proteomes

After selecting organisms based on DIAMOND peptide hits and fetching their matching UniProt proteomes, this cell automates the download of **protein sequences** from each matched proteome.

Key features:
- Uses UniProt’s `/uniprotkb/stream` API to fetch FASTA data.
- Parallelizes requests using `ThreadPoolExecutor` to accelerate download.
- Collects and merges all results into a single multi-entry FASTA file.
- Skips failed downloads and prints a list of any failed proteomes.

This final merged FASTA file will be used to construct a custom database for downstream taxonomic or functional annotation.


In [None]:
import os
import pandas as pd
import requests
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed

# === 1. Config ===

# Directory where previous proteome matching results were saved
output_dir = os.path.dirname(proteome_csv_path)

# Output path for the combined FASTA containing all downloaded proteins
output_fasta = os.path.join(output_dir, "diamond_proteins_from_proteomes.fasta")

# Determine a reasonable number of threads to use (does not overload CPU because downloads are I/O-bound)
num_threads = os.cpu_count()

# === 2. Load proteome IDs ===

# Read CSV file created from matched proteomes
df = pd.read_csv(proteome_csv_path)

# Extract unique proteome IDs (to avoid redundant downloads)
proteome_ids = df["Proteome ID"].dropna().unique()

# === 3. Define the FASTA download function ===

def fetch_fasta(proteome_id):
    """
    Fetches all protein sequences in FASTA format from a given UniProt proteome ID.
    Returns (proteome_id, fasta_content) on success, (proteome_id, None) on failure.
    """
    url = f"https://rest.uniprot.org/uniprotkb/stream?query=proteome:{proteome_id}&format=fasta"
    try:
        response = requests.get(url, timeout=60)
        if response.ok:
            return proteome_id, response.text.strip()
    except:
        pass
    return proteome_id, None

# === 4. Parallel downloading of proteomes ===

all_entries = []      # To store all downloaded protein FASTA entries
failed_ids = []       # To track proteomes that could not be downloaded

print(f"\nDownloading {len(proteome_ids)} proteomes using {num_threads} threads...\n")

# Use ThreadPoolExecutor to parallelize requests for better performance
with ThreadPoolExecutor(max_workers=num_threads) as executor:
    # Submit all proteome download tasks at once
    futures = {executor.submit(fetch_fasta, pid): pid for pid in proteome_ids}

    # Process results as they complete
    for future in tqdm(as_completed(futures), total=len(futures)):
        pid, result = future.result()
        if result:
            # Split result into FASTA entries (entries start with ">")
            entries = result.split("\n>")
            for entry in entries:
                # Ensure proper formatting (each entry must start with ">")
                if not entry.startswith(">"):
                    entry = ">" + entry
                all_entries.append(entry)
        else:
            failed_ids.append(pid)

# === 5. Write all protein sequences to one combined FASTA file ===

with open(output_fasta, "w") as f_out:
    f_out.write("\n".join(all_entries))

# === 6. Summary output ===

print("\nDownload complete.")
print(f"Total protein sequences written: {len(all_entries)}")
print(f"Output FASTA saved to:\n{output_fasta}")

if failed_ids:
    print(f"{len(failed_ids)} proteomes failed to download.")
    print("Failed Proteome IDs:", failed_ids)


🚀 Downloading FASTA files for 923 proteomes using 8 threads...



100%|██████████| 923/923 [24:11<00:00,  1.57s/it]



📦 Download complete.
✅ Total sequences written: 2755606
❌ Proteomes failed to download: 196
Failed IDs: ['UP000257281', 'UP000260238', 'UP000606398', 'UP000594619', 'UP000618573', 'UP000608564', 'UP000640770', 'UP000641310', 'UP000672553', 'UP000636347', 'UP000619799', 'UP000672815', 'UP000673606', 'UP000623454', 'UP000673624', 'UP000674615', 'UP000636420', 'UP000675066', 'UP000651759', 'UP000672736', 'UP000673195', 'UP000673466', 'UP000673536', 'UP000717363', 'UP000767385', 'UP000778227', 'UP000807157', 'UP001057786', 'UP000674281', 'UP001057842', 'UP000709408', 'UP001057854', 'UP000710986', 'UP001057884', 'UP000770306', 'UP000772915', 'UP000811667', 'UP000826969', 'UP001057836', 'UP001159746', 'UP001057852', 'UP001059466', 'UP001059564', 'UP001159151', 'UP001159715', 'UP001159720', 'UP001159738', 'UP001159756', 'UP001159771', 'UP001159801', 'UP001161004', 'UP000316179', 'UP000601385', 'UP000596954', 'UP000602056', 'UP000620433', 'UP000631491', 'UP000643674', 'UP000663967', 'UP000666