In [1]:
from pathlib import Path

# Automatically get the base path of your project
base_path = Path.cwd().parents[0]  # adjust .parents[0] if needed
print("Base path of the project:", base_path)

Base path of the project: c:\Users\Yusuf\OneDrive\LST\Derde_jaar\Y3Q4\Metaproteomics_with_db


In [None]:
# This code is more robust and will work regardless of the current working directory.
# Make sure that base_path is defined correctly.
# Ensure the required packages are installed from the requirements.txt file
!pip3 install -r "{base_path}/requirements.txt"

In [None]:
# Paste the path to the diamond executable and input FASTA file between the quotes below.
diamond_exe = r"diamond_executable_path"

# Paste the path to the input FASTA file that you want to cluster between the quotes below.
input_fasta = r"input_fasta_path"

In [None]:
import subprocess
import pandas as pd
from pathlib import Path
from Bio import SeqIO

# === 1. Config ===
diamond_exe = Path(diamond_exe)  # adjust path if needed
input_fasta = Path(input_fasta)  # Convert to Path object for consistency
output_dir = input_fasta.parent
diamond_db = output_dir / "diamond_db.dmnd"

identity_threshold = 0.95  # Set your sequence identity cutoff (0.0 to 1.0)
cluster_output = output_dir / f"diamond_clust{identity_threshold}_output.tsv"
clustered_fasta = output_dir / f"diamond_clust{identity_threshold}_output.fasta"

# === 2. Make DIAMOND database ===
cmd_db = [str(diamond_exe), "makedb", "--in", str(input_fasta), "-d", str(diamond_db)]
print("Creating DIAMOND database...")
result = subprocess.run(cmd_db, capture_output=True, text=True)
if result.returncode != 0:
    print(result.stderr)
    raise RuntimeError("DIAMOND makedb failed.")


# === 3. Run DIAMOND clustering ===
cmd_cluster = [
    str(diamond_exe), "linclust",
    "-d", str(diamond_db),
    "-o", str(cluster_output),
    "--approx-id", str(identity_threshold),
    "-M", "64G"
]

print(f"Running DIAMOND clustering...\n{' '.join(cmd_cluster)}")
result = subprocess.run(cmd_cluster, capture_output=True, text=True)
if result.returncode != 0:
    print(result.stderr)
    raise RuntimeError("DIAMOND clustering failed.")
print("Clustering complete.")

# === 4. Parse clustering output ===
df = pd.read_csv(cluster_output, sep="\t", header=None, names=["cluster", "member"])
cluster_map = df.groupby("cluster")["member"].apply(list).to_dict()

# === 5. Load original sequences ===
records_dict = {rec.id: rec for rec in SeqIO.parse(input_fasta, "fasta")}

# === 6. Longest member becomes representative ===
rep_records = []
for cluster_id, members in cluster_map.items():
    valid_members = [records_dict[m] for m in members if m in records_dict]
    if not valid_members:
        continue

    rep_seq = max(valid_members, key=lambda r: len(r.seq))
    rep_id = rep_seq.id
    size = len(valid_members)
    member_ids = [m.id for m in valid_members]

    original_desc = rep_seq.description
    new_desc = f"{original_desc} | Cluster={rep_id} | Members={','.join(member_ids)} | Size={size}"
    rep_seq.description = new_desc
    rep_seq.name = rep_id
    rep_seq.id = rep_id
    rep_records.append(rep_seq)

# === 7. Save clustered FASTA ===
SeqIO.write(rep_records, clustered_fasta, "fasta")
print(f"Final clustered FASTA written to:\n{clustered_fasta}")

In [None]:
def count_proteins_in_fasta(fasta_path):
    """Count the number of protein entries in a FASTA file."""
    count = 0
    with open(fasta_path, "r") as file:
        for line in file:
            if line.startswith(">"):
                count += 1
    print(f"Total proteins in '{fasta_path}': {count}")
    return count

# === Example usage ===
clustered_path = clustered_fasta
original_path  = input_fasta

# Run both
original_count = count_proteins_in_fasta(original_path)
clustered_count = count_proteins_in_fasta(clustered_path)

# Difference
reduction = original_count - clustered_count
print(f"Proteins clustered away: {reduction} ({(reduction/original_count)*100:.2f}%)")

In [None]:
from Bio import SeqIO
from pathlib import Path

# === 1. Input file paths ===
clustered_fasta_path = Path(clustered_fasta)
unclustered_fasta_path = Path(input_fasta)
output_fasta_path = clustered_fasta_path.with_name(clustered_fasta_path.stem + "_with_metadata.fasta")

# === 2. Load unclustered headers (metadata) by protein ID ===
id_to_metadata = {}
for rec in SeqIO.parse(unclustered_fasta_path, "fasta"):
    # Extract the clean ID used by clustered file (e.g., A0A679HT45)
    # which can be extracted from headers like: >tr|A0A679HT45|...
    parts = rec.id.split("|")
    clean_id = parts[1] if len(parts) > 2 else rec.id
    id_to_metadata[clean_id] = rec.description

# === 3. Update clustered FASTA headers with metadata ===
updated_records = []
for rec in SeqIO.parse(clustered_fasta_path, "fasta"):
    cluster_id = rec.id

    # Get metadata-rich header from original file
    metadata_header = id_to_metadata.get(cluster_id, cluster_id)

    # Extract cluster info (starts after first space)
    cluster_info = rec.description[len(cluster_id):].strip()

    # Combine metadata + cluster info
    rec.description = f"{metadata_header} {cluster_info}".strip()
    rec.name = cluster_id
    rec.id = cluster_id
    updated_records.append(rec)

# === 4. Write updated FASTA ===
SeqIO.write(updated_records, output_fasta_path, "fasta")
print(f"Final clustered FASTA written with full metadata:\n{output_fasta_path}")
print(f"Total sequences processed: {len(updated_records)}")