In [1]:
from pathlib import Path

# Automatically get the base path of your project
base_path = Path.cwd().parents[0]  # adjust .parents[0] if needed
print("Base path of the project:", base_path)

Base path of the project: c:\Users\Yusuf\OneDrive\LST\Derde_jaar\Y3Q4\Metaproteomics_with_db


In [None]:
# This code is more robust and will work regardless of the current working directory.
# Make sure that base_path is defined correctly.
# Ensure the required packages are installed from the requirements.txt file
!pip3 install -r "{base_path}/requirements.txt"

In [None]:
# Paste the path to the input FASTA file that you want to cluster between the quotes below.
input_fasta_win = r"input_sequences.fasta" 

In [None]:
import os
import subprocess
from Bio import SeqIO
import pandas as pd

# === 1. Input and Output paths ===
input_fasta_wsl = "/mnt/" + input_fasta_win.replace("\\", "/").replace(":", "").replace("C", "c")
identity_threshold = 50  # Choose the cluster identity threshold in % (0-100)

output_fasta_win = input_fasta_win.replace(".fasta", f"_clustered{identity_threshold}.fasta")
output_fasta_wsl = "/mnt/" + output_fasta_win.replace("\\", "/").replace(":", "").replace("C", "c")

# === 2. MMseqs2 clustering steps ===
commands = [
    f"wsl mmseqs createdb {input_fasta_wsl} db",
    f"wsl mmseqs cluster db clust tmp --min-seq-id {identity_threshold/100} -c 0.8",
    "wsl mmseqs createtsv db db clust clusters.tsv",
    "wsl mmseqs createsubdb clust db rep",
    f"wsl mmseqs result2flat db db rep clustered_raw.fasta"
]

def run(cmd):
    print(f"Running: {cmd}")
    result = subprocess.run(cmd, shell=True, text=True, capture_output=True)
    if result.returncode != 0:
        print(result.stdout)
        print(result.stderr)
        raise RuntimeError("Command failed.")

for cmd in commands:
    run(cmd)

# === 3. Enrich headers ===
clusters_df = pd.read_csv("clusters.tsv", sep="\t", header=None, names=["member", "representative"])
cluster_map = clusters_df.groupby("representative")["member"].apply(list).to_dict()

# Store original full headers to keep protein name and taxid
original_headers = {}
for rec in SeqIO.parse(input_fasta_win, "fasta"):
    original_headers[rec.id] = rec.description

# Load representative sequences
rep_records = list(SeqIO.parse("clustered_raw.fasta", "fasta"))

for rec in rep_records:
    rep_id = rec.id
    members = cluster_map.get(rep_id, [rep_id])
    size = len(members)

    # Retrieve original metadata-rich header
    original_header = original_headers.get(rep_id, rep_id)

    # Replace default description with enriched metadata
    rec.description = f"{original_header} | Cluster={rep_id} | Members={','.join(members)} | Size={size}"

# === 4. Save final clustered file with enriched headers ===
SeqIO.write(rep_records, output_fasta_win, "fasta")
print(f"\nClustered file written to:\n{output_fasta_win}")

In [None]:
def count_proteins_in_fasta(fasta_path):
    """Count the number of protein entries in a FASTA file."""
    count = 0
    with open(fasta_path, "r") as file:
        for line in file:
            if line.startswith(">"):
                count += 1
    print(f"Total proteins in '{fasta_path}': {count}")
    return count

# === Example usage ===
clustered_path = output_fasta_win
original_path  = input_fasta_win

# Run both
original_count = count_proteins_in_fasta(original_path)
clustered_count = count_proteins_in_fasta(clustered_path)

# Difference
reduction = original_count - clustered_count
print(f"Proteins clustered away: {reduction} ({(reduction/original_count)*100:.2f}%)")

In [None]:
from Bio import SeqIO
from pathlib import Path

# === 1. Input file paths ===
clustered_fasta_path = Path(output_fasta_win)
unclustered_fasta_path = Path(original_path)
output_fasta_path = clustered_fasta_path.with_name(clustered_fasta_path.stem + "_with_metadata.fasta")

# === 2. Load unclustered headers (metadata) by protein ID ===
id_to_metadata = {}
for rec in SeqIO.parse(unclustered_fasta_path, "fasta"):
    # Extract the clean ID used by clustered file (e.g., A0A679HT45)
    # which can be extracted from headers like: >tr|A0A679HT45|...
    parts = rec.id.split("|")
    clean_id = parts[1] if len(parts) > 2 else rec.id
    id_to_metadata[clean_id] = rec.description

# === 3. Update clustered FASTA headers with metadata ===
updated_records = []
for rec in SeqIO.parse(clustered_fasta_path, "fasta"):
    cluster_id = rec.id

    # Get metadata-rich header from original file
    metadata_header = id_to_metadata.get(cluster_id, cluster_id)

    # Extract cluster info (starts after first space)
    cluster_info = rec.description[len(cluster_id):].strip()

    # Combine metadata + cluster info
    rec.description = f"{metadata_header} {cluster_info}".strip()
    rec.name = cluster_id
    rec.id = cluster_id
    updated_records.append(rec)

# === 4. Write updated FASTA ===
SeqIO.write(updated_records, output_fasta_path, "fasta")
print(f"Final clustered FASTA written with full metadata:\n{output_fasta_path}")
print(f"Total sequences processed: {len(updated_records)}")