<a href="https://colab.research.google.com/github/AhmedAboushanab/Genomic-Insights-into-Specialized-and-Primary-Metabolism-Gene-Clusters/blob/main/Data_Extraction_from_antiSMASH.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install beautifulsoup4 pandas

In [None]:
import os
import hashlib
from bs4 import BeautifulSoup, Tag
import pandas as pd
import re

def generate_row_hash(entry: dict) -> str:
    """Generate a unique hash for a dictionary of row values."""
    row_str = "|".join(str(value) for value in entry.values())
    return hashlib.md5(row_str.encode("utf-8")).hexdigest()

def extract_antismash_data(html_path, sample_id):
    with open(html_path, 'r', encoding='utf-8') as f:
        soup = BeautifulSoup(f, 'html.parser')

    start_heading = soup.find(string=re.compile("Identified secondary metabolite regions.*strictness"))
    end_heading = soup.find(string=re.compile("No secondary metabolite regions were found"))

    content_blocks = []
    current = start_heading
    while current and current != end_heading:
        content_blocks.append(current)
        current = current.next_element

    tables = []
    seen_table_ids = set()
    for el in content_blocks:
        if isinstance(el, Tag) and el.name == "table":
            node_id_tag = el.find_previous(string=re.compile("NODE_"))
            node_id = node_id_tag.strip() if node_id_tag else None
            if node_id and node_id not in seen_table_ids:
                tables.append(el)
                seen_table_ids.add(node_id)
    records = []
    seen_hashes = set()

    for table in tables:
        node_label = ""
        prev = table
        while prev and not node_label:
            prev = prev.find_previous(string=True)
            if prev and "NODE" in prev:
                node_label = prev.strip()

        for row in table.find_all("tr")[1:]:
            cols = row.find_all("td")
            region = re.sub(r'\s+', ' ', cols[0].get_text(strip=True).replace(u'\xa0', ' ')) if len(cols) > 0 else ""
            bgc_type = cols[1].get_text(strip=True) if len(cols) > 1 else ""
            from_coord = cols[2].get_text(strip=True).replace(",", "") if len(cols) > 2 else ""
            to_coord = cols[3].get_text(strip=True).replace(",", "") if len(cols) > 3 else ""

            similar_lines = []
            if len(cols) > 5:
                similar_lines = list(cols[4].stripped_strings) + list(cols[5].stripped_strings)
            elif len(cols) == 5:
                similar_lines = list(cols[4].stripped_strings)

            cluster_name = similar_lines[0] if len(similar_lines) > 0 else ""
            cluster_class = similar_lines[1] if len(similar_lines) > 1 else ""

            similarity_text = cols[6].get_text(strip=True) if len(cols) > 6 else ""
            similarity_pct = re.search(r"(\d+)%", similarity_text)
            similarity_value = similarity_pct.group(1) if similarity_pct else ""

            row_data = {
                "Sample-id": sample_id,
                "Node": node_label,
                "Region": region,
                "Type": bgc_type,
                "From": from_coord,
                "To": to_coord,
                "Most_similar_known cluster": cluster_name,
                "Similarity %": similarity_value,
                "cluster": cluster_class
            }

            row_hash = generate_row_hash(row_data)
            if row_hash in seen_hashes:
                continue
            seen_hashes.add(row_hash)

            records.append(row_data)

    return pd.DataFrame(records)

def process_antismash_folder(folder_path):
    all_records = []
    count_per_sample = {}

    for root, _, files in os.walk(folder_path):
        for file in files:
            if file.endswith(".html"):
                html_path = os.path.join(root, file)
                sample_id = os.path.splitext(file)[0]
                print(f"Processing {sample_id}")
                df = extract_antismash_data(html_path, sample_id)
                count_per_sample[sample_id] = len(df)
                all_records.append(df)

    all_records = [df for df in all_records if not df.empty]

    if all_records:
        combined_df = pd.concat(all_records, ignore_index=True)
        combined_df = combined_df.drop_duplicates()  # Just in case
        combined_df.to_csv("antismash_combined_output.csv", index=False)
        print("\n✅ antismash_combined_output.csv saved.")

        print("\n📊 BGC Count per Sample:")
        for sample, count in count_per_sample.items():
            print(f"  {sample}: {count} regions")
        return combined_df
    else:
        print("No data extracted from any HTML files.")
        return pd.DataFrame()


In [None]:
process_antismash_folder("/content/drive/MyDrive/BGC-HTML-Files /References")