In [5]:
### this script sorts a hmmer output based on coverage and E-value
### Before running this script run these two commands from terminal
# 1. build an HMM from your seed alignment
# hmmbuild amylase.hmm seed_alignment.fasta

# 2. search the Coelicolor proteome
# hmmsearch -E 3 --domtblout amylase_hmm_domains.tbl --tblout amylase_hmm_hits.tbl --cpu 4 amylase.hmm coelicolor_proteins.faa > amylase_hmm.out

### That creates a hmmer profile based on a fasta alignment file
### then using that hmmmer profile it searces the coelicolor proteome for proteins with similar domains


import pandas as pd
import os
import glob

input_folder = '/Users/andreaslawaetz/Streptomyces/Angeliga_project/SCOG/amylase_inhibitors/acarviosin/gator_redo/results21_Amy_is_only_Opt_allBGC_we30/hmmer_search'
output_folder = '/Users/andreaslawaetz/Streptomyces/Angeliga_project/SCOG/amylase_inhibitors/acarviosin/gator_redo/results21_Amy_is_only_Opt_allBGC_we30/hmmer_search_tsv'
os.makedirs(output_folder, exist_ok = True)

accessions = []
inhibitor = []
for file in glob.glob(input_folder + '/*.tbl'):
    rows = []
    with open(file) as f:
        for line in f:
            if line.startswith("#") or not line.strip():
                continue  # skip comments and blank lines

            parts = line.strip().split()
            # The HMMER domtblout format has 22 fixed columns before description
            fixed_fields = parts[:18]
            description = " ".join(parts[18:]) if len(parts) > 18 else ""
            rows.append(fixed_fields + [description])

    # Convert to DataFrame
    df = pd.DataFrame(rows, columns=[
        "target_name", "accession", "query_name", "accession", "E_value_1", "score", "bias",
        "E_value_2", "score", "bias", "exp", "reg", "clu",
        "ov", "env", "dom", "rep", "inc", "description"
    ])

    df = df.drop(columns=["exp", "reg", "clu",
        "ov", "env", "dom", "rep", "inc"])
    
    # Convert E-value columns to numeric (coerce errors to NaN)
    df["E_value_1"] = pd.to_numeric(df["E_value_1"], errors="coerce")
    df["E_value_2"] = pd.to_numeric(df["E_value_2"], errors="coerce")

    # Drop rows with missing or invalid E-values
    df = df.dropna(subset=["E_value_1"])

    # Filter: only keep significant hits (E-value ≤ 1e-10)
    df = df[df["E_value_1"] <= 1e-10]
    
    accessions.append(os.path.basename(file)[:-9])
    inhibitor.append(len(df))


    # Save
    df.to_csv(output_folder + f'/{os.path.basename(file)}.tsv', sep="\t", index=False)

# master_inhibitor = pd.DataFrame()
# master_inhibitor['tip_label'] = accessions
# master_inhibitor['tendamistatPFAM'] = inhibitor

# master_table = '/Users/andreaslawaetz/Streptomyces/Angeliga_project/SCOG/amylase_inhibitors/Master_table_with_amyPhyloGroups_known_clusters_and_inhibitors_GatorAcarviosin_GatorAcarbose_gatorValidamycinA_gatorBafilomycin_tendamisatPFAM_tendamisatSMART.csv'
# master_df = pd.read_csv(master_table, sep = ',', header = 0)

# merged_df = pd.merge(master_df, master_inhibitor, on="tip_label", how="left")
# merged_df.to_csv('/Users/andreaslawaetz/Streptomyces/Angeliga_project/SCOG/amylase_inhibitors/Master_table_with_amyPhyloGroups_known_clusters_and_inhibitors_GatorAcarviosin_GatorAcarbose_gatorValidamycinA_gatorBafilomycin_tendamisatPFAM_tendamisatSMART_tendamisatPFAM.csv', sep = ',', index = False)

In [21]:
merged_df

Unnamed: 0,tip_label,2dos,amglyccycl,aminocoumarin,aminopolycarboxylic-acid,arylpolyene,atropopeptide,azole-containing-RiPP,azoxy-crosslink,azoxy-dimer,...,transAT-PKS-like,triceptide,number_amylases,GH_13_occurences,Group_1_amylase,Group_2_amylase,Group_3_amylase,resistant_amylase,acarbose_BGC,tendamistat
0,GCF_000009765.2,0,0,0,1,0,0,0,0,0,...,0,1,2,16.0,1.0,1.0,0.0,0,0,1
1,GCF_000010605.1,0,1,0,0,0,0,0,0,0,...,0,0,3,13.0,1.0,1.0,1.0,0,0,0
2,GCF_000092385.1,0,0,0,0,1,0,1,0,0,...,0,0,0,10.0,0.0,0.0,0.0,0,0,0
3,GCF_000147815.2,0,0,0,0,0,0,0,0,0,...,0,1,1,11.0,1.0,0.0,0.0,0,0,0
4,GCF_000225525.1,0,0,0,0,0,0,0,0,0,...,0,0,2,15.0,1.0,1.0,0.0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
290,GCF_900119365.1,0,0,0,0,0,0,0,0,0,...,0,0,1,11.0,1.0,0.0,0.0,0,0,0
291,GCF_900187385.1,0,0,0,0,0,1,0,0,0,...,0,1,2,13.0,1.0,1.0,0.0,0,0,0
292,GCF_900206255.1,0,0,0,0,0,0,0,1,0,...,0,0,3,18.0,1.0,2.0,0.0,0,0,0
293,GCF_900215595.1,0,0,0,1,0,0,0,0,0,...,0,0,2,15.0,1.0,1.0,0.0,0,0,0


In [11]:
master_df

Unnamed: 0,tip_label,2dos,amglyccycl,aminocoumarin,aminopolycarboxylic-acid,arylpolyene,atropopeptide,azole-containing-RiPP,azoxy-crosslink,azoxy-dimer,...,thioamide-NRP,thioamitides,transAT-PKS,transAT-PKS-like,triceptide,number_amylases,GH_13_occurences,Group_1_amylase,Group_2_amylase,Group_3_amylase
0,GCF_000009765.2,0,0,0,1,0,0,0,0,0,...,0,0,0,0,1,2,16.0,1.0,1.0,0.0
1,GCF_000010605.1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,3,13.0,1.0,1.0,1.0
2,GCF_000092385.1,0,0,0,0,1,0,1,0,0,...,0,1,1,0,0,0,10.0,0.0,0.0,0.0
3,GCF_000147815.2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,1,11.0,1.0,0.0,0.0
4,GCF_000225525.1,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,2,15.0,1.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
290,GCF_900119365.1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,11.0,1.0,0.0,0.0
291,GCF_900187385.1,0,0,0,0,0,1,0,0,0,...,0,1,0,0,1,2,13.0,1.0,1.0,0.0
292,GCF_900206255.1,0,0,0,0,0,0,0,1,0,...,0,1,0,0,0,3,18.0,1.0,2.0,0.0
293,GCF_900215595.1,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,2,15.0,1.0,1.0,0.0


In [12]:
master_inhibitor

Unnamed: 0,tip_label,tendamistat
0,GCF_016741855.1,1
1,GCF_000931445.1,1
2,GCF_004125245.1,1
3,GCF_008704515.1,1
4,GCF_017876235.1,1
5,GCF_000009765.2,1
6,GCF_000739045.1,1
7,GCF_017526105.1,1
8,GCF_004028635.1,1
9,GCF_000700005.2,1
