In [1]:
from Bio import SearchIO
import pandas as pd
import glob
import os

In [2]:
df = pd.DataFrame(columns=['full_bitscore', 'domain_bitscore'])
hmmfile = 'ko_hmms/ectoine/GCF_000006175.1/K00003'

In [3]:
def retrieve_max_hmm_scores(hmmfile):
    with open(hmmfile, 'r') as handle:
        max_hit = 0
        max_dom = 0
        for record in SearchIO.parse(handle, 'hmmer3-tab'):
            for hit in (record.hits):
                if hit.bitscore> max_hit:
                    max_hit = hit.bitscore 
            for dom in (record.hsps):
                if dom.bitscore > max_dom:
                    max_dom = dom.bitscore
    return(max_hit, max_dom)

In [5]:
df = pd.DataFrame(columns=['full_bitscore', 'domain_bitscore'])
for hmmfile in glob.glob('ko_hmms/ectoine/GCF_000006175.1/K*'):
    index_name = os.path.basename(hmmfile)
    full_bitscore, domain_bitscore = retrieve_max_hmm_scores(hmmfile)
    df.loc[index_name] = (full_bitscore, domain_bitscore)

In [6]:
for file in glob.glob('ko_tables/ectoine/*csv'):
    df=pd.read_csv(file, sep='\t', index_col=0)

In [27]:
kofamlist = pd.read_csv('/vortexfs1/home/halexander/kofamscan/db/ko_list', sep='\t', index_col=0)

In [28]:
KO_index = pd.read_csv(glob.glob('ko_tables/ectoine/*csv')[0], sep='\t', index_col=0).index

In [37]:
list(set(KO_index))

['K15783',
 'K00003',
 'K00133',
 'K00928',
 'K15785',
 'K06718',
 'K15784',
 'K10674',
 'K06720',
 'K00836']

In [30]:
kofamlist.loc[KO_index]

Unnamed: 0,threshold,score_type,profile_type,F-measure,nseq,nseq_used,alen,mlen,eff_nseq,re/pos,definition
K00836,440.63,full,all,0.991918,1706,1430,2856,767,2.97,0.59,diaminobutyrate-2-oxoglutarate transaminase [E...
K06718,102.47,full,all,0.986807,626,561,476,198,2.61,0.59,"L-2,4-diaminobutyric acid acetyltransferase [E..."
K06720,95.4,full,all,0.999237,755,656,346,146,1.69,0.59,L-ectoine synthase [EC:4.2.1.108]
K15785,638.1,full,all,0.990991,362,338,971,507,0.9,0.589,"L-2,4-diaminobutyrate transaminase [EC:2.6.1.76]"
K00928,410.23,domain,all,0.752206,7421,5909,3586,593,6.91,0.59,aspartate kinase [EC:2.7.2.4]
K10674,202.97,full,all,0.951124,603,566,832,356,5.53,0.59,ectoine hydroxylase [EC:1.14.11.55]
K15783,392.47,full,all,0.963616,479,449,709,392,2.54,0.59,ectoine hydrolase [EC:3.5.4.44]
K15784,426.77,full,trim,0.878495,296,263,628,332,3.03,0.59,"N2-acetyl-L-2,4-diaminobutanoate deacetylase [..."
K00003,279.1,domain,all,0.944943,4890,4055,2512,536,7.73,0.59,homoserine dehydrogenase [EC:1.1.1.3]
K00133,221.63,full,trim,0.993935,6668,5299,2177,525,5.53,0.59,aspartate-semialdehyde dehydrogenase [EC:1.2.1...


In [31]:
ko_subset = kofamlist.loc[KO_index]


In [32]:
out_df = pd.DataFrame(columns=KO_index)

for csv_file in glob.glob(os.path.join('ko_tables/ectoine/GCF*csv')):
    df = pd.read_csv(csv_file, sep='\t', index_col=0)
    for k in KO_index:
        column = None
        if kofamlist.loc[k, 'score_type']=='full':
            column = 'full_bitscore'
        elif kofamlist.loc[k, 'score_type']=='domain':
            column = 'domain_bitscore'
        if df.loc[k,column] > float(kofamlist.loc[k,'threshold']):
            out_df.loc[csv_file, k]=1
        else:
            out_df.loc[csv_file,k]=0

In [35]:
KO_index

Index(['K00836', 'K06718', 'K06720', 'K15785', 'K00928', 'K10674', 'K15783',
       'K15784', 'K00003', 'K00133'],
      dtype='object')

In [17]:
out_df

Unnamed: 0,K00836,K06718,K06720,K15785,K00928,K10674,K15783,K15784,K00003,K00133
name,0,0,0,0,0,0,0,0,0,1


In [38]:
pd.read_csv('ko_tables/ectoine_bact.csv')

Unnamed: 0.1,Unnamed: 0,K00003,K15783,K00928,K06718,K06720,K00836,K15785,K15784,K10674,K00133
0,kofamscan/ko_tables/ectoine/GCF_000210915.2.csv,0,0,1,0,0,0,0,0,0,1
1,kofamscan/ko_tables/ectoine/GCF_000590925.1.csv,1,1,1,0,0,0,1,0,0,1
2,kofamscan/ko_tables/ectoine/GCF_001044335.1.csv,1,0,1,1,1,1,0,0,0,1
3,kofamscan/ko_tables/ectoine/GCF_000015645.1.csv,1,0,1,0,0,0,0,0,0,1
4,kofamscan/ko_tables/ectoine/GCF_000485905.1.csv,1,0,1,1,1,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...
892,kofamscan/ko_tables/ectoine/GCF_000010405.1.csv,1,0,1,0,0,0,0,0,0,1
893,kofamscan/ko_tables/ectoine/GCF_000178875.2.csv,0,0,1,0,0,0,0,0,0,1
894,kofamscan/ko_tables/ectoine/GCF_002983865.1.csv,1,1,1,0,1,0,0,1,0,1
895,kofamscan/ko_tables/ectoine/GCF_000439495.1.csv,0,0,1,0,0,0,0,0,0,1
