In [1]:
import Bio

In [51]:
import pandas as pd
import numpy as np

Clean up the `Resfams.tbl` file by making it tab delimited and stripping off the header, footer, and small woodland creatures that inhabit it.  Save to `Resfams.tsv`.

In [86]:
with open('Resfams.tbl', 'rt') as input, open('Resfams.tsv', 'wt') as output:
    line_no = 1;
    for line in input:
        fields = line.split()
        if fields:
            if fields[0][0] != '#':
                if len(fields) < 19:
                    raise Exception("Incorrect number of fields", "Line %d has fewer than 19 fields after initial split" % line_no)
                else:
                    for field in fields[:18]:
                        output.write("%s\t" % field.replace('"', ''))
                    for field in fields[18:len(fields)-1]:
                        output.write("%s " % field.replace('"', ''))
                    output.write("%s\n" % fields[len(fields)-1].replace('"', ''))
            else:
                pass
        line_no += 1

Read the modified table using the fixed width reader.  Note we manually specify the dtypes and column names and skip the header and footer.  Bummer.

In [87]:
df = pd.read_csv('Resfams.tsv', sep='\t', engine='python',
    header=None, error_bad_lines=False, warn_bad_lines=True,
    names=[ "target", "target_accession", "query", "query_accesion", "full_evalue", "full_score", 
           "full_bias", "best_evalue", "best_score", "best_bias", "exp", "reg", "clu", "ov", "env", 
           "dom", "rep", "inc", "description"],
    dtype={
        "target" : object, 
        "target_accession" : object, 
        "query" : object, 
        "query_accesion" : object, 
        "full_evalue" : np.float64, 
        "full_score" : np.float64, 
        "full_bias" : np.float64, 
        "best_evalue" : np.float64, 
        "best_score" : np.float64, 
        "best_bias" : np.float64, 
        "exp" : np.float64, 
        "reg" : np.float64, 
        "clu" : np.int64, 
        "ov" : np.int64, 
        "env" : np.int64, 
        "dom" : np.int64, 
        "rep" : np.int64, 
        "inc" : np.int64, 
        "description" : object, 
    })
df.head()

Unnamed: 0,target,target_accession,query,query_accesion,full_evalue,full_score,full_bias,best_evalue,best_score,best_bias,exp,reg,clu,ov,env,dom,rep,inc,description
0,CTXM,RF0059,NP_774964.1,-,4.7e-193,632.8,4.5,5.199999999999999e-193,632.6,4.5,1.0,1.0,0,0,1,1,1,1,CTX-M beta-lactamase (class a) [ARO:3000016]
1,ClassA,RF0053,NP_774964.1,-,3.2e-129,422.9,0.5,3.8e-129,422.7,0.5,1.0,1.0,0,0,1,1,1,1,Class A beta-lactamase [ARO:3000078]
2,KPC,RF0083,NP_774964.1,-,6.8e-85,277.8,2.3,7.699999999999999e-85,277.6,2.3,1.0,1.0,0,0,1,1,1,1,Klebsiella pneumoniae carbapenem resistant (KP...
3,SME,RF0120,NP_774964.1,-,2e-73,239.9,0.0,2.4e-73,239.7,0.0,1.0,1.0,0,0,1,1,1,1,SME beta-lactamase (class a) [ARO:3000055]
4,CARB-PSE,RF0044,NP_774964.1,-,2.2e-72,236.8,0.3,2.7e-72,236.5,0.3,1.0,1.0,0,0,1,1,1,1,CARB-PSE beta-lactamases (class a) [ARO:300009...


In [109]:
df_min_evalue = df.loc[df.groupby("query")["full_evalue"].idxmin()]
df_min_evalue.shape

(5660, 19)

In [107]:
df_min_evalue.loc[df_min_evalue["query"] == 'NP_774964.1']

Unnamed: 0,target,target_accession,query,query_accesion,full_evalue,full_score,full_bias,best_evalue,best_score,best_bias,exp,reg,clu,ov,env,dom,rep,inc,description
0,CTXM,RF0059,NP_774964.1,-,4.7e-193,632.8,4.5,5.199999999999999e-193,632.6,4.5,1.0,1.0,0,0,1,1,1,1,CTX-M beta-lactamase (class a) [ARO:3000016]
