This notebookd is to format the domain annotation output for plasmid regions detected using KMER spectrum analysis. HMMER and the pfam database each of the plasmid proteomes was annotated. We used the NCBI annotation of the genome when possible. Otherwise, PATRIC annotations or the getORF tool ( from the EMBOSS suite ) were used.



In [75]:
import pandas as pd
from Bio import SeqIO
from matplotlib import pyplot as plt
import uniprot
import numpy as np
import itertools
import glob
import multiprocessing as mp
import subprocess
import shlex
import Bio.SeqFeature

In [76]:
hnatans = pd.read_table( './926690.3.PATRIC.features.tab')

In [77]:
sub = hnatans[ hnatans.accession == 'ATYM01000002']
sub = sub[sub.start > 1493414]
sub = sub[sub.end > 1495372]
print(sub)
print(len(sub))

      genome_id                  genome_name     accession annotation  \
2707   926690.3  Haloplanus natans DSM 17983  ATYM01000002     PATRIC   
2708   926690.3  Haloplanus natans DSM 17983  ATYM01000002     PATRIC   
2709   926690.3  Haloplanus natans DSM 17983  ATYM01000002     PATRIC   
2710   926690.3  Haloplanus natans DSM 17983  ATYM01000002     PATRIC   
2711   926690.3  Haloplanus natans DSM 17983  ATYM01000002     PATRIC   
...         ...                          ...           ...        ...   
3405   926690.3  Haloplanus natans DSM 17983  ATYM01000002     PATRIC   
3406   926690.3  Haloplanus natans DSM 17983  ATYM01000002     PATRIC   
3407   926690.3  Haloplanus natans DSM 17983  ATYM01000002     PATRIC   
3408   926690.3  Haloplanus natans DSM 17983  ATYM01000002     PATRIC   
3409   926690.3  Haloplanus natans DSM 17983  ATYM01000002     PATRIC   

     feature_type              patric_id  refseq_locus_tag    start      end  \
2707          CDS  fig|926690.3.peg.2651   

In [78]:
in_seq_file = "./926690.3.PATRIC.faa"
parsed = SeqIO.parse( in_seq_file , 'fasta')
keep = []
for i,seq in enumerate(parsed):
    if seq.id in sub.patric_id.unique():
        seqlist.add(seq.id)
        keep.append(seq)
outfile = './926690.3.PATRIC.proteo.fasta'
SeqIO.write( keep,outfile, 'fasta')

684


684

In [79]:
sub['protein_id'] = sub.patric_id
sub.index = sub.patric_id

In [80]:
featureDfs = {}

def rungetORF( geno , proteome , minsize = 600 ):
    args = 'getorf -sequence '+ geno + ' -outseq ' + proteome +' -minsize ' + str(minsize)
    p = subprocess.run( shlex.split(args) )
    return p, proteome

for filename in glob.glob('*.gb'):
    parsed = SeqIO.parse( filename , 'gb')
    featuredict={}
    fastastr = ''
    for i,seq in enumerate(parsed):
            print(seq)
            xrefs = {'genome xrefs' : seq.dbxrefs }
            for feature in seq.features:
                if feature.type == 'CDS':
                    if 'protein_id' in feature.qualifiers:
                        featuredict[feature.qualifiers['protein_id'][0]]  = {q:feature.qualifiers[q][0] for q in feature.qualifiers }
                        featuredict[feature.qualifiers['protein_id'][0]].update(xrefs)
                        fastastr+= '>'+ feature.qualifiers['protein_id'][0] + '\n' + feature.qualifiers['translation'][0] +'\n'
    
    if len(fastastr)==0:
        #fall back on getORF if NCBI annot unavailable
        geno = seq
        with open( filename+'.convert.fasta' , 'w') as fastout:
            SeqIO.write( geno , fastout, 'fasta')
        #predict orfs of all genos w hap2
        rungetORF(   filename+'.convert.fasta' , filename+'.convert.proteo.fasta')
        
        #rungetorf
        parsed = SeqIO.parse( filename+'.convert.proteo.fasta' , 'fasta')
        for orf in parsed:
            #JNCS01000001.1_1 [708 - 1748] Natrinema altunense strain AJ2 N_altunense_AJ2_contig_1, whole genome shotgun sequence
            featuredict[orf.id] = dict( zip (['protein_id', 'start' , 'dash' ,'stop'] , orf.description.split()[0:4] )) 
            del featuredict[orf.id]['dash']
            featuredict[orf.id]['start'] = featuredict[orf.id]['start'].replace('[','')
            featuredict[orf.id]['stop']= featuredict[orf.id]['stop'].replace(']','')
            featuredict[orf.id].update(xrefs)
            #parse output
    featureDf = pd.DataFrame.from_dict( featuredict , orient = 'index')
    featureDfs[filename] = featureDf
    print(featureDf)
    with open(filename+'.proteo.fasta', 'w') as fastout:
        fastout.write( fastastr )
        
#add patric annot for hnatans
featureDfs['hnatans'] = sub

ID: JNCS01000001.1
Name: JNCS01000001
Description: Natrinema altunense strain AJ2 N_altunense_AJ2_contig_1, whole genome shotgun sequence
Database cross-references: BioProject:PRJNA248700, BioSample:SAMN02800870
Number of features: 1
/molecule_type=DNA
/topology=linear
/data_file_division=BCT
/date=21-JUL-2014
/accessions=['JNCS01000001', 'REGION:', '496500..593500']
/sequence_version=1
/keywords=['WGS']
/source=Natrinema altunense
/organism=Natrinema altunense
/taxonomy=['Archaea', 'Euryarchaeota', 'Stenosarchaea group', 'Halobacteria', 'Natrialbales', 'Natrialbaceae', 'Natrinema']
/references=[Reference(title='Natrinema altunense sp. nov., an extremely halophilic archaeon isolated from a salt lake in Altun Mountain in Xinjiang, China', ...), Reference(title='Direct Submission', ...)]
/structured_comment=OrderedDict([('Genome-Assembly-Data', OrderedDict([('Assembly Method', 'Newbler v. Feb-2013'), ('Genome Representation', 'Full'), ('Expected Final Version', 'No'), ('Genome Coverage',

In [81]:
dfs = []
annotations = glob.glob( './*hmmerscan.csv')
header = 'targetname accession queryid accession2    E-value  score  bias   domain-E-value  domain-score  domain-bias   exp reg clu  ov env dom rep inc description'.split()
for filename in  annotations:
    framedict = {}
    with open(filename , 'r') as infile:
        for i,line in enumerate(infile):
            if line[0] != '#':
                words = line.split()
                words[18] = ''.join([ w + ' ' for w in words[18:-1]])
                #print(words)
                framedict[i] = dict(zip( header , words[0:19]))
        df = pd.DataFrame.from_dict(framedict , orient='index')
        df['infile'] = filename
        #df.to_csv(filename + '.reformat.csv')
        dfs.append(df)
globaldf = pd.concat(dfs)
globaldf['domain'] = globaldf.accession.map(lambda x : str(int(x.split('.')[0].replace('PF',''))) )

print(globaldf.infile.unique())
print(globaldf[globaldf.infile == './926690.3.PATRIC.proteo.fastahmmerscan.csv'])

['./Naltunense1.gb.fastahmmerscan.csv'
 './halogeometricum.gb.proteo.fastahmmerscan.csv'
 './haloterigena_reg.gb.proteo.fastahmmerscan.csv'
 './Nal.gb.convert.proteo.fastahmmerscan.csv'
 './halovivax.gb.proteo.fastahmmerscan.csv'
 './haloterigena_reg.gb.fastahmmerscan.csv'
 './halobonum.gb.fastahmmerscan.csv' './halovivax.gb.fastahmmerscan.csv'
 './Naltunense1.gb.proteo.fastahmmerscan.csv'
 './haloferax.gb.convert.proteo.fastahmmerscan.csv'
 './926690.3.PATRIC.proteo.fastahmmerscan.csv'
 './halogeometricum.gb.fastahmmerscan.csv'
 './halobonum.gb.proteo.fastahmmerscan.csv']
          targetname   accession                queryid accession2 E-value  \
3                AAA  PF00004.31  fig|926690.3.peg.2651          -  0.0016   
4             DUF853  PF05872.14  fig|926690.3.peg.2651          -  0.0068   
5              DUF87  PF01935.19  fig|926690.3.peg.2651          -  0.0085   
6           SHNi-TPR  PF10516.11  fig|926690.3.peg.2651          -    0.13   
7             DUF309  PF03745.

In [82]:
print(globaldf.columns)

Index(['targetname', 'accession', 'queryid', 'accession2', 'E-value', 'score',
       'bias', 'domain-E-value', 'domain-score', 'domain-bias', 'exp', 'reg',
       'clu', 'ov', 'env', 'dom', 'rep', 'inc', 'description', 'infile',
       'domain'],
      dtype='object')


In [83]:
globaldf['E-value'] =globaldf['E-value'].map(lambda x : float(x)) 

In [84]:
globaldf=globaldf[globaldf['E-value'] < .001]
globaldf.accession = globaldf.accession.map( lambda x : x.split('.')[0].lower().replace('pf', 'pfam'))
print(globaldf)

          targetname  accession     queryid accession2       E-value  score  \
16   Phage_integrase  pfam00589  ELY83596.1          -  7.800000e-18   64.9   
17   Phage_int_SAM_4  pfam13495  ELY83596.1          -  2.000000e-11   44.3   
18   Phage_int_SAM_1  pfam02899  ELY83596.1          -  6.900000e-05   23.1   
19   Phage_int_SAM_5  pfam13102  ELY83596.1          -  8.800000e-04   19.7   
22        OrfB_IS605  pfam01385  ELY83598.1          -  8.900000e-31  106.7   
..               ...        ...         ...        ...           ...    ...   
504            PQQ_2  pfam13360  QLG62040.1          -  1.200000e-48  166.0   
513           HalOD1  pfam18545  QLG62045.1          -  7.000000e-10   39.1   
514           TrkA_C  pfam02080  QLG62049.1          -  2.900000e-11   43.2   
540            DUF87  pfam01935  QLG62070.1          -  3.400000e-06   27.4   
541     FtsK_SpoIIIE  pfam01580  QLG62070.1          -  7.400000e-04   19.0   

     bias domain-E-value domain-score domain-bias  

In [85]:
#map to arcog
header = '<arCOG-id> <functional-class> <gene-name> <arCOG-annotation> <supercluster> <profile-pfam> <profile-cdd> <profile-tigrfam>'
header = header.replace('<', '').replace('>' , '').split()

arcogDF = pd.read_table( 'ar14.arCOGdef.tab'  ,engine='python' )
arcogDF.columns = header
print(len(arcogDF))
arcogDF = arcogDF[~arcogDF['profile-pfam'].isna()]
print(len(arcogDF))
print(arcogDF)

13443
5232
         arCOG-id functional-class gene-name  \
0      arCOG00001                K         -   
1      arCOG00002                K         -   
2      arCOG00004                K         -   
3      arCOG00005                K         -   
4      arCOG00006                K         -   
...           ...              ...       ...   
13280  arCOG15117                S         -   
13290  arCOG15127                S         -   
13318  arCOG15155                S         -   
13331  arCOG15168                P         -   
13417  arCOG15257                P         -   

                             arCOG-annotation supercluster profile-pfam  \
0      Transcriptional regulator, PadR family     COG01695    pfam03551   
1      Transcriptional regulator, PadR family     COG01695    pfam03551   
2      Transcriptional regulator, PadR family     COG01695    pfam03551   
3      Transcriptional regulator, PadR family     COG01695    pfam03551   
4      Transcriptional regulator, Pad

In [86]:
arcogmap = dict(zip( arcogDF['profile-pfam'] , arcogDF['arCOG-id']))
globaldf['arCOG-id'] = globaldf.accession.map(arcogmap)
globaldf = globaldf.merge( arcogDF , left_on = 'arCOG-id'  , right_on = 'arCOG-id' , how = 'left')

In [87]:

print(globaldf)

           targetname  accession     queryid accession2       E-value  score  \
0     Phage_integrase  pfam00589  ELY83596.1          -  7.800000e-18   64.9   
1     Phage_int_SAM_4  pfam13495  ELY83596.1          -  2.000000e-11   44.3   
2     Phage_int_SAM_1  pfam02899  ELY83596.1          -  6.900000e-05   23.1   
3     Phage_int_SAM_5  pfam13102  ELY83596.1          -  8.800000e-04   19.7   
4          OrfB_IS605  pfam01385  ELY83598.1          -  8.900000e-31  106.7   
...               ...        ...         ...        ...           ...    ...   
2144            PQQ_2  pfam13360  QLG62040.1          -  1.200000e-48  166.0   
2145           HalOD1  pfam18545  QLG62045.1          -  7.000000e-10   39.1   
2146           TrkA_C  pfam02080  QLG62049.1          -  2.900000e-11   43.2   
2147            DUF87  pfam01935  QLG62070.1          -  3.400000e-06   27.4   
2148     FtsK_SpoIIIE  pfam01580  QLG62070.1          -  7.400000e-04   19.0   

      bias domain-E-value domain-score 

In [88]:
print(globaldf)

           targetname  accession     queryid accession2       E-value  score  \
0     Phage_integrase  pfam00589  ELY83596.1          -  7.800000e-18   64.9   
1     Phage_int_SAM_4  pfam13495  ELY83596.1          -  2.000000e-11   44.3   
2     Phage_int_SAM_1  pfam02899  ELY83596.1          -  6.900000e-05   23.1   
3     Phage_int_SAM_5  pfam13102  ELY83596.1          -  8.800000e-04   19.7   
4          OrfB_IS605  pfam01385  ELY83598.1          -  8.900000e-31  106.7   
...               ...        ...         ...        ...           ...    ...   
2144            PQQ_2  pfam13360  QLG62040.1          -  1.200000e-48  166.0   
2145           HalOD1  pfam18545  QLG62045.1          -  7.000000e-10   39.1   
2146           TrkA_C  pfam02080  QLG62049.1          -  2.900000e-11   43.2   
2147            DUF87  pfam01935  QLG62070.1          -  3.400000e-06   27.4   
2148     FtsK_SpoIIIE  pfam01580  QLG62070.1          -  7.400000e-04   19.0   

      bias domain-E-value domain-score 

In [89]:
print(featureDfs['hnatans'])

                       genome_id                  genome_name     accession  \
patric_id                                                                     
fig|926690.3.peg.2651   926690.3  Haloplanus natans DSM 17983  ATYM01000002   
fig|926690.3.peg.2652   926690.3  Haloplanus natans DSM 17983  ATYM01000002   
fig|926690.3.peg.2653   926690.3  Haloplanus natans DSM 17983  ATYM01000002   
fig|926690.3.peg.2654   926690.3  Haloplanus natans DSM 17983  ATYM01000002   
fig|926690.3.peg.2655   926690.3  Haloplanus natans DSM 17983  ATYM01000002   
...                          ...                          ...           ...   
fig|926690.3.peg.3332   926690.3  Haloplanus natans DSM 17983  ATYM01000002   
fig|926690.3.peg.3333   926690.3  Haloplanus natans DSM 17983  ATYM01000002   
fig|926690.3.peg.3334   926690.3  Haloplanus natans DSM 17983  ATYM01000002   
fig|926690.3.peg.3335   926690.3  Haloplanus natans DSM 17983  ATYM01000002   
fig|926690.3.rna.50     926690.3  Haloplanus natans 

In [90]:
for key in featureDfs:
    print(key)
    if len( featureDfs[key]) > 0:
        featureDfs[key] = featureDfs[key].merge(globaldf, left_index = True , right_on = 'queryid' , how='left')
        for col in ['transl_table' , 'translation' , 'infile' , 'inference' , 'note' , 'profile-pfam' , 'score',
       'bias', 'domain-E-value', 'domain-score', 'domain-bias', 'exp', 'reg', 'EC_number'
       'clu', 'ov', 'env', 'dom', 'rep', 'inc' , 'accession2' ]:
            try:
                featureDfs[key] = featureDfs[key].drop( axis = 1, labels =[col]  )
            except:
                pass
        featureDfs[key].index = np.arange( len(featureDfs[key]))
        
        
        featureDfs[key].to_csv( key + 'arcog_features.csv')
        print(featureDfs[key].head())

Nal.gb
         protein_id  start   stop  \
0  JNCS01000001.1_1    708   1748   
1  JNCS01000001.1_2   3911   4552   
2  JNCS01000001.1_3   4354   5184   
3  JNCS01000001.1_4   8492   9190   
4  JNCS01000001.1_5  12484  13356   

                                       genome xrefs targetname  accession  \
0  [BioProject:PRJNA248700, BioSample:SAMN02800870]    Mrr_cat  pfam04471   
1  [BioProject:PRJNA248700, BioSample:SAMN02800870]        NaN        NaN   
2  [BioProject:PRJNA248700, BioSample:SAMN02800870]        NaN        NaN   
3  [BioProject:PRJNA248700, BioSample:SAMN02800870]        NaN        NaN   
4  [BioProject:PRJNA248700, BioSample:SAMN02800870]        NaN        NaN   

            queryid   E-value  clu   description domain    arCOG-id  \
0  JNCS01000001.1_1  0.000001    0  Restriction    4471  arCOG09571   
1  JNCS01000001.1_2       NaN  NaN           NaN    NaN         NaN   
2  JNCS01000001.1_3       NaN  NaN           NaN    NaN         NaN   
3  JNCS01000001.1_4    