In [135]:
locus_lines = r'''LOCUS       pGREG505     9465 bp    DNA   CIRCULAR SYN        09-OCT-2013
LOCUS       pGREG505                9465 bp    DNA     CIRCULAR SYN 09-OCT-2013
LOCUS       F-GFAP-gCaMP\3-W       11512 bp    DNA     circular     14-JUN-2010
LOCUS       BC068339                1466 bp    mRNA    linear   VRT 26-JUN-2007
LOCUS       New_DNA                 9465 bp ds-DNA     linear       01-NOV-2016
LOCUS       YEplac181	5741 bp 	DNA	SYN
LOCUS      YEplac181             5000 bp DNA  linear      01-NOV-2016'''.splitlines()

In [136]:
from pyparsing import *

In [137]:
GoodLocus =  ( Literal("LOCUS") +
               Word(alphas+nums+'-_().'+'\\').setResultsName("name") +
               Word(nums).setResultsName("size")+Suppress(CaselessLiteral('bp')) +
               Word(alphas+'-').setResultsName("seqtype") +
               (CaselessLiteral("linear")|CaselessLiteral("circular")).setResultsName("topology") +
               Optional(Word(alphas)).setResultsName("divcode") +
               Word(alphas+nums+'-').setResultsName("date") )

# Older versions of ApE don't include a LOCUS name! Need separate def for this case:
BrokenLocus1 =( Literal("LOCUS").setResultsName("name") +
                Word(nums).setResultsName("size")+Suppress(CaselessLiteral('bp')) +
                Word(alphas+'-').setResultsName("seqtype") +
                (CaselessLiteral("linear")|CaselessLiteral("circular")).setResultsName("topology") +
                Optional(Word(alphas)).setResultsName("divcode") +
                Word(alphas+nums+'-').setResultsName("date") )
            
# LOCUS       YEplac181	5741 bp 	DNA	SYN
BrokenLocus2 =( Literal("LOCUS") +
                Word(alphas+nums+'-_().'+'\\').setResultsName("name") +
                Word(nums).setResultsName("size")+Suppress(CaselessLiteral('bp')) +
                Word(alphas+'-').setResultsName("seqtype") +
                Optional(CaselessLiteral("linear")|CaselessLiteral("circular")).setResultsName("topology") +
                Optional(Word(alphas)).setResultsName("divcode") )


LocusEntry = (GoodLocus|BrokenLocus1|BrokenLocus2)

In [155]:
from pprint import pprint
from collections import defaultdict

for line in locus_lines:
    locusdict = defaultdict(str)
    print(line)
    locusdict.update( LocusEntry.parseString(line) )
    #pprint(locusdict)
    name    = locusdict["name"] or "default"
    size  = locusdict["size"] or "100"
    seqtype = "DNA" or locusdict["seqtype"]
    for p in ["ds-", "ss-", "ms-"]:
        a, *b = seqtype.split(p)
        if b: 
            prefix=p
            seqtype=b.pop()
            break
    prefix  = prefix or "ds-"    
    topology = locusdict["topology"] or "linear"
    divcode = locusdict["divcode"] or "   "
    date = locusdict["date"] or "19-MAR-1970"
    
    print("LOCUS       {name:<24}{size:>4} bp {prefix}{seqtype:<4}    {topology:<8} {divcode} {date}".format(name=name, 
                                                                                                         size=size,
                                                                                                         prefix=prefix,
                                                                                                         seqtype=seqtype, 
                                                                                                         topology=topology,
                                                                                                         divcode=divcode,
                                                                                                         date=date))
    
    
    print("-------------------------------------------------------------------------------")
    
    
    
    

LOCUS       pGREG505     9465 bp    DNA   CIRCULAR SYN        09-OCT-2013
LOCUS       pGREG505                9465 bp ds-DNA     circular SYN 09-OCT-2013
-------------------------------------------------------------------------------
LOCUS       pGREG505                9465 bp    DNA     CIRCULAR SYN 09-OCT-2013
LOCUS       pGREG505                9465 bp ds-DNA     circular SYN 09-OCT-2013
-------------------------------------------------------------------------------
LOCUS       F-GFAP-gCaMP\3-W       11512 bp    DNA     circular     14-JUN-2010
LOCUS       F-GFAP-gCaMP\3-W        11512 bp ds-DNA     circular     14-JUN-2010
-------------------------------------------------------------------------------
LOCUS       BC068339                1466 bp    mRNA    linear   VRT 26-JUN-2007
LOCUS       BC068339                1466 bp ds-DNA     linear   VRT 26-JUN-2007
-------------------------------------------------------------------------------
LOCUS       New_DNA                 9465 bp d