In [9]:
from pyparsing import *

In [10]:
GoodLocus = (
    Literal("LOCUS")
    + Word(alphas + nums + "-_()." + "\\").setResultsName("name")
    + Word(nums).setResultsName("size")
    + Suppress(CaselessLiteral("bp"))
    + Word(alphas + "-").setResultsName("seqtype")
    + (CaselessLiteral("linear") | CaselessLiteral("circular")).setResultsName(
        "topology"
    )
    + Optional(Word(alphas)).setResultsName("divcode")
    + Word(alphas + nums + "-").setResultsName("date")
)

# Older versions of ApE don't include a LOCUS name! Need separate def for this case:
BrokenLocus1 = (
    Literal("LOCUS").setResultsName("name")
    + Word(nums).setResultsName("size")
    + Suppress(CaselessLiteral("bp"))
    + Word(alphas + "-").setResultsName("seqtype")
    + (CaselessLiteral("linear") | CaselessLiteral("circular")).setResultsName(
        "topology"
    )
    + Optional(Word(alphas)).setResultsName("divcode")
    + Word(alphas + nums + "-").setResultsName("date")
)

# LOCUS       YEplac181	5741 bp 	DNA	SYN
BrokenLocus2 = (
    Literal("LOCUS")
    + Word(alphas + nums + "-_()." + "\\").setResultsName("name")
    + Word(nums).setResultsName("size")
    + Suppress(CaselessLiteral("bp"))
    + Word(alphas + "-").setResultsName("seqtype")
    + Optional(CaselessLiteral("linear") | CaselessLiteral("circular")).setResultsName(
        "topology"
    )
    + Optional(Word(alphas)).setResultsName("divcode")
)


LocusEntry = GoodLocus | BrokenLocus1 | BrokenLocus2

In [42]:
import pydna
from pprint import pprint
from collections import defaultdict
import os

for file in (f for f in os.listdir("testfiles") if not f.startswith(".")):
    with open("testfiles/" + file, "rU") as f:
        rawlines = f.readlines()
        line = rawlines[0]
        locusdict = defaultdict(str)
        locusdict.update(LocusEntry.parseString(line))
        # pprint(locusdict)
        name = locusdict["name"] or "default"
        size = locusdict["size"] or "100"
        seqtype = "DNA" or locusdict["seqtype"]
        prefix = ""
        for p in ["ds-", "ss-", "ms-"]:
            a, *b = seqtype.split(p)
            if b:
                prefix = p
                seqtype = b.pop()
                break
        prefix = prefix or "ds-"
        topology = locusdict["topology"] or "linear"
        divcode = locusdict["divcode"] or "   "
        date = locusdict["date"] or "19-MAR-1970"

        locus_line = "LOCUS       {name:<24}{size:>4} bp {prefix}{seqtype:<4}    {topology:<8} {divcode} {date}".format(
            name=name,
            size=size,
            prefix=prefix,
            seqtype=seqtype,
            topology=topology,
            divcode=divcode,
            date=date,
        )
        os = "\n".join(rawlines)
        ns = "\n".join([locus_line] + rawlines[1:])
        item = ns
        raw = ""
        print(locus_line)
        # print(repr(pydna.read(ns)))
        import textwrap

        raw += textwrap.dedent(item).strip()
        pattern = r"(?:>.+\n^(?:^[^>]+?)(?=\n\n|>|LOCUS|ID))|(?:(?:LOCUS|ID)(?:(?:.|\n)+?)^//)"
        import re

        rawseqs = re.findall(pattern, textwrap.dedent(raw + "\n\n"), flags=re.MULTILINE)
        rawseq = rawseqs.pop(0)
        import io

        handle = io.StringIO(rawseq)
        from Bio import SeqIO
        from Bio.Alphabet.IUPAC import IUPACAmbiguousDNA

        parsed = SeqIO.read(handle, "genbank", alphabet=IUPACAmbiguousDNA())

LOCUS       BC068339                5741 bp ds-DNA     linear       26-JUN-2007




In [45]:
print(parsed.seq)

GCGCCCAATACGCAAACCGCCTCTCCCCGCGCGTTGGCCGATTCATTAATGCAGCTGGCACGACAGGTTTCCCGACTGGAAAGCGGGCAGTGAGCGCAACGCAATTAATGTGAGTTAGCTCACTCATTAGGCACCCCAGGCTTTACACTTTATGCTTCCGGCTCGTATGTTGTGTGGAATTGTGAGCGGATAACAATTTCACACAGGAAACAGCTATGACCATGATTACGCCAAGCTTGCATGCCTGCAGGTCGACTCTAGAGGATCCCCGGGTACCGAGCTCGAATTCACTGGCCGTCGTTTTACAACGTCGTGACTGGGAAAACCCTGGCGTTACCCAACTTAATCGCCTTGCAGCACATCCCCCTTTCGCCAGCTGGCGTAATAGCGAAGAGGCCCGCACCGATCGCCCTTCCCAACAGTTGCGCAGCCTGAATGGCGAATGGCGCCTGATGCGGTATTTTCTCCTTACGCATCTGTGCGGTATTTCACACCGCATATATCGGATCGTACTTGTTACCCATCATTGAATTTTGAACATCCGAACCTGGGAGTTTTCCCTGAAACAGATAGTATATTTGAACCTGTATAATAATATATAGTCTAGCGCTTTACGGAAGACAATGTATGTATTTCGGTTCCTGGAGAAACTATTGCATCTATTGCATAGGTAATCTTGCACGTCGCATCCCCGGTTCATTTTCTGCGTTTCCATCTTGCACTTCAATAGCATATCTTTGTTAACGAAGCATCTGTGCTTCATTTTGTAGAACAAAAATGCAACGCGAGAGCGCTAATTTTTCAAACAAAGAATCTGAGCTGCATTTTTACAGAACAGAAATGCAACGCGAAAGCGCTATTTTACCAACGAAGAATCTGTGCTTCATTTTTGTAAAACAAAAATGCAACGCGAGAGCGCTAATTTTTCAAACAAAGAATCTGAGCTGCATTTTTACAGAACAGAAATGCAACGCGAGAGCGCTATTTTACCAACAAAGAA