Skip to content

Commit

Permalink
Support writing GenBank REFERENCE lines in SeqIO (Bug 2294, see also …
Browse files Browse the repository at this point in the history
…Bug 3000)
  • Loading branch information
peterjc committed Feb 1, 2010
1 parent 5a87b07 commit 42707bd
Show file tree
Hide file tree
Showing 2 changed files with 85 additions and 9 deletions.
48 changes: 47 additions & 1 deletion Bio/SeqIO/InsdcIO.py
Expand Up @@ -460,6 +460,49 @@ def _write_the_first_line(self, record):

self.handle.write(line)

def _write_references(self, record):
number = 0
for ref in record.annotations["references"]:
if not isinstance(ref, SeqFeature.Reference):
continue
number += 1
data = str(number)
#TODO - support more complex record reference locations?
if ref.location and len(ref.location)==1:
a = Alphabet._get_base_alphabet(record.seq.alphabet)
if isinstance(a, Alphabet.ProteinAlphabet):
units = "residues"
else:
units = "bases"
data += " (%s %i to %i)" % (units,
ref.location[0].nofuzzy_start+1,
ref.location[0].nofuzzy_end)
self._write_single_line("REFERENCE",data)
if ref.authors:
#We store the AUTHORS data as a single string
self._write_multi_line(" AUTHORS", ref.authors)
if ref.consrtm:
#We store the consortium as a single string
self._write_multi_line(" CONSRTM", ref.consrtm)
if ref.title:
#We store the title as a single string
self._write_multi_line(" TITLE", ref.title)
if ref.journal:
#We store this as a single string - holds the journal name,
#volume, year, and page numbers of the citation
self._write_multi_line(" JOURNAL", ref.journal)
if ref.medline_id:
#This line type is obsolete and was removed from the GenBank
#flatfile format in April 2005. Should we write it?
#Note this has a two space indent:
self._write_multi_line(" MEDLINE", ref.medline_id)
if ref.pubmed_id:
#Note this has a THREE space indent:
self._write_multi_line(" PUBMED", ref.pubmed_id)
if ref.comment:
self._write_multi_line(" REMARK", ref.comment)


def _write_comment(self, record):
#This is a bit complicated due to the range of possible
#ways people might have done their annotation...
Expand Down Expand Up @@ -576,9 +619,12 @@ def write_record(self, record):
taxonomy = "."
self._write_multi_line("", taxonomy)

#TODO - References...
if "references" in record.annotations:
self._write_references(record)

if "comment" in record.annotations:
self._write_comment(record)

handle.write("FEATURES Location/Qualifiers\n")
for feature in record.features:
self._write_feature(feature)
Expand Down
46 changes: 38 additions & 8 deletions Tests/test_SeqIO_features.py
Expand Up @@ -31,9 +31,9 @@ def write_read(filename, in_format="gb", out_format="gb"):
gb_records2 = list(SeqIO.parse(handle,out_format))
compare_records(gb_records, gb_records2)

def compare_record(old, new, ignore_name=False):
def compare_record(old, new, expect_minor_diffs=False):
#Note the name matching is a bit fuzzy
if not ignore_name \
if not expect_minor_diffs \
and old.id != new.id and old.name != new.name \
and (old.id not in new.id) and (new.id not in old.id) \
and (old.id.replace(" ","_") != new.id.replace(" ","_")):
Expand All @@ -47,21 +47,51 @@ def compare_record(old, new, ignore_name=False):
else:
raise ValueError("'%s...' vs '%s...'" % (old.seq[:100], new.seq[:100]))
if old.features and new.features:
return compare_features(old.features, new.features)
if not compare_features(old.features, new.features):
return False
#Just insist on at least one word in common:
if (old.description or new.description) \
and not set(old.description.split()).intersection(new.description.split()):
raise ValueError("%s versus %s" \
% (repr(old.description), repr(new.description)))
#TODO - check annotation
#This only checks common annotation
#Would a white list be easier?
for key in set(old.annotations.keys()).intersection(new.annotations.keys()):
if key in ["data_file_division", "accessions"]:
#TODO - These are not yet supported on output, or
#have other complications (e.g. different number of accessions
#allowed in various file formats)
continue
if key == "comment":
#Ignore whitespace
if old.annotations[key].split() != new.annotations[key].split():
raise ValueError("Annotation mis-match for comment:\n%s\n%s" \
% (old.annotations[key], new.annotations[key]))
continue
if key == "references":
if expect_minor_diffs:
#TODO - Implement EMBL output of references
continue
assert len(old.annotations[key]) == len(new.annotations[key])
for r1, r2 in zip(old.annotations[key], new.annotations[key]):
assert r1.title == r2.title
assert r1.authors == r2.authors
assert r1.journal == r2.journal
assert r1.consrtm == r2.consrtm
assert r1.medline_id == r2.medline_id
assert r1.pubmed_id == r2.pubmed_id
continue
if repr(old.annotations[key]) != repr(new.annotations[key]):
raise ValueError("Annotation mis-match for %s:\n%s\n%s" \
% (key, old.annotations[key], new.annotations[key]))
return True

def compare_records(old_list, new_list, ignore_name=False):
def compare_records(old_list, new_list, expect_minor_diffs=False):
"""Check two lists of SeqRecords agree, raises a ValueError if mismatch."""
if len(old_list) != len(new_list):
raise ValueError("%i vs %i records" % (len(old_list), len(new_list)))
for old, new in zip(old_list, new_list):
if not compare_record(old,new,ignore_name):
if not compare_record(old,new,expect_minor_diffs):
return False
return True

Expand Down Expand Up @@ -608,7 +638,7 @@ def test_GenBank_vs_EMBL(self):
return
gb_record = SeqIO.read(open(self.gb_filename),"genbank")
embl_record = SeqIO.read(open(self.embl_filename),"embl")
return compare_record(gb_record, embl_record, ignore_name=True)
return compare_record(gb_record, embl_record, expect_minor_diffs=True)

def test_Translations(self):
#"""Checking translation of FASTA features (faa vs ffn)."""
Expand Down Expand Up @@ -636,7 +666,7 @@ def test_Genome(self):
if self.emblname is None:
return
embl_record = SeqIO.read(open(self.embl_filename),"embl")
compare_record(gb_record, embl_record, ignore_name=True)
compare_record(gb_record, embl_record, expect_minor_diffs=True)

def test_Features(self):
#"""Checking GenBank features sequences vs FASTA ffn file."""
Expand Down

0 comments on commit 42707bd

Please sign in to comment.