Skip to content

Commit

Permalink
Updated Record class in Bio.SCOP.Cla to use a dictionary instead of a…
Browse files Browse the repository at this point in the history
… list of pairs, because the file format definition specified by the maintainers of SCOP states that the order of the key-value pairs in the hierarchy field of the Record does not matter.

Also updated the string representation of the Record so that it does not end with a newline character.

Updated tests accordingly.
  • Loading branch information
jfinkels committed Jul 8, 2010
1 parent 7a0477c commit 6d2257d
Show file tree
Hide file tree
Showing 5 changed files with 59 additions and 22 deletions.
14 changes: 6 additions & 8 deletions Bio/SCOP/Cla.py
@@ -1,4 +1,5 @@
# Copyright 2001 by Gavin E. Crooks. All rights reserved.
# Copyright 2010 Jeffrey Finkelstein
# This code is part of the Biopython distribution and governed by its
# license. Please see the LICENSE file that should have been included
# as part of this package.
Expand Down Expand Up @@ -40,7 +41,7 @@ def __init__(self, line=None):
self.residues = None
self.sccs = ''
self.sunid =''
self.hierarchy = []
self.hierarchy = {}
if line:
self._process(line)

Expand All @@ -57,8 +58,7 @@ def _process(self, line):

for ht in hierarchy.split(","):
key, value = ht.split('=')
value = int(value)
self.hierarchy.append([key, value])
self.hierarchy[key] = int(value)

def __str__(self):
s = []
Expand All @@ -67,12 +67,10 @@ def __str__(self):
s.append(self.sccs)
s.append(self.sunid)

h=[]
for ht in self.hierarchy:
h.append("=".join(map(str,ht)))
s.append(",".join(h))
s.append(",".join([key + '=' + str(value) for key, value \
in self.hierarchy.items()]))

return "\t".join(map(str,s)) + "\n"
return "\t".join(map(str,s)) #+ "\n"


def parse(handle):
Expand Down
10 changes: 6 additions & 4 deletions Bio/SCOP/__init__.py
@@ -1,5 +1,6 @@
# Copyright 2001 by Gavin E. Crooks. All rights reserved.
# Modifications Copyright 2004/2005 James Casbon. All rights Reserved.
# Copyright 2010 Jeffrey Finkelstein
#
# This code is part of the Biopython distribution and governed by its
# license. Please see the LICENSE file that should have been included
Expand Down Expand Up @@ -318,8 +319,7 @@ def write_cla(self, handle):
nodes = self._sidDict.values()
# We order nodes to ease comparison with original file
nodes.sort(key = lambda n: n.sunid)
for n in nodes:
handle.write(str(n.toClaRecord()))
handle.writelines([str(n.toClaRecord()) + '\n' for n in nodes])


def getDomainFromSQL(self, sunid=None, sid=None):
Expand Down Expand Up @@ -661,10 +661,12 @@ def toClaRecord(self):

n = self
while n.sunid != 0: #Not root node
rec.hierarchy.append( (n.type, str(n.sunid)) )
rec.hierarchy[n.type] = n.sunid
n = n.getParent()

rec.hierarchy.reverse()
# Order does not matter in the hierarchy field. For more info, see
# http://scop.mrc-lmb.cam.ac.uk/scop/release-notes.html
#rec.hierarchy.reverse()

return rec

Expand Down
1 change: 1 addition & 0 deletions CONTRIB
Expand Up @@ -29,6 +29,7 @@ Andrew Dalke <dalke at domain acm.org>
Michiel de Hoon <mdehoon at domain c2b2.columbia.edu>
Sjoerd de Vries <sjoerd at domain nmr.chem.uu.nl>
Kyle Ellrott
Jeffrey Finkelstein <jeffrey.finkelstein@gmail.com>
Iddo Friedberg <idoerg at domain burnham.org>
Bertrand Frottier <bertrand.frottier at domain free.fr>
Frederik Gwinner
Expand Down
31 changes: 22 additions & 9 deletions Tests/test_SCOP_Cla.py
@@ -1,4 +1,5 @@
# Copyright 2001 by Gavin E. Crooks. All rights reserved.
# Copyright 2010 Jeffrey Finkelstein
# This code is part of the Biopython distribution and governed by its
# license. Please see the LICENSE file that should have been included
# as part of this package.
Expand Down Expand Up @@ -36,8 +37,20 @@ def testStr(self):
try:
for line in f:
record = Cla.Record(line)
#End of line is platform dependent. Strip it off
self.assertEqual(str(record).rstrip(), line.rstrip())
# The SCOP Classification file format which can be found at
# http://scop.mrc-lmb.cam.ac.uk/scop/release-notes.html states
# that the list of classification hierarchy key-value pairs is
# unordered, therefore we need only check that they are all
# there, NOT that they are in the same order.
#
# End of line is platform dependent. Strip it off
expected_hierarchy = line.rstrip().split('\t')[5].split(',')
hierarchy = str(record).rstrip().split('\t')[5].split(',')
expected_hierarchy = dict([pair.split('=') for pair in expected_hierarchy])
hierarchy = dict([pair.split('=') for pair in hierarchy])
self.assertEqual(len(hierarchy), len(expected_hierarchy))
for key, value in hierarchy.items():
self.assertEqual(value, expected_hierarchy[key])
finally:
f.close()

Expand All @@ -56,13 +69,13 @@ def testRecord(self):
self.assertEqual(record.residues.fragments, (('T','',''),('U','91','106')))
self.assertEqual(record.sccs, 'b.1.2.1')
self.assertEqual(record.sunid, 21953)
self.assertEqual(record.hierarchy, [['cl',48724],
['cf',48725],
['sf',49265],
['fa',49266],
['dm',49267],
['sp',49268],
['px',21953]])
self.assertEqual(record.hierarchy, {'cl' : 48724,
'cf' : 48725,
'sf' : 49265,
'fa' : 49266,
'dm' : 49267,
'sp' : 49268,
'px' : 21953})

def testIndex(self):
"""Test CLA file indexing"""
Expand Down
25 changes: 24 additions & 1 deletion Tests/test_SCOP_Scop.py
@@ -1,4 +1,5 @@
# Copyright 2001 by Gavin E. Crooks. All rights reserved.
# Copyright 2010 Jeffrey Finkelstein
# This code is part of the Biopython distribution and governed by its
# license. Please see the LICENSE file that should have been included
# as part of this package.
Expand All @@ -14,6 +15,25 @@

class ScopTests(unittest.TestCase):

def _compare_cla_lines(self, cla_line_1, cla_line_2):
""" Compares the two specified Cla lines for equality.
The order of the key-value pairs in the sixth field of the lines does
not matter. For more information, see
http://scop.mrc-lmb.cam.ac.uk/scop/release-notes.html
"""
fields1 = cla_line_1.rstrip().split('\t')
fields2 = cla_line_2.rstrip().split('\t')
# the first five fields in a Cla line should be the same
for i in range(5):
if fields1[i] != fields2[i]:
return False
# get the hierarchy key-value pairs, which are unordered
hierarchy1 = set(fields1[5].split(','))
hierarchy2 = set(fields2[5].split(','))
if hierarchy1 != hierarchy2:
return False
return True

def testParse(self):

Expand All @@ -35,7 +55,10 @@ def testParse(self):

cla_out = StringIO()
scop.write_cla(cla_out)
self.assertEqual(cla_out.getvalue(), cla)
for line, expected_line in zip(cla.rstrip().split('\n'), \
cla_out.getvalue().rstrip() \
.split('\n')):
self.assertTrue(self._compare_cla_lines(line, expected_line))

des_out = StringIO()
scop.write_des(des_out)
Expand Down

4 comments on commit 6d2257d

@peterjc
Copy link

@peterjc peterjc commented on 6d2257d Jul 8, 2010

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there anything in the spec about the possibility of duplicate keys? This can't be handled with a dict.

@jfinkels
Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That is true, but the file format specification is more of an informal description: http://scop.mrc-lmb.cam.ac.uk/scop/release-notes.html#scop-parseable-files

"the sixth column is the list of sunid for the domain: class (cl), fold (cf), superfamily (sf), family (fa), protein domain (dm), species (sp), and domain entry (px). They appear as pairs (key=sunid) so that order does not need to be taken into account in parsing the file and therefore intermediate levels can be added in the future without breaking code written now."

I don't know much about this, but I assumed that the hierarchical classification would remain for some time, and that a hierarchical classification implies a single value for each key.

@jfinkels
Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Any decision on this? Currently, to examine a member of the Record.hierarchy list, I must manually iterate over the list and check if the first member of each tuple is the key which I'm looking for. This is a pain and is easily remedied by this patch.

@peterjc
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we need to discuss this on the main list (and CC the original authors)
as it could break existing scripts. Would you like to write it up as a proposal
ith a before/after example? Assuming there are no objections it seems sensible
to me, but I don't use SCOP.

The newline thing seems harmless, and I was meaning to merge that anyway.
If you want that to go in quickly, could you make a clean branch with just that
change (or a patch for just that change)?

Peter

Please sign in to comment.