Skip to content

Commit

Permalink
Added a 'reader' for MoSS output files, which reads the given substru…
Browse files Browse the repository at this point in the history
…ctures as IMolecule's, as we do not have a proper data model for query structures yet

Signed-off-by: Rajarshi Guha <rajarshi.guha@gmail.com>
  • Loading branch information
egonw authored and rajarshi committed Oct 5, 2010
1 parent 1f1cf50 commit 19cf62c
Show file tree
Hide file tree
Showing 5 changed files with 316 additions and 2 deletions.
16 changes: 14 additions & 2 deletions doc/refs/cheminf.bibx
Expand Up @@ -12,7 +12,7 @@
<bibtex:url>http://www.openrasmol.org/doc/rasmol.html#cpkcolours</bibtex:url>
</bibtex:misc>
</bibtex:entry>

<bibtex:entry id="BGdV04a">
<bibtex:article>
<bibtex:author>Berger, F. and Gritzmann, P. and De Vries, S.</bibtex:author>
Expand Down Expand Up @@ -79,7 +79,19 @@
<bibtex:pages>271-275</bibtex:pages>
</bibtex:article>
</bibtex:entry>


<bibtex:entry id="BOR2002">
<bibtex:inproceedings>
<bibtex:author>Borgelt, C. and Berthold, M.R.</bibtex:author>
<bibtex:title>Mining Molecular Fragments: Finding Relevant Substructures of Molecules</bibtex:title>
<bibtex:year>2002</bibtex:year>
<bibtex:booktitle>ICDM '02: Proceedings of the 2002 IEEE International Conference on Data Mining (ICDM'02)</bibtex:booktitle>
<bibtex:isbn>0769517544</bibtex:isbn>
<bibtex:publisher>IEEE Computer Society</bibtex:publisher>
<bibtex:address>Washington, DC, USA</bibtex:address>
</bibtex:inproceedings>
</bibtex:entry>

<bibtex:entry id="BUR89">
<bibtex:article>
<bibtex:author>Burden, F.R.</bibtex:author>
Expand Down
196 changes: 196 additions & 0 deletions src/main/org/openscience/cdk/io/MoSSOutputReader.java
@@ -0,0 +1,196 @@
/* Copyright (C) 2010 Egon Willighagen <egonw@users.sf.net>
*
* Contact: cdk-devel@lists.sourceforge.net
*
* This program is free software; you can redistribute it and/or modify it under
* the terms of the GNU Lesser General Public License as published by the Free
* Software Foundation; either version 2.1 of the License, or (at your option)
* any later version. All we ask is that proper credit is given for our work,
* which includes - but is not limited to - adding the above copyright notice to
* the beginning of your source code files, and to any copyright notice that you
* may distribute with programs based on this work.
*
* This program is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
* details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*/
package org.openscience.cdk.io;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.StringReader;

import org.openscience.cdk.annotations.TestClass;
import org.openscience.cdk.annotations.TestMethod;
import org.openscience.cdk.exception.CDKException;
import org.openscience.cdk.exception.InvalidSmilesException;
import org.openscience.cdk.interfaces.IChemFile;
import org.openscience.cdk.interfaces.IChemModel;
import org.openscience.cdk.interfaces.IChemObject;
import org.openscience.cdk.interfaces.IChemSequence;
import org.openscience.cdk.interfaces.IMolecule;
import org.openscience.cdk.interfaces.IMoleculeSet;
import org.openscience.cdk.io.formats.IResourceFormat;
import org.openscience.cdk.io.formats.MoSSOutputFormat;
import org.openscience.cdk.smiles.SmilesParser;
import org.openscience.cdk.tools.ILoggingTool;
import org.openscience.cdk.tools.LoggingToolFactory;

/**
* Reader for MoSS output files {@cdk.cite BOR2002} which present the results
* of a substructure mining study. These files look like:
* <pre>
* id,description,nodes,edges,s_abs,s_rel,c_abs,c_rel
* 1,S-c:c:c:c:c:c,7,6,491,5.055081,5,1.7421603
* 2,S-c:c:c:c:c,6,5,493,5.0756717,5,1.7421603
* </pre>
*
* <p><b>Caution</b>: the output contains substructures, not full molecules,
* even though they are read as such right now.
*
* @cdk.module smiles
* @cdk.githash
*
* @cdk.keyword MoSS
*/
@TestClass("org.openscience.cdk.io.MoSSOutputReaderTest")
public class MoSSOutputReader extends DefaultChemObjectReader {

private BufferedReader input;
private static ILoggingTool logger =
LoggingToolFactory.createLoggingTool(MoSSOutputReader.class);

/**
* Create a reader for MoSS output files from a {@link Reader}.
*
* @param input source of CIF data
*/
public MoSSOutputReader(Reader input) {
if (input instanceof BufferedReader) {
this.input = (BufferedReader)input;
} else {
this.input = new BufferedReader(input);
}
}

/**
* Create a reader for MoSS output files from an {@link InputStream}.
*
* @param input source of CIF data
*/
public MoSSOutputReader(InputStream input) {
this(new InputStreamReader(input));
}

/**
* Create a reader for MoSS output files from an empty string.
*/
public MoSSOutputReader() {
this(new StringReader(""));
}

/** {@inheritDoc} */
@TestMethod("testGetFormat")
public IResourceFormat getFormat() {
return MoSSOutputFormat.getInstance();
}

/** {@inheritDoc} */
@TestMethod("testSetReader_Reader")
public void setReader(Reader reader) throws CDKException {
this.input = new BufferedReader(input);
}

/** {@inheritDoc} */
@TestMethod("testSetReader_InputStream")
public void setReader(InputStream input) throws CDKException {
setReader(new InputStreamReader(input));
}

/** {@inheritDoc} */
@TestMethod("testAccepts")
public boolean accepts(Class testClass) {
Class[] interfaces = testClass.getInterfaces();
for (int i=0; i<interfaces.length; i++) {
if (IMoleculeSet.class.equals(interfaces[i])) return true;
if (IChemFile.class.equals(interfaces[i])) return true;
}
Class superClass = testClass.getSuperclass();
if (superClass != null) return this.accepts(superClass);
return false;
}

/**
* Read a {@link IMoleculeSet} from the input source.
*
* @param object an {@link IMoleculeSet} into which the data is stored.
* @return the content in a {@link IMoleculeSet} object
*/
public <T extends IChemObject> T read(T object) throws CDKException {
if (object instanceof IMoleculeSet) {
IMoleculeSet cf = (IMoleculeSet)object;
try {
cf = readMoleculeSet(cf);
} catch (IOException e) {
logger.error("Input/Output error while reading from input.");
}
return (T)cf;
} else if (object instanceof IChemFile) {
IChemFile chemFile = (IChemFile)object;
IChemSequence chemSeq = object.getBuilder().newInstance(IChemSequence.class);
IChemModel chemModel = object.getBuilder().newInstance(IChemModel.class);
IMoleculeSet molSet = object.getBuilder().newInstance(IMoleculeSet.class);
try {
molSet = readMoleculeSet(molSet);
} catch (IOException e) {
logger.error("Input/Output error while reading from input.");
}
chemModel.setMoleculeSet(molSet);
chemSeq.addChemModel(chemModel);
chemFile.addChemSequence(chemSeq);
return (T)chemFile;
} else {
throw new CDKException("Only supported is reading of IMoleculeSet.");
}
}

/**
* Read the file content into a {@link IMoleculeSet}.
*/
private IMoleculeSet readMoleculeSet(IMoleculeSet molSet) throws IOException {
SmilesParser parser = new SmilesParser(molSet.getBuilder());
parser.setPreservingAromaticity(true);
String line = input.readLine();
line = input.readLine(); // skip the first line
while (line != null) {
String[] cols = line.split(",");
try {
IMolecule mol = parser.parseSmiles(cols[1]);
mol.setProperty("focusSupport", cols[5]);
mol.setProperty("complementSupport", cols[7]);
mol.setProperty("atomCount", cols[2]);
mol.setProperty("bondCount", cols[3]);
molSet.addMolecule(mol);
} catch (InvalidSmilesException exception) {
logger.error("Skipping invalid SMILES: " + cols[1]);
logger.debug(exception);
}
line = input.readLine();
}
return molSet;
}

/** {@inheritDoc} */
@TestMethod("testClose")
public void close() throws IOException {
input.close();
}
}
20 changes: 20 additions & 0 deletions src/test/data/moss/TKO.mossoutput
@@ -0,0 +1,20 @@
id,description,nodes,edges,s_abs,s_rel,c_abs,c_rel
1,S-c:c:c:c:c:c,7,6,491,5.055081,5,1.7421603
2,S-c:c:c:c:c,6,5,493,5.0756717,5,1.7421603
3,S-c:c:c:c,5,4,496,5.1065583,5,1.7421603
4,S-c:c:c,4,3,498,5.127149,5,1.7421603
5,S-c:c,3,2,509,5.2403994,5,1.7421603
6,s(:c):c(-C):c,5,4,491,5.055081,2,0.6968641
7,s(:c):c-C,4,3,516,5.312468,3,1.0452962
8,s1:c:c:c:c:1,5,5,610,6.280243,5,1.7421603
9,s:c(-C):c,4,3,492,5.0653763,2,0.6968641
10,s:c-C,3,2,517,5.3227634,3,1.0452962
11,F-C-F,3,2,499,5.1374445,4,1.3937283
12,Cl-c1:c:c:c:c:c:1-O-C,9,9,598,6.1566973,3,1.0452962
13,Cl-c1:c:c:c:c:c:1-O,8,8,616,6.3420157,3,1.0452962
14,Cl-c(:c):c(-O-C):c:c,8,7,601,6.1875834,3,1.0452962
15,Cl-c(:c):c(-O):c:c,7,6,619,6.3729024,3,1.0452962
16,Cl-c(:c:c:c):c-O-C,8,7,599,6.1669927,3,1.0452962
17,Cl-c(:c:c:c):c-O,7,6,617,6.352311,3,1.0452962
18,Cl-c(:c):c-O-C,6,5,602,6.1978793,3,1.0452962
19,Cl-c(:c):c-O,5,4,620,6.383198,3,1.0452962
84 changes: 84 additions & 0 deletions src/test/org/openscience/cdk/io/MoSSOutputReaderTest.java
@@ -0,0 +1,84 @@
/* Copyright (C) 2010 Egon Willighagen <egonw@users.sf.net>
*
* Contact: cdk-devel@slists.sourceforge.net
*
* This program is free software; you can redistribute it and/or modify it under
* the terms of the GNU Lesser General Public License as published by the Free
* Software Foundation; either version 2.1 of the License, or (at your option)
* any later version. All we ask is that proper credit is given for our work,
* which includes - but is not limited to - adding the above copyright notice to
* the beginning of your source code files, and to any copyright notice that you
* may distribute with programs based on this work.
*
* This program is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
* details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*/
package org.openscience.cdk.io;

import java.io.InputStream;

import org.junit.Assert;
import org.junit.BeforeClass;
import org.junit.Test;
import org.openscience.cdk.MoleculeSet;
import org.openscience.cdk.interfaces.IAtomContainer;

/**
* @cdk.module test-smiles
*/
public class MoSSOutputReaderTest extends SimpleChemObjectReaderTest {

@BeforeClass public static void setup() {
setSimpleChemObjectReader(new MoSSOutputReader(), "data/moss/TKO.mossoutput");
}

@Test public void testAccepts() {
MoSSOutputReader reader = new MoSSOutputReader();
Assert.assertTrue(reader.accepts(MoleculeSet.class));
}

@Test public void testExampleFile_MolReading() throws Exception {
String filename = "data/moss/TKO.mossoutput";
InputStream ins = this.getClass().getClassLoader().getResourceAsStream(filename);
MoSSOutputReader reader = new MoSSOutputReader(ins);
MoleculeSet moleculeSet = new MoleculeSet();
moleculeSet = reader.read(moleculeSet);
Assert.assertEquals(19, moleculeSet.getAtomContainerCount());
for (IAtomContainer mol : moleculeSet.molecules()) {
Assert.assertEquals(
Integer.valueOf(mol.getProperty("atomCount").toString()).intValue(),
mol.getAtomCount()
);
Assert.assertEquals(
Integer.valueOf(mol.getProperty("bondCount").toString()).intValue(),
mol.getBondCount());
}
}

@Test public void testExampleFile_SupportColumns() throws Exception {
String filename = "data/moss/TKO.mossoutput";
InputStream ins = this.getClass().getClassLoader().getResourceAsStream(filename);
MoSSOutputReader reader = new MoSSOutputReader(ins);
MoleculeSet moleculeSet = new MoleculeSet();
moleculeSet = reader.read(moleculeSet);
Assert.assertEquals(
5.06,
Double.valueOf(moleculeSet.getMolecule(0).getProperty("focusSupport").toString())
.doubleValue(),
0.01
);
Assert.assertEquals(
1.74,
Double.valueOf(moleculeSet.getMolecule(0).getProperty("complementSupport").toString())
.doubleValue(),
0.01
);
}

}
2 changes: 2 additions & 0 deletions src/test/org/openscience/cdk/modulesuites/MsmilesTests.java
Expand Up @@ -27,6 +27,7 @@
import org.junit.runners.Suite;
import org.junit.runners.Suite.SuiteClasses;
import org.openscience.cdk.coverage.SmilesCoverageTest;
import org.openscience.cdk.io.MoSSOutputReaderTest;
import org.openscience.cdk.io.SMILESReaderTest;
import org.openscience.cdk.io.iterator.IteratingSMILESReaderTest;
import org.openscience.cdk.smiles.DeduceBondSystemToolTest;
Expand All @@ -45,6 +46,7 @@
SMILESReaderTest.class,
IteratingSMILESReaderTest.class,
DeduceBondSystemToolTest.class,
MoSSOutputReaderTest.class,
SmilesParserTest.class,
SmilesGeneratorTest.class,
NormalizerTest.class
Expand Down

0 comments on commit 19cf62c

Please sign in to comment.