Navigation Menu

Skip to content

Commit

Permalink
R-group query changes for MDL reading/writing
Browse files Browse the repository at this point in the history
Signed-off-by: Egon Willighagen <egonw@users.sourceforge.net>
  • Loading branch information
Mark Rynbeek authored and egonw committed Mar 30, 2010
1 parent 1fc3430 commit fb33d7e
Show file tree
Hide file tree
Showing 5 changed files with 199 additions and 19 deletions.
54 changes: 46 additions & 8 deletions src/main/org/openscience/cdk/io/MDLV2000Reader.java
Expand Up @@ -29,8 +29,12 @@
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.StringReader;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.StringTokenizer;

import javax.vecmath.Point2d;
Expand Down Expand Up @@ -100,6 +104,9 @@ public class MDLV2000Reader extends DefaultChemObjectReader {

private BooleanIOSetting forceReadAs3DCoords;
private BooleanIOSetting interpretHydrogenIsotopes;

//Keep track of atoms and the lines they were on in the atom block.
private List<IAtom> atomsByLinePosition;

public MDLV2000Reader() {
this(new StringReader(""));
Expand Down Expand Up @@ -341,6 +348,8 @@ private IMolecule readMolecule(IMolecule molecule) throws CDKException {
//String help;
IAtom atom;
String line = "";
//A map to keep track of R# atoms so that RGP line can be parsed
Map<Integer,IPseudoAtom> rAtoms = new HashMap<Integer,IPseudoAtom>();

try {
IsotopeFactory isotopeFactory = IsotopeFactory.getInstance(molecule.getBuilder());
Expand Down Expand Up @@ -400,8 +409,11 @@ private IMolecule readMolecule(IMolecule molecule) throws CDKException {

// read ATOM block
logger.info("Reading atom block");
atomsByLinePosition = new ArrayList<IAtom>();
atomsByLinePosition.add(null); // 0 is not a valid position
int atomBlockLineNumber=0;
for (int f = 0; f < atoms; f++) {
line = input.readLine(); linecount++;
line = input.readLine(); linecount++; atomBlockLineNumber++;
x = Double.parseDouble(line.substring(0, 10).trim());
y = Double.parseDouble(line.substring(10, 20).trim());
z = Double.parseDouble(line.substring(20, 30).trim());
Expand All @@ -425,21 +437,29 @@ private IMolecule readMolecule(IMolecule molecule) throws CDKException {
atom = molecule.getBuilder().newPseudoAtom(element);
} else if ("L".equals(element)) {
atom = molecule.getBuilder().newPseudoAtom(element);
} else if (element.length() > 0 && element.charAt(0) == 'R'){
logger.debug("Atom ", element, " is not an regular element. Creating a PseudoAtom.");
} else if ( element.equals("R") ||
(element.length() > 0 && element.charAt(0) == 'R')){
logger.debug("Atom ", element, " is not an regular element. Creating a PseudoAtom.");
//check if the element is R
rGroup=element.split("^R");
atom=null;
if (rGroup.length >1){
try{
Rnumber= Integer.valueOf(rGroup[(rGroup.length - 1)]);
RGroupCounter=Rnumber;
element="R"+Rnumber;
atom = molecule.getBuilder().newPseudoAtom(element);

}catch(Exception ex){
Rnumber=RGroupCounter;
RGroupCounter++;
// This happens for atoms labeled "R#".
// The Rnumber may be set later on, using RGP line
atom = molecule.getBuilder().newPseudoAtom("R");
rAtoms.put(atomBlockLineNumber,(IPseudoAtom)atom);
}
element="R"+Rnumber;
}
atom = molecule.getBuilder().newPseudoAtom(element);
else {
atom = molecule.getBuilder().newPseudoAtom("R");
}
} else {
handleError(
"Invalid element type. Must be an existing " +
Expand Down Expand Up @@ -472,7 +492,7 @@ private IMolecule readMolecule(IMolecule molecule) throws CDKException {
} else {
logger.error("Cannot set mass difference for a non-element!");
}


String chargeCodeString = line.substring(36,39).trim();
logger.debug("Atom charge code: ", chargeCodeString);
Expand Down Expand Up @@ -523,6 +543,7 @@ private IMolecule readMolecule(IMolecule molecule) throws CDKException {
}

molecule.addAtom(atom);
atomsByLinePosition.add(atom);
}

// convert to 2D, if totalZ == 0
Expand Down Expand Up @@ -750,6 +771,19 @@ private IMolecule readMolecule(IMolecule molecule) throws CDKException {
exception
);
}
} else if (line.startsWith("M RGP")) {
StringTokenizer st = new StringTokenizer(line);
//Ignore first 3 tokens (overhead).
st.nextToken(); st.nextToken(); st.nextToken();
//Process the R group numbers as defined in RGP line.
while (st.hasMoreTokens()) {
Integer position = new Integer(st.nextToken());
Rnumber = new Integer(st.nextToken());
IPseudoAtom pseudoAtom = rAtoms.get(position);
if (pseudoAtom!=null) {
pseudoAtom.setLabel("R"+Rnumber);
}
}
}
if (!lineRead) {
logger.warn("Skipping line in property block: ", line);
Expand Down Expand Up @@ -824,5 +858,9 @@ public IOSetting[] getIOSettings() {
settings[1] = interpretHydrogenIsotopes;
return settings;
}

public List<IAtom> getAtomsByLinePosition() {
return atomsByLinePosition;
}
}

53 changes: 43 additions & 10 deletions src/main/org/openscience/cdk/io/MDLWriter.java
Expand Up @@ -2,6 +2,7 @@
*
* Copyright (C) 1997-2007 The Chemistry Development Kit (CDK) project
* 2009 Egon Willighagen <egonw@users.sf.net>
* 2010 Mark Rijnbeek <mark_rynbeek@users.sf.net>
*
* Contact: cdk-devel@lists.sourceforge.net
*
Expand Down Expand Up @@ -33,9 +34,12 @@
import java.io.Writer;
import java.text.NumberFormat;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.Iterator;
import java.util.List;
import java.util.Locale;
import java.util.StringTokenizer;
import java.util.TimeZone;

import org.openscience.cdk.CDKConstants;
Expand Down Expand Up @@ -221,6 +225,7 @@ private void writeChemFile(IChemFile file) throws Exception {
*/
public void writeMolecule(IAtomContainer container) throws Exception {
String line = "";
List<Integer> rgroupList=null;
// write header block
// lines get shortened to 80 chars, that's in the spec
String title = (String)container.getProperty(CDKConstants.TITLE);
Expand All @@ -240,10 +245,8 @@ public void writeMolecule(IAtomContainer container) throws Exception {
* if input through MDL form.
* A blank line can be substituted for line 2.
*/
writer.write(" CDK ");
writer.write(new SimpleDateFormat("M/d/y,H:m",Locale.US).format(
Calendar.getInstance(TimeZone.getDefault()).getTime())
);
writer.write(" CDK ");
writer.write(new SimpleDateFormat("MMddyyHHmm").format(System.currentTimeMillis()));
writer.newLine();

String comment = (String)container.getProperty(CDKConstants.REMARK);
Expand All @@ -254,7 +257,7 @@ public void writeMolecule(IAtomContainer container) throws Exception {
writer.newLine();

// write Counts line
line += formatMDLInt(container.getAtomCount(), 3);
line += formatMDLInt(container.getAtomCount(), 3);
line += formatMDLInt(container.getBondCount(), 3);
line += " 0 0 0 0 0 0 0 0999 V2000";
writer.write(line);
Expand All @@ -281,8 +284,17 @@ public void writeMolecule(IAtomContainer container) throws Exception {
}
if(container.getAtom(f) instanceof IPseudoAtom){
//according to http://www.google.co.uk/url?sa=t&ct=res&cd=2&url=http%3A%2F%2Fwww.mdl.com%2Fdownloads%2Fpublic%2Fctfile%2Fctfile.pdf&ei=MsJjSMbjAoyq1gbmj7zCDQ&usg=AFQjCNGaJSvH4wYy4FTXIaQ5f7hjoTdBAw&sig2=eSfruNOSsdMFdlrn7nhdAw an R group is written as R#
if(((IPseudoAtom) container.getAtom(f)).getLabel().equals("R"))
line += "R#";
IPseudoAtom pseudoAtom = (IPseudoAtom) container.getAtom(f);
if (pseudoAtom.getSymbol().equals("R") && pseudoAtom.getLabel().length()>1) {
line += "R# ";
if (rgroupList==null) {
rgroupList = new ArrayList<Integer>();
}
Integer rGroupNumber = new Integer(pseudoAtom.getLabel().substring(1));
rgroupList.add(f+1);
rgroupList.add(rGroupNumber);

}
else
line += formatMDLString(((IPseudoAtom) container.getAtom(f)).getLabel(), 3);
}else{
Expand Down Expand Up @@ -370,6 +382,27 @@ public void writeMolecule(IAtomContainer container) throws Exception {
}
}
}

//write RGP line (max occurrence is 16 data points per line)
if (rgroupList!=null) {
StringBuffer rgpLine=new StringBuffer();
int cnt=0;
for (int i=1; i<= rgroupList.size(); i++) {

rgpLine.append(formatMDLInt((rgroupList.get(i-1)), 4));
i++;
rgpLine.append(formatMDLInt((rgroupList.get(i-1)), 4));

cnt++;
if (i==rgroupList.size() || i==16 ) {
rgpLine.insert(0, "M RGP"+formatMDLInt(cnt, 3));
writer.write(rgpLine.toString());
writer.newLine();
rgpLine=new StringBuffer();
cnt=0;
}
}
}

// close molecule
writer.write("M END");
Expand All @@ -385,7 +418,7 @@ public void writeMolecule(IAtomContainer container) throws Exception {
* @param l Length of the String
* @return The String to be written into the connectiontable
*/
private String formatMDLInt(int i, int l) {
protected static String formatMDLInt(int i, int l) {
String s = "", fs = "";
NumberFormat nf = NumberFormat.getNumberInstance(Locale.ENGLISH);
nf.setParseIntegerOnly(true);
Expand All @@ -410,7 +443,7 @@ private String formatMDLInt(int i, int l) {
* @param fl The float to be formated
* @return The String to be written into the connectiontable
*/
private String formatMDLFloat(float fl) {
protected static String formatMDLFloat(float fl) {
String s = "", fs = "";
int l;
NumberFormat nf = NumberFormat.getNumberInstance(Locale.ENGLISH);
Expand All @@ -436,7 +469,7 @@ private String formatMDLFloat(float fl) {
* @param le The length of the String
* @return The String to be written in the connectiontable
*/
private String formatMDLString(String s, int le) {
protected static String formatMDLString(String s, int le) {
s = s.trim();
if (s.length() > le)
return s.substring(0, le);
Expand Down
2 changes: 1 addition & 1 deletion src/test/org/openscience/cdk/io/ChemObjectIOTest.java
Expand Up @@ -117,7 +117,7 @@ public static void setChemObjectIO(IChemObjectIO aChemObjectIO) {
oneAccepted = true;
}
}
Assert.assertTrue("At least one of the following IChemObect's should be accepted: IChemFile, IChemModel, IMolecule, IReaction", oneAccepted);
Assert.assertTrue("At least one of the following IChemObect's should be accepted: IChemFile, IChemModel, IMolecule, IReaction, IRGroupQuery", oneAccepted);
}

@Test public void testClose() throws Exception {
Expand Down
75 changes: 75 additions & 0 deletions src/test/org/openscience/cdk/io/MDLV2000ReaderTest.java
Expand Up @@ -666,4 +666,79 @@ public void testQueryBondTypes() throws Exception {
reader.read(new ChemFile());
}

/**
* Tests numbering of R# elements according to RGP line.
* @throws Exception
*/
@Test public void testRGroupHashNumbering() throws Exception {
String filename = "data/mdl/rgroups.mol";
logger.info("Testing: " + filename);
InputStream ins = this.getClass().getClassLoader().getResourceAsStream(filename);
MDLV2000Reader reader = new MDLV2000Reader(ins);
Molecule mol = (Molecule)reader.read(new Molecule());
for(IBond bond: mol.bonds() ) {
PseudoAtom rGroup = null;
IAtom partner=null;
if (bond.getAtom(0) instanceof PseudoAtom ) {
rGroup = (PseudoAtom)bond.getAtom(0);
partner = bond.getAtom(1);
}
else {
partner = bond.getAtom(0);
rGroup = (PseudoAtom)bond.getAtom(1);
}
if (partner.getSymbol().equals("N")) {
Assert.assertEquals(rGroup.getLabel(),"R4");
}
else
if (partner.getSymbol().equals("P")) {
Assert.assertEquals(rGroup.getLabel(),"R1");
}
else
if (partner.getSymbol().equals("As")) {
Assert.assertEquals(rGroup.getLabel(),"R4");
}
else
if (partner.getSymbol().equals("Si")) {
Assert.assertEquals(rGroup.getLabel(),"R");
}
}
}


/**
* Test for hard coded R-group numbers in the Atom block.
* Hard coding is accepted but should not be done really, instead use
* a hash (#) conform the CTFile spec.
* @throws Exception
*/
@Test public void testRGroupHardcodedNumbering() throws Exception {
String filename = "data/mdl/rgroupsNumbered.mol";
logger.info("Testing: " + filename);
InputStream ins = this.getClass().getClassLoader().getResourceAsStream(filename);
MDLV2000Reader reader = new MDLV2000Reader(ins);
Molecule mol = (Molecule)reader.read(new Molecule());
for(IBond bond: mol.bonds() ) {
PseudoAtom rGroup = null;
if (bond.getAtom(0) instanceof PseudoAtom )
rGroup = (PseudoAtom)bond.getAtom(0);
else
rGroup = (PseudoAtom)bond.getAtom(1);

if (bond.getOrder()== IBond.Order.DOUBLE) {
Assert.assertEquals(rGroup.getLabel(),"R32");
}
else
if (bond.getStereo()==IBond.Stereo.DOWN) {
Assert.assertEquals(rGroup.getLabel(),"R2");
}
else
if (bond.getStereo()==IBond.Stereo.UP) {
Assert.assertEquals(rGroup.getLabel(),"R20");
}
else
Assert.assertEquals(rGroup.getLabel(),"R5");
}
}

}
34 changes: 34 additions & 0 deletions src/test/org/openscience/cdk/io/MDLWriterTest.java
Expand Up @@ -46,6 +46,7 @@
import org.openscience.cdk.interfaces.IChemModel;
import org.openscience.cdk.interfaces.IChemObjectBuilder;
import org.openscience.cdk.interfaces.IMolecule;
import org.openscience.cdk.interfaces.IPseudoAtom;
import org.openscience.cdk.io.listener.PropertiesListener;
import org.openscience.cdk.templates.MoleculeFactory;

Expand Down Expand Up @@ -219,5 +220,38 @@ public class MDLWriterTest extends ChemObjectIOTest {
String output = writer.toString();
Assert.assertTrue(output.contains("title1; title2"));
}

/**
* Test correct output of R-groups, using the hash (#) and a separate RGP line.
*/
@Test public void testRGPLine() throws Exception {
StringWriter writer = new StringWriter();
IMolecule molecule = builder.newMolecule();
IPseudoAtom atom1 = builder.newPseudoAtom();
atom1.setSymbol("R");
atom1.setLabel("R12");

IAtom atom2 = builder.newAtom("C");
IBond bond = builder.newBond(atom1, atom2);

IPseudoAtom atom3 = builder.newPseudoAtom();
atom3.setSymbol("A");
atom3.setLabel("A");
IBond bond2 = builder.newBond(atom3, atom2);

molecule.addAtom(atom1);
molecule.addAtom(atom2);
molecule.addAtom(atom3);
molecule.addBond(bond);
molecule.addBond(bond2);

MDLWriter mdlWriter = new MDLWriter(writer);
mdlWriter.write(molecule);
String output = writer.toString();

Assert.assertTrue("Test for R#", -1 != output.indexOf("R#"));
Assert.assertTrue("Test for RGP line", -1 != output.indexOf("M RGP 1 1 12"));
}


}

0 comments on commit fb33d7e

Please sign in to comment.