Include pdfx XMP metadata in document information dictionary.

Also populate pdfx namespace when downloading DOI metadata.
CrossRef · May 8, 2012 · ab785f6 · ab785f6
1 parent 4a58625
commit ab785f6
Show file tree

Hide file tree

Showing 9 changed files with 147 additions and 25 deletions.
diff --git a/src/org/crossref/pdfmark/Main.java b/src/org/crossref/pdfmark/Main.java
@@ -22,16 +22,22 @@
 import java.io.FileInputStream;
 import java.io.FileOutputStream;
 import java.io.IOException;
+import java.util.Map.Entry;
 
 import org.apache.pdfbox.cos.COSDictionary;
 import org.apache.pdfbox.cos.COSDocument;
 import org.apache.pdfbox.cos.COSName;
 import org.apache.pdfbox.cos.COSObject;
+import org.apache.pdfbox.exceptions.COSVisitorException;
 import org.apache.pdfbox.pdfparser.PDFParser;
+import org.apache.pdfbox.pdmodel.PDDocument;
+import org.apache.pdfbox.pdmodel.PDDocumentInformation;
 
 import com.itextpdf.text.DocumentException;
 import com.itextpdf.text.pdf.PdfReader;
 import com.itextpdf.text.pdf.PdfStamper;
+import com.itextpdf.text.pdf.PdfWriter;
+import com.itextpdf.text.xml.xmp.XmpSchema;
 
 import static jargs.gnu.CmdLineParser.Option;
 
@@ -192,8 +198,10 @@ public Main(String[] args) {
 			}
 
 			try {
+				new File(outputFile.getPath() + ".tmp").deleteOnExit();
+
 				FileInputStream fileIn = new FileInputStream(pdfFile);
-				FileOutputStream fileOut = new FileOutputStream(outputFile);
+				FileOutputStream fileOut = new FileOutputStream(outputFile.getPath() + ".tmp");
 				PdfReader reader = new PdfReader(fileIn);
 				PdfStamper stamper = new PdfStamper(reader, fileOut);
 
@@ -206,11 +214,14 @@ public Main(String[] args) {
 				if (resolvedXmpData != null) {
 					merged = XmpUtils.mergeXmp(merged, resolvedXmpData);
 				}
-				
+
 				stamper.setXmpMetadata(merged);
 
 				stamper.close();
 				reader.close();
+
+				fileIn = new FileInputStream(outputFile.getPath() + ".tmp");
+				writeInfoDictionary(fileIn, outputFile.getPath(), merged);
 			} catch (IOException e) {
 				exitWithError(2, "Error: Couldn't handle '" + pdfFilePath 
 						+ "' because of:\n" + e);
@@ -220,12 +231,33 @@ public Main(String[] args) {
 			} catch (XmpException e) {
 				exitWithError(2, "Error: Couldn't handle '" + pdfFilePath
 						+ "' because of:\n" + e);
+			} catch (COSVisitorException e) {
+				exitWithError(2, "Error: Couldn't write document info dictionary"
+						+ " because of:\n" + e);
 			}
 		}
 
 		shutDown();
 	}
 
+	public static void writeInfoDictionary(FileInputStream in, 
+			String outputFile, byte[] xmp) throws IOException, COSVisitorException {
+
+		PDFParser parser = new PDFParser(in);
+		parser.parse();
+
+		PDDocument document = parser.getPDDocument();
+		PDDocumentInformation info = document.getDocumentInformation();
+
+		for (Entry<String, String> entry : XmpUtils.toInfo(xmp).entrySet()) {
+			info.setCustomMetadataValue(entry.getKey(), entry.getValue());
+		}
+
+		document.setDocumentInformation(info);
+		document.save(outputFile);
+		document.close();
+	}
+
 	/**
 	 * According to the PDF Reference Manual (appendix F) a linearized PDF
 	 * must have as its first object after the PDF header an indirect

diff --git a/src/org/crossref/pdfmark/MarkBuilder.java b/src/org/crossref/pdfmark/MarkBuilder.java
@@ -71,7 +71,7 @@ public void onPublisher(String requestedDoi, Publisher pub) {
 	@Override
 	public void onComplete(String requestedDoi) {
 		ByteArrayOutputStream bout = new ByteArrayOutputStream();
-		DcPrismSet dcPrism = new DcPrismSet();
+		SchemaSet schemaSet = new SchemaSet();
 
 		try {
 		    Work work = null;
@@ -90,23 +90,23 @@ public void onComplete(String requestedDoi) {
 		    if (work != null) {
 		        XmpWriter writer = new XmpWriter(bout);
 
-		        work.writeXmp(dcPrism);
+		        work.writeXmp(schemaSet);
 
     		    if (publisher != null) {
     	            if (generateCopyright) {
     	                String cp = getCopyright(work);
-    	                Work.addToSchema(dcPrism.getDc(), DublinCoreSchema.RIGHTS, cp);
-    	                Work.addToSchema(dcPrism.getPrism(), Prism21Schema.COPYRIGHT, cp);
+    	                Work.addToSchema(schemaSet.getDc(), DublinCoreSchema.RIGHTS, cp);
+    	                Work.addToSchema(schemaSet.getPrism(), Prism21Schema.COPYRIGHT, cp);
     	            }
-    	            Work.addToSchema(dcPrism.getDc(), DublinCoreSchema.PUBLISHER, 
+    	            Work.addToSchema(schemaSet.getDc(), DublinCoreSchema.PUBLISHER, 
     	                             publisher.getName());
     	        }
 
-    		    Work.addToSchema(dcPrism.getPrism(), Prism21Schema.RIGHTS_AGENT, 
+    		    Work.addToSchema(schemaSet.getPrism(), Prism21Schema.RIGHTS_AGENT, 
     		                     rightsAgent);
 
-    		    writer.addRdfDescription(dcPrism.getDc());
-                writer.addRdfDescription(dcPrism.getPrism());
+    		    writer.addRdfDescription(schemaSet.getDc());
+                writer.addRdfDescription(schemaSet.getPrism());
                 writer.close();
 		    }
 

diff --git a/src/org/crossref/pdfmark/PdfxSchema.java b/src/org/crossref/pdfmark/PdfxSchema.java
@@ -0,0 +1,19 @@
+package org.crossref.pdfmark;
+
+import com.itextpdf.text.xml.xmp.XmpSchema;
+
+public class PdfxSchema extends XmpSchema {
+
+	public static final String DEFAULT_XPATH_ID = "pdfx";
+	public static final String DEFAULT_XPATH_URI 
+				= "http://ns.adobe.com/pdfx/1.3/";
+
+	public static final String DOI = "doi";
+
+	public PdfxSchema() {
+		super("xmlns:" 
+	               + DEFAULT_XPATH_ID 
+	               + "=\"" + DEFAULT_XPATH_URI + "\"");
+	}
+
+}
diff --git a/src/org/crossref/pdfmark/DcPrismSet.java → src/org/crossref/pdfmark/SchemaSet.java b/src/org/crossref/pdfmark/DcPrismSet.java → src/org/crossref/pdfmark/SchemaSet.java
@@ -5,10 +5,11 @@
 import com.itextpdf.text.xml.xmp.DublinCoreSchema;
 import com.itextpdf.text.xml.xmp.XmpSchema;
 
-public class DcPrismSet {
+public class SchemaSet {
 
     private XmpSchema dc = new DublinCoreSchema();
     private XmpSchema prism = new Prism21Schema();
+    private XmpSchema pdfx = new PdfxSchema();
 
     public XmpSchema getPrism() {
         return prism;
@@ -17,5 +18,9 @@ public XmpSchema getPrism() {
     public XmpSchema getDc() {
         return dc;
     }
+
+    public XmpSchema getPdfx() {
+    	return pdfx;
+    }
 
 }
diff --git a/src/org/crossref/pdfmark/XmpUtils.java b/src/org/crossref/pdfmark/XmpUtils.java
@@ -20,10 +20,12 @@
 import java.io.ByteArrayInputStream;
 import java.io.ByteArrayOutputStream;
 import java.io.IOException;
+import java.io.ObjectInputStream.GetField;
 import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
+import java.util.Map.Entry;
 
 import javax.sql.rowset.spi.XmlWriter;
 import javax.xml.XMLConstants;
@@ -32,6 +34,7 @@
 import javax.xml.parsers.ParserConfigurationException;
 
 import org.w3c.dom.Attr;
+import org.w3c.dom.DOMException;
 import org.w3c.dom.Document;
 import org.w3c.dom.Element;
 import org.w3c.dom.NamedNodeMap;
@@ -40,6 +43,8 @@
 import org.w3c.dom.Text;
 import org.xml.sax.SAXException;
 
+import com.itextpdf.text.pdf.PdfReader;
+import com.itextpdf.text.pdf.PdfStamper;
 import com.itextpdf.text.xml.xmp.XmpArray;
 import com.itextpdf.text.xml.xmp.XmpReader;
 import com.itextpdf.text.xml.xmp.XmpSchema;
@@ -208,4 +213,57 @@ public static byte[] mergeXmp(byte[] left, byte[] right) throws XmpException {
 			throw new XmpException(e);
 		}
 	}
+
+	/**
+	 * Copy key value pairs from PDFX namespace into a PDF's document information
+	 * dictionary.
+	 */
+	public static Map<String, String> toInfo(byte[] xmp) throws XmpException {
+		try {
+			DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
+			factory.setNamespaceAware(true);
+
+			Map<String, String> info = new HashMap<String, String>();
+
+			XmpSchema[] schemata = XmpUtils.parseSchemata(xmp);
+			for (XmpSchema schema : schemata) {
+				if (schema.getXmlns().contains("pdfx")) {
+					for (Entry<Object, Object> entry : schema.entrySet()) {
+						Object value = entry.getValue();
+
+						String key = (String) entry.getKey();
+						String[] parts = key.split(":");
+						String infoKey = parts.length == 2 ? parts[1] : parts[0];
+
+						String val = (String) entry.getValue();
+
+						if (val.contains("<rdf:Seq>") || val.contains("<rdf:Bag>")) {
+							val = "<xml xmlns:rdf=\"rdf\">" + val + "</xml>";
+							DocumentBuilder builder = factory.newDocumentBuilder();
+							Document doc = builder.parse(new ByteArrayInputStream(val.getBytes()));
+
+							NodeList nodes = doc.getElementsByTagName("rdf:li");
+							for (int i=0; i<nodes.getLength(); i++) {
+								Element item = (Element) nodes.item(i);
+								info.put(infoKey + "[" + (i + 1) + "]", item.getTextContent());
+							}
+						} else {
+							info.put(infoKey, (String) value);
+						}
+					}
+				}
+			}
+
+			return info;
+		} catch (DOMException e) {
+			throw new XmpException(e);
+		} catch (IOException e) {
+			throw new XmpException(e);
+		} catch (SAXException e) {
+			throw new XmpException(e);
+		} catch (ParserConfigurationException e) {
+			throw new XmpException(e);
+		}
+	}
+
 }
diff --git a/src/org/crossref/pdfmark/unixref/Book.java b/src/org/crossref/pdfmark/unixref/Book.java
@@ -4,7 +4,8 @@
 import javax.xml.xpath.XPathExpression;
 import javax.xml.xpath.XPathExpressionException;
 
-import org.crossref.pdfmark.DcPrismSet;
+import org.crossref.pdfmark.PdfxSchema;
+import org.crossref.pdfmark.SchemaSet;
 import org.crossref.pdfmark.MarkBuilder;
 import org.crossref.pdfmark.XPathHelpers;
 import org.crossref.pdfmark.prism.Prism21Schema;
@@ -109,23 +110,26 @@ public String getYear() throws XPathExpressionException {
         return year;
     }
 
-    public void writeXmp(DcPrismSet dcPrism) throws XPathExpressionException {
-        XmpSchema dc = dcPrism.getDc();
-        XmpSchema prism = dcPrism.getPrism();
+    public void writeXmp(SchemaSet schemaSet) throws XPathExpressionException {
+        XmpSchema dc = schemaSet.getDc();
+        XmpSchema prism = schemaSet.getPrism();
+        XmpSchema pdfx = schemaSet.getPdfx();
 
         addToSchema(dc, DublinCoreSchema.CREATOR, getContributors());
         addToSchema(dc, DublinCoreSchema.TITLE, getTitles());
         addToSchema(dc, DublinCoreSchema.DATE, getPublicationDate());
-        addToSchema(dc, DublinCoreSchema.IDENTIFIER, "doi:" + getDoi());
+        addToSchema(dc, DublinCoreSchema.IDENTIFIER, getDoi());
 
         addToSchema(prism, Prism21Schema.PUBLICATION_DATE, getPublicationDate());
         addToSchema(prism, Prism21Schema.DOI, getDoi());
         addToSchema(prism, Prism21Schema.URL, MarkBuilder.getUrlForDoi(getDoi()));
-        addToSchema(prism, Prism21Schema.ISSUE_IDENTIFIER, "doi:" + getDoi());
+        addToSchema(prism, Prism21Schema.ISSUE_IDENTIFIER, getDoi());
         addToSchema(prism, Prism21Schema.EDITION, getEditionNumber());
         addToSchema(prism, Prism21Schema.ISBN, getIsbn());
         addToSchema(prism, Prism21Schema.ISSN, getIssn());
 
+        addToSchema(pdfx, PdfxSchema.DOI, getDoi());
+
         // TODO:
         //addToSchema(prism, Prism21Schema.PUBLICATION_NAME, getFullTitle());
     }

diff --git a/src/org/crossref/pdfmark/unixref/Journal.java b/src/org/crossref/pdfmark/unixref/Journal.java
@@ -22,7 +22,8 @@
 import javax.xml.xpath.XPathExpression;
 import javax.xml.xpath.XPathExpressionException;
 
-import org.crossref.pdfmark.DcPrismSet;
+import org.crossref.pdfmark.PdfxSchema;
+import org.crossref.pdfmark.SchemaSet;
 import org.crossref.pdfmark.MarkBuilder;
 import org.crossref.pdfmark.XPathHelpers;
 import org.crossref.pdfmark.prism.Prism21Schema;
@@ -146,10 +147,11 @@ public String getYear() throws XPathExpressionException {
 	    return getArticle().getYear();
 	}
 
-	public void writeXmp(DcPrismSet dcPrism) throws XPathExpressionException {
+	public void writeXmp(SchemaSet schemaSet) throws XPathExpressionException {
 	    JournalArticle article = getArticle();
-	    XmpSchema dc = dcPrism.getDc();
-	    XmpSchema prism = dcPrism.getPrism();
+	    XmpSchema dc = schemaSet.getDc();
+	    XmpSchema prism = schemaSet.getPrism();
+	    XmpSchema pdfx = schemaSet.getPdfx();
 
         addToSchema(dc, DublinCoreSchema.CREATOR, article.getContributors());
         addToSchema(dc, DublinCoreSchema.TITLE, article.getTitles());
@@ -167,6 +169,8 @@ public void writeXmp(DcPrismSet dcPrism) throws XPathExpressionException {
         addToSchema(prism, Prism21Schema.STARTING_PAGE, article.getFirstPage());
         addToSchema(prism, Prism21Schema.ENDING_PAGE, article.getLastPage());
         addToSchema(prism, Prism21Schema.URL, MarkBuilder.getUrlForDoi(article.getDoi()));
+
+        addToSchema(pdfx, PdfxSchema.DOI, article.getDoi());
 	}
 
 }
diff --git a/src/org/crossref/pdfmark/unixref/Standard.java b/src/org/crossref/pdfmark/unixref/Standard.java
@@ -4,7 +4,7 @@
 import javax.xml.xpath.XPathExpression;
 import javax.xml.xpath.XPathExpressionException;
 
-import org.crossref.pdfmark.DcPrismSet;
+import org.crossref.pdfmark.SchemaSet;
 import org.crossref.pdfmark.XPathHelpers;
 import org.w3c.dom.Document;
 import org.w3c.dom.Node;
@@ -73,7 +73,7 @@ public String getYear() throws XPathExpressionException {
     }
 
     @Override
-    public void writeXmp(DcPrismSet dcPrism) throws XPathExpressionException {
+    public void writeXmp(SchemaSet dcPrism) throws XPathExpressionException {
         // TODO Auto-generated method stub
 
     }

diff --git a/src/org/crossref/pdfmark/unixref/Work.java b/src/org/crossref/pdfmark/unixref/Work.java
@@ -2,7 +2,7 @@
 
 import javax.xml.xpath.XPathExpressionException;
 
-import org.crossref.pdfmark.DcPrismSet;
+import org.crossref.pdfmark.SchemaSet;
 
 import com.itextpdf.text.xml.xmp.XmpArray;
 import com.itextpdf.text.xml.xmp.XmpSchema;
@@ -31,7 +31,7 @@ public static void addToSchema(XmpSchema schema, String key, String[] vals) {
         }
     }
 
-    public abstract void writeXmp(DcPrismSet dcPrism) throws XPathExpressionException;
+    public abstract void writeXmp(SchemaSet schemaSet) throws XPathExpressionException;
 
     public abstract String getYear() throws XPathExpressionException;