Skip to content
This repository has been archived by the owner on Nov 29, 2019. It is now read-only.

Commit

Permalink
Include pdfx XMP metadata in document information dictionary.
Browse files Browse the repository at this point in the history
Also populate pdfx namespace when downloading DOI metadata.
  • Loading branch information
kjw committed May 8, 2012
1 parent 4a58625 commit ab785f6
Show file tree
Hide file tree
Showing 9 changed files with 147 additions and 25 deletions.
36 changes: 34 additions & 2 deletions src/org/crossref/pdfmark/Main.java
Expand Up @@ -22,16 +22,22 @@
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.Map.Entry;

import org.apache.pdfbox.cos.COSDictionary;
import org.apache.pdfbox.cos.COSDocument;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.cos.COSObject;
import org.apache.pdfbox.exceptions.COSVisitorException;
import org.apache.pdfbox.pdfparser.PDFParser;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentInformation;

import com.itextpdf.text.DocumentException;
import com.itextpdf.text.pdf.PdfReader;
import com.itextpdf.text.pdf.PdfStamper;
import com.itextpdf.text.pdf.PdfWriter;
import com.itextpdf.text.xml.xmp.XmpSchema;

import static jargs.gnu.CmdLineParser.Option;

Expand Down Expand Up @@ -192,8 +198,10 @@ public Main(String[] args) {
}

try {
new File(outputFile.getPath() + ".tmp").deleteOnExit();

FileInputStream fileIn = new FileInputStream(pdfFile);
FileOutputStream fileOut = new FileOutputStream(outputFile);
FileOutputStream fileOut = new FileOutputStream(outputFile.getPath() + ".tmp");
PdfReader reader = new PdfReader(fileIn);
PdfStamper stamper = new PdfStamper(reader, fileOut);

Expand All @@ -206,11 +214,14 @@ public Main(String[] args) {
if (resolvedXmpData != null) {
merged = XmpUtils.mergeXmp(merged, resolvedXmpData);
}

stamper.setXmpMetadata(merged);

stamper.close();
reader.close();

fileIn = new FileInputStream(outputFile.getPath() + ".tmp");
writeInfoDictionary(fileIn, outputFile.getPath(), merged);
} catch (IOException e) {
exitWithError(2, "Error: Couldn't handle '" + pdfFilePath
+ "' because of:\n" + e);
Expand All @@ -220,12 +231,33 @@ public Main(String[] args) {
} catch (XmpException e) {
exitWithError(2, "Error: Couldn't handle '" + pdfFilePath
+ "' because of:\n" + e);
} catch (COSVisitorException e) {
exitWithError(2, "Error: Couldn't write document info dictionary"
+ " because of:\n" + e);
}
}

shutDown();
}

public static void writeInfoDictionary(FileInputStream in,
String outputFile, byte[] xmp) throws IOException, COSVisitorException {

PDFParser parser = new PDFParser(in);
parser.parse();

PDDocument document = parser.getPDDocument();
PDDocumentInformation info = document.getDocumentInformation();

for (Entry<String, String> entry : XmpUtils.toInfo(xmp).entrySet()) {
info.setCustomMetadataValue(entry.getKey(), entry.getValue());
}

document.setDocumentInformation(info);
document.save(outputFile);
document.close();
}

/**
* According to the PDF Reference Manual (appendix F) a linearized PDF
* must have as its first object after the PDF header an indirect
Expand Down
16 changes: 8 additions & 8 deletions src/org/crossref/pdfmark/MarkBuilder.java
Expand Up @@ -71,7 +71,7 @@ public void onPublisher(String requestedDoi, Publisher pub) {
@Override
public void onComplete(String requestedDoi) {
ByteArrayOutputStream bout = new ByteArrayOutputStream();
DcPrismSet dcPrism = new DcPrismSet();
SchemaSet schemaSet = new SchemaSet();

try {
Work work = null;
Expand All @@ -90,23 +90,23 @@ public void onComplete(String requestedDoi) {
if (work != null) {
XmpWriter writer = new XmpWriter(bout);

work.writeXmp(dcPrism);
work.writeXmp(schemaSet);

if (publisher != null) {
if (generateCopyright) {
String cp = getCopyright(work);
Work.addToSchema(dcPrism.getDc(), DublinCoreSchema.RIGHTS, cp);
Work.addToSchema(dcPrism.getPrism(), Prism21Schema.COPYRIGHT, cp);
Work.addToSchema(schemaSet.getDc(), DublinCoreSchema.RIGHTS, cp);
Work.addToSchema(schemaSet.getPrism(), Prism21Schema.COPYRIGHT, cp);
}
Work.addToSchema(dcPrism.getDc(), DublinCoreSchema.PUBLISHER,
Work.addToSchema(schemaSet.getDc(), DublinCoreSchema.PUBLISHER,
publisher.getName());
}

Work.addToSchema(dcPrism.getPrism(), Prism21Schema.RIGHTS_AGENT,
Work.addToSchema(schemaSet.getPrism(), Prism21Schema.RIGHTS_AGENT,
rightsAgent);

writer.addRdfDescription(dcPrism.getDc());
writer.addRdfDescription(dcPrism.getPrism());
writer.addRdfDescription(schemaSet.getDc());
writer.addRdfDescription(schemaSet.getPrism());
writer.close();
}

Expand Down
19 changes: 19 additions & 0 deletions src/org/crossref/pdfmark/PdfxSchema.java
@@ -0,0 +1,19 @@
package org.crossref.pdfmark;

import com.itextpdf.text.xml.xmp.XmpSchema;

public class PdfxSchema extends XmpSchema {

public static final String DEFAULT_XPATH_ID = "pdfx";
public static final String DEFAULT_XPATH_URI
= "http://ns.adobe.com/pdfx/1.3/";

public static final String DOI = "doi";

public PdfxSchema() {
super("xmlns:"
+ DEFAULT_XPATH_ID
+ "=\"" + DEFAULT_XPATH_URI + "\"");
}

}
Expand Up @@ -5,10 +5,11 @@
import com.itextpdf.text.xml.xmp.DublinCoreSchema;
import com.itextpdf.text.xml.xmp.XmpSchema;

public class DcPrismSet {
public class SchemaSet {

private XmpSchema dc = new DublinCoreSchema();
private XmpSchema prism = new Prism21Schema();
private XmpSchema pdfx = new PdfxSchema();

public XmpSchema getPrism() {
return prism;
Expand All @@ -17,5 +18,9 @@ public XmpSchema getPrism() {
public XmpSchema getDc() {
return dc;
}

public XmpSchema getPdfx() {
return pdfx;
}

}
58 changes: 58 additions & 0 deletions src/org/crossref/pdfmark/XmpUtils.java
Expand Up @@ -20,10 +20,12 @@
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.ObjectInputStream.GetField;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;

import javax.sql.rowset.spi.XmlWriter;
import javax.xml.XMLConstants;
Expand All @@ -32,6 +34,7 @@
import javax.xml.parsers.ParserConfigurationException;

import org.w3c.dom.Attr;
import org.w3c.dom.DOMException;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.NamedNodeMap;
Expand All @@ -40,6 +43,8 @@
import org.w3c.dom.Text;
import org.xml.sax.SAXException;

import com.itextpdf.text.pdf.PdfReader;
import com.itextpdf.text.pdf.PdfStamper;
import com.itextpdf.text.xml.xmp.XmpArray;
import com.itextpdf.text.xml.xmp.XmpReader;
import com.itextpdf.text.xml.xmp.XmpSchema;
Expand Down Expand Up @@ -208,4 +213,57 @@ public static byte[] mergeXmp(byte[] left, byte[] right) throws XmpException {
throw new XmpException(e);
}
}

/**
* Copy key value pairs from PDFX namespace into a PDF's document information
* dictionary.
*/
public static Map<String, String> toInfo(byte[] xmp) throws XmpException {
try {
DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
factory.setNamespaceAware(true);

Map<String, String> info = new HashMap<String, String>();

XmpSchema[] schemata = XmpUtils.parseSchemata(xmp);
for (XmpSchema schema : schemata) {
if (schema.getXmlns().contains("pdfx")) {
for (Entry<Object, Object> entry : schema.entrySet()) {
Object value = entry.getValue();

String key = (String) entry.getKey();
String[] parts = key.split(":");
String infoKey = parts.length == 2 ? parts[1] : parts[0];

String val = (String) entry.getValue();

if (val.contains("<rdf:Seq>") || val.contains("<rdf:Bag>")) {
val = "<xml xmlns:rdf=\"rdf\">" + val + "</xml>";
DocumentBuilder builder = factory.newDocumentBuilder();
Document doc = builder.parse(new ByteArrayInputStream(val.getBytes()));

NodeList nodes = doc.getElementsByTagName("rdf:li");
for (int i=0; i<nodes.getLength(); i++) {
Element item = (Element) nodes.item(i);
info.put(infoKey + "[" + (i + 1) + "]", item.getTextContent());
}
} else {
info.put(infoKey, (String) value);
}
}
}
}

return info;
} catch (DOMException e) {
throw new XmpException(e);
} catch (IOException e) {
throw new XmpException(e);
} catch (SAXException e) {
throw new XmpException(e);
} catch (ParserConfigurationException e) {
throw new XmpException(e);
}
}

}
16 changes: 10 additions & 6 deletions src/org/crossref/pdfmark/unixref/Book.java
Expand Up @@ -4,7 +4,8 @@
import javax.xml.xpath.XPathExpression;
import javax.xml.xpath.XPathExpressionException;

import org.crossref.pdfmark.DcPrismSet;
import org.crossref.pdfmark.PdfxSchema;
import org.crossref.pdfmark.SchemaSet;
import org.crossref.pdfmark.MarkBuilder;
import org.crossref.pdfmark.XPathHelpers;
import org.crossref.pdfmark.prism.Prism21Schema;
Expand Down Expand Up @@ -109,23 +110,26 @@ public String getYear() throws XPathExpressionException {
return year;
}

public void writeXmp(DcPrismSet dcPrism) throws XPathExpressionException {
XmpSchema dc = dcPrism.getDc();
XmpSchema prism = dcPrism.getPrism();
public void writeXmp(SchemaSet schemaSet) throws XPathExpressionException {
XmpSchema dc = schemaSet.getDc();
XmpSchema prism = schemaSet.getPrism();
XmpSchema pdfx = schemaSet.getPdfx();

addToSchema(dc, DublinCoreSchema.CREATOR, getContributors());
addToSchema(dc, DublinCoreSchema.TITLE, getTitles());
addToSchema(dc, DublinCoreSchema.DATE, getPublicationDate());
addToSchema(dc, DublinCoreSchema.IDENTIFIER, "doi:" + getDoi());
addToSchema(dc, DublinCoreSchema.IDENTIFIER, getDoi());

addToSchema(prism, Prism21Schema.PUBLICATION_DATE, getPublicationDate());
addToSchema(prism, Prism21Schema.DOI, getDoi());
addToSchema(prism, Prism21Schema.URL, MarkBuilder.getUrlForDoi(getDoi()));
addToSchema(prism, Prism21Schema.ISSUE_IDENTIFIER, "doi:" + getDoi());
addToSchema(prism, Prism21Schema.ISSUE_IDENTIFIER, getDoi());
addToSchema(prism, Prism21Schema.EDITION, getEditionNumber());
addToSchema(prism, Prism21Schema.ISBN, getIsbn());
addToSchema(prism, Prism21Schema.ISSN, getIssn());

addToSchema(pdfx, PdfxSchema.DOI, getDoi());

// TODO:
//addToSchema(prism, Prism21Schema.PUBLICATION_NAME, getFullTitle());
}
Expand Down
12 changes: 8 additions & 4 deletions src/org/crossref/pdfmark/unixref/Journal.java
Expand Up @@ -22,7 +22,8 @@
import javax.xml.xpath.XPathExpression;
import javax.xml.xpath.XPathExpressionException;

import org.crossref.pdfmark.DcPrismSet;
import org.crossref.pdfmark.PdfxSchema;
import org.crossref.pdfmark.SchemaSet;
import org.crossref.pdfmark.MarkBuilder;
import org.crossref.pdfmark.XPathHelpers;
import org.crossref.pdfmark.prism.Prism21Schema;
Expand Down Expand Up @@ -146,10 +147,11 @@ public String getYear() throws XPathExpressionException {
return getArticle().getYear();
}

public void writeXmp(DcPrismSet dcPrism) throws XPathExpressionException {
public void writeXmp(SchemaSet schemaSet) throws XPathExpressionException {
JournalArticle article = getArticle();
XmpSchema dc = dcPrism.getDc();
XmpSchema prism = dcPrism.getPrism();
XmpSchema dc = schemaSet.getDc();
XmpSchema prism = schemaSet.getPrism();
XmpSchema pdfx = schemaSet.getPdfx();

addToSchema(dc, DublinCoreSchema.CREATOR, article.getContributors());
addToSchema(dc, DublinCoreSchema.TITLE, article.getTitles());
Expand All @@ -167,6 +169,8 @@ public void writeXmp(DcPrismSet dcPrism) throws XPathExpressionException {
addToSchema(prism, Prism21Schema.STARTING_PAGE, article.getFirstPage());
addToSchema(prism, Prism21Schema.ENDING_PAGE, article.getLastPage());
addToSchema(prism, Prism21Schema.URL, MarkBuilder.getUrlForDoi(article.getDoi()));

addToSchema(pdfx, PdfxSchema.DOI, article.getDoi());
}

}
4 changes: 2 additions & 2 deletions src/org/crossref/pdfmark/unixref/Standard.java
Expand Up @@ -4,7 +4,7 @@
import javax.xml.xpath.XPathExpression;
import javax.xml.xpath.XPathExpressionException;

import org.crossref.pdfmark.DcPrismSet;
import org.crossref.pdfmark.SchemaSet;
import org.crossref.pdfmark.XPathHelpers;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
Expand Down Expand Up @@ -73,7 +73,7 @@ public String getYear() throws XPathExpressionException {
}

@Override
public void writeXmp(DcPrismSet dcPrism) throws XPathExpressionException {
public void writeXmp(SchemaSet dcPrism) throws XPathExpressionException {
// TODO Auto-generated method stub

}
Expand Down
4 changes: 2 additions & 2 deletions src/org/crossref/pdfmark/unixref/Work.java
Expand Up @@ -2,7 +2,7 @@

import javax.xml.xpath.XPathExpressionException;

import org.crossref.pdfmark.DcPrismSet;
import org.crossref.pdfmark.SchemaSet;

import com.itextpdf.text.xml.xmp.XmpArray;
import com.itextpdf.text.xml.xmp.XmpSchema;
Expand Down Expand Up @@ -31,7 +31,7 @@ public static void addToSchema(XmpSchema schema, String key, String[] vals) {
}
}

public abstract void writeXmp(DcPrismSet dcPrism) throws XPathExpressionException;
public abstract void writeXmp(SchemaSet schemaSet) throws XPathExpressionException;

public abstract String getYear() throws XPathExpressionException;

Expand Down

0 comments on commit ab785f6

Please sign in to comment.