Skip to content
This repository has been archived by the owner on Nov 29, 2019. It is now read-only.

Commit

Permalink
Look for citations in mongo rather than over OAI-PMH.
Browse files Browse the repository at this point in the history
  • Loading branch information
kjw committed Apr 26, 2012
1 parent 591ae6b commit 3ed930a
Showing 1 changed file with 41 additions and 155 deletions.
196 changes: 41 additions & 155 deletions test/catalog
@@ -1,4 +1,5 @@
#!/usr/bin/env ruby
# -*- coding: utf-8 -*-

# Grab some DOIS and metadata via OAI PMH.

Expand All @@ -7,116 +8,18 @@
# If there is no corresponding PDF in the test-data dir, download it using
# get-pdf.

require 'cgi'
require 'net/http'
require 'uri/http'
require 'commander/import'
require 'nokogiri'
require 'json'
require 'getpdf'
require 'mongo'
require_relative '../lib/pdf-extract'

program :name, "catalog"
program :version, "0.0.1"
program :description, "Build a PDF catalog, with metadata."

def query_uri verb, options={}
prefix = options[:prefix]
journal = options[:journal]
year = options[:year]

if prefix.nil? || (!year.nil? && journal.nil?)
fail "Must specify one of prefix, prefix:journal, or prefix:journal:year"
end

set = CGI.escape [prefix, journal, year].compact.join(":")
q = "verb=#{verb}&metadataPrefix=cr_unixml&set=#{set}"
URI::HTTP.build({
:host => "oai.crossref.org",
:path => "/OAIHandler",
:query => q
})
end

def parse_dois xml
doc = Nokogiri::XML::Document.parse xml
identifiers = doc.css "identifier"
identifiers.map { |id| id.text.sub "info:doi/", "" }
end

def parse_records xml
doc = Nokogiri::XML::Document.parse xml
ns = {"cr" => "http://www.crossref.org/xschema/1.0"}

doc.xpath("//cr:crossref", ns).map do |metadata|
publication = {
:title => metadata.at_xpath(".//cr:full_title", ns).text,
:issue => metadata.at_xpath(".//cr:issue", ns).text,
:volume => metadata.at_xpath(".//cr:volume", ns).text
}

contributors = metadata.xpath(".//cr:person_name", ns).map do |name|
full_name = name.at_xpath(".//cr:given_name", ns).text
full_name += " "
full_name += name.at_xpath(".//cr:surname", ns).text
full_name
end

citations = metadata.xpath(".//cr:unstructured_citation", ns).map { |c| c.text }

article = {
:doi => metadata.at_xpath(".//cr:doi", ns).text,
:title => metadata.at_xpath(".//cr:title", ns).text,
:publication => publication,
:contributors => contributors,
:citations => citations
}

if metadata.at_xpath ".//cr:first_page", ns
article[:first_page] = metadata.at_xpath ".//cr:first_page", ns
end

if metadata.at_xpath ".//cr:last_page", ns
article[:last_page] = metadata.at_xpath ".//cr:last_page", ns
end

article
end
end

def parse_setspecs xml
doc = Nokogiri::XML::Document.parse xml
doc.css("setSpec").map { |s| s.text }.uniq
end

def get_xml verb, options
uri = query_uri verb, options

Net::HTTP.start uri.host do |http|
response = http.get uri.request_uri

if response.code.to_i == 200
response.body
else
fail "Failed to get metadata. OAI server returned: #{response.code}"
end
end
end

def get_records options
parse_records get_xml("ListRecords", options)
end

def get_dois options
parse_dois get_xml("ListIdentifiers", options)
end

def get_sets options
parse_setspecs get_xml("ListSets", options)
end

def catalog_filename
File.join File.dirname(__FILE__), "catalog.json"
"catalog.json"
end

def read_catalog filename=catalog_filename
Expand Down Expand Up @@ -151,66 +54,48 @@ def diff_list left, right
(left.count - right.count).abs
end

$set_spec = {}
def get_references doi
$mongo ||= Mongo::Connection.new($mongo_host || "192.168.1.152")
citations = $mongo["crossref"]["citations"]

["prefix", "journal", "year"].each do |item|
global_option "--#{item.downcase}=#{item.upcase}" do |value|
$set_spec[item.to_sym] = value
end
docs = citations.find({"from.doi" => doi})
docs.map { |doc| doc["to"]["unstructured_citation"] }.compact
end

command :list do |c|
c.syntax = "catalog list --prefix=10.1109"
c.description = "List set specs in OAI metadata"

c.action do |args, options|
get_sets($set_spec).each do |setspec|
say setspec
end
end
global_option "--mongo host" do |val|
$mongo_host = val
end

command :populate do |c|
c.syntax = "catalog populate --prefix=10.5555 --journal=5 --year=2002"
c.description = "Add CrossRef metadata to a catalog"
command :add do |c|
# add doi for PDF filename

c.syntax = "catalog add doi pdf"
c.description = "Add a PDF to the catalog with given DOI"

c.action do |args, options|
records = get_records $set_spec

doi, filename = args
with_catalog do |catalog|
records.each do |record|
catalog[record[:doi]] = record.merge(:from => $set_spec)
end
catalog[doi] = {
:pdf => filename,
:citations => get_references(doi)
}
say "Found #{catalog[doi][:citations].count} citations in CrossRef Cited-by data"
end

say "Added or updated #{records.count} records"
end
end

command :pdfs do |c|
c.syntax = "catalog pdfs"
c.description = "Locate and download PDFs for DOIs in a catalog"
command :accept do |c|
c.syntax = "catalog accept doi"
c.description = "Accept extracted references as test examples"

c.action do |args, options|
pdfs_added = 0

with_catalog do |catalog|
catalog.each do |doi, record|
unless record["pdf"]
say "Crawling for #{doi}..."
pdf_filename = GetPdf.get_from_doi doi
if pdf_filename.nil?
say "Couldn't find PDF"
else
record[:pdf] = pdf_filename
say "Found PDF"
pdfs_added = pdfs_added.next
end
end
args.each do |doi|
xml_filename = catalog[doi]["pdf"] + ".xml"
refs = File.open(xml_filename) { |file| parse_xml_citations(file.read) }
catalog[doi]["citations"] = refs
end
end

say "Found #{pdfs_added} PDFs."
end
end

Expand All @@ -233,16 +118,16 @@ command :run do |c|

with_catalog do |catalog|
catalog.each do |doi, record|
if record["pdf"] and not record["extracted"]
say "Running pdfextract for #{doi}..."
if record["pdf"]
say "Running pdfextract for #{record["pdf"]} ( #{doi} )..."

begin
xml = PdfExtract.view(record["pdf"], :as => :xml) do |pdf|
pdf.references
end

references_filename = record["pdf"] + ".xml"

File.open(references_filename, "w") do |file|
file.write xml
end
Expand All @@ -265,27 +150,28 @@ command :stats do |c|
c.action do |args, options|

with_catalog do |catalog|

diffs = []

catalog.each do |doi, record|
if record["pdf"] and record["extracted"]
diffs << diff_list(parse_xml_citations(xml), record[:citations])
if record["pdf"]
xml = File.open(record["pdf"] + ".xml") { |file| file.read }
diffs << diff_list(parse_xml_citations(xml), record["citations"])
end
end
end

if diffs.count.zero?
say "No records with a PDF and pdf-extract results"
else
successful = diffs.count { |diff| diff.zero? }
unsuccessful = diffs.count - successful

success_percent = (successful / diffs.count) * 100.0
say "Successfully matched all references: #{successful} (#{success_percent}%)"
success_percent = (successful / diffs.count.to_f) * 100.0

say "Successfully matched all references: #{successful}/#{diffs.count} (#{success_percent}%)"
end
end

end
end

0 comments on commit 3ed930a

Please sign in to comment.