This repository has been archived by the owner on Nov 29, 2019. It is now read-only.
/
catalog
executable file
·291 lines (229 loc) · 6.71 KB
/
catalog
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
#!/usr/bin/env ruby
# Grab some DOIS and metadata via OAI PMH.
# Record metadata in the test-data dir.
# If there is no corresponding PDF in the test-data dir, download it using
# get-pdf.
require 'cgi'
require 'net/http'
require 'uri/http'
require 'commander/import'
require 'nokogiri'
require 'json'
require 'getpdf'
require_relative '../lib/pdf-extract'
program :name, "catalog"
program :version, "0.0.1"
program :description, "Build a PDF catalog, with metadata."
def query_uri verb, options={}
prefix = options[:prefix]
journal = options[:journal]
year = options[:year]
if prefix.nil? || (!year.nil? && journal.nil?)
fail "Must specify one of prefix, prefix:journal, or prefix:journal:year"
end
set = CGI.escape [prefix, journal, year].compact.join(":")
q = "verb=#{verb}&metadataPrefix=cr_unixml&set=#{set}"
URI::HTTP.build({
:host => "oai.crossref.org",
:path => "/OAIHandler",
:query => q
})
end
def parse_dois xml
doc = Nokogiri::XML::Document.parse xml
identifiers = doc.css "identifier"
identifiers.map { |id| id.text.sub "info:doi/", "" }
end
def parse_records xml
doc = Nokogiri::XML::Document.parse xml
ns = {"cr" => "http://www.crossref.org/xschema/1.0"}
doc.xpath("//cr:crossref", ns).map do |metadata|
publication = {
:title => metadata.at_xpath(".//cr:full_title", ns).text,
:issue => metadata.at_xpath(".//cr:issue", ns).text,
:volume => metadata.at_xpath(".//cr:volume", ns).text
}
contributors = metadata.xpath(".//cr:person_name", ns).map do |name|
full_name = name.at_xpath(".//cr:given_name", ns).text
full_name += " "
full_name += name.at_xpath(".//cr:surname", ns).text
full_name
end
citations = metadata.xpath(".//cr:unstructured_citation", ns).map { |c| c.text }
article = {
:doi => metadata.at_xpath(".//cr:doi", ns).text,
:title => metadata.at_xpath(".//cr:title", ns).text,
:publication => publication,
:contributors => contributors,
:citations => citations
}
if metadata.at_xpath ".//cr:first_page", ns
article[:first_page] = metadata.at_xpath ".//cr:first_page", ns
end
if metadata.at_xpath ".//cr:last_page", ns
article[:last_page] = metadata.at_xpath ".//cr:last_page", ns
end
article
end
end
def parse_setspecs xml
doc = Nokogiri::XML::Document.parse xml
doc.css("setSpec").map { |s| s.text }.uniq
end
def get_xml verb, options
uri = query_uri verb, options
Net::HTTP.start uri.host do |http|
response = http.get uri.request_uri
if response.code.to_i == 200
response.body
else
fail "Failed to get metadata. OAI server returned: #{response.code}"
end
end
end
def get_records options
parse_records get_xml("ListRecords", options)
end
def get_dois options
parse_dois get_xml("ListIdentifiers", options)
end
def get_sets options
parse_setspecs get_xml("ListSets", options)
end
def catalog_filename
File.join File.dirname(__FILE__), "catalog.json"
end
def read_catalog filename=catalog_filename
if File.exists? filename
File.open filename do |file|
JSON.load file
end
else
say "Created a new catalog"
{}
end
end
def write_catalog catalog, filename=catalog_filename
File.open filename, "w" do |file|
file.write catalog.to_json
end
end
def with_catalog &block
catalog = read_catalog
yield catalog
write_catalog catalog
end
def parse_xml_citations xml
doc = Nokogiri::XML::Document.parse xml
doc.css("reference").map { |r| r.text }
end
def diff_list left, right
(left.count - right.count).abs
end
$set_spec = {}
["prefix", "journal", "year"].each do |item|
global_option "--#{item.downcase}=#{item.upcase}" do |value|
$set_spec[item.to_sym] = value
end
end
command :list do |c|
c.syntax = "catalog list --prefix=10.1109"
c.description = "List set specs in OAI metadata"
c.action do |args, options|
get_sets($set_spec).each do |setspec|
say setspec
end
end
end
command :populate do |c|
c.syntax = "catalog populate --prefix=10.5555 --journal=5 --year=2002"
c.description = "Add CrossRef metadata to a catalog"
c.action do |args, options|
records = get_records $set_spec
with_catalog do |catalog|
records.each do |record|
catalog[record[:doi]] = record.merge(:from => $set_spec)
end
end
say "Added or updated #{records.count} records"
end
end
command :pdfs do |c|
c.syntax = "catalog pdfs"
c.description = "Locate and download PDFs for DOIs in a catalog"
c.action do |args, options|
pdfs_added = 0
with_catalog do |catalog|
catalog.each do |doi, record|
unless record["pdf"]
say "Crawling for #{doi}..."
pdf_filename = GetPdf.get_from_doi doi
if pdf_filename.nil?
say "Couldn't find PDF"
else
record[:pdf] = pdf_filename
say "Found PDF"
pdfs_added = pdfs_added.next
end
end
end
end
say "Found #{pdfs_added} PDFs."
end
end
command :count do |c|
c.syntax = "catalog count"
c.description = "Count records in a catalog"
c.action do |args, options|
with_catalog do |catalog|
say catalog.count.to_s
end
end
end
command :run do |c|
c.syntax = "catalog run"
c.description = "Run pdfextract on PDFs in a catalog"
c.action do |args, options|
with_catalog do |catalog|
catalog.each do |doi, record|
if record["pdf"] and not record["extracted"]
say "Running pdfextract for #{doi}..."
begin
xml = PdfExtract.view(record["pdf"], :as => :xml) do |pdf|
pdf.references
end
references_filename = record["pdf"] + ".xml"
File.open(references_filename, "w") do |file|
file.write xml
end
record[:extracted] = references_filename
rescue StandardError => e
say "Failed because of: #{e}"
end
end
end
end
end
end
command :stats do |c|
c.syntax = "catalog stats"
c.description = "Success rate of pdfextract"
c.action do |args, options|
with_catalog do |catalog|
diffs = []
catalog.each do |doi, record|
if record["pdf"] and record["extracted"]
diffs << diff_list(parse_xml_citations(xml), record[:citations])
end
end
if diffs.count.zero?
say "No records with a PDF and pdf-extract results"
else
successful = diffs.count { |diff| diff.zero? }
unsuccessful = diffs.count - successful
success_percent = (successful / diffs.count) * 100.0
say "Successfully matched all references: #{successful} (#{success_percent}%)"
end
end
end
end