Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

catalog: Simple pdf-extract success rate.

  • Loading branch information...
commit a41d81df4aece3ccccdda82f9293e372351092c2 1 parent b708b2b
@kjw kjw authored
Showing with 42 additions and 2 deletions.
  1. +42 −2 test/catalog
View
44 test/catalog
@@ -142,6 +142,15 @@ def with_catalog &block
write_catalog catalog
end
+def parse_xml_citations xml
+ doc = Nokogiri::XML::Document.parse xml
+ doc.css("reference").map { |r| r.text }
+end
+
+def diff_list left, right
+ (left.count - right.count).abs
+end
+
$set_spec = {}
["prefix", "journal", "year"].each do |item|
@@ -224,7 +233,7 @@ command :run do |c|
with_catalog do |catalog|
catalog.each do |doi, record|
- if record["pdf"] and not record["references"]
+ if record["pdf"] and not record["extracted"]
say "Running pdfextract for #{doi}..."
begin
@@ -238,7 +247,7 @@ command :run do |c|
file.write xml
end
- record[doi][:references] = references_filename
+ record[:extracted] = references_filename
rescue StandardError => e
say "Failed because of: #{e}"
end
@@ -249,3 +258,34 @@ command :run do |c|
end
end
+command :stats do |c|
+ c.syntax = "catalog stats"
+ c.description = "Success rate of pdfextract"
+
+ c.action do |args, options|
+
+ with_catalog do |catalog|
+
+ diffs = []
+
+ catalog.each do |doi, record|
+ if record["pdf"] and record["extracted"]
+ diffs << diff_list(parse_xml_citations(xml), record[:citations])
+ end
+ end
+
+ if diffs.count.zero?
+ say "No records with a PDF and pdf-extract results"
+ else
+ successful = diffs.count { |diff| diff.zero? }
+ unsuccessful = diffs.count - successful
+
+ success_percent = (successful / diffs.count) * 100.0
+
+ say "Successfully matched all references: #{successful} (#{success_percent}%)"
+ end
+ end
+
+ end
+end
+
Please sign in to comment.
Something went wrong with that request. Please try again.