Skip to content

Commit

Permalink
Move cli application to lib and improve cov tool.
Browse files Browse the repository at this point in the history
  • Loading branch information
EvgeneOskin committed Oct 20, 2015
1 parent 412423a commit 33ebc48
Show file tree
Hide file tree
Showing 3 changed files with 141 additions and 133 deletions.
3 changes: 3 additions & 0 deletions .simplecov
@@ -0,0 +1,3 @@
SimpleCov.start do
add_filter "/spec/"
end
135 changes: 2 additions & 133 deletions exe/mathnet
@@ -1,136 +1,5 @@
#!/usr/bin/env ruby
require 'rubygems'
require 'commander'
require 'parallel'
require 'net/http'
require 'mathnet/crawler'
require 'exponential_backoff'
require 'mathnet/cli'

# CLI for search and download articles from mathnet site.
class MathnetApplication
include Commander::Methods

# Initialize constances.
def initialize
@minimal_interval = 1.0
@maximal_elapsed_time = 600.0
@base_dir = 'mathnet'
end

# Process programm call via command line.
def run
program :name, 'Mathnet crawler.'
program :version, Mathnet::Crawler::VERSION
program :description, 'Command that act like missed mathnet client.'
add_download_all_command
add_articles_command
add_journals_command
run!
end

def add_download_all_command
command :'download all' do |c|
c.description = 'Download all articals.'
c.action do
journals = list_journals
issues = list_issues journals
articles = list_articals issues
download_aricles articles
end
end
end

def add_articles_command
command :articles do |c|
c.description = 'List articles per journal.'
c.action do
articles = list_articals list_issues list_journals
articles.each do |article|
say "#{article.journal_title}/#{article.title}"
end
end
end
end

def add_journals_command
command :journals do |c|
c.description = 'List journals on mathnet.'
c.action do
journals = list_journals
journals.each do |journal|
say journal.title
end
end
end
end

# Return all journals published on mathnet.
def list_journals
process_backoff do
Mathnet::Crawler::Journal.list Mathnet::Crawler::Library.new
end
end

# Return all issues that existing in passed journals.
# @param journals [Array] of [Mathnet::Crawler::Journals] that having issues.
def list_issues(journals)
issues_lists = Parallel.map(journals, progress: 'List issues') do |journal|
process_backoff { Mathnet::Crawler::Issue.list journal }
end
issues_lists.reduce do |initial, item|
initial + item
end
end

# Return all articles that existing in passed issues.
# @param issues [Array] of [Mathnet::Crawler::Issue] that having articles.
def list_articals(issues)
articals_lists = Parallel.map(issues, progress: 'List articals') do |issue|
process_backoff { Mathnet::Crawler::Article.list issue }
end
articals_lists.reduce do |initial, item|
(item && initial + item) || initial
end
end

# Store full texts of passed articles.
# @param articles [Array] of [Mathnet::Crawler::Article] that having
# full texts.
def download_aricles(articles)
Parallel.each(articles, progress: 'Download texts') do |article|
pdf_path = article_path article
process_backoff do
article.full_text do |body|
pdf = File.new pdf_path, 'w'
pdf.write body
pdf.close
end
end
end
end

# Execut block with exponential backoff
# @param &block [block] do http request and if http error occured,
# we would retry it.
def process_backoff(&block)
backoff.until_success do
begin
block.call
rescue Net::HTTPServerException
false
end
end
end

def article_path(article)
directory = File.join @base_dir, article.journal_title
FileUtils.mkdir_p directory if Dir.exist?(directory)
File.join directory, "#{article.title}.pdf"
end

def backoff
ExponentialBackoff.new @minimal_interval, @maximal_elapsed_time
end
end

MathnetApplication.new.run if $PROGRAM_NAME == __FILE__
Mathnet::Application.new.run if $PROGRAM_NAME == __FILE__
136 changes: 136 additions & 0 deletions lib/mathnet/cli.rb
@@ -0,0 +1,136 @@
#!/usr/bin/env ruby
require 'rubygems'
require 'commander'
require 'parallel'
require 'net/http'
require 'mathnet/crawler'
require 'exponential_backoff'

module Mathnet
# CLI for search and download articles from mathnet site.
class Application
include Commander::Methods

# Initialize constances.
def initialize
@minimal_interval = 1.0
@maximal_elapsed_time = 600.0
@base_dir = 'mathnet'
end

# Process programm call via command line.
def run
program :name, 'Mathnet crawler.'
program :version, Crawler::VERSION
program :description, 'Command that act like missed mathnet client.'
add_download_all_command
add_articles_command
add_journals_command
run!
end

def add_download_all_command
command :'download all' do |c|
c.description = 'Download all articals.'
c.action do
journals = list_journals
issues = list_issues journals
articles = list_articals issues
download_aricles articles
end
end
end

def add_articles_command
command :articles do |c|
c.description = 'List articles per journal.'
c.action do
articles = list_articals list_issues list_journals
articles.each do |article|
say "#{article.journal_title}/#{article.title}"
end
end
end
end

def add_journals_command
command :journals do |c|
c.description = 'List journals on mathnet.'
c.action do
journals = list_journals
journals.each do |journal|
say journal.title
end
end
end
end

# Return all journals published on mathnet.
def list_journals
process_backoff do
Crawler::Journal.list Crawler::Library.new
end
end

# Return all issues that existing in passed journals.
# @param journals [Array] of [Mathnet::Crawler::Journals] with issues.
def list_issues(journals)
issues_lists = Parallel.map(journals, progress: 'List issues') do |j|
process_backoff { Crawler::Issue.list j }
end
issues_lists.reduce do |initial, item|
initial + item
end
end

# Return all articles that existing in passed issues.
# @param issues [Array] of [Mathnet::Crawler::Issue] that having articles.
def list_articals(issues)
articals_lists = Parallel.map(issues, progress: 'List articals') do |i|
process_backoff { Crawler::Article.list i }
end
articals_lists.reduce do |initial, item|
(item && initial + item) || initial
end
end

# Store full texts of passed articles.
# @param articles [Array] of [Mathnet::Crawler::Article] that having
# full texts.
def download_aricles(articles)
Parallel.each(articles, progress: 'Download texts') do |article|
pdf_path = article_path article
process_backoff do
article.full_text do |body|
pdf = File.new pdf_path, 'w'
pdf.write body
pdf.close
end
end
end
end

# Execut block with exponential backoff
# @param &block [block] do http request and if http error occured,
# we would retry it.
def process_backoff(&block)
backoff.until_success do
begin
block.call
rescue Net::HTTPServerException
false
end
end
end

def article_path(article)
directory = File.join @base_dir, article.journal_title
FileUtils.mkdir_p directory if Dir.exist?(directory)
File.join directory, "#{article.title}.pdf"
end

def backoff
ExponentialBackoff.new @minimal_interval, @maximal_elapsed_time
end
end
end

0 comments on commit 33ebc48

Please sign in to comment.