Skip to content

Commit

Permalink
Add ci.
Browse files Browse the repository at this point in the history
Add travis, coverall and fix rubocop issues.
  • Loading branch information
EvgeneOskin committed Oct 19, 2015
1 parent a19725c commit 412423a
Show file tree
Hide file tree
Showing 11 changed files with 324 additions and 184 deletions.
1 change: 1 addition & 0 deletions .coveralls.yml
@@ -0,0 +1 @@
service_name: travis-ci
8 changes: 8 additions & 0 deletions .travis.yml
@@ -0,0 +1,8 @@
language: ruby
rvm:
- 2.0.0
- 2.1.4
- ruby-head
before_install:
- gem update --system
- gem --version
5 changes: 5 additions & 0 deletions .yardopts
@@ -0,0 +1,5 @@
--no-private
lib/**/*.rb
exe/mathnet
-
LICENSE
32 changes: 29 additions & 3 deletions Gemfile.lock
@@ -1,7 +1,7 @@
PATH
remote: .
specs:
mathnet-crawler (0.1.0)
mathnet-crawler (0.1.1)
commander (~> 4.3)
exponential-backoff
nokogiri (~> 1.6)
Expand All @@ -18,28 +18,44 @@ GEM
coderay (1.1.0)
commander (4.3.5)
highline (~> 1.7.2)
coveralls (0.8.3)
json (~> 1.8)
rest-client (>= 1.6.8, < 2)
simplecov (~> 0.10.0)
term-ansicolor (~> 1.3)
thor (~> 0.19.1)
crack (0.4.2)
safe_yaml (~> 1.0.0)
diff-lcs (1.2.5)
docile (1.1.5)
domain_name (0.5.25)
unf (>= 0.0.5, < 1.0.0)
exponential-backoff (0.0.2)
hashdiff (0.2.2)
highline (1.7.8)
http-cookie (1.0.2)
domain_name (~> 0.5)
json (1.8.3)
method_source (0.8.2)
mime-types (2.6.2)
mini_portile (0.6.2)
netrc (0.10.3)
nokogiri (1.6.6.2)
mini_portile (~> 0.6.0)
parallel (1.6.1)
parser (2.2.2.6)
parser (2.2.3.0)
ast (>= 1.1, < 3.0)
powerpack (0.1.1)
pry (0.10.1)
pry (0.10.3)
coderay (~> 1.1.0)
method_source (~> 0.8.1)
slop (~> 3.4)
rainbow (2.0.0)
rake (10.4.2)
rest-client (1.8.0)
http-cookie (>= 1.0.2, < 2.0)
mime-types (>= 1.16, < 3.0)
netrc (~> 0.7)
rspec (3.3.0)
rspec-core (~> 3.3.0)
rspec-expectations (~> 3.3.0)
Expand Down Expand Up @@ -67,24 +83,34 @@ GEM
simplecov-html (~> 0.10.0)
simplecov-html (0.10.0)
slop (3.6.0)
term-ansicolor (1.3.2)
tins (~> 1.0)
thor (0.19.1)
tins (1.6.0)
unf (0.1.4)
unf_ext
unf_ext (0.0.7.1)
webmock (1.22.1)
addressable (>= 2.3.6)
crack (>= 0.3.2)
hashdiff
yard (0.8.7.6)

PLATFORMS
ruby

DEPENDENCIES
bundler (~> 1.10)
byebug (~> 6.0)
coveralls (~> 0.8.3)
mathnet-crawler!
pry (~> 0.10)
rake (~> 10.0)
rspec (~> 3.3.0)
rubocop (~> 0.34.2)
simplecov (~> 0.10.0)
webmock (~> 1.22.1)
yard (~> 0.8)

BUNDLED WITH
1.10.6
4 changes: 3 additions & 1 deletion README.md
@@ -1,4 +1,6 @@
# Mathnet::Crawler
[![Build Status](https://travis-ci.org/EvgeneOskin/ruby-mathnet-crawler.svg)](https://travis-ci.org/EvgeneOskin/ruby-mathnet-crawler)
[![Coverage Status](https://coveralls.io/repos/EvgeneOskin/ruby-mathnet-crawler/badge.svg?branch=master&service=github)](https://coveralls.io/github/EvgeneOskin/ruby-mathnet-crawler?branch=master)

The Project is ruby library and cli tool to search and download articles from [MathNet](http://mathnet.ru/).

Expand All @@ -22,7 +24,7 @@ Or install it yourself as:
$ gem install mathnet-crawler
```

## Usagerm
## Usage

TODO: Write usage instructions here

Expand Down
75 changes: 41 additions & 34 deletions exe/mathnet
Expand Up @@ -6,118 +6,125 @@ require 'net/http'
require 'mathnet/crawler'
require 'exponential_backoff'

# CLI for search and download articles from mathnet site.
class MathnetApplication
include Commander::Methods

# Initialize constances.
def initialize
@minimal_interval = 1.0
@maximal_elapsed_time = 600.0
@base_dir = 'mathnet'
end

# Process programm call via command line.
def run
program :name, 'Mathnet crawler.'
program :version, Mathnet::Crawler::VERSION
program :description, 'Command that act like missed mathnet client.'

add_download_all_command
add_articles_command
add_journals_command
run!
end

def add_download_all_command
command :'download all' do |c|
c.syntax = 'download all'
c.description = 'Download all articals.'
c.action do |args, options|
c.action do
journals = list_journals
issues = list_issues journals
articles = list_articals issues
download_aricles articles
end
end
end

def add_articles_command
command :articles do |c|
c.syntax = 'articles'
c.description = 'List articles per journal.'
c.action do |args, options|
journals = list_journals
issues = list_issues journals
articles = list_articals issues
c.action do
articles = list_articals list_issues list_journals
articles.each do |article|
say "#{article.journal_title}/#{article.title}"
end
end
end
end

def add_journals_command
command :journals do |c|
c.syntax = 'journals'
c.description = 'List journals on mathnet.'
c.action do |args, options|
c.action do
journals = list_journals
journals.each do |journal|
say journal.title
end
end
end
run!
end

# Return all journals published on mathnet.
def list_journals
process_backoff do
process_backoff do
Mathnet::Crawler::Journal.list Mathnet::Crawler::Library.new
end
end

# Return all issues that existing in passed journals.
# @param journals [Array] of [Mathnet::Crawler::Journals] that having issues.
def list_issues(journals)
issues_lists = Parallel.map(journals, :progress => 'List issues') do |journal|
process_backoff do
Mathnet::Crawler::Issue.list journal
end
issues_lists = Parallel.map(journals, progress: 'List issues') do |journal|
process_backoff { Mathnet::Crawler::Issue.list journal }
end
issues_lists.reduce do |initial, item|
initial + item
end
end

# Return all articles that existing in passed issues.
# @param issues [Array] of [Mathnet::Crawler::Issue] that having articles.
def list_articals(issues)
articals_lists = Parallel.map(issues, :progress => 'List articals') do |issue|
process_backoff do
Mathnet::Crawler::Article.list issue
end
articals_lists = Parallel.map(issues, progress: 'List articals') do |issue|
process_backoff { Mathnet::Crawler::Article.list issue }
end
articals_lists.reduce do |initial, item|
if not item
return initial
end
initial + item
(item && initial + item) || initial
end
end

# Store full texts of passed articles.
# @param articles [Array] of [Mathnet::Crawler::Article] that having
# full texts.
def download_aricles(articles)
Parallel.each(articles, :progress => 'Download texts') do |article|
Parallel.each(articles, progress: 'Download texts') do |article|
pdf_path = article_path article
process_backoff do
process_backoff do
article.full_text do |body|
pdf = File.new pdf_path, 'w'
pdf.write body
pdf.close
end
true
end
end
end

# Execut block with exponential backoff
# @param &block [block] do http request and if http error occured,
# we would retry it.
def process_backoff(&block)
backoff.until_success do
begin
result = block.call()
block.call
rescue Net::HTTPServerException
result = false
false
end
return result
end
end

def article_path(article)
directory = File.join @base_dir, article.journal_title
if not Dir.exist? directory
FileUtils.mkdir_p directory
end
directory = File.join @base_dir, article.journal_title
FileUtils.mkdir_p directory if Dir.exist?(directory)
File.join directory, "#{article.title}.pdf"
end

Expand Down
23 changes: 15 additions & 8 deletions lib/mathnet/crawler.rb
Expand Up @@ -3,10 +3,14 @@
require 'net/http'
require 'nokogiri'

module Mathnet # :nodoc:
module Crawler # :nodoc:
# Major module for Mathnet API.
module Mathnet
# WEB Crawler to simulate mathnet web site via API.
module Crawler
# Base class for main kind of data on mathnet.
class Entry
module Listable
# Module to store list operations with entries.
module Listable
CSS_FILTER = 'a.SLink'

def list(parent)
Expand All @@ -20,12 +24,13 @@ def list(parent)
end
end
end

def title
@title.delete("\r\n").delete("\n")
end
end

# Custom client to make http requests.
class HTTPClient
def initialize(host: 'www.mathnet.ru')
@base_uri = URI('http://' + host)
Expand Down Expand Up @@ -64,12 +69,14 @@ def uri(path)
end
end

# Class represent mathnet main page.
class Library
def children_url
'/ej.phtml'
end
end

# Science journal.
class Journal < Entry
@detail_url_reqexp = %r{/php/journal.phtml}

Expand All @@ -89,6 +96,7 @@ def children_url
end
end

# Single issues of journal.
class Issue < Entry
@detail_url_reqexp = %r{/php/archive.phtml?.*wshow=issue}

Expand All @@ -110,6 +118,7 @@ def journal_title
end
end

# Single article of issues.
class Article < Entry
@detail_url_reqexp = %r{/rus/}

Expand Down Expand Up @@ -140,14 +149,12 @@ def full_text_url
def full_text(&block)
client = HTTPClient.new
payload = client.get full_text_url
if payload['Content-Type'] != 'text/html'
block.call payload.body
end
block.call payload.body if payload['Content-Type'] != 'text/html'
end

def journal_title
@parent.journal_title
end
end
end
end
end
2 changes: 1 addition & 1 deletion lib/mathnet/crawler/version.rb
@@ -1,5 +1,5 @@
module Mathnet # :nodoc:
module Crawler # :nodoc:
VERSION = '0.1.0'
VERSION = '0.1.1'
end
end
5 changes: 4 additions & 1 deletion mathnet-crawler.gemspec
Expand Up @@ -10,7 +10,8 @@ Gem::Specification.new do |spec|
spec.email = ['eoskin@crystalnix.com']

spec.summary = 'Tool kit to operate with mathnet.ru'
spec.description = 'The Library provides API and CLI to operate with mathnet.ru.'
spec.description = 'The Library provides API and CLI to' \
'operate with mathnet.ru.'
spec.homepage = 'https://github.com/EvgeneOskin/ruby-mathnet-crawler'

# Prevent pushing this gem to RubyGems.org by setting 'allowed_push_host', or
Expand Down Expand Up @@ -41,4 +42,6 @@ Gem::Specification.new do |spec|
spec.add_development_dependency 'rspec', '~> 3.3.0'
spec.add_development_dependency 'simplecov', '~> 0.10.0'
spec.add_development_dependency 'webmock', '~> 1.22.1'
spec.add_development_dependency 'coveralls', '~> 0.8.3'
spec.add_development_dependency 'yard', '~> 0.8'
end

0 comments on commit 412423a

Please sign in to comment.