Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/bulk reindex urls #175

Merged
merged 27 commits into from
Dec 12, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 11 additions & 3 deletions app/models/searchgov_url.rb
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,11 @@ class SearchgovUrl < ActiveRecord::Base
application/vnd.openxmlformats-officedocument.spreadsheetml.sheet
]

attr_accessible :last_crawl_status, :last_crawled_at, :url, :lastmod
attr_accessible :last_crawl_status,
:last_crawled_at,
:url,
:lastmod,
:enqueued_for_reindex
attr_reader :response, :document, :tempfile
attr_readonly :url

Expand All @@ -35,14 +39,18 @@ class SearchgovUrl < ActiveRecord::Base
column_name: proc {|url| !url.fetched? ? 'unfetched_urls_count' : nil },
column_names: { ['searchgov_urls.last_crawled_at IS NULL'] => 'unfetched_urls_count' }

scope :fetch_required, -> { where('last_crawled_at IS NULL OR lastmod > last_crawled_at') }
scope :fetch_required, -> do
MothOnMars marked this conversation as resolved.
Show resolved Hide resolved
where('last_crawled_at IS NULL
OR lastmod > last_crawled_at
OR enqueued_for_reindex')
end

class SearchgovUrlError < StandardError; end
class DomainError < StandardError; end

def fetch
raise DomainError.new("#{searchgov_domain.domain}: #{searchgov_domain.status}") if !searchgov_domain.available?
self.update_attributes(last_crawled_at: Time.now)
update(last_crawled_at: Time.now, enqueued_for_reindex: false)
self.load_time = Benchmark.realtime do
DocumentFetchLogger.new(url, 'searchgov_url').log
begin
Expand Down
18 changes: 18 additions & 0 deletions spec/fixtures/searchgov_urls.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
new:
url: http://www.agency.gov/new

outdated:
url: http://www.agency.gov/outdated
last_crawled_at: <%= 1.week.ago.to_s(:db) %>
lastmod: <%= 1.day.ago.to_s(:db) %>

current:
url: http://www.agency.gov/current
last_crawled_at: <%= 1.day.ago.to_s(:db) %>
lastmod: <%= 1.week.ago.to_s(:db) %>

enqueued:
url: http://www.agency.gov/enqueued
last_crawled_at: <%= 1.day.ago.to_s(:db) %>
lastmod: <%= 1.week.ago.to_s(:db) %>
enqueued_for_reindex: true
2 changes: 1 addition & 1 deletion spec/jobs/searchgov_domain_destroyer_job_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
let!(:searchgov_url2) { SearchgovUrl.create!(url: url2) }

it 'destroys the searchgov_urls' do
expect{ perform }.to change{ SearchgovUrl.count }.from(2).to(0)
expect { perform }.to change{ SearchgovUrl.count }.by(-2)
end
end
end
Expand Down
2 changes: 1 addition & 1 deletion spec/lib/tasks/searchgov_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@
it 'indexes new urls' do
allow(I14yDocument).to receive(:promote).
with(handle: 'searchgov', document_id: doc_id, bool: 'true').at_least(:once)
expect{ promote_urls }.to change{ SearchgovUrl.count }.from(0).to(1)
expect { promote_urls }.to change{ SearchgovUrl.count }.by(1)
end

it 'creates new urls' do
Expand Down
39 changes: 27 additions & 12 deletions spec/models/searchgov_url_spec.rb
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
require 'spec_helper'

describe SearchgovUrl do
fixtures :searchgov_urls

let(:url) { 'http://www.agency.gov/boring.html' }
let(:html) { read_fixture_file("/html/page_with_og_metadata.html") }
let(:valid_attributes) { { url: url } }
Expand All @@ -22,19 +24,20 @@

describe 'scopes' do
describe '.fetch_required' do
before do
SearchgovUrl.create!(url: 'http://www.agency.gov/new')
SearchgovUrl.create!(
url: 'http://www.agency.gov/outdated', last_crawled_at: 1.week.ago, lastmod: 1.day.ago
)
SearchgovUrl.create!(
url: 'http://www.agency.gov/current', last_crawled_at: 1.day.ago, lastmod: 1.week.ago
)
end

it 'includes urls that have never been crawled and outdated urls' do
expect(SearchgovUrl.fetch_required.pluck(:url)).
to eq %w[http://www.agency.gov/new http://www.agency.gov/outdated]
to include('http://www.agency.gov/new', 'http://www.agency.gov/outdated')
end

it 'does not include current, crawled and not enqueued urls' do
expect(SearchgovUrl.fetch_required.pluck(:url)).
not_to include('http://www.agency.gov/current')
end

it 'includes urls that have been enqueued for reindexing' do
MothOnMars marked this conversation as resolved.
Show resolved Hide resolved
expect(SearchgovUrl.fetch_required.pluck(:url)).
to include 'http://www.agency.gov/enqueued'
end
end
end
Expand All @@ -43,7 +46,8 @@
it 'requires a valid domain' do
searchgov_url = SearchgovUrl.new(url: 'https://foo/bar')
expect(searchgov_url).not_to be_valid
expect(searchgov_url.errors.messages[:searchgov_domain]).to include 'is invalid'
expect(searchgov_url.errors.messages[:searchgov_domain]).
to include 'is invalid'
end

describe 'validating url uniqueness' do
Expand Down Expand Up @@ -75,7 +79,7 @@
end

it 'deletes the Searchgov Url' do
expect{ searchgov_url.destroy }.to change{ SearchgovUrl.count }.from(1).to(0)
expect { searchgov_url.destroy }.to change{ SearchgovUrl.count }.by(-1)
end
end
end
Expand Down Expand Up @@ -128,6 +132,17 @@
fetch
end

context 'when the record is enqueued for reindex' do
let(:searchgov_url) do
SearchgovUrl.create!(valid_attributes.merge(enqueued_for_reindex: true))
end

it 'sets enqueued_for_reindex to false' do
expect { fetch }.to change{ searchgov_url.enqueued_for_reindex }.
from(true).to(false)
end
end

context 'when the record includes a lastmod value' do
let(:valid_attributes) { { url: url, lastmod: '2018-01-01' } }

Expand Down
6 changes: 3 additions & 3 deletions spec/models/sitemap_indexer_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
subject(:index) { indexer.index }

it 'creates searchgov urls' do
expect{ index }.to change{SearchgovUrl.count}.from(0).to(1)
expect { index }.to change{ SearchgovUrl.count }.by(1)
end

it 'updates the counter cache columns' do
Expand Down Expand Up @@ -115,7 +115,7 @@
let(:sitemap_entries) { "<url>\n <loc>\n http://agency.gov/doc1 \n </loc>\n </url>" }

it 'creates a searchgov_url record' do
expect{index}.to change{SearchgovUrl.count}.from(0).to(1)
expect { index }.to change{ SearchgovUrl.count }.by(1)
end
end

Expand Down Expand Up @@ -143,7 +143,7 @@

it 'ignores them' do
index
expect(SearchgovUrl.pluck(:url)).to eq ['http://agency.gov/doc1']
expect(SearchgovUrl.pluck(:url)).not_to include 'http://other.gov/doc1'
end
end

Expand Down
44 changes: 33 additions & 11 deletions spec/support/fetchable_behavior.rb
Original file line number Diff line number Diff line change
Expand Up @@ -30,34 +30,52 @@
describe 'scopes' do
context 'by last_crawl_status or last_crawled_at' do
before do
described_class.create!(valid_attributes.merge(url: 'http://agency.gov/ok', last_crawl_status: 'OK', last_crawled_at: 1.day.ago))
described_class.create!(valid_attributes.merge(url: 'http://agency.gov/failed', last_crawl_status: 'failed', last_crawled_at: 1.day.ago))
described_class.create!(valid_attributes.merge(url: 'http://agency.gov/unfetched', last_crawl_status: nil, last_crawled_at: nil))
described_class.create!(valid_attributes.merge(url: 'http://agency.gov/ok',
last_crawl_status: 'OK',
last_crawled_at: 1.day.ago))
described_class.create!(valid_attributes.merge(url: 'http://agency.gov/failed',
last_crawl_status: 'failed',
last_crawled_at: 1.day.ago))
described_class.create!(valid_attributes.merge(url: 'http://agency.gov/unfetched',
last_crawl_status: nil,
last_crawled_at: nil))
end

describe '.fetched' do
it 'includes successfully and unsuccessfully fetched records' do
expect(described_class.fetched.pluck(:url)).
to match_array %w[http://agency.gov/ok http://agency.gov/failed]
to include('http://agency.gov/ok', 'http://agency.gov/failed')
end

it 'does not include unfetched records' do
expect(described_class.fetched.pluck(:url)).
not_to include 'http://agency.gov/unfetched'
end
end

describe '.unfetched' do
it 'includes unfetched records' do
expect(described_class.unfetched.pluck(:url)).to eq ['http://agency.gov/unfetched']
expect(described_class.unfetched.pluck(:url)).
to include 'http://agency.gov/unfetched'
end

it 'does not include fetched records' do
expect(described_class.unfetched.pluck(:url)).
not_to include 'http://agency.gov/ok'
end
end

describe '.ok' do
it 'includes successfully fetched records' do
expect(described_class.ok.pluck(:url)).to match_array ['http://agency.gov/ok']
expect(described_class.ok.pluck(:url)).
to match_array ['http://agency.gov/ok']
end
end

describe '.not_ok' do
it 'includes failed or unfetched records' do
expect(described_class.not_ok.pluck(:url)).
to match_array %w[http://agency.gov/unfetched http://agency.gov/failed]
to include('http://agency.gov/unfetched', 'http://agency.gov/failed')
end
end
end
Expand All @@ -76,28 +94,32 @@
context "when an URL contains an anchor tag" do
let(:url) { "http://www.nps.gov/sdfsdf#anchorme" }
it "should remove it" do
expect(described_class.create!(valid_attributes.merge(url: url)).url).to eq("http://www.nps.gov/sdfsdf")
expect(described_class.create!(valid_attributes.merge(url: url)).url).
to eq("http://www.nps.gov/sdfsdf")
end
end

context "when URL is mixed case" do
let(:url) { "HTTP://Www.nps.GOV/UsaGovLovesToCapitalize" }
it "should downcase the scheme and host only" do
expect(described_class.create!(valid_attributes.merge(url: url)).url).to eq("http://www.nps.gov/UsaGovLovesToCapitalize")
expect(described_class.create!(valid_attributes.merge(url: url)).url).
to eq("http://www.nps.gov/UsaGovLovesToCapitalize")
end
end

context "when URL is missing trailing slash for a scheme+host URL" do
let(:url) { "http://www.nps.gov" }
it "should append a /" do
expect(described_class.create!(valid_attributes.merge(url: url)).url).to eq("http://www.nps.gov/")
expect(described_class.create!(valid_attributes.merge(url: url)).url).
to eq("http://www.nps.gov/")
end
end

context "when URL contains duplicate leading slashes in request" do
let(:url) { "http://www.nps.gov//hey/I/am/usagov/and/love/extra////slashes.shtml" }
it "should collapse the slashes" do
expect(described_class.create!(valid_attributes.merge(url: url)).url).to eq("http://www.nps.gov/hey/I/am/usagov/and/love/extra/slashes.shtml")
expect(described_class.create!(valid_attributes.merge(url: url)).url).
to eq("http://www.nps.gov/hey/I/am/usagov/and/love/extra/slashes.shtml")
end
end

Expand Down
30 changes: 15 additions & 15 deletions spec/vcr_cassettes/jobs/search_options_.yml
Original file line number Diff line number Diff line change
Expand Up @@ -1222,9 +1222,9 @@ http_interactions:
string: ''
headers:
Authorization-Key:
- Qbk5RB/WRc1ctYqwojqlSKeoLVrwokT8OnSLq+G1qu0=
- "<JOBS_AUTHORIZATION_KEY>"
User-Agent:
- parissa.eggleston@gmail.com
- "<JOBS_USER_AGENT>"
Accept-Encoding:
- gzip;q=1.0,deflate;q=0.6,identity;q=0.3
Accept:
Expand Down Expand Up @@ -1877,9 +1877,9 @@ http_interactions:
string: ''
headers:
Authorization-Key:
- Qbk5RB/WRc1ctYqwojqlSKeoLVrwokT8OnSLq+G1qu0=
- "<JOBS_AUTHORIZATION_KEY>"
User-Agent:
- parissa.eggleston@gmail.com
- "<JOBS_USER_AGENT>"
Accept-Encoding:
- gzip;q=1.0,deflate;q=0.6,identity;q=0.3
Accept:
Expand Down Expand Up @@ -3061,7 +3061,7 @@ http_interactions:
Authorization-Key:
- Qbk5RB/WRc1ctYqwojqlSKeoLVrwokT8OnSLq+G1qu0=
User-Agent:
- parissa.eggleston@gmail.com
- "<JOBS_USER_AGENT>"
Accept-Encoding:
- gzip;q=1.0,deflate;q=0.6,identity;q=0.3
Accept:
Expand Down Expand Up @@ -3111,9 +3111,9 @@ http_interactions:
string: ''
headers:
Authorization-Key:
- Qbk5RB/WRc1ctYqwojqlSKeoLVrwokT8OnSLq+G1qu0=
- "<JOBS_AUTHORIZATION_KEY>"
User-Agent:
- parissa.eggleston@gmail.com
- "<JOBS_USER_AGENT>"
Accept-Encoding:
- gzip;q=1.0,deflate;q=0.6,identity;q=0.3
Accept:
Expand Down Expand Up @@ -3163,9 +3163,9 @@ http_interactions:
string: ''
headers:
Authorization-Key:
- Qbk5RB/WRc1ctYqwojqlSKeoLVrwokT8OnSLq+G1qu0=
- "<JOBS_AUTHORIZATION_KEY>"
User-Agent:
- parissa.eggleston@gmail.com
- "<JOBS_USER_AGENT>"
Accept-Encoding:
- gzip;q=1.0,deflate;q=0.6,identity;q=0.3
Accept:
Expand Down Expand Up @@ -4345,9 +4345,9 @@ http_interactions:
string: ''
headers:
Authorization-Key:
- Qbk5RB/WRc1ctYqwojqlSKeoLVrwokT8OnSLq+G1qu0=
- "<JOBS_AUTHORIZATION_KEY>"
User-Agent:
- parissa.eggleston@gmail.com
- "<JOBS_USER_AGENT>"
Accept-Encoding:
- gzip;q=1.0,deflate;q=0.6,identity;q=0.3
Accept:
Expand Down Expand Up @@ -4975,9 +4975,9 @@ http_interactions:
string: ''
headers:
Authorization-Key:
- Qbk5RB/WRc1ctYqwojqlSKeoLVrwokT8OnSLq+G1qu0=
- "<JOBS_AUTHORIZATION_KEY>"
User-Agent:
- parissa.eggleston@gmail.com
- "<JOBS_USER_AGENT>"
Accept-Encoding:
- gzip;q=1.0,deflate;q=0.6,identity;q=0.3
Accept:
Expand Down Expand Up @@ -5229,9 +5229,9 @@ http_interactions:
string: ''
headers:
Authorization-Key:
- Qbk5RB/WRc1ctYqwojqlSKeoLVrwokT8OnSLq+G1qu0=
- "<JOBS_AUTHORIZATION_KEY>"
User-Agent:
- parissa.eggleston@gmail.com
- "<JOBS_USER_AGENT>"
Accept-Encoding:
- gzip;q=1.0,deflate;q=0.6,identity;q=0.3
Accept:
Expand Down
Loading