Feature/bulk reindex urls (#175)

* SRCH-202 - scrub job keywords from query * update jobs.rb to have options in the jobs module * change job_scrub to scrub_keyword, squish instead of strip, delete instead of gsub * add a comment to explain why we keeping the keyword * fix bracket and update scrub_keyword spec * update jobs.rb to scrub jobs if employment, or jobs or posting or opening * spec/vcr_cassettes/govbox_set/new_query_/affiliate_geoip_info_when_the_affiliate_has_the_jobs_govbox_enabled.yml * update vcr cassettes * SRCH-183, SRCH - 181 - bulk reindex urls * test to check urls can be enqueued for reindexing * wrap line for attr_accessible remove parenthesis in rspec test * Feature/bool db column urls (#172) * SRCH-179 - Add a boolean column to the searchgov_urls * update structure.sql * migration file for enqueued for reindex to searachgov_url
GSA · Dec 12, 2018 · 30fca50 · 30fca50
1 parent 50329a8
commit 30fca50
Show file tree

Hide file tree

Showing 11 changed files with 127 additions and 64 deletions.
diff --git a/app/models/searchgov_url.rb b/app/models/searchgov_url.rb
@@ -14,7 +14,11 @@ class SearchgovUrl < ActiveRecord::Base
                                 application/vnd.openxmlformats-officedocument.spreadsheetml.sheet
                               ]
 
-  attr_accessible :last_crawl_status, :last_crawled_at, :url, :lastmod
+  attr_accessible :last_crawl_status,
+                  :last_crawled_at,
+                  :url,
+                  :lastmod,
+                  :enqueued_for_reindex
   attr_reader :response, :document, :tempfile
   attr_readonly :url
 
@@ -35,14 +39,18 @@ class SearchgovUrl < ActiveRecord::Base
     column_name: proc {|url| !url.fetched? ? 'unfetched_urls_count' : nil },
     column_names: { ['searchgov_urls.last_crawled_at IS NULL'] => 'unfetched_urls_count' }
 
-  scope :fetch_required, -> { where('last_crawled_at IS NULL OR lastmod > last_crawled_at') }
+  scope :fetch_required, -> do
+    where('last_crawled_at IS NULL
+           OR lastmod > last_crawled_at
+           OR enqueued_for_reindex')
+  end
 
   class SearchgovUrlError < StandardError; end
   class DomainError < StandardError; end
 
   def fetch
     raise DomainError.new("#{searchgov_domain.domain}: #{searchgov_domain.status}") if !searchgov_domain.available?
-    self.update_attributes(last_crawled_at: Time.now)
+    update(last_crawled_at: Time.now, enqueued_for_reindex: false)
     self.load_time = Benchmark.realtime do
       DocumentFetchLogger.new(url, 'searchgov_url').log
       begin

diff --git a/spec/fixtures/searchgov_urls.yml b/spec/fixtures/searchgov_urls.yml
@@ -0,0 +1,18 @@
+new:
+  url: http://www.agency.gov/new
+
+outdated:
+  url: http://www.agency.gov/outdated
+  last_crawled_at: <%= 1.week.ago.to_s(:db) %>
+  lastmod: <%= 1.day.ago.to_s(:db) %>
+
+current:
+  url: http://www.agency.gov/current
+  last_crawled_at: <%= 1.day.ago.to_s(:db) %>
+  lastmod: <%= 1.week.ago.to_s(:db) %>
+
+enqueued:
+  url: http://www.agency.gov/enqueued
+  last_crawled_at: <%= 1.day.ago.to_s(:db) %>
+  lastmod: <%= 1.week.ago.to_s(:db) %>
+  enqueued_for_reindex: true
diff --git a/spec/jobs/searchgov_domain_destroyer_job_spec.rb b/spec/jobs/searchgov_domain_destroyer_job_spec.rb
@@ -28,7 +28,7 @@
       let!(:searchgov_url2) { SearchgovUrl.create!(url: url2) }
 
       it 'destroys the searchgov_urls' do
-        expect{ perform }.to change{ SearchgovUrl.count }.from(2).to(0)
+        expect { perform }.to change{ SearchgovUrl.count }.by(-2)
       end
     end
   end

diff --git a/spec/lib/tasks/searchgov_spec.rb b/spec/lib/tasks/searchgov_spec.rb
@@ -77,7 +77,7 @@
     it 'indexes new urls' do
       allow(I14yDocument).to receive(:promote).
         with(handle: 'searchgov', document_id: doc_id, bool: 'true').at_least(:once)
-      expect{ promote_urls }.to change{ SearchgovUrl.count }.from(0).to(1)
+      expect { promote_urls }.to change{ SearchgovUrl.count }.by(1)
     end
 
     it 'creates new urls' do

diff --git a/spec/models/searchgov_url_spec.rb b/spec/models/searchgov_url_spec.rb
@@ -1,6 +1,8 @@
 require 'spec_helper'
 
 describe SearchgovUrl do
+  fixtures :searchgov_urls
+
   let(:url) { 'http://www.agency.gov/boring.html' }
   let(:html) { read_fixture_file("/html/page_with_og_metadata.html") }
   let(:valid_attributes) { { url: url } }
@@ -22,19 +24,20 @@
 
   describe 'scopes' do
     describe '.fetch_required' do
-      before do
-        SearchgovUrl.create!(url: 'http://www.agency.gov/new')
-        SearchgovUrl.create!(
-          url: 'http://www.agency.gov/outdated', last_crawled_at: 1.week.ago, lastmod: 1.day.ago
-        )
-        SearchgovUrl.create!(
-          url: 'http://www.agency.gov/current', last_crawled_at: 1.day.ago, lastmod: 1.week.ago
-        )
-      end
 
       it 'includes urls that have never been crawled and outdated urls' do
         expect(SearchgovUrl.fetch_required.pluck(:url)).
-          to eq %w[http://www.agency.gov/new http://www.agency.gov/outdated]
+          to include('http://www.agency.gov/new', 'http://www.agency.gov/outdated')
+      end
+
+      it 'does not include current, crawled and not enqueued urls' do
+        expect(SearchgovUrl.fetch_required.pluck(:url)).
+          not_to include('http://www.agency.gov/current')
+      end
+
+      it 'includes urls that have been enqueued for reindexing' do
+        expect(SearchgovUrl.fetch_required.pluck(:url)).
+          to include 'http://www.agency.gov/enqueued'
       end
     end
   end
@@ -43,7 +46,8 @@
     it 'requires a valid domain' do
       searchgov_url = SearchgovUrl.new(url: 'https://foo/bar')
       expect(searchgov_url).not_to be_valid
-      expect(searchgov_url.errors.messages[:searchgov_domain]).to include 'is invalid'
+      expect(searchgov_url.errors.messages[:searchgov_domain]).
+        to include 'is invalid'
     end
 
     describe 'validating url uniqueness' do
@@ -75,7 +79,7 @@
         end
 
         it 'deletes the Searchgov Url' do
-          expect{ searchgov_url.destroy }.to change{ SearchgovUrl.count }.from(1).to(0)
+          expect { searchgov_url.destroy }.to change{ SearchgovUrl.count }.by(-1)
         end
       end
     end
@@ -128,6 +132,17 @@
         fetch
       end
 
+      context 'when the record is enqueued for reindex' do
+        let(:searchgov_url) do
+          SearchgovUrl.create!(valid_attributes.merge(enqueued_for_reindex: true))
+        end
+
+        it 'sets enqueued_for_reindex to false' do
+          expect { fetch }.to change{ searchgov_url.enqueued_for_reindex }.
+            from(true).to(false)
+        end
+      end
+
       context 'when the record includes a lastmod value' do
         let(:valid_attributes) { { url: url, lastmod: '2018-01-01' } }
 

diff --git a/spec/models/sitemap_indexer_spec.rb b/spec/models/sitemap_indexer_spec.rb
@@ -24,7 +24,7 @@
     subject(:index) { indexer.index }
 
     it 'creates searchgov urls' do
-      expect{ index }.to change{SearchgovUrl.count}.from(0).to(1)
+      expect { index }.to change{ SearchgovUrl.count }.by(1)
     end
 
     it 'updates the counter cache columns' do
@@ -115,7 +115,7 @@
       let(:sitemap_entries) { "<url>\n  <loc>\n    http://agency.gov/doc1 \n    </loc>\n  </url>" }
 
       it 'creates a searchgov_url record' do
-        expect{index}.to change{SearchgovUrl.count}.from(0).to(1)
+        expect { index }.to change{ SearchgovUrl.count }.by(1)
       end
     end
 
@@ -143,7 +143,7 @@
 
       it 'ignores them' do
         index
-        expect(SearchgovUrl.pluck(:url)).to eq ['http://agency.gov/doc1']
+        expect(SearchgovUrl.pluck(:url)).not_to include 'http://other.gov/doc1'
       end
     end
 

diff --git a/spec/support/fetchable_behavior.rb b/spec/support/fetchable_behavior.rb
@@ -30,34 +30,52 @@
   describe 'scopes' do
     context 'by last_crawl_status or last_crawled_at' do
       before do
-        described_class.create!(valid_attributes.merge(url: 'http://agency.gov/ok', last_crawl_status: 'OK', last_crawled_at: 1.day.ago))
-        described_class.create!(valid_attributes.merge(url: 'http://agency.gov/failed', last_crawl_status: 'failed', last_crawled_at: 1.day.ago))
-        described_class.create!(valid_attributes.merge(url: 'http://agency.gov/unfetched', last_crawl_status: nil, last_crawled_at: nil))
+        described_class.create!(valid_attributes.merge(url: 'http://agency.gov/ok',
+                                                       last_crawl_status: 'OK',
+                                                       last_crawled_at: 1.day.ago))
+        described_class.create!(valid_attributes.merge(url: 'http://agency.gov/failed',
+                                                       last_crawl_status: 'failed',
+                                                       last_crawled_at: 1.day.ago))
+        described_class.create!(valid_attributes.merge(url: 'http://agency.gov/unfetched',
+                                                       last_crawl_status: nil,
+                                                       last_crawled_at: nil))
       end
 
       describe '.fetched' do
         it 'includes successfully and unsuccessfully fetched records' do
           expect(described_class.fetched.pluck(:url)).
-            to match_array %w[http://agency.gov/ok http://agency.gov/failed]
+            to include('http://agency.gov/ok', 'http://agency.gov/failed')
+        end
+
+        it 'does not include unfetched records' do
+          expect(described_class.fetched.pluck(:url)).
+            not_to include 'http://agency.gov/unfetched'
         end
       end
 
       describe '.unfetched' do
         it 'includes unfetched records' do
-          expect(described_class.unfetched.pluck(:url)).to eq ['http://agency.gov/unfetched']
+          expect(described_class.unfetched.pluck(:url)).
+            to include 'http://agency.gov/unfetched'
+        end
+
+        it 'does not include fetched records' do
+          expect(described_class.unfetched.pluck(:url)).
+            not_to include 'http://agency.gov/ok'
         end
       end
 
       describe '.ok' do
         it 'includes successfully fetched records' do
-          expect(described_class.ok.pluck(:url)).to match_array ['http://agency.gov/ok']
+          expect(described_class.ok.pluck(:url)).
+            to match_array ['http://agency.gov/ok']
         end
       end
 
       describe '.not_ok' do
         it 'includes failed or unfetched records' do
           expect(described_class.not_ok.pluck(:url)).
-            to match_array %w[http://agency.gov/unfetched http://agency.gov/failed]
+            to include('http://agency.gov/unfetched', 'http://agency.gov/failed')
         end
       end
     end
@@ -76,28 +94,32 @@
     context "when an URL contains an anchor tag" do
       let(:url) { "http://www.nps.gov/sdfsdf#anchorme" }
       it "should remove it" do
-        expect(described_class.create!(valid_attributes.merge(url: url)).url).to eq("http://www.nps.gov/sdfsdf")
+        expect(described_class.create!(valid_attributes.merge(url: url)).url).
+          to eq("http://www.nps.gov/sdfsdf")
       end
     end
 
     context "when URL is mixed case" do
       let(:url) { "HTTP://Www.nps.GOV/UsaGovLovesToCapitalize" }
       it "should downcase the scheme and host only" do
-        expect(described_class.create!(valid_attributes.merge(url: url)).url).to eq("http://www.nps.gov/UsaGovLovesToCapitalize")
+        expect(described_class.create!(valid_attributes.merge(url: url)).url).
+          to eq("http://www.nps.gov/UsaGovLovesToCapitalize")
       end
     end
 
     context "when URL is missing trailing slash for a scheme+host URL" do
       let(:url) { "http://www.nps.gov" }
       it "should append a /" do
-        expect(described_class.create!(valid_attributes.merge(url: url)).url).to eq("http://www.nps.gov/")
+        expect(described_class.create!(valid_attributes.merge(url: url)).url).
+          to eq("http://www.nps.gov/")
       end
     end
 
     context "when URL contains duplicate leading slashes in request" do
       let(:url) { "http://www.nps.gov//hey/I/am/usagov/and/love/extra////slashes.shtml" }
       it "should collapse the slashes" do
-        expect(described_class.create!(valid_attributes.merge(url: url)).url).to eq("http://www.nps.gov/hey/I/am/usagov/and/love/extra/slashes.shtml")
+        expect(described_class.create!(valid_attributes.merge(url: url)).url).
+          to eq("http://www.nps.gov/hey/I/am/usagov/and/love/extra/slashes.shtml")
       end
     end
 

diff --git a/spec/vcr_cassettes/jobs/search_options_.yml b/spec/vcr_cassettes/jobs/search_options_.yml
@@ -1222,9 +1222,9 @@ http_interactions:
       string: ''
     headers:
       Authorization-Key:
-      - Qbk5RB/WRc1ctYqwojqlSKeoLVrwokT8OnSLq+G1qu0=
+      - "<JOBS_AUTHORIZATION_KEY>" 
       User-Agent:
-      - parissa.eggleston@gmail.com
+      - "<JOBS_USER_AGENT>" 
       Accept-Encoding:
       - gzip;q=1.0,deflate;q=0.6,identity;q=0.3
       Accept:
@@ -1877,9 +1877,9 @@ http_interactions:
       string: ''
     headers:
       Authorization-Key:
-      - Qbk5RB/WRc1ctYqwojqlSKeoLVrwokT8OnSLq+G1qu0=
+      - "<JOBS_AUTHORIZATION_KEY>"
       User-Agent:
-      - parissa.eggleston@gmail.com
+      - "<JOBS_USER_AGENT>"
       Accept-Encoding:
       - gzip;q=1.0,deflate;q=0.6,identity;q=0.3
       Accept:
@@ -3061,7 +3061,7 @@ http_interactions:
       Authorization-Key:
       - Qbk5RB/WRc1ctYqwojqlSKeoLVrwokT8OnSLq+G1qu0=
       User-Agent:
-      - parissa.eggleston@gmail.com
+      - "<JOBS_USER_AGENT>" 
       Accept-Encoding:
       - gzip;q=1.0,deflate;q=0.6,identity;q=0.3
       Accept:
@@ -3111,9 +3111,9 @@ http_interactions:
       string: ''
     headers:
       Authorization-Key:
-      - Qbk5RB/WRc1ctYqwojqlSKeoLVrwokT8OnSLq+G1qu0=
+      - "<JOBS_AUTHORIZATION_KEY>" 
       User-Agent:
-      - parissa.eggleston@gmail.com
+      - "<JOBS_USER_AGENT>"
       Accept-Encoding:
       - gzip;q=1.0,deflate;q=0.6,identity;q=0.3
       Accept:
@@ -3163,9 +3163,9 @@ http_interactions:
       string: ''
     headers:
       Authorization-Key:
-      - Qbk5RB/WRc1ctYqwojqlSKeoLVrwokT8OnSLq+G1qu0=
+      - "<JOBS_AUTHORIZATION_KEY>"
       User-Agent:
-      - parissa.eggleston@gmail.com
+      - "<JOBS_USER_AGENT>"
       Accept-Encoding:
       - gzip;q=1.0,deflate;q=0.6,identity;q=0.3
       Accept:
@@ -4345,9 +4345,9 @@ http_interactions:
       string: ''
     headers:
       Authorization-Key:
-      - Qbk5RB/WRc1ctYqwojqlSKeoLVrwokT8OnSLq+G1qu0=
+      - "<JOBS_AUTHORIZATION_KEY>"
       User-Agent:
-      - parissa.eggleston@gmail.com
+      - "<JOBS_USER_AGENT>"
       Accept-Encoding:
       - gzip;q=1.0,deflate;q=0.6,identity;q=0.3
       Accept:
@@ -4975,9 +4975,9 @@ http_interactions:
       string: ''
     headers:
       Authorization-Key:
-      - Qbk5RB/WRc1ctYqwojqlSKeoLVrwokT8OnSLq+G1qu0=
+      - "<JOBS_AUTHORIZATION_KEY>"
       User-Agent:
-      - parissa.eggleston@gmail.com
+      - "<JOBS_USER_AGENT>"
       Accept-Encoding:
       - gzip;q=1.0,deflate;q=0.6,identity;q=0.3
       Accept:
@@ -5229,9 +5229,9 @@ http_interactions:
       string: ''
     headers:
       Authorization-Key:
-      - Qbk5RB/WRc1ctYqwojqlSKeoLVrwokT8OnSLq+G1qu0=
+      - "<JOBS_AUTHORIZATION_KEY>"
       User-Agent:
-      - parissa.eggleston@gmail.com
+      - "<JOBS_USER_AGENT>"
       Accept-Encoding:
       - gzip;q=1.0,deflate;q=0.6,identity;q=0.3
       Accept: