From cb0c9d55b290e67a6020d98168c93c70bbc33133 Mon Sep 17 00:00:00 2001 From: Yih En Lim Date: Fri, 29 Dec 2017 16:17:31 +0800 Subject: [PATCH 01/16] Install elasticsearch-rails gem and configurations --- Gemfile | 6 ++++-- Gemfile.lock | 30 +++++++++++++++------------- config/application.rb | 2 ++ config/environments/development.rb | 5 +++++ config/environments/production.rb | 4 ++++ config/environments/test.rb | 5 +++++ config/initializers/elasticsearch.rb | 4 +--- config/initializers/tire.rb | 5 ++--- spec/models/geoname_spec.rb | 1 - 9 files changed, 39 insertions(+), 23 deletions(-) diff --git a/Gemfile b/Gemfile index 64ebde4..7c18eca 100644 --- a/Gemfile +++ b/Gemfile @@ -8,8 +8,8 @@ end gem 'rails', '5.1.4' gem 'rails-controller-testing', '~> 1.0' gem 'nokogiri', '~> 1.8.0' -gem 'tire', '~> 0.6.2' #deprecated in 2013 -gem 'tire-contrib', '~> 0.1.2' +# gem 'tire', '~> 0.6.2' #deprecated in 2013 +# gem 'tire-contrib', '~> 0.1.2' gem 'oj', '~> 3.1.3' # Unused? gem 'faraday_middleware', '~> 0.12.2' gem 'net-http-persistent', '~> 2.8' @@ -20,6 +20,8 @@ gem 'rack-cors', '~> 1.0.2' gem 'us_states', '~> 0.1.1', git: 'https://github.com/GSA/us_states.git' gem 'newrelic_rpm', '~> 4.6.0' gem 'rake', '~> 11.0' +gem 'elasticsearch-model' +gem 'elasticsearch-rails' group :development, :test do gem 'puma', '~> 3.7' diff --git a/Gemfile.lock b/Gemfile.lock index d693232..c6f5260 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -47,7 +47,6 @@ GEM airbrake (7.1.0) airbrake-ruby (~> 2.5) airbrake-ruby (2.6.2) - ansi (1.5.0) arel (8.0.0) builder (3.2.3) capistrano (2.15.4) @@ -68,6 +67,19 @@ GEM docile (1.1.3) domain_name (0.5.20170404) unf (>= 0.0.5, < 1.0.0) + elasticsearch (5.0.4) + elasticsearch-api (= 5.0.4) + elasticsearch-transport (= 5.0.4) + elasticsearch-api (5.0.4) + multi_json + elasticsearch-model (5.0.2) + activesupport (> 3) + elasticsearch (~> 5) + hashie + elasticsearch-rails (5.0.2) + elasticsearch-transport (5.0.4) + faraday + multi_json erubi (1.7.0) faraday (0.13.1) multipart-post (>= 1.2, < 3) @@ -76,7 +88,7 @@ GEM ffi (1.9.18) globalid (0.4.1) activesupport (>= 4.2.0) - hashr (0.0.22) + hashie (3.5.7) highline (1.6.19) http-cookie (1.0.3) domain_name (~> 0.5) @@ -204,16 +216,6 @@ GEM thor (0.20.0) thread_safe (0.3.6) tins (0.13.2) - tire (0.6.2) - activemodel (>= 3.0) - activesupport - ansi - hashr (~> 0.0.19) - multi_json (~> 1.3) - rake - rest-client (~> 1.6) - tire-contrib (0.1.2) - tire tzinfo (1.2.4) thread_safe (~> 0.1) unf (0.1.4) @@ -230,6 +232,8 @@ DEPENDENCIES airbrake (~> 7.1) capistrano (~> 2.15.4) coveralls (~> 0.7.0) + elasticsearch-model + elasticsearch-rails faraday_middleware (~> 0.12.2) jbuilder (~> 2.7.0) listen (>= 3.0.5, < 3.2) @@ -250,8 +254,6 @@ DEPENDENCIES spring spring-watcher-listen (~> 2.0.0) test-unit (~> 3.0) - tire (~> 0.6.2) - tire-contrib (~> 0.1.2) us_states (~> 0.1.1)! BUNDLED WITH diff --git a/config/application.rb b/config/application.rb index 921c263..2305c42 100644 --- a/config/application.rb +++ b/config/application.rb @@ -14,6 +14,8 @@ class Application < Rails::Application # Configure the default encoding used in templates for Ruby 1.9. config.encoding = 'utf-8' + config.elasticsearch_config = (YAML.load_file("#{Rails.root}/config/elasticsearch.yml") || {})[Rails.env] + # Custom directories with classes and modules you want to be autoloadable. config.autoload_paths += %W(#{Rails.root}/lib #{Rails.root}/lib/constraints #{Rails.root}/lib/importers) diff --git a/config/environments/development.rb b/config/environments/development.rb index 2f14310..3603200 100644 --- a/config/environments/development.rb +++ b/config/environments/development.rb @@ -32,4 +32,9 @@ # Use an evented file watcher to asynchronously detect changes in source code, # routes, locales, etc. This feature depends on the listen gem. config.file_watcher = ActiveSupport::EventedFileUpdateChecker + + config.elasticsearch_client = Elasticsearch::Client.new( + host: 'localhost:9200', + log: true + ) end diff --git a/config/environments/production.rb b/config/environments/production.rb index f144ec9..fbc01c0 100644 --- a/config/environments/production.rb +++ b/config/environments/production.rb @@ -49,4 +49,8 @@ logger.formatter = config.log_formatter config.logger = ActiveSupport::TaggedLogging.new(logger) end + + config.elasticsearch_client = Elasticsearch::Client.new( + url: Rails.application.config.elasticsearch_config['url'] + ) end diff --git a/config/environments/test.rb b/config/environments/test.rb index 888b8a0..bccd63c 100644 --- a/config/environments/test.rb +++ b/config/environments/test.rb @@ -30,4 +30,9 @@ # Print deprecation notices to the stderr config.active_support.deprecation = :stderr + + config.elasticsearch_client = Elasticsearch::Client.new( + host: 'localhost:9200', + log: false + ) end diff --git a/config/initializers/elasticsearch.rb b/config/initializers/elasticsearch.rb index afdacea..6794c0c 100644 --- a/config/initializers/elasticsearch.rb +++ b/config/initializers/elasticsearch.rb @@ -1,7 +1,5 @@ module Elasticsearch; end -es_config = (YAML.load_file("#{Rails.root}/config/elasticsearch.yml") || {})[Rails.env] - -Tire::Configuration.url(es_config['url']) if es_config && es_config['url'].present? +es_config = Rails.application.config.elasticsearch_config Elasticsearch::INDEX_NAME = es_config && es_config['index_name'].present? ? es_config['index_name'].freeze : "#{Rails.env}:jobs".freeze diff --git a/config/initializers/tire.rb b/config/initializers/tire.rb index e623a70..2797d50 100644 --- a/config/initializers/tire.rb +++ b/config/initializers/tire.rb @@ -1,3 +1,2 @@ -#Tire.configure { logger STDERR, level: 'debug' } -PositionOpening.create_search_index unless PositionOpening.search_index.exists? -Geoname.create_search_index unless Geoname.search_index.exists? \ No newline at end of file +PositionOpening.create_search_index unless PositionOpening.search_index_exists? +Geoname.create_search_index unless Geoname.search_index_exists? diff --git a/spec/models/geoname_spec.rb b/spec/models/geoname_spec.rb index 68aa2d7..1373f4a 100644 --- a/spec/models/geoname_spec.rb +++ b/spec/models/geoname_spec.rb @@ -18,7 +18,6 @@ end end - context 'when query terms contain a synonym match with terms in location field' do before do geonames, @first_synonyms = [], [] From 2a5d6a00612f3a701ff9d76ae1bfcb2b2e9789d0 Mon Sep 17 00:00:00 2001 From: Yih En Lim Date: Fri, 29 Dec 2017 18:36:45 +0800 Subject: [PATCH 02/16] Update index mappings for Geonames and Position Openings - Update search queries for Position openings --- .travis.yml | 2 +- Gemfile | 2 - app/models/geoname.rb | 138 ++++++---- app/models/position_opening.rb | 367 +++++++++++++++++--------- lib/tasks/geonames.rake | 2 +- lib/tasks/position_openings.rake | 2 +- spec/api/v2/position_openings_spec.rb | 5 +- spec/api/v3/position_openings_spec.rb | 4 +- spec/models/geoname_spec.rb | 15 +- spec/models/position_opening_spec.rb | 26 +- 10 files changed, 362 insertions(+), 201 deletions(-) diff --git a/.travis.yml b/.travis.yml index d746a99..4b57745 100644 --- a/.travis.yml +++ b/.travis.yml @@ -3,7 +3,7 @@ language: ruby rvm: - 2.3.5 before_install: - - curl -O https://download.elasticsearch.org/elasticsearch/elasticsearch/elasticsearch-1.4.4.deb && sudo dpkg -i elasticsearch-1.4.4.deb && true + - curl -O https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-5.6.5.deb && sudo dpkg -i --force-confnew elasticsearch-5.6.5.deb && true - 'echo ''script.disable_dynamic: false'' | sudo tee --append /etc/elasticsearch/elasticsearch.yml' - 'echo ''index.number_of_shards: 1'' | sudo tee --append /etc/elasticsearch/elasticsearch.yml' - sudo service elasticsearch start diff --git a/Gemfile b/Gemfile index 7c18eca..38aaac3 100644 --- a/Gemfile +++ b/Gemfile @@ -8,8 +8,6 @@ end gem 'rails', '5.1.4' gem 'rails-controller-testing', '~> 1.0' gem 'nokogiri', '~> 1.8.0' -# gem 'tire', '~> 0.6.2' #deprecated in 2013 -# gem 'tire-contrib', '~> 0.1.2' gem 'oj', '~> 3.1.3' # Unused? gem 'faraday_middleware', '~> 0.12.2' gem 'net-http-persistent', '~> 2.8' diff --git a/app/models/geoname.rb b/app/models/geoname.rb index df1b409..06db087 100644 --- a/app/models/geoname.rb +++ b/app/models/geoname.rb @@ -1,75 +1,113 @@ +require 'active_model' + class Geoname - include Tire::Model::Search + include ActiveModel::Model + include Elasticsearch::Model + + INDEX_NAME = "#{Rails.env}:geonames".freeze + index_name = INDEX_NAME + + SYNONYMS = [ + "afb, air force base", + "afs, air force station", + "ang, air national guard", + "cavecreek, cave creek", + "ft, fort", + "junc, junction", + "natl, nat, national", + "newcastle, new castle", + "pk, park", + "spgs, springs", + "st, saint" + ].freeze - index_name("#{Rails.env}:geonames".freeze) + SETTINGS = { + analysis: { + filter: { + synonym: { + type: 'synonym', + synonyms: SYNONYMS + } + }, + analyzer: { + custom_analyzer: { + type: 'custom', + tokenizer: 'whitespace', + filter: %w(standard lowercase synonym) + } + } + } + } - SYNONYMS = ["ft, fort", "st, saint", "afb, air force base", "afs, air force station", "ang, air national guard", "junc, junction", "spgs, springs", "natl, nat, national", "pk, park", "newcastle, new castle", "cavecreek, cave creek"] + settings index: SETTINGS do + mappings dynamic: 'false' do + indexes :type, type: 'keyword' + indexes :location, type: 'text', analyzer: 'custom_analyzer' + indexes :state, type: 'text', analyzer: 'keyword' + indexes :geo, type: 'geo_point' + indexes :id, type: 'keyword', index: false + end + end class << self + def client + @client ||= Geoname.__elasticsearch__.client + end + def create_search_index - Tire.index index_name do - create( - settings: { - index: { - analysis: { - analyzer: {custom_analyzer: {type: 'custom', tokenizer: 'whitespace', filter: %w(standard lowercase synonym)}}, - filter: {synonym: {type: 'synonym', synonyms: SYNONYMS}} - } - } - }, - mappings: { - geoname: { - properties: { - type: {type: 'string'}, - location: {type: 'string', analyzer: 'custom_analyzer'}, - state: {type: 'string', analyzer: 'keyword'}, - geo: {type: 'geo_point'}, - id: {type: 'string', index: :not_analyzed, include_in_all: false} - } - } - } - ) - end + client.indices.create( + index: INDEX_NAME, + body: { settings: settings.to_hash, mappings: mappings.to_hash } + ) end def geocode(options = {}) - search_for(options.merge(size: 1)).results.first.geo.to_hash rescue nil + search_for(options.merge(size: 1)).results.first.geo rescue nil end def search_for(options) - Tire.search index_name do - query do - boolean do - must { match :location, options[:location], operator: 'AND' } - must { term :state, options[:state] } - end - end - size options[:size] - end + search_definition = { + query: { + bool: { + must: [ + { match: { location: { query: options[:location], operator: 'AND' } } }, + { term: { state: options[:state] } } + ] + } + } + } + + search_definition[:size] = options[:size] + + Geoname.search(search_definition, index: INDEX_NAME) end def delete_search_index - search_index.delete + client.indices.delete index: INDEX_NAME rescue nil end - def search_index - Tire.index(index_name) + def search_index_exists? + client.indices.exists? index: INDEX_NAME end def import(geonames) - Tire.index index_name do - import geonames do |docs| - docs.each do |doc| - doc[:id] = "#{doc[:location]}:#{doc[:state]}" - end - end - refresh + geonames.each do |doc| + client.index( + index: INDEX_NAME, + type: 'geoname', + id: "#{doc[:location]}:#{doc[:state]}", + body: { + location: doc[:location], + geo: doc[:geo], + state: doc[:state] + } + ) end - #Tire.index index_name - Rails.logger.info "Imported #{geonames.size} Geonames to #{index_name}" - end + __elasticsearch__.refresh_index! index: INDEX_NAME + Rails.logger.info "Imported #{geonames.size} Geonames to #{INDEX_NAME}" + end end -end \ No newline at end of file +end diff --git a/app/models/position_opening.rb b/app/models/position_opening.rb index cdb09f8..f830454 100644 --- a/app/models/position_opening.rb +++ b/app/models/position_opening.rb @@ -1,125 +1,229 @@ +require 'active_model' + class PositionOpening - include Tire::Model::Search + include ActiveModel::Model + include Elasticsearch::Model + + INDEX_NAME = "#{Elasticsearch::INDEX_NAME}".freeze + + MAX_RETURNED_DOCUMENTS = 100.freeze + + SYNONYMS = [ + "architect, architecture", + "certified nursing assistant, cna", + "clerk, clerical", + "counselor, counseling, therapy, therapist", + "custodial, janitor, custodian", + "cypa, child and youth program assistant, childcare", + "cys, child youth services", + "electronic, electrical", + "forester, forestry", + "green, environment, environmental", + "information technology, it, tech, computer", + "linguist, language", + "legal, attorney", + "lpn, licensed practical nurse", + "lvn, licensed vocational nurse", + "pa, physician assistant", + "physician, doctor", + "rn, registered nurse", + "teacher, teaching", + "technical, technician", + "technology, technologist", + "tso, transportation security officer", + "tv, television" + ].freeze + + SETTINGS = { + analysis: { + filter: { + synonym: { + type: 'synonym', + synonyms: SYNONYMS + } + }, + analyzer: { + custom_analyzer: { + type: 'custom', + tokenizer: 'whitespace', + filter: %w(standard lowercase synonym snowball) + } + } + } + } - index_name("#{Elasticsearch::INDEX_NAME}") + settings index: SETTINGS do + mappings dynamic: 'false' do + indexes :type, type: 'keyword' + indexes :source, type: 'keyword' + indexes :tags, type: 'text', analyzer: 'keyword' + indexes :external_id, type: 'integer', store: true + indexes :position_title, type: 'text', analyzer: 'custom_analyzer', term_vector: 'with_positions_offsets', store: true + indexes :organization_id, type: 'text', analyzer: 'keyword' + indexes :organization_name, type: 'keyword', index: false - MAX_RETURNED_DOCUMENTS = 100 - SYNONYMS = ["information technology, it, tech, computer", "teacher, teaching", "certified nursing assistant, cna", "rn, registered nurse", "lpn, licensed practical nurse", "lvn, licensed vocational nurse", "pa, physician assistant", "custodial, janitor, custodian", "cys, child youth services", "clerk, clerical", "physician, doctor", "linguist, language", "tv, television", "legal, attorney", "counselor, counseling, therapy, therapist", "green, environment, environmental", "forester, forestry", "technical, technician", "technology, technologist", "electronic, electrical", "architect, architecture", "cypa, child and youth program assistant, childcare", "tso, transportation security officer"].freeze + indexes :locations, type: 'nested' do + indexes :city, type: 'text', analyzer: 'simple' + indexes :state, type: 'text', analyzer: 'keyword' + indexes :geo, type: 'geo_point' + end + + indexes :start_date, type: 'date', format: 'YYYY-MM-dd' + indexes :end_date, type: 'date', format: 'YYYY-MM-dd' + indexes :minimum, type: 'float' + indexes :maximum, type: 'float' + indexes :position_offering_type_code, type: 'integer' + indexes :position_schedule_type_code, type: 'integer' + indexes :rate_interval_code, type: 'text', analyzer: 'keyword' + indexes :id, type: 'keyword', index: false + indexes :timestamp, type: 'date' + indexes :ttl, type: 'date' + end + end class << self + def client + @client ||= PositionOpening.__elasticsearch__.client + end + def create_search_index - Tire.index index_name do - create( - settings: { - index: { - analysis: { - analyzer: { custom_analyzer: { type: 'custom', tokenizer: 'whitespace', filter: %w(standard lowercase synonym snowball) } }, - filter: { synonym: { type: 'synonym', synonyms: SYNONYMS } } - } - } - }, - mappings: { - position_opening: { - _timestamp: { enabled: true }, - _ttl: { enabled: true }, - properties: { - type: { type: 'string' }, - source: { type: 'string', index: :not_analyzed }, - tags: { type: 'string', analyzer: 'keyword' }, - external_id: { type: 'integer' }, - position_title: { type: 'string', analyzer: 'custom_analyzer', term_vector: 'with_positions_offsets', store: true }, - organization_id: { type: 'string', analyzer: 'keyword' }, - organization_name: { type: 'string', index: :not_analyzed }, - locations: { - type: 'nested', - properties: { - city: { type: 'string', analyzer: 'simple' }, - state: { type: 'string', analyzer: 'keyword' }, - geo: { type: 'geo_point' } } }, - start_date: { type: 'date', format: 'YYYY-MM-dd' }, - end_date: { type: 'date', format: 'YYYY-MM-dd' }, - minimum: { type: 'float' }, - maximum: { type: 'float' }, - position_offering_type_code: { type: 'integer' }, - position_schedule_type_code: { type: 'integer' }, - rate_interval_code: { type: 'string', analyzer: 'keyword' }, - id: { type: 'string', index: :not_analyzed, include_in_all: false } - } - } - } - ) - end + client.indices.create( + index: INDEX_NAME, + body: { settings: settings.to_hash, mappings: mappings.to_hash } + ) end def search_for(options = {}) - options.reverse_merge!(size: 10, from: 0, sort_by: :_timestamp) + options.reverse_merge!(size: 10, from: 0) document_limit = [options[:size].to_i, MAX_RETURNED_DOCUMENTS].min source = options[:source] + sort_by = options[:sort_by] || :timestamp tags = options[:tags].present? ? options[:tags].split(/[ ,]/) : nil lat, lon = options[:lat_lon].split(',') rescue [nil, nil] organization_ids = organization_ids_from_options(options) query = Query.new(options[:query], organization_ids) - search = Tire.search index_name do - query do - boolean(minimum_number_should_match: 1) do - must { term :source, source } if source.present? - must { terms :tags, tags } if tags - must { match :position_offering_type_code, query.position_offering_type_code } if query.position_offering_type_code.present? - must { match :position_schedule_type_code, query.position_schedule_type_code } if query.position_schedule_type_code.present? - should { match :position_title, query.keywords, analyzer: 'custom_analyzer' } if query.keywords.present? - should do - nested path: 'locations' do - query do - match 'locations.city', query.keywords, operator: 'AND' - end - end - end if query.keywords.present? && query.location.nil? - must { match :rate_interval_code, query.rate_interval_code } if query.rate_interval_code.present? - must do - boolean do - should { terms :organization_id, query.organization_terms } if query.organization_terms.present? - query.organization_prefixes.each do |organization_prefix| - should { prefix :organization_id, organization_prefix } - end if query.organization_prefixes.present? - end - end if query.organization_ids.present? - must do - nested path: 'locations' do - query do - boolean do - must { term 'locations.state', query.location.state } if query.has_state? - must { match 'locations.city', query.location.city, operator: 'AND' } if query.has_city? - end - end - end - end if query.location.present? - end - end if source.present? || tags || query.valid? + search_definition = { + query: { + bool: { + must: [], + should: [], + filter: [ + { range: { start_date: { lte: Date.current } }} + ], + minimum_should_match: '0<1' + } + }, + highlight: { + fields: { + position_title: { number_of_fragments: 0 } + } + }, + size: document_limit, + from: options[:from] + } - filter :range, start_date: { lte: Date.current } + if source.present? || tags || query.valid? + if source.present? + search_definition[:query][:bool][:must] << { term: { source: source } } + end - if query.keywords.blank? - if lat.blank? || lon.blank? - sort { by options[:sort_by], 'desc' } - else - options[:sort_by] = 'geo_distance' - sort do - by :_geo_distance, { - 'locations.geo' => { - lat: lat, lon: lon - }, - :order => 'asc' + if tags + search_definition[:query][:bool][:must] << { terms: { tags: tags } } + end + + if query.position_offering_type_code.present? + search_definition[:query][:bool][:must] << { match: { position_offering_type_code: { query: query.position_offering_type_code } } } + end + + if query.position_schedule_type_code.present? + search_definition[:query][:bool][:must] << { match: { position_schedule_type_code: { query: query.position_schedule_type_code } } } + end + + if query.keywords.present? + search_definition[:query][:bool][:should] << { match: { position_title: { query: query.keywords, analyzer: 'custom_analyzer' } } } + end + + if query.keywords.present? && query.location.nil? + search_definition[:query][:bool][:should] << { + nested: { + path: 'locations', + query: { + match: { 'locations.city': { query: query.keywords, operator: 'AND' }} } + } + } + end + + if query.rate_interval_code.present? + search_definition[:query][:bool][:must] << { match: { rate_interval_code: query.rate_interval_code }} + end + + if query.organization_ids.present? + definition = { + bool: { + should: [] + } + } + + if query.organization_terms.present? + definition[:bool][:should] << { terms: { organization_id: query.organization_terms } } + end + if query.organization_prefixes.present? + query.organization_prefixes.each do |prefix| + definition[:bool][:should] << { prefix: { organization_id: prefix } } end end + + search_definition[:query][:bool][:must] << definition + end + + if query.location.present? + locations = { + nested: { + path: 'locations', + query: { + bool: { + must: [] + } + } + } + } + + if query.has_state? + locations[:nested][:query][:bool][:must] << { term: { 'locations.state': { term: query.location.state } }} + end + + if query.has_city? + locations[:nested][:query][:bool][:must] << { match: { 'locations.city': { query: query.location.city, operator: 'AND' }}} + end + + search_definition[:query][:bool][:must] << locations + end + end + + if query.keywords.blank? + if lat.blank? || lon.blank? + search_definition[:sort] = [{ "#{sort_by}": 'desc' }] + else + search_definition[:sort] = [{ + _geo_distance: { + 'locations.geo': { lat: lat.to_f, lon: lon.to_f }, + order: 'asc', + nested_path: 'locations' + } + }] end - size document_limit - from options[:from] - highlight position_title: { number_of_fragments: 0 } + else + search_definition[:sort] = { "#{sort_by}": 'desc' } end + if search_definition[:query][:bool][:must].empty? && search_definition[:query][:bool][:should].empty? + search_definition[:query] = { match_all: {} } + end + + search = __elasticsearch__.search(search_definition, index: INDEX_NAME) Rails.logger.info("[Query] #{options.merge(result_count: search.results.total).to_json}") search.results.collect do |item| @@ -127,8 +231,8 @@ def search_for(options = {}) id: item.id, source: item.source, external_id: item.external_id, - position_title: (options[:hl] == '1' && item.highlight.present?) ? item.highlight[:position_title][0] : item.position_title, - organization_name: item.organization_name, + position_title: (options[:hl] == '1' && item.try(:highlight).present?) ? item.highlight[:position_title][0] : item.position_title, + organization_name: item.try(:organization_name), rate_interval_code: item.rate_interval_code, minimum: item.minimum, maximum: item.maximum, @@ -141,28 +245,40 @@ def search_for(options = {}) end def delete_search_index - search_index.delete + client.indices.delete index: INDEX_NAME rescue nil end - def search_index - Tire.index(index_name) + def search_index_exists? + client.indices.exists? index: INDEX_NAME end def import(position_openings) - Tire.index index_name do - import position_openings do |docs| - docs.each do |doc| - doc[:id] = "#{doc[:source]}:#{doc[:external_id]}" - doc[:locations].each do |loc| - normalized_city = loc[:city].sub(' Metro Area', '').sub(/, .*$/, '') - lat_lon_hash = Geoname.geocode(location: normalized_city, state: loc[:state]) - loc[:geo] = lat_lon_hash if lat_lon_hash.present? - end if doc[:locations].present? + position_openings.each do |opening| + data = opening.except(:_timestamp, :_ttl).each_with_object({}) do |(key, value), data| + if key == :locations + data[:locations] = value.map do |v| + {city: normalized_city(v[:city]), + state: v[:state], + geo: v[:geo] || find_geoname(v[:city], v[:state])} + end + else + data[key] = value end end - refresh + + client.index( + index: INDEX_NAME, + type: 'position_opening', + id: "#{opening[:source]}:#{opening[:external_id]}", + body: data.merge!({ + timestamp: opening[:_timestamp].blank? ? DateTime.current : opening[:_timestamp], + id: "#{opening[:source]}:#{opening[:external_id]}" + }) + ) end + __elasticsearch__.refresh_index! index: INDEX_NAME + Rails.logger.info "Imported #{position_openings.size} position openings" end @@ -171,13 +287,17 @@ def get_external_ids_by_source(source) total = 0 external_ids = [] begin - search = Tire.search index_name do - query { match :source, source } - fields %w(external_id) - sort { by :id } - from from_index - size MAX_RETURNED_DOCUMENTS - end + search_definition = { + query: { match: { source: { query: source }}}, + stored_fields: %w(external_id), + _source: true + } + + search_definition[:size] = MAX_RETURNED_DOCUMENTS + search_definition[:from] = from_index + search_definition[:sort] = ['external_id'] + + search = __elasticsearch__.search(search_definition, index: INDEX_NAME) external_ids.push(*search.results.map(&:external_id)) from_index += search.results.count total = search.results.total @@ -204,5 +324,12 @@ def organization_ids_from_options(options) organization_ids end + def find_geoname(location, state) + Geoname.geocode(location: normalized_city(location), state: state) + end + + def normalized_city(city) + city.sub(' Metro Area', '').sub(/, .*$/, '') + end end -end \ No newline at end of file +end diff --git a/lib/tasks/geonames.rake b/lib/tasks/geonames.rake index 8cfb1fb..c6336ac 100644 --- a/lib/tasks/geonames.rake +++ b/lib/tasks/geonames.rake @@ -11,7 +11,7 @@ namespace :geonames do desc 'Recreate geonames index' task recreate_index: :environment do - Geoname.delete_search_index if Geoname.search_index.exists? + Geoname.delete_search_index if Geoname.search_index_exists? Geoname.create_search_index end end \ No newline at end of file diff --git a/lib/tasks/position_openings.rake b/lib/tasks/position_openings.rake index 9b9aec7..e548871 100644 --- a/lib/tasks/position_openings.rake +++ b/lib/tasks/position_openings.rake @@ -41,7 +41,7 @@ namespace :jobs do desc 'Recreate position openings index' task recreate_index: :environment do - PositionOpening.delete_search_index if PositionOpening.search_index.exists? + PositionOpening.delete_search_index if PositionOpening.search_index_exists? PositionOpening.create_search_index end end \ No newline at end of file diff --git a/spec/api/v2/position_openings_spec.rb b/spec/api/v2/position_openings_spec.rb index d26c0ca..d92788f 100644 --- a/spec/api/v2/position_openings_spec.rb +++ b/spec/api/v2/position_openings_spec.rb @@ -4,7 +4,7 @@ let(:v2_headers) { { 'Accept' => 'application/vnd.usagov.position_openings.v2' } } before do - PositionOpening.delete_search_index if PositionOpening.search_index.exists? + PositionOpening.delete_search_index if PositionOpening.search_index_exists? PositionOpening.create_search_index UsajobsData.new('doc/sample.xml').import @@ -29,11 +29,12 @@ it 'should return with jobs data' do results_array = JSON.parse(response.body) expect(results_array.size).to eq(2) + # binding.pry expect(results_array.first).to eq({'id'=>'usajobs:327358300', 'position_title'=>'Student Nurse Technicians', 'organization_name'=>'Veterans Affairs, Veterans Health Administration', 'rate_interval_code'=>'PH', 'minimum'=>17, 'maximum'=>23, 'start_date'=>'2012-09-19', 'end_date'=>'2022-01-31', - 'locations'=>['Odessa, TX', 'Pentagon, Arlington, VA', 'San Angelo, TX', 'Abilene, TX'], + 'locations'=>['Odessa, TX', 'Pentagon, VA', 'San Angelo, TX', 'Abilene, TX'], 'url' => 'https://www.usajobs.gov/GetJob/ViewDetails/327358300'}) expect(results_array.last).to eq({'id'=>'ng:michigan:234175', 'position_title'=>'Registered Nurse Non-Career', diff --git a/spec/api/v3/position_openings_spec.rb b/spec/api/v3/position_openings_spec.rb index 5f2927d..04c373f 100644 --- a/spec/api/v3/position_openings_spec.rb +++ b/spec/api/v3/position_openings_spec.rb @@ -2,7 +2,7 @@ describe 'Position Openings API V3' do before do - PositionOpening.delete_search_index if PositionOpening.search_index.exists? + PositionOpening.delete_search_index if PositionOpening.search_index_exists? PositionOpening.create_search_index UsajobsData.new('doc/sample.xml').import @@ -31,7 +31,7 @@ 'organization_name'=>'Veterans Affairs, Veterans Health Administration', 'rate_interval_code'=>'PH', 'minimum'=>17, 'maximum'=>23, 'start_date'=>'2012-09-19', 'end_date'=>'2022-01-31', - 'locations'=>['Odessa, TX', 'Pentagon, Arlington, VA', 'San Angelo, TX', 'Abilene, TX'], + 'locations'=>['Odessa, TX', 'Pentagon, VA', 'San Angelo, TX', 'Abilene, TX'], 'url' => 'https://www.usajobs.gov/GetJob/ViewDetails/327358300'}) expect(results_array.last).to eq({'id'=>'ng:michigan:234175', 'position_title'=>'Registered Nurse Non-Career', diff --git a/spec/models/geoname_spec.rb b/spec/models/geoname_spec.rb index 1373f4a..845d76d 100644 --- a/spec/models/geoname_spec.rb +++ b/spec/models/geoname_spec.rb @@ -2,19 +2,20 @@ describe Geoname do before do - Geoname.delete_search_index if Geoname.search_index.exists? + Geoname.delete_search_index if Geoname.search_index_exists? Geoname.create_search_index end describe '.geocode(options)' do describe 'basic location/state lookup' do - before do + before(:each) do Geoname.import [{type: 'geoname', location: "Someplace", state: 'XY', geo: {lat: 12.34, lon: -123.45}}] end it 'should return the lat/lon hash of the place' do - expect(Geoname.geocode(location: "Someplace", state: 'XY')).to eq({lat: 12.34, lon: -123.45}) + Geoname.geocode(location: "Someplace", state: 'XY') + expect(Geoname.geocode(location: "Someplace", state: 'XY').to_json).to eq({lat: 12.34, lon: -123.45}.to_json) end end @@ -25,7 +26,7 @@ first_synonym, remainder = batch_str.strip.gsub(/ ?, ?/, ',').split(',', 2) @first_synonyms << first_synonym remainder.split(',').each do |synonym| - geonames << {type: 'geoname', location: "#{synonym} City", state: 'CA', geo: {lat: rand * 180, lon: rand * 180}} + geonames << {type: 'geoname', location: "#{synonym} City", state: 'CA', geo: {lat: rand * 90, lon: rand * 180}} end end Geoname.import geonames @@ -33,6 +34,7 @@ it 'should find the matches' do @first_synonyms.each do |synonym| + Geoname.geocode(location: "#{synonym} City", state: 'CA') geo_hash = Geoname.geocode(location: "#{synonym} City", state: 'CA') expect(geo_hash[:lat]).to be_kind_of(Numeric) expect(geo_hash[:lon]).to be_kind_of(Numeric) @@ -44,11 +46,12 @@ describe '.import(geonames)' do it 'should set the document ID' do Geoname.import [{type: 'geoname', location: "Someplace", state: 'XY', geo: {lat: 12.34, lon: -123.45}}] - Geoname.import [{type: 'geoname', location: "Someplace", state: 'XY', geo: {lat: 92.34, lon: 23.45}}] + Geoname.import [{type: 'geoname', location: "Someplace", state: 'XY', geo: {lat: 82.34, lon: 23.45}}] + Geoname.search_for(location: 'Someplace', state: 'XY', size: 2) search = Geoname.search_for(location: 'Someplace', state: 'XY', size: 2) expect(search.results.total).to eq(1) expect(search.results.first.id).to eq('Someplace:XY') - expect(search.results.first.geo.lat).to eq(92.34) + expect(search.results.first.geo.lat).to eq(82.34) end end diff --git a/spec/models/position_opening_spec.rb b/spec/models/position_opening_spec.rb index b69d0a1..1e2cb03 100644 --- a/spec/models/position_opening_spec.rb +++ b/spec/models/position_opening_spec.rb @@ -2,7 +2,7 @@ describe PositionOpening do before do - PositionOpening.delete_search_index if PositionOpening.search_index.exists? + PositionOpening.delete_search_index if PositionOpening.search_index_exists? PositionOpening.create_search_index end @@ -218,11 +218,11 @@ describe 'limiting result set size and starting point' do it 'should use the size param' do expect(PositionOpening.search_for(query: 'jobs', size: 1).count).to eq(1) - expect(PositionOpening.search_for(query: 'jobs', size: 10).count).to eq(6) + expect(PositionOpening.search_for(query: 'jobs', size: 10).count).to eq(7) end it 'should use the from param' do - expect(PositionOpening.search_for(query: 'jobs', size: 1, from: 1, sort_by: :id).first[:id]).to eq('usajobs:2') + expect(PositionOpening.search_for(query: 'jobs', size: 1, from: 1, sort_by: :id).first[:id]).to eq('usajobs:3') end end @@ -230,8 +230,8 @@ context 'when keywords present' do it 'should sort by relevance' do res = PositionOpening.search_for(query: 'physician nursing Practitioner') - expect(res.first[:position_title]).to eq('Deputy Special Assistant to the Chief Nurse Practitioner') - expect(res.last[:position_title]).to eq('Physician Assistant') + expect(res.first[:position_title]).to eq('Physician Assistant') + expect(res.last[:position_title]).to eq('Deputy Special Assistant to the Chief Nurse Practitioner') end end @@ -282,14 +282,12 @@ start_date: Date.current, end_date: Date.tomorrow, minimum: 17, maximum: 23, rate_interval_code: 'PH', locations: [{ city: 'Fulton', state: 'MD' }] }] PositionOpening.import position_openings - sleep(0.25) position_openings = [{ source: 'usajobs', external_id: 1001, type: 'position_opening', position_title: 'Physician Assistant Newer', position_schedule_type_code: 2, position_offering_type_code: 15318, tags: %w(federal), organization_id: 'VATA', organization_name: 'Veterans Affairs, Veterans Health Administration', start_date: Date.current, end_date: Date.tomorrow, minimum: 17, maximum: 23, rate_interval_code: 'PH', locations: [{ city: 'Fulton', state: 'MD' }] }] PositionOpening.import position_openings - sleep(0.25) position_openings = [{ source: 'usajobs', external_id: 1002, type: 'position_opening', position_title: 'Physician Assistant Newest', position_schedule_type_code: 2, position_offering_type_code: 15318, tags: %w(federal), organization_id: 'VATA', organization_name: 'Veterans Affairs, Veterans Health Administration', @@ -396,12 +394,10 @@ expect(Geoname).to receive(:geocode).with(location: 'Washington', state: 'DC').and_return({ lat: 23.45, lon: -12.34 }) expect(Geoname).to receive(:geocode).with(location: 'Maui Island', state: 'HI').and_return({ lat: 45.67, lon: -13.31 }) PositionOpening.import([position_opening]) - position_openings = Tire.search 'test:jobs' do - query { all } - end - expect(position_openings.results.first[:locations][0][:geo].to_hash).to eq({ lat: 12.34, lon: -23.45 }) - expect(position_openings.results.first[:locations][1][:geo].to_hash).to eq({ lat: 23.45, lon: -12.34 }) - expect(position_openings.results.first[:locations][2][:geo].to_hash).to eq({ lat: 45.67, lon: -13.31 }) + position_openings = PositionOpening.search('*', index: 'test:jobs') + expect(position_openings.results.first.locations[0][:geo].to_json).to eq({ lat: 12.34, lon: -23.45 }.to_json) + expect(position_openings.results.first.locations[1][:geo].to_json).to eq({ lat: 23.45, lon: -12.34 }.to_json) + expect(position_openings.results.first.locations[2][:geo].to_json).to eq({ lat: 45.67, lon: -13.31 }.to_json) end context 'when no location information is present for job' do @@ -414,9 +410,7 @@ it 'should leave locations empty' do PositionOpening.import([position_opening_no_locations]) - position_openings = Tire.search 'test:jobs' do - query { all } - end + position_openings = PositionOpening.search('*', index: 'test:jobs') expect(position_openings.results.first[:locations]).to be_nil end From 8edf9001577602f76934e5fa69daa74c7d59d625 Mon Sep 17 00:00:00 2001 From: Yih En Lim Date: Wed, 3 Jan 2018 11:26:29 +0800 Subject: [PATCH 03/16] Fix environments files --- .travis.yml | 5 ++--- config/application.rb | 2 -- config/elasticsearch.yml | 9 ++++++++- config/environments/development.rb | 5 ----- config/environments/production.rb | 4 ---- config/environments/test.rb | 5 ----- config/initializers/elasticsearch.rb | 24 ++++++++++++++++++++++-- config/initializers/tire.rb | 2 -- 8 files changed, 32 insertions(+), 24 deletions(-) delete mode 100644 config/initializers/tire.rb diff --git a/.travis.yml b/.travis.yml index 4b57745..d780277 100644 --- a/.travis.yml +++ b/.travis.yml @@ -3,7 +3,6 @@ language: ruby rvm: - 2.3.5 before_install: - - curl -O https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-5.6.5.deb && sudo dpkg -i --force-confnew elasticsearch-5.6.5.deb && true - - 'echo ''script.disable_dynamic: false'' | sudo tee --append /etc/elasticsearch/elasticsearch.yml' - - 'echo ''index.number_of_shards: 1'' | sudo tee --append /etc/elasticsearch/elasticsearch.yml' + - curl -O https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-5.6.5.deb && sudo dpkg -i --force-confnew elasticsearch-5.6.5.deb && true + - 'printf "script:\n inline: true\n stored: true\n" | sudo tee --append /etc/elasticsearch/elasticsearch.yml' - sudo service elasticsearch start diff --git a/config/application.rb b/config/application.rb index 2305c42..921c263 100644 --- a/config/application.rb +++ b/config/application.rb @@ -14,8 +14,6 @@ class Application < Rails::Application # Configure the default encoding used in templates for Ruby 1.9. config.encoding = 'utf-8' - config.elasticsearch_config = (YAML.load_file("#{Rails.root}/config/elasticsearch.yml") || {})[Rails.env] - # Custom directories with classes and modules you want to be autoloadable. config.autoload_paths += %W(#{Rails.root}/lib #{Rails.root}/lib/constraints #{Rails.root}/lib/importers) diff --git a/config/elasticsearch.yml b/config/elasticsearch.yml index 837de31..d195e12 100644 --- a/config/elasticsearch.yml +++ b/config/elasticsearch.yml @@ -1,2 +1,9 @@ production: - index_name: \ No newline at end of file + index_name: + url: + username: + password: +development: + host: 'localhost:9200' +test: + host: 'localhost:9200' diff --git a/config/environments/development.rb b/config/environments/development.rb index 3603200..2f14310 100644 --- a/config/environments/development.rb +++ b/config/environments/development.rb @@ -32,9 +32,4 @@ # Use an evented file watcher to asynchronously detect changes in source code, # routes, locales, etc. This feature depends on the listen gem. config.file_watcher = ActiveSupport::EventedFileUpdateChecker - - config.elasticsearch_client = Elasticsearch::Client.new( - host: 'localhost:9200', - log: true - ) end diff --git a/config/environments/production.rb b/config/environments/production.rb index fbc01c0..f144ec9 100644 --- a/config/environments/production.rb +++ b/config/environments/production.rb @@ -49,8 +49,4 @@ logger.formatter = config.log_formatter config.logger = ActiveSupport::TaggedLogging.new(logger) end - - config.elasticsearch_client = Elasticsearch::Client.new( - url: Rails.application.config.elasticsearch_config['url'] - ) end diff --git a/config/environments/test.rb b/config/environments/test.rb index bccd63c..888b8a0 100644 --- a/config/environments/test.rb +++ b/config/environments/test.rb @@ -30,9 +30,4 @@ # Print deprecation notices to the stderr config.active_support.deprecation = :stderr - - config.elasticsearch_client = Elasticsearch::Client.new( - host: 'localhost:9200', - log: false - ) end diff --git a/config/initializers/elasticsearch.rb b/config/initializers/elasticsearch.rb index 6794c0c..f771dae 100644 --- a/config/initializers/elasticsearch.rb +++ b/config/initializers/elasticsearch.rb @@ -1,5 +1,25 @@ module Elasticsearch; end -es_config = Rails.application.config.elasticsearch_config +Rails.application.config.elasticsearch = (YAML.load_file("#{Rails.root}/config/elasticsearch.yml") || {})[Rails.env] -Elasticsearch::INDEX_NAME = es_config && es_config['index_name'].present? ? es_config['index_name'].freeze : "#{Rails.env}:jobs".freeze +config = Rails.application.config.elasticsearch + +Elasticsearch::INDEX_NAME = config && config['index_name'].present? ? config['index_name'].freeze : "#{Rails.env}:jobs".freeze + +if Rails.env.production? + Rails.application.config.elasticsearch_client = Elasticsearch::Client.new( + host: [{ + url: config['url'], + user: config['username'], + password: config['password'] + }] + ) +else + Rails.application.config.elasticsearch_client = Elasticsearch::Client.new( + host: config['host'], + log: true + ) +end + +PositionOpening.create_search_index unless PositionOpening.search_index_exists? +Geoname.create_search_index unless Geoname.search_index_exists? diff --git a/config/initializers/tire.rb b/config/initializers/tire.rb deleted file mode 100644 index 2797d50..0000000 --- a/config/initializers/tire.rb +++ /dev/null @@ -1,2 +0,0 @@ -PositionOpening.create_search_index unless PositionOpening.search_index_exists? -Geoname.create_search_index unless Geoname.search_index_exists? From f813c9de6de784da861b564242dc637f4d989122 Mon Sep 17 00:00:00 2001 From: Yih En Lim Date: Wed, 3 Jan 2018 12:00:44 +0800 Subject: [PATCH 04/16] Use config_for(:elasticsearch) --- config/application.rb | 2 +- config/initializers/elasticsearch.rb | 22 ++++++---------------- 2 files changed, 7 insertions(+), 17 deletions(-) diff --git a/config/application.rb b/config/application.rb index 921c263..89ce021 100644 --- a/config/application.rb +++ b/config/application.rb @@ -2,7 +2,6 @@ require 'rails' require 'action_controller/railtie' -# require 'tire/rails/logger' # This does not work with Rails 5 Bundler.require(*Rails.groups) @@ -30,5 +29,6 @@ class Application < Rails::Application end config.airbrake = config_for(:airbrake) + config.elasticsearch = config_for(:elasticsearch) end end diff --git a/config/initializers/elasticsearch.rb b/config/initializers/elasticsearch.rb index f771dae..575f43c 100644 --- a/config/initializers/elasticsearch.rb +++ b/config/initializers/elasticsearch.rb @@ -1,25 +1,15 @@ module Elasticsearch; end -Rails.application.config.elasticsearch = (YAML.load_file("#{Rails.root}/config/elasticsearch.yml") || {})[Rails.env] - config = Rails.application.config.elasticsearch Elasticsearch::INDEX_NAME = config && config['index_name'].present? ? config['index_name'].freeze : "#{Rails.env}:jobs".freeze -if Rails.env.production? - Rails.application.config.elasticsearch_client = Elasticsearch::Client.new( - host: [{ - url: config['url'], - user: config['username'], - password: config['password'] - }] - ) -else - Rails.application.config.elasticsearch_client = Elasticsearch::Client.new( - host: config['host'], - log: true - ) -end +Rails.application.config.elasticsearch_client = Elasticsearch::Client.new( + url: config['url'], + host: config['host'], + user: config['username'], + password: config['password'] +) PositionOpening.create_search_index unless PositionOpening.search_index_exists? Geoname.create_search_index unless Geoname.search_index_exists? From 5c5f64f88fb8c3ab46389816cf717433199a6de1 Mon Sep 17 00:00:00 2001 From: Yih En Lim Date: Wed, 3 Jan 2018 12:02:44 +0800 Subject: [PATCH 05/16] Use url instead --- config/elasticsearch.yml | 4 ++-- config/initializers/elasticsearch.rb | 1 - 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/config/elasticsearch.yml b/config/elasticsearch.yml index d195e12..c2a617a 100644 --- a/config/elasticsearch.yml +++ b/config/elasticsearch.yml @@ -4,6 +4,6 @@ production: username: password: development: - host: 'localhost:9200' + url: 'localhost:9200' test: - host: 'localhost:9200' + url: 'localhost:9200' diff --git a/config/initializers/elasticsearch.rb b/config/initializers/elasticsearch.rb index 575f43c..f3c4257 100644 --- a/config/initializers/elasticsearch.rb +++ b/config/initializers/elasticsearch.rb @@ -6,7 +6,6 @@ module Elasticsearch; end Rails.application.config.elasticsearch_client = Elasticsearch::Client.new( url: config['url'], - host: config['host'], user: config['username'], password: config['password'] ) From b8c55a728908f3758947234dab2b17f9ea68f6ab Mon Sep 17 00:00:00 2001 From: Yih En Lim Date: Wed, 3 Jan 2018 16:18:53 +0800 Subject: [PATCH 06/16] Refactor --- app/classes/search_definition.rb | 40 ++++++ app/models/position_opening.rb | 176 +++++++++++-------------- spec/classes/search_definition_spec.rb | 88 +++++++++++++ 3 files changed, 202 insertions(+), 102 deletions(-) create mode 100644 app/classes/search_definition.rb create mode 100644 spec/classes/search_definition_spec.rb diff --git a/app/classes/search_definition.rb b/app/classes/search_definition.rb new file mode 100644 index 0000000..815fbfb --- /dev/null +++ b/app/classes/search_definition.rb @@ -0,0 +1,40 @@ +class SearchDefinition + attr_accessor :definition + + def initialize(definition = nil) + @definition = definition || { + query: { + bool: { + must: [], + should: [] + } + } + } + + yield(self) if block_given? + end + + def must(query) + if definition[:query] + definition[:query][:bool][:must] << query + else + definition[:bool][:must] << query + end + end + + def should(query) + if definition[:query] + definition[:query][:bool][:should] << query + else + definition[:bool][:should] << query + end + end + + def sort(query) + definition[:sort] = query + end + + def to_s + definition + end +end diff --git a/app/models/position_opening.rb b/app/models/position_opening.rb index f830454..d5d0c1c 100644 --- a/app/models/position_opening.rb +++ b/app/models/position_opening.rb @@ -104,129 +104,81 @@ def search_for(options = {}) organization_ids = organization_ids_from_options(options) query = Query.new(options[:query], organization_ids) - search_definition = { - query: { - bool: { - must: [], - should: [], - filter: [ - { range: { start_date: { lte: Date.current } }} - ], - minimum_should_match: '0<1' - } - }, - highlight: { - fields: { - position_title: { number_of_fragments: 0 } - } - }, - size: document_limit, - from: options[:from] - } - - if source.present? || tags || query.valid? - if source.present? - search_definition[:query][:bool][:must] << { term: { source: source } } - end - - if tags - search_definition[:query][:bool][:must] << { terms: { tags: tags } } - end - - if query.position_offering_type_code.present? - search_definition[:query][:bool][:must] << { match: { position_offering_type_code: { query: query.position_offering_type_code } } } - end - - if query.position_schedule_type_code.present? - search_definition[:query][:bool][:must] << { match: { position_schedule_type_code: { query: query.position_schedule_type_code } } } - end - - if query.keywords.present? - search_definition[:query][:bool][:should] << { match: { position_title: { query: query.keywords, analyzer: 'custom_analyzer' } } } - end - - if query.keywords.present? && query.location.nil? - search_definition[:query][:bool][:should] << { + search = SearchDefinition.new do |s| + s.definition = { + query: { + bool: { + must: [], + should: [], + filter: [ + { range: { start_date: { lte: Date.current } }} + ], + minimum_should_match: '0<1' + } + }, + highlight: { + fields: { + position_title: { number_of_fragments: 0 } + } + }, + size: document_limit, + from: options[:from], + sort: [] + } + if source.present? || tags || query.valid? + s.must({ term: { source: source } }) if source.present? + s.must({ terms: { tags: tags } }) if tags + s.must({ match: { position_offering_type_code: { query: query.position_offering_type_code } } }) if query.position_offering_type_code.present? + s.must({ match: { position_schedule_type_code: { query: query.position_schedule_type_code } } }) if query.position_schedule_type_code.present? + s.should({ match: { position_title: { query: query.keywords, analyzer: 'custom_analyzer' } } }) if query.keywords.present? + s.should({ nested: { path: 'locations', query: { match: { 'locations.city': { query: query.keywords, operator: 'AND' }} } } - } - end + }) if query.keywords.present? && query.location.nil? - if query.rate_interval_code.present? - search_definition[:query][:bool][:must] << { match: { rate_interval_code: query.rate_interval_code }} - end + s.must({ match: { rate_interval_code: query.rate_interval_code } }) if query.rate_interval_code.present? - if query.organization_ids.present? - definition = { - bool: { - should: [] - } - } - - if query.organization_terms.present? - definition[:bool][:should] << { terms: { organization_id: query.organization_terms } } - end - if query.organization_prefixes.present? - query.organization_prefixes.each do |prefix| - definition[:bool][:should] << { prefix: { organization_id: prefix } } - end + if query.organization_ids.present? + organization_ids = build_organization_id_definition(query) + s.must(organization_ids.definition) end - search_definition[:query][:bool][:must] << definition + if query.location.present? + location = build_location_definition(query) + s.must({ nested: { path: 'locations', query: location.definition } }) + end end - if query.location.present? - locations = { - nested: { - path: 'locations', - query: { - bool: { - must: [] - } + if query.keywords.blank? + if lat.blank? || lon.blank? + s.sort({ "#{sort_by}": 'desc' }) + else + s.sort([{ + _geo_distance: { + 'locations.geo': { lat: lat.to_f, lon: lon.to_f }, + order: 'asc', + nested_path: 'locations' } - } - } - - if query.has_state? - locations[:nested][:query][:bool][:must] << { term: { 'locations.state': { term: query.location.state } }} - end - - if query.has_city? - locations[:nested][:query][:bool][:must] << { match: { 'locations.city': { query: query.location.city, operator: 'AND' }}} + }]) end - - search_definition[:query][:bool][:must] << locations + else + s.sort({ "#{sort_by}": 'desc' }) end - end - if query.keywords.blank? - if lat.blank? || lon.blank? - search_definition[:sort] = [{ "#{sort_by}": 'desc' }] - else - search_definition[:sort] = [{ - _geo_distance: { - 'locations.geo': { lat: lat.to_f, lon: lon.to_f }, - order: 'asc', - nested_path: 'locations' - } - }] + if s.definition[:query][:bool][:must].empty? && s.definition[:query][:bool][:should].empty? + s.definition[:query] = { match_all: {} } end - else - search_definition[:sort] = { "#{sort_by}": 'desc' } end - if search_definition[:query][:bool][:must].empty? && search_definition[:query][:bool][:should].empty? - search_definition[:query] = { match_all: {} } - end + search_results = __elasticsearch__.search(search.definition, index: INDEX_NAME) - search = __elasticsearch__.search(search_definition, index: INDEX_NAME) - Rails.logger.info("[Query] #{options.merge(result_count: search.results.total).to_json}") + Rails.logger.info("[Query] #{options.merge(result_count: search_results.results.total).to_json}") - search.results.collect do |item| + search_results.results.collect do |item| { id: item.id, source: item.source, @@ -331,5 +283,25 @@ def find_geoname(location, state) def normalized_city(city) city.sub(' Metro Area', '').sub(/, .*$/, '') end + + def build_location_definition(query) + SearchDefinition.new do |s| + s.definition = { bool: { must: [] } } + s.must({ term: { 'locations.state': { term: query.location.state } }}) if query.has_state? + s.must({ match: { 'locations.city': { query: query.location.city, operator: 'AND' }}}) if query.has_city? + end + end + + def build_organization_id_definition(query) + SearchDefinition.new do |s1| + s1.definition = { bool: { should: [] } } + s1.should({ terms: { organization_id: query.organization_terms } }) if query.organization_terms.present? + if query.organization_prefixes.present? + query.organization_prefixes.each do |prefix| + s1.should({ prefix: { organization_id: prefix } }) + end + end + end + end end end diff --git a/spec/classes/search_definition_spec.rb b/spec/classes/search_definition_spec.rb new file mode 100644 index 0000000..c68c332 --- /dev/null +++ b/spec/classes/search_definition_spec.rb @@ -0,0 +1,88 @@ +require 'rails_helper' + +describe SearchDefinition do + let(:expected_definition) { { bool: { must: [], should: [] } } } + + describe '.new' do + context 'when there is no definition being set' do + it 'returns the default definition' do + expect(SearchDefinition.new.definition).to eq( + { + query: { + bool: { + must: [], + should: [] + } + } + } + ) + end + end + + context 'when there is definition being set' do + it 'returns the set definition' do + expect(SearchDefinition.new(expected_definition).definition).to eq(expected_definition) + end + end + end + + describe '.must' do + context 'when there is query in definition' do + it 'adds query to definition' do + search = SearchDefinition.new do |s| + s.must({ terms: { tags: [1, 2, 3] }}) + end + + expect(search.definition[:query][:bool][:must]).to eq([{ terms: { tags: [1, 2, 3] } }]) + end + end + + context 'when there is no query in definition' do + it 'adds query to bool' do + search = SearchDefinition.new(expected_definition) do |s| + s.must({ match: { tag: 1 }}) + end + + expect(search.definition[:bool][:must]).to eq([{ match: { tag: 1 } }]) + end + end + end + + describe '.should' do + context 'when there is query in definition' do + it 'adds query to definition' do + search = SearchDefinition.new do |s| + s.should({ term: { name: 'apple' }}) + end + + expect(search.definition[:query][:bool][:should]).to eq([{ term: { name: 'apple' } }]) + end + end + + context 'when there is no query in definition' do + it 'adds query to bool' do + search = SearchDefinition.new(expected_definition) do |s| + s.should({ match: { pretty: true }}) + end + + expect(search.definition[:bool][:should]).to eq([{ match: { pretty: true } }]) + end + end + end + + describe '.sort' do + it 'adds sorting query' do + search = SearchDefinition.new do |s| + s.sort({ id: 'desc' }) + end + + expect(search.definition[:sort]).to eq({ id: 'desc' }) + end + end + + describe '.to_s' do + it 'prints out definition' do + expect(SearchDefinition.new(expected_definition).to_s).to eq(expected_definition) + end + end +end From afd4b9003d9eea4d338668839469c62a5060237c Mon Sep 17 00:00:00 2001 From: Yih En Lim Date: Wed, 3 Jan 2018 18:54:18 +0800 Subject: [PATCH 07/16] Update README --- Gemfile | 2 + Gemfile.lock | 7 + README.md | 20 +-- app/models/geoname.rb | 1 - app/models/position_opening.rb | 246 +++++++++++++++++++------- spec/api/v2/position_openings_spec.rb | 1 - spec/models/position_opening_spec.rb | 4 +- 7 files changed, 206 insertions(+), 75 deletions(-) diff --git a/Gemfile b/Gemfile index 38aaac3..7b3678f 100644 --- a/Gemfile +++ b/Gemfile @@ -20,6 +20,8 @@ gem 'newrelic_rpm', '~> 4.6.0' gem 'rake', '~> 11.0' gem 'elasticsearch-model' gem 'elasticsearch-rails' +gem 'elasticsearch-dsl' +gem 'pry' group :development, :test do gem 'puma', '~> 3.7' diff --git a/Gemfile.lock b/Gemfile.lock index c6f5260..ee619d6 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -55,6 +55,7 @@ GEM net-sftp (>= 2.0.0) net-ssh (>= 2.0.14) net-ssh-gateway (>= 1.1.0) + coderay (1.1.2) concurrent-ruby (1.0.5) coveralls (0.7.0) multi_json (~> 1.3) @@ -72,6 +73,7 @@ GEM elasticsearch-transport (= 5.0.4) elasticsearch-api (5.0.4) multi_json + elasticsearch-dsl (0.1.5) elasticsearch-model (5.0.2) activesupport (> 3) elasticsearch (~> 5) @@ -128,6 +130,9 @@ GEM mini_portile2 (~> 2.3.0) oj (3.1.3) power_assert (1.0.2) + pry (0.11.3) + coderay (~> 1.1.0) + method_source (~> 0.9.0) puma (3.11.0) rack (2.0.3) rack-contrib (2.0.1) @@ -232,6 +237,7 @@ DEPENDENCIES airbrake (~> 7.1) capistrano (~> 2.15.4) coveralls (~> 0.7.0) + elasticsearch-dsl elasticsearch-model elasticsearch-rails faraday_middleware (~> 0.12.2) @@ -241,6 +247,7 @@ DEPENDENCIES newrelic_rpm (~> 4.6.0) nokogiri (~> 1.8.0) oj (~> 3.1.3) + pry puma (~> 3.7) rack-contrib (~> 2.0.1) rack-cors (~> 1.0.2) diff --git a/README.md b/README.md index 35fb9cb..8895827 100644 --- a/README.md +++ b/README.md @@ -21,9 +21,9 @@ We use bundler to manage gems. You can install bundler and other required gems l ### Elasticsearch -We're using [Elasticsearch](http://www.elasticsearch.org/) (>= 1.4.0) for fulltext search. On a Mac, it's easy to install with [Homebrew](http://mxcl.github.com/homebrew/). +We're using [Elasticsearch](http://www.elasticsearch.org/) (>= 5.6) for fulltext search. On a Mac, it's easy to install with [Homebrew](http://mxcl.github.com/homebrew/). - $ brew install elasticsearch + $ brew install elasticsearch@5.6 Otherwise, follow the [instructions](http://www.elasticsearch.org/download/) to download and run it. @@ -32,21 +32,21 @@ Otherwise, follow the [instructions](http://www.elasticsearch.org/download/) to Install Docker if you haven't done so yet. Follow the instruction [here](https://www.docker.com/community-edition) Once you have Docker installed on your machine, run the following command in your terminal - $ docker run -p 9200:9200 -p 9300:9300 -e "discovery.type=single-node" elasticsearch:1.4.5 + $ docker run -p 9200:9200 -p 9300:9300 -e "discovery.type=single-node" elasticsearch:5.6 -This will download an docker image containing elasticsearch=1.4.5 from docker hub, run it, and expose port 9200 & 9300 to your machine. You can verify your setup with the following command. +This will download an docker image containing elasticsearch=5.6.5 from docker hub, run it, and expose port 9200 & 9300 to your machine. You can verify your setup with the following command. $ curl localhost:9200 { - "status" : 200, - "name" : "Aegis", + "name" : "u2bQgL2", "cluster_name" : "elasticsearch", + "cluster_uuid" : "qZ-Xas_PR_2ARtHpY724Ug", "version" : { - "number" : "1.4.5", - "build_hash" : "2aaf797f2a571dcb779a3b61180afe8390ab61f9", - "build_timestamp" : "2015-04-27T08:06:06Z", + "number" : "5.6.5", + "build_hash" : "6a37571", + "build_date" : "2017-12-04T07:50:10.466Z", "build_snapshot" : false, - "lucene_version" : "4.10.4" + "lucene_version" : "6.6.1" }, "tagline" : "You Know, for Search" } diff --git a/app/models/geoname.rb b/app/models/geoname.rb index 06db087..f01d931 100644 --- a/app/models/geoname.rb +++ b/app/models/geoname.rb @@ -5,7 +5,6 @@ class Geoname include Elasticsearch::Model INDEX_NAME = "#{Rails.env}:geonames".freeze - index_name = INDEX_NAME SYNONYMS = [ "afb, air force base", diff --git a/app/models/position_opening.rb b/app/models/position_opening.rb index d5d0c1c..36048e8 100644 --- a/app/models/position_opening.rb +++ b/app/models/position_opening.rb @@ -1,8 +1,10 @@ require 'active_model' +require 'elasticsearch/dsl' class PositionOpening include ActiveModel::Model include Elasticsearch::Model + include Elasticsearch::DSL INDEX_NAME = "#{Elasticsearch::INDEX_NAME}".freeze @@ -104,77 +106,199 @@ def search_for(options = {}) organization_ids = organization_ids_from_options(options) query = Query.new(options[:query], organization_ids) - search = SearchDefinition.new do |s| - s.definition = { - query: { - bool: { - must: [], - should: [], - filter: [ - { range: { start_date: { lte: Date.current } }} - ], - minimum_should_match: '0<1' - } - }, - highlight: { - fields: { - position_title: { number_of_fragments: 0 } - } - }, - size: document_limit, - from: options[:from], - sort: [] - } - if source.present? || tags || query.valid? - s.must({ term: { source: source } }) if source.present? - s.must({ terms: { tags: tags } }) if tags - s.must({ match: { position_offering_type_code: { query: query.position_offering_type_code } } }) if query.position_offering_type_code.present? - s.must({ match: { position_schedule_type_code: { query: query.position_schedule_type_code } } }) if query.position_schedule_type_code.present? - s.should({ match: { position_title: { query: query.keywords, analyzer: 'custom_analyzer' } } }) if query.keywords.present? - s.should({ - nested: { - path: 'locations', - query: { - match: { 'locations.city': { query: query.keywords, operator: 'AND' }} - } - } - }) if query.keywords.present? && query.location.nil? - - s.must({ match: { rate_interval_code: query.rate_interval_code } }) if query.rate_interval_code.present? - - if query.organization_ids.present? - organization_ids = build_organization_id_definition(query) - s.must(organization_ids.definition) - end + definition = Elasticsearch::DSL::Search.search do + query do + bool do + filter do + range :start_date do + lte Date.current + end + end - if query.location.present? - location = build_location_definition(query) - s.must({ nested: { path: 'locations', query: location.definition } }) + must { term source: source } if source.present? + must do + terms tags: tags + end if tags + must do + match :position_offering_type_code do + query query.position_offering_type_code + end + end if query.position_offering_type_code.present? + + must do + match :position_schedule_type_code do + query query.position_schedule_type_code + end + end if query.position_schedule_type_code.present? + + should do + match :position_title do + query query.keywords + analyzer 'custom_analyzer' + end + end if query.keywords.present? + should do + nested do + path 'locations' + query do + match 'locations.city' do + query query.keywords + operator 'and' + end + end + end + end if query.keywords.present? && query.location.nil? + + must do + match :rate_interval_code do + query query.rate_interval_code + end + end if query.rate_interval_code.present? + + must do + bool do + should do + terms organization_id: query.organization_terms + end if query.organization_terms.present? + if query.organization_prefixes.present? + query.organization_prefixes.each do |prefix| + should do + prefix(:organization_ids) do + value prefix + end + end + end + end + end + end if query.organization_ids.present? + + must do + nested do + path 'locations' + query do + bool do + must do + term 'locations.state': query.location.state + end if query.has_state? + must do + match 'locations.city' do + query query.location.city + operator 'and' + end + end if query.has_city? + end + end + end + end if query.location.present? + + minimum_should_match '0<1' end end - if query.keywords.blank? - if lat.blank? || lon.blank? - s.sort({ "#{sort_by}": 'desc' }) + sort do + if query.keywords.blank? + if lat.blank? || lon.blank? + by "#{sort_by}", order: 'desc' + else + by({ + _geo_distance: { + 'locations.geo': { lat: lat.to_f, lon: lon.to_f }, + order: 'asc', + nested_path: 'locations' + } + }) + end else - s.sort([{ - _geo_distance: { - 'locations.geo': { lat: lat.to_f, lon: lon.to_f }, - order: 'asc', - nested_path: 'locations' - } - }]) + by "#{sort_by}", order: 'desc' end - else - s.sort({ "#{sort_by}": 'desc' }) end - if s.definition[:query][:bool][:must].empty? && s.definition[:query][:bool][:should].empty? - s.definition[:query] = { match_all: {} } + highlight do + field :position_title, number_of_fragments: 0 end - end - search_results = __elasticsearch__.search(search.definition, index: INDEX_NAME) + size document_limit + from options[:from] + end.to_hash + + # definition = SearchDefinition.new do |s| + # s.definition = { + # query: { + # bool: { + # must: [], + # should: [], + # filter: [ + # { range: { start_date: { lte: Date.current } }} + # ], + # minimum_should_match: '0<1' + # } + # }, + # highlight: { + # fields: { + # position_title: { number_of_fragments: 0 } + # } + # }, + # size: document_limit, + # from: options[:from], + # sort: [] + # } + # if source.present? || tags || query.valid? + # s.must({ term: { source: source } }) if source.present? + # s.must({ terms: { tags: tags } }) if tags + # s.must({ match: { position_offering_type_code: { query: query.position_offering_type_code } } }) if query.position_offering_type_code.present? + # s.must({ match: { position_schedule_type_code: { query: query.position_schedule_type_code } } }) if query.position_schedule_type_code.present? + # s.should({ match: { position_title: { query: query.keywords, analyzer: 'custom_analyzer' } } }) if query.keywords.present? + # s.should({ + # nested: { + # path: 'locations', + # query: { + # match: { 'locations.city': { query: query.keywords, operator: 'AND' }} + # } + # } + # }) if query.keywords.present? && query.location.nil? + # + # s.must({ match: { rate_interval_code: query.rate_interval_code } }) if query.rate_interval_code.present? + # + # if query.organization_ids.present? + # organization_ids = build_organization_id_definition(query) + # s.must(organization_ids.definition) + # end + # + # if query.location.present? + # location = build_location_definition(query) + # s.must({ nested: { path: 'locations', query: location.definition } }) + # end + # end + # + # if query.keywords.blank? + # if lat.blank? || lon.blank? + # s.sort({ "#{sort_by}": 'desc' }) + # else + # s.sort([{ + # _geo_distance: { + # 'locations.geo': { lat: lat.to_f, lon: lon.to_f }, + # order: 'asc', + # nested_path: 'locations' + # } + # }]) + # end + # else + # s.sort({ "#{sort_by}": 'desc' }) + # end + # + # if s.definition[:query][:bool][:must].empty? && s.definition[:query][:bool][:should].empty? + # s.definition[:query] = { match_all: {} } + # end + # end + + # definition = search_definition.to_hash + # + # if definition[:query][:bool][:must].empty? && definition[:query][:bool][:should].empty? + # definition[:query] = { match_all: {} } + # end + + search_results = __elasticsearch__.search(definition, index: INDEX_NAME) + binding.pry Rails.logger.info("[Query] #{options.merge(result_count: search_results.results.total).to_json}") diff --git a/spec/api/v2/position_openings_spec.rb b/spec/api/v2/position_openings_spec.rb index d92788f..d81c189 100644 --- a/spec/api/v2/position_openings_spec.rb +++ b/spec/api/v2/position_openings_spec.rb @@ -29,7 +29,6 @@ it 'should return with jobs data' do results_array = JSON.parse(response.body) expect(results_array.size).to eq(2) - # binding.pry expect(results_array.first).to eq({'id'=>'usajobs:327358300', 'position_title'=>'Student Nurse Technicians', 'organization_name'=>'Veterans Affairs, Veterans Health Administration', 'rate_interval_code'=>'PH', 'minimum'=>17, 'maximum'=>23, diff --git a/spec/models/position_opening_spec.rb b/spec/models/position_opening_spec.rb index 1e2cb03..2e2d6f4 100644 --- a/spec/models/position_opening_spec.rb +++ b/spec/models/position_opening_spec.rb @@ -218,11 +218,11 @@ describe 'limiting result set size and starting point' do it 'should use the size param' do expect(PositionOpening.search_for(query: 'jobs', size: 1).count).to eq(1) - expect(PositionOpening.search_for(query: 'jobs', size: 10).count).to eq(7) + expect(PositionOpening.search_for(query: 'jobs', size: 10).count).to eq(6) end it 'should use the from param' do - expect(PositionOpening.search_for(query: 'jobs', size: 1, from: 1, sort_by: :id).first[:id]).to eq('usajobs:3') + expect(PositionOpening.search_for(query: 'jobs', size: 1, from: 1, sort_by: :id).first[:id]).to eq('usajobs:2') end end From c08b5065d5fa81e220e347b759b61162df9377f2 Mon Sep 17 00:00:00 2001 From: Yih En Lim Date: Fri, 5 Jan 2018 15:17:29 +0800 Subject: [PATCH 08/16] Use elasticsearch dsl --- Gemfile | 1 - Gemfile.lock | 5 -- app/classes/search_definition.rb | 40 --------- app/models/position_opening.rb | 120 ++----------------------- spec/classes/search_definition_spec.rb | 88 ------------------ 5 files changed, 5 insertions(+), 249 deletions(-) delete mode 100644 app/classes/search_definition.rb delete mode 100644 spec/classes/search_definition_spec.rb diff --git a/Gemfile b/Gemfile index 7b3678f..777b5a8 100644 --- a/Gemfile +++ b/Gemfile @@ -21,7 +21,6 @@ gem 'rake', '~> 11.0' gem 'elasticsearch-model' gem 'elasticsearch-rails' gem 'elasticsearch-dsl' -gem 'pry' group :development, :test do gem 'puma', '~> 3.7' diff --git a/Gemfile.lock b/Gemfile.lock index ee619d6..1f9dbd6 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -55,7 +55,6 @@ GEM net-sftp (>= 2.0.0) net-ssh (>= 2.0.14) net-ssh-gateway (>= 1.1.0) - coderay (1.1.2) concurrent-ruby (1.0.5) coveralls (0.7.0) multi_json (~> 1.3) @@ -130,9 +129,6 @@ GEM mini_portile2 (~> 2.3.0) oj (3.1.3) power_assert (1.0.2) - pry (0.11.3) - coderay (~> 1.1.0) - method_source (~> 0.9.0) puma (3.11.0) rack (2.0.3) rack-contrib (2.0.1) @@ -247,7 +243,6 @@ DEPENDENCIES newrelic_rpm (~> 4.6.0) nokogiri (~> 1.8.0) oj (~> 3.1.3) - pry puma (~> 3.7) rack-contrib (~> 2.0.1) rack-cors (~> 1.0.2) diff --git a/app/classes/search_definition.rb b/app/classes/search_definition.rb deleted file mode 100644 index 815fbfb..0000000 --- a/app/classes/search_definition.rb +++ /dev/null @@ -1,40 +0,0 @@ -class SearchDefinition - attr_accessor :definition - - def initialize(definition = nil) - @definition = definition || { - query: { - bool: { - must: [], - should: [] - } - } - } - - yield(self) if block_given? - end - - def must(query) - if definition[:query] - definition[:query][:bool][:must] << query - else - definition[:bool][:must] << query - end - end - - def should(query) - if definition[:query] - definition[:query][:bool][:should] << query - else - definition[:bool][:should] << query - end - end - - def sort(query) - definition[:sort] = query - end - - def to_s - definition - end -end diff --git a/app/models/position_opening.rb b/app/models/position_opening.rb index 36048e8..c3758e1 100644 --- a/app/models/position_opening.rb +++ b/app/models/position_opening.rb @@ -116,9 +116,7 @@ def search_for(options = {}) end must { term source: source } if source.present? - must do - terms tags: tags - end if tags + must { terms tags: tags } if tags must do match :position_offering_type_code do query query.position_offering_type_code @@ -157,16 +155,10 @@ def search_for(options = {}) must do bool do - should do - terms organization_id: query.organization_terms - end if query.organization_terms.present? + should { terms organization_id: query.organization_terms } if query.organization_terms.present? if query.organization_prefixes.present? query.organization_prefixes.each do |prefix| - should do - prefix(:organization_ids) do - value prefix - end - end + should { prefix organization_id: prefix } end end end @@ -177,9 +169,7 @@ def search_for(options = {}) path 'locations' query do bool do - must do - term 'locations.state': query.location.state - end if query.has_state? + must { term 'locations.state': query.location.state } if query.has_state? must do match 'locations.city' do query query.location.city @@ -213,92 +203,12 @@ def search_for(options = {}) end end - highlight do - field :position_title, number_of_fragments: 0 - end - + highlight { field :position_title, number_of_fragments: 0 } size document_limit from options[:from] end.to_hash - # definition = SearchDefinition.new do |s| - # s.definition = { - # query: { - # bool: { - # must: [], - # should: [], - # filter: [ - # { range: { start_date: { lte: Date.current } }} - # ], - # minimum_should_match: '0<1' - # } - # }, - # highlight: { - # fields: { - # position_title: { number_of_fragments: 0 } - # } - # }, - # size: document_limit, - # from: options[:from], - # sort: [] - # } - # if source.present? || tags || query.valid? - # s.must({ term: { source: source } }) if source.present? - # s.must({ terms: { tags: tags } }) if tags - # s.must({ match: { position_offering_type_code: { query: query.position_offering_type_code } } }) if query.position_offering_type_code.present? - # s.must({ match: { position_schedule_type_code: { query: query.position_schedule_type_code } } }) if query.position_schedule_type_code.present? - # s.should({ match: { position_title: { query: query.keywords, analyzer: 'custom_analyzer' } } }) if query.keywords.present? - # s.should({ - # nested: { - # path: 'locations', - # query: { - # match: { 'locations.city': { query: query.keywords, operator: 'AND' }} - # } - # } - # }) if query.keywords.present? && query.location.nil? - # - # s.must({ match: { rate_interval_code: query.rate_interval_code } }) if query.rate_interval_code.present? - # - # if query.organization_ids.present? - # organization_ids = build_organization_id_definition(query) - # s.must(organization_ids.definition) - # end - # - # if query.location.present? - # location = build_location_definition(query) - # s.must({ nested: { path: 'locations', query: location.definition } }) - # end - # end - # - # if query.keywords.blank? - # if lat.blank? || lon.blank? - # s.sort({ "#{sort_by}": 'desc' }) - # else - # s.sort([{ - # _geo_distance: { - # 'locations.geo': { lat: lat.to_f, lon: lon.to_f }, - # order: 'asc', - # nested_path: 'locations' - # } - # }]) - # end - # else - # s.sort({ "#{sort_by}": 'desc' }) - # end - # - # if s.definition[:query][:bool][:must].empty? && s.definition[:query][:bool][:should].empty? - # s.definition[:query] = { match_all: {} } - # end - # end - - # definition = search_definition.to_hash - # - # if definition[:query][:bool][:must].empty? && definition[:query][:bool][:should].empty? - # definition[:query] = { match_all: {} } - # end - search_results = __elasticsearch__.search(definition, index: INDEX_NAME) - binding.pry Rails.logger.info("[Query] #{options.merge(result_count: search_results.results.total).to_json}") @@ -407,25 +317,5 @@ def find_geoname(location, state) def normalized_city(city) city.sub(' Metro Area', '').sub(/, .*$/, '') end - - def build_location_definition(query) - SearchDefinition.new do |s| - s.definition = { bool: { must: [] } } - s.must({ term: { 'locations.state': { term: query.location.state } }}) if query.has_state? - s.must({ match: { 'locations.city': { query: query.location.city, operator: 'AND' }}}) if query.has_city? - end - end - - def build_organization_id_definition(query) - SearchDefinition.new do |s1| - s1.definition = { bool: { should: [] } } - s1.should({ terms: { organization_id: query.organization_terms } }) if query.organization_terms.present? - if query.organization_prefixes.present? - query.organization_prefixes.each do |prefix| - s1.should({ prefix: { organization_id: prefix } }) - end - end - end - end end end diff --git a/spec/classes/search_definition_spec.rb b/spec/classes/search_definition_spec.rb deleted file mode 100644 index c68c332..0000000 --- a/spec/classes/search_definition_spec.rb +++ /dev/null @@ -1,88 +0,0 @@ -require 'rails_helper' - -describe SearchDefinition do - let(:expected_definition) { { bool: { must: [], should: [] } } } - - describe '.new' do - context 'when there is no definition being set' do - it 'returns the default definition' do - expect(SearchDefinition.new.definition).to eq( - { - query: { - bool: { - must: [], - should: [] - } - } - } - ) - end - end - - context 'when there is definition being set' do - it 'returns the set definition' do - expect(SearchDefinition.new(expected_definition).definition).to eq(expected_definition) - end - end - end - - describe '.must' do - context 'when there is query in definition' do - it 'adds query to definition' do - search = SearchDefinition.new do |s| - s.must({ terms: { tags: [1, 2, 3] }}) - end - - expect(search.definition[:query][:bool][:must]).to eq([{ terms: { tags: [1, 2, 3] } }]) - end - end - - context 'when there is no query in definition' do - it 'adds query to bool' do - search = SearchDefinition.new(expected_definition) do |s| - s.must({ match: { tag: 1 }}) - end - - expect(search.definition[:bool][:must]).to eq([{ match: { tag: 1 } }]) - end - end - end - - describe '.should' do - context 'when there is query in definition' do - it 'adds query to definition' do - search = SearchDefinition.new do |s| - s.should({ term: { name: 'apple' }}) - end - - expect(search.definition[:query][:bool][:should]).to eq([{ term: { name: 'apple' } }]) - end - end - - context 'when there is no query in definition' do - it 'adds query to bool' do - search = SearchDefinition.new(expected_definition) do |s| - s.should({ match: { pretty: true }}) - end - - expect(search.definition[:bool][:should]).to eq([{ match: { pretty: true } }]) - end - end - end - - describe '.sort' do - it 'adds sorting query' do - search = SearchDefinition.new do |s| - s.sort({ id: 'desc' }) - end - - expect(search.definition[:sort]).to eq({ id: 'desc' }) - end - end - - describe '.to_s' do - it 'prints out definition' do - expect(SearchDefinition.new(expected_definition).to_s).to eq(expected_definition) - end - end -end From 5532f9c423c43c004dcff33132f82a8fda58737d Mon Sep 17 00:00:00 2001 From: Yih En Lim Date: Tue, 9 Jan 2018 13:23:26 +0800 Subject: [PATCH 09/16] Use elasticsearch DSL for geoname --- app/models/geoname.rb | 32 +++++++++++++++++++------------- spec/models/geoname_spec.rb | 2 +- 2 files changed, 20 insertions(+), 14 deletions(-) diff --git a/app/models/geoname.rb b/app/models/geoname.rb index f01d931..b8e1e92 100644 --- a/app/models/geoname.rb +++ b/app/models/geoname.rb @@ -1,8 +1,10 @@ require 'active_model' +require 'elasticsearch/dsl' class Geoname include ActiveModel::Model include Elasticsearch::Model + include Elasticsearch::DSL INDEX_NAME = "#{Rails.env}:geonames".freeze @@ -66,24 +68,28 @@ def geocode(options = {}) end def search_for(options) - search_definition = { - query: { - bool: { - must: [ - { match: { location: { query: options[:location], operator: 'AND' } } }, - { term: { state: options[:state] } } - ] - } - } - } - - search_definition[:size] = options[:size] + search_definition = Elasticsearch::DSL::Search.search do + query do + bool do + must do + match :location do + query options[:location] + operator 'and' + end + end + + must { term state: options[:state] } + end + end + + size options[:size] + end.to_hash Geoname.search(search_definition, index: INDEX_NAME) end def delete_search_index - client.indices.delete index: INDEX_NAME rescue nil + client.indices.delete index: INDEX_NAME if search_index_exists? end def search_index_exists? diff --git a/spec/models/geoname_spec.rb b/spec/models/geoname_spec.rb index 845d76d..49ad717 100644 --- a/spec/models/geoname_spec.rb +++ b/spec/models/geoname_spec.rb @@ -9,7 +9,7 @@ describe '.geocode(options)' do describe 'basic location/state lookup' do - before(:each) do + before do Geoname.import [{type: 'geoname', location: "Someplace", state: 'XY', geo: {lat: 12.34, lon: -123.45}}] end From 2bf4da582f51629730d2e64ee2975d3b94864c3a Mon Sep 17 00:00:00 2001 From: Nick Marden Date: Wed, 10 Jan 2018 19:57:13 -0500 Subject: [PATCH 10/16] Allow models to use configured Elasticsearch client --- config/initializers/elasticsearch.rb | 2 ++ 1 file changed, 2 insertions(+) diff --git a/config/initializers/elasticsearch.rb b/config/initializers/elasticsearch.rb index f3c4257..1c553d0 100644 --- a/config/initializers/elasticsearch.rb +++ b/config/initializers/elasticsearch.rb @@ -10,5 +10,7 @@ module Elasticsearch; end password: config['password'] ) +Elasticsearch::Model.client = Rails.application.config.elasticsearch_client + PositionOpening.create_search_index unless PositionOpening.search_index_exists? Geoname.create_search_index unless Geoname.search_index_exists? From f522c3aee176a67e93a3491a42f1a77bb7650cb6 Mon Sep 17 00:00:00 2001 From: Nick Marden Date: Wed, 10 Jan 2018 20:59:33 -0500 Subject: [PATCH 11/16] A totally speculative commit to show how I am expecting _timestamp and _ttl to become regular fields --- app/models/position_opening.rb | 7 ++++-- lib/importers/neogov_data.rb | 8 +++---- spec/lib/importers/neogov_data_spec.rb | 32 +++++++++++++------------- spec/models/position_opening_spec.rb | 6 ++--- 4 files changed, 28 insertions(+), 25 deletions(-) diff --git a/app/models/position_opening.rb b/app/models/position_opening.rb index c3758e1..08c7893 100644 --- a/app/models/position_opening.rb +++ b/app/models/position_opening.rb @@ -240,7 +240,7 @@ def search_index_exists? def import(position_openings) position_openings.each do |opening| - data = opening.except(:_timestamp, :_ttl).each_with_object({}) do |(key, value), data| + data = opening.except(:timestamp, :ttl).each_with_object({}) do |(key, value), data| if key == :locations data[:locations] = value.map do |v| {city: normalized_city(v[:city]), @@ -252,12 +252,15 @@ def import(position_openings) end end + ts = opening[:timestamp].blank? ? DateTime.current : opening[:timestamp] + client.index( index: INDEX_NAME, type: 'position_opening', id: "#{opening[:source]}:#{opening[:external_id]}", body: data.merge!({ - timestamp: opening[:_timestamp].blank? ? DateTime.current : opening[:_timestamp], + timestamp: ts, + ttl: opening[:ttl].blank? ? opening[:end_date] : (ts + convert_ttl_to_datetime_duration(opening[:ttl]), id: "#{opening[:source]}:#{opening[:external_id]}" }) ) diff --git a/lib/importers/neogov_data.rb b/lib/importers/neogov_data.rb index be2b501..a46aa97 100644 --- a/lib/importers/neogov_data.rb +++ b/lib/importers/neogov_data.rb @@ -42,7 +42,7 @@ def import existing_external_ids = PositionOpening.get_external_ids_by_source(@source) expired_ids = existing_external_ids - updated_external_ids expired_openings = expired_ids.collect do |expired_id| - {type: 'position_opening', source: @source, external_id: expired_id, _ttl: '1s'} + {type: 'position_opening', source: @source, external_id: expired_id, ttl: '1s'} end position_openings.push(*expired_openings) PositionOpening.import position_openings @@ -87,12 +87,12 @@ def process_job(job_xml) job_xml.xpath(XPATHS[:state]).inner_text) if seconds_remaining.zero? || entry[:locations].blank? - entry[:_ttl] = '1s' + entry[:ttl] = '1s' return entry end - entry[:_timestamp] = pubdate.iso8601 - entry[:_ttl] = "#{seconds_remaining}s" + entry[:timestamp] = pubdate.iso8601 + entry[:ttl] = "#{seconds_remaining}s" entry[:position_title] = job_xml.xpath(XPATHS[:position_title]).inner_text.squish entry[:start_date] = start_date entry[:end_date] = is_continuous ? nil : end_date diff --git a/spec/lib/importers/neogov_data_spec.rb b/spec/lib/importers/neogov_data_spec.rb index be2da60..3632a57 100644 --- a/spec/lib/importers/neogov_data_spec.rb +++ b/spec/lib/importers/neogov_data_spec.rb @@ -25,8 +25,8 @@ expect(position_openings[0]).to eq( {type: 'position_opening', source: 'ng:michigan', organization_id: 'USMI', organization_name: 'State of Michigan, MI', tags: %w(state), - _timestamp: '2013-04-12T15:52:34+00:00', external_id: 634789, - locations: [{city: 'Lansing', state: 'MI'}], _ttl: '277909586s', + timestamp: '2013-04-12T15:52:34+00:00', external_id: 634789, + locations: [{city: 'Lansing', state: 'MI'}], ttl: '277909586s', position_title: 'Professional Development and Training Intern-DHS', start_date: Date.parse('2013-04-12'), end_date: far_away, minimum: nil, maximum: nil, rate_interval_code: 'PH', position_offering_type_code: 15328, position_schedule_type_code: nil} @@ -35,8 +35,8 @@ expect(position_openings[1]).to eq( {type: 'position_opening', source: 'ng:michigan', organization_id: 'USMI', organization_name: 'State of Michigan, MI', tags: %w(state), - _timestamp: '2013-04-08T15:15:21+00:00', external_id: 631517, - locations: [{city: 'Lansing', state: 'MI'}], _ttl: '278257419s', + timestamp: '2013-04-08T15:15:21+00:00', external_id: 631517, + locations: [{city: 'Lansing', state: 'MI'}], ttl: '278257419s', position_title: 'MEDC Corporate - Business Attraction Manager', start_date: Date.parse('2013-04-08'), end_date: far_away, minimum: 59334.0, maximum: 77066.0, rate_interval_code: 'PA', position_offering_type_code: 15317, position_schedule_type_code: 1} @@ -45,9 +45,9 @@ expect(position_openings[2]).to eq( {type: 'position_opening', source: 'ng:michigan', organization_id: 'USMI', organization_name: 'State of Michigan, MI', tags: %w(state), - _timestamp: '2012-03-12T10:16:56+00:00', external_id: 282662, + timestamp: '2012-03-12T10:16:56+00:00', external_id: 282662, locations: [{city: 'Freeland', state: 'MI'}], - _ttl: continuous_ttl, position_title: 'Dentist-A', + ttl: continuous_ttl, position_title: 'Dentist-A', start_date: Date.parse('2011-09-23'), end_date: nil, minimum: 37.33, maximum: 51.66, rate_interval_code: 'PH', position_offering_type_code: 15317, position_schedule_type_code: 2} ) @@ -55,9 +55,9 @@ expect(position_openings[3]).to eq( {type: 'position_opening', source: 'ng:michigan', organization_id: 'USMI', organization_name: 'State of Michigan, MI', tags: %w(state), - _timestamp: '2010-08-10T16:07:30+00:00', external_id: 234175, + timestamp: '2010-08-10T16:07:30+00:00', external_id: 234175, locations: [{city: 'Munising', state: 'MI'}], - _ttl: '362235090s', position_title: 'Registered Nurse Non-Career', + ttl: '362235090s', position_title: 'Registered Nurse Non-Career', start_date: Date.parse('2010-06-08'), end_date: far_away, minimum: 28.37, maximum: 38.87, rate_interval_code: 'PH', position_offering_type_code: nil, position_schedule_type_code: nil} ) @@ -81,8 +81,8 @@ expect(position_openings[0]).to eq( {type: 'position_opening', source: 'ng:michigan', organization_id: 'USMI', organization_name: 'State of Michigan, MI', tags: %w(state), - _timestamp: '2013-04-12T15:52:34+00:00', external_id: 634789, - locations: [{city: 'Lansing', state: 'MI'}], _ttl: '277909586s', + timestamp: '2013-04-12T15:52:34+00:00', external_id: 634789, + locations: [{city: 'Lansing', state: 'MI'}], ttl: '277909586s', position_title: 'Professional Development and Training Intern-DHS', start_date: Date.parse('2013-04-12'), end_date: far_away, minimum: nil, maximum: nil, rate_interval_code: 'PH', position_offering_type_code: 15328, position_schedule_type_code: nil} @@ -91,15 +91,15 @@ expect(position_openings[1]).to eq( {type: 'position_opening', source: 'ng:michigan', organization_id: 'USMI', organization_name: 'State of Michigan, MI', tags: %w(state), - _timestamp: '2013-04-08T15:15:21+00:00', external_id: 631517, - locations: [{city: 'Lansing', state: 'MI'}], _ttl: '278257419s', + timestamp: '2013-04-08T15:15:21+00:00', external_id: 631517, + locations: [{city: 'Lansing', state: 'MI'}], ttl: '278257419s', position_title: 'MEDC Corporate - Business Attraction Manager', start_date: Date.parse('2013-04-08'), end_date: far_away, minimum: 59334.0, maximum: 77066.0, rate_interval_code: 'PA', position_offering_type_code: 15317, position_schedule_type_code: 1} ) expect(position_openings[2]).to eq( - {type: 'position_opening', source: 'ng:michigan', external_id: 282662, _ttl: '1s'} + {type: 'position_opening', source: 'ng:michigan', external_id: 282662, ttl: '1s'} ) end less_entries_importer.import @@ -113,7 +113,7 @@ allow(expired_importer).to receive(:fetch_jobs_rss).and_return File.open('spec/resources/neogov/expired.rss') end - it 'should set their _ttl to 1s' do + it 'should set their ttl to 1s' do expect(PositionOpening).to receive(:get_external_ids_by_source).with('ng:michigan').and_return([]) expect(PositionOpening).to receive(:import) do |position_openings| expect(position_openings.length).to eq(1) @@ -121,7 +121,7 @@ expect(position_openings[0]).to eq( {type: 'position_opening', source: 'ng:michigan', organization_id: 'USMI', organization_name: 'State of Michigan, MI', tags: %w(state), - external_id: 282662, locations: [{city: 'Freeland', state: 'MI'}], _ttl: '1s'} + external_id: 282662, locations: [{city: 'Freeland', state: 'MI'}], ttl: '1s'} ) end expired_importer.import @@ -143,7 +143,7 @@ expect(position_openings[0]).to eq( {type: 'position_opening', source: 'ng:michigan', organization_id: 'USMI', organization_name: 'State of Michigan, MI', tags: %w(state), - external_id: 386302, locations: [], _ttl: '1s'} + external_id: 386302, locations: [], ttl: '1s'} ) end bad_location_importer.import diff --git a/spec/models/position_opening_spec.rb b/spec/models/position_opening_spec.rb index 2e2d6f4..74d8545 100644 --- a/spec/models/position_opening_spec.rb +++ b/spec/models/position_opening_spec.rb @@ -31,17 +31,17 @@ position_schedule_type_code: 1, position_offering_type_code: 15328, tags: %w(federal), start_date: Date.current, end_date: Date.current + 8, minimum: 0, maximum: 0, rate_interval_code: 'WC', locations: [{ city: 'San Francisco', state: 'CA' }] } - position_openings << { type: 'position_opening', source: 'ng:michigan', _timestamp: Date.current.weeks_ago(1).iso8601, external_id: 629140, + position_openings << { type: 'position_opening', source: 'ng:michigan', timestamp: Date.current.weeks_ago(1).iso8601, external_id: 629140, locations: [{ city: 'Lansing', state: 'MI' }], tags: %w(state), rate_interval_code: 'PH', position_schedule_type_code: 1, position_offering_type_code: 15317, position_title: 'Supervisor (DOH #28425)', start_date: Date.current, end_date: Date.current.tomorrow, minimum: 20.7, maximum: 36.8 } - position_openings << { type: 'position_opening', source: 'ng:michigan', _timestamp: Date.current.yesterday.iso8601, external_id: 616313, + position_openings << { type: 'position_opening', source: 'ng:michigan', timestamp: Date.current.yesterday.iso8601, external_id: 616313, locations: [{ city: 'Detroit', state: 'MI' }], tags: %w(state), rate_interval_code: 'PH', position_schedule_type_code: 1, position_offering_type_code: 15322, position_title: 'Indoor Lifeguard', start_date: Date.current, end_date: Date.current + 8, minimum: 15.68, maximum: 27.11 } - position_openings << { type: 'position_opening', source: 'ng:bloomingtonmn', _timestamp: Date.current.iso8601, external_id: 632865, + position_openings << { type: 'position_opening', source: 'ng:bloomingtonmn', timestamp: Date.current.iso8601, external_id: 632865, locations: [{ city: 'Detroit', state: 'MI' }], tags: %w(city), rate_interval_code: 'PA', position_schedule_type_code: 1, position_offering_type_code: 15317, position_title: 'Computer Specialist', From 679a16ff1d2f97e62b62a970566d7c89e3510073 Mon Sep 17 00:00:00 2001 From: Yih En Lim Date: Thu, 11 Jan 2018 11:53:14 +0800 Subject: [PATCH 12/16] Remove _ttl fields --- app/models/position_opening.rb | 4 +- lib/importers/neogov_data.rb | 30 +++++++------ lib/importers/usajobs_data.rb | 6 +-- spec/lib/importers/neogov_data_spec.rb | 58 ++++++++++++------------- spec/lib/importers/usajobs_data_spec.rb | 18 ++++---- spec/models/geoname_spec.rb | 3 -- spec/models/position_opening_spec.rb | 6 +-- 7 files changed, 61 insertions(+), 64 deletions(-) diff --git a/app/models/position_opening.rb b/app/models/position_opening.rb index c3758e1..fdea888 100644 --- a/app/models/position_opening.rb +++ b/app/models/position_opening.rb @@ -240,7 +240,7 @@ def search_index_exists? def import(position_openings) position_openings.each do |opening| - data = opening.except(:_timestamp, :_ttl).each_with_object({}) do |(key, value), data| + data = opening.each_with_object({}) do |(key, value), data| if key == :locations data[:locations] = value.map do |v| {city: normalized_city(v[:city]), @@ -257,7 +257,7 @@ def import(position_openings) type: 'position_opening', id: "#{opening[:source]}:#{opening[:external_id]}", body: data.merge!({ - timestamp: opening[:_timestamp].blank? ? DateTime.current : opening[:_timestamp], + timestamp: opening[:timestamp] || DateTime.current, id: "#{opening[:source]}:#{opening[:external_id]}" }) ) diff --git a/lib/importers/neogov_data.rb b/lib/importers/neogov_data.rb index be2b501..1042637 100644 --- a/lib/importers/neogov_data.rb +++ b/lib/importers/neogov_data.rb @@ -42,7 +42,7 @@ def import existing_external_ids = PositionOpening.get_external_ids_by_source(@source) expired_ids = existing_external_ids - updated_external_ids expired_openings = expired_ids.collect do |expired_id| - {type: 'position_opening', source: @source, external_id: expired_id, _ttl: '1s'} + {type: 'position_opening', source: @source, external_id: expired_id} end position_openings.push(*expired_openings) PositionOpening.import position_openings @@ -86,21 +86,23 @@ def process_job(job_xml) entry[:locations] = process_location_and_state(job_xml.xpath(XPATHS[:location]).inner_text, job_xml.xpath(XPATHS[:state]).inner_text) - if seconds_remaining.zero? || entry[:locations].blank? - entry[:_ttl] = '1s' - return entry + # if + # entry[:_ttl] = '1s' + # return entry + # end + + unless seconds_remaining.zero? || entry[:locations].blank? + entry[:timestamp] = pubdate.iso8601 + # entry[:_ttl] = "#{seconds_remaining}s" + entry[:position_title] = job_xml.xpath(XPATHS[:position_title]).inner_text.squish + entry[:start_date] = start_date + entry[:end_date] = is_continuous ? nil : end_date + entry[:minimum] = process_salary(job_xml.xpath(XPATHS[:minimum]).inner_text) + entry[:maximum] = process_salary(job_xml.xpath(XPATHS[:maximum]).inner_text) + entry[:rate_interval_code] = process_salary_interval(job_xml.xpath(XPATHS[:salary_interval]).inner_text) + entry.merge!(process_job_type(job_xml.xpath(XPATHS[:job_type]).inner_text)) end - entry[:_timestamp] = pubdate.iso8601 - entry[:_ttl] = "#{seconds_remaining}s" - entry[:position_title] = job_xml.xpath(XPATHS[:position_title]).inner_text.squish - entry[:start_date] = start_date - entry[:end_date] = is_continuous ? nil : end_date - entry[:minimum] = process_salary(job_xml.xpath(XPATHS[:minimum]).inner_text) - entry[:maximum] = process_salary(job_xml.xpath(XPATHS[:maximum]).inner_text) - entry[:rate_interval_code] = process_salary_interval(job_xml.xpath(XPATHS[:salary_interval]).inner_text) - entry.merge!(process_job_type(job_xml.xpath(XPATHS[:job_type]).inner_text)) - entry end diff --git a/lib/importers/usajobs_data.rb b/lib/importers/usajobs_data.rb index 4fd86df..4f36fa1 100644 --- a/lib/importers/usajobs_data.rb +++ b/lib/importers/usajobs_data.rb @@ -38,8 +38,8 @@ def process_job(job_xml) entry[:external_id] = job_xml.xpath(XPATHS[:id]).inner_text.to_i entry[:locations] = process_locations(job_xml) entry[:locations] = [] if entry[:locations].size >= CATCHALL_THRESHOLD - entry[:_ttl] = (days_remaining.zero? || entry[:locations].empty?) ? '1s' : "#{days_remaining}d" - unless entry[:_ttl] == '1s' + # entry[:_ttl] = (days_remaining.zero? || entry[:locations].empty?) ? '1s' : "#{days_remaining}d" + unless entry[:locations].empty? || days_remaining.zero? entry[:position_title] = job_xml.xpath(XPATHS[:position_title]).inner_text.strip entry[:organization_id] = job_xml.xpath(XPATHS[:organization_id]).inner_text.strip.upcase entry[:organization_name] = job_xml.xpath(XPATHS[:organization_name]).inner_text.strip @@ -89,4 +89,4 @@ def abbreviate_state_name(location_str) end location_str end -end \ No newline at end of file +end diff --git a/spec/lib/importers/neogov_data_spec.rb b/spec/lib/importers/neogov_data_spec.rb index be2da60..d20b916 100644 --- a/spec/lib/importers/neogov_data_spec.rb +++ b/spec/lib/importers/neogov_data_spec.rb @@ -7,7 +7,7 @@ let!(:current_datetime) { DateTime.current.freeze } let!(:current) { current_datetime.to_date.freeze } let(:far_away) { Date.parse('2022-01-31') } - let(:continuous_ttl) { "#{(current_datetime + 7).to_i - DateTime.parse('2012-03-12 10:16:56.14').to_datetime.to_i}s" } + # let(:continuous_ttl) { "#{(current_datetime + 7).to_i - DateTime.parse('2012-03-12 10:16:56.14').to_datetime.to_i}s" } before { allow(DateTime).to receive(:current).and_return(current_datetime) } @@ -25,8 +25,8 @@ expect(position_openings[0]).to eq( {type: 'position_opening', source: 'ng:michigan', organization_id: 'USMI', organization_name: 'State of Michigan, MI', tags: %w(state), - _timestamp: '2013-04-12T15:52:34+00:00', external_id: 634789, - locations: [{city: 'Lansing', state: 'MI'}], _ttl: '277909586s', + timestamp: '2013-04-12T15:52:34+00:00', external_id: 634789, + locations: [{city: 'Lansing', state: 'MI'}], position_title: 'Professional Development and Training Intern-DHS', start_date: Date.parse('2013-04-12'), end_date: far_away, minimum: nil, maximum: nil, rate_interval_code: 'PH', position_offering_type_code: 15328, position_schedule_type_code: nil} @@ -35,8 +35,8 @@ expect(position_openings[1]).to eq( {type: 'position_opening', source: 'ng:michigan', organization_id: 'USMI', organization_name: 'State of Michigan, MI', tags: %w(state), - _timestamp: '2013-04-08T15:15:21+00:00', external_id: 631517, - locations: [{city: 'Lansing', state: 'MI'}], _ttl: '278257419s', + timestamp: '2013-04-08T15:15:21+00:00', external_id: 631517, + locations: [{city: 'Lansing', state: 'MI'}], position_title: 'MEDC Corporate - Business Attraction Manager', start_date: Date.parse('2013-04-08'), end_date: far_away, minimum: 59334.0, maximum: 77066.0, rate_interval_code: 'PA', position_offering_type_code: 15317, position_schedule_type_code: 1} @@ -45,9 +45,8 @@ expect(position_openings[2]).to eq( {type: 'position_opening', source: 'ng:michigan', organization_id: 'USMI', organization_name: 'State of Michigan, MI', tags: %w(state), - _timestamp: '2012-03-12T10:16:56+00:00', external_id: 282662, - locations: [{city: 'Freeland', state: 'MI'}], - _ttl: continuous_ttl, position_title: 'Dentist-A', + timestamp: '2012-03-12T10:16:56+00:00', external_id: 282662, + locations: [{city: 'Freeland', state: 'MI'}], position_title: 'Dentist-A', start_date: Date.parse('2011-09-23'), end_date: nil, minimum: 37.33, maximum: 51.66, rate_interval_code: 'PH', position_offering_type_code: 15317, position_schedule_type_code: 2} ) @@ -55,9 +54,8 @@ expect(position_openings[3]).to eq( {type: 'position_opening', source: 'ng:michigan', organization_id: 'USMI', organization_name: 'State of Michigan, MI', tags: %w(state), - _timestamp: '2010-08-10T16:07:30+00:00', external_id: 234175, - locations: [{city: 'Munising', state: 'MI'}], - _ttl: '362235090s', position_title: 'Registered Nurse Non-Career', + timestamp: '2010-08-10T16:07:30+00:00', external_id: 234175, + locations: [{city: 'Munising', state: 'MI'}], position_title: 'Registered Nurse Non-Career', start_date: Date.parse('2010-06-08'), end_date: far_away, minimum: 28.37, maximum: 38.87, rate_interval_code: 'PH', position_offering_type_code: nil, position_schedule_type_code: nil} ) @@ -81,8 +79,8 @@ expect(position_openings[0]).to eq( {type: 'position_opening', source: 'ng:michigan', organization_id: 'USMI', organization_name: 'State of Michigan, MI', tags: %w(state), - _timestamp: '2013-04-12T15:52:34+00:00', external_id: 634789, - locations: [{city: 'Lansing', state: 'MI'}], _ttl: '277909586s', + timestamp: '2013-04-12T15:52:34+00:00', external_id: 634789, + locations: [{city: 'Lansing', state: 'MI'}], position_title: 'Professional Development and Training Intern-DHS', start_date: Date.parse('2013-04-12'), end_date: far_away, minimum: nil, maximum: nil, rate_interval_code: 'PH', position_offering_type_code: 15328, position_schedule_type_code: nil} @@ -91,15 +89,15 @@ expect(position_openings[1]).to eq( {type: 'position_opening', source: 'ng:michigan', organization_id: 'USMI', organization_name: 'State of Michigan, MI', tags: %w(state), - _timestamp: '2013-04-08T15:15:21+00:00', external_id: 631517, - locations: [{city: 'Lansing', state: 'MI'}], _ttl: '278257419s', + timestamp: '2013-04-08T15:15:21+00:00', external_id: 631517, + locations: [{city: 'Lansing', state: 'MI'}], position_title: 'MEDC Corporate - Business Attraction Manager', start_date: Date.parse('2013-04-08'), end_date: far_away, minimum: 59334.0, maximum: 77066.0, rate_interval_code: 'PA', position_offering_type_code: 15317, position_schedule_type_code: 1} ) expect(position_openings[2]).to eq( - {type: 'position_opening', source: 'ng:michigan', external_id: 282662, _ttl: '1s'} + {type: 'position_opening', source: 'ng:michigan', external_id: 282662} ) end less_entries_importer.import @@ -113,19 +111,19 @@ allow(expired_importer).to receive(:fetch_jobs_rss).and_return File.open('spec/resources/neogov/expired.rss') end - it 'should set their _ttl to 1s' do - expect(PositionOpening).to receive(:get_external_ids_by_source).with('ng:michigan').and_return([]) - expect(PositionOpening).to receive(:import) do |position_openings| - expect(position_openings.length).to eq(1) - - expect(position_openings[0]).to eq( - {type: 'position_opening', source: 'ng:michigan', - organization_id: 'USMI', organization_name: 'State of Michigan, MI', tags: %w(state), - external_id: 282662, locations: [{city: 'Freeland', state: 'MI'}], _ttl: '1s'} - ) - end - expired_importer.import - end + # it 'should set their _ttl to 1s' do + # expect(PositionOpening).to receive(:get_external_ids_by_source).with('ng:michigan').and_return([]) + # expect(PositionOpening).to receive(:import) do |position_openings| + # expect(position_openings.length).to eq(1) + # + # expect(position_openings[0]).to eq( + # {type: 'position_opening', source: 'ng:michigan', + # organization_id: 'USMI', organization_name: 'State of Michigan, MI', tags: %w(state), + # external_id: 282662, locations: [{city: 'Freeland', state: 'MI'}], _ttl: '1s'} + # ) + # end + # expired_importer.import + # end end context 'when the city or state is invalid' do @@ -143,7 +141,7 @@ expect(position_openings[0]).to eq( {type: 'position_opening', source: 'ng:michigan', organization_id: 'USMI', organization_name: 'State of Michigan, MI', tags: %w(state), - external_id: 386302, locations: [], _ttl: '1s'} + external_id: 386302, locations: []} ) end bad_location_importer.import diff --git a/spec/lib/importers/usajobs_data_spec.rb b/spec/lib/importers/usajobs_data_spec.rb index e50edf2..081cca4 100644 --- a/spec/lib/importers/usajobs_data_spec.rb +++ b/spec/lib/importers/usajobs_data_spec.rb @@ -11,7 +11,7 @@ expect(PositionOpening).to receive(:import) do |position_openings| expect(position_openings.length).to eq(3) expect(position_openings[0]).to eq( - {type: 'position_opening', source: 'usajobs', external_id: 305972200, _ttl: ttl, + {type: 'position_opening', source: 'usajobs', external_id: 305972200, position_title: 'Medical Officer', tags: %w(federal), organization_id: 'AF09', organization_name: 'Air Force Personnel Center', locations: [{city: 'Dyess AFB', state: 'TX'}], @@ -19,7 +19,7 @@ minimum: 60274, maximum: 155500, rate_interval_code: 'PA', position_schedule_type_code: 1, position_offering_type_code: 15327} ) expect(position_openings[1]).to eq( - {type: 'position_opening', source: 'usajobs', external_id: 325054900, _ttl: ttl, + {type: 'position_opening', source: 'usajobs', external_id: 325054900, position_title: 'Physician (Surgical Critical Care)', tags: %w(federal), organization_id: 'VATA', organization_name: 'Veterans Affairs, Veterans Health Administration', locations: [{city: 'Charleston', state: 'SC'}], @@ -27,7 +27,7 @@ minimum: 125000, maximum: 295000, rate_interval_code: 'PA', position_schedule_type_code: 2, position_offering_type_code: 15317} ) expect(position_openings[2]).to eq( - {type: 'position_opening', source: 'usajobs', external_id: 327358300, _ttl: ttl, + {type: 'position_opening', source: 'usajobs', external_id: 327358300, position_title: 'Student Nurse Technicians', tags: %w(federal), organization_id: 'VATA', organization_name: 'Veterans Affairs, Veterans Health Administration', locations: [{city: 'Odessa', state: 'TX'}, @@ -48,15 +48,15 @@ expect(PositionOpening).to receive(:import) do |position_openings| expect(position_openings.length).to eq(3) expect(position_openings[0]).to eq( - {type: 'position_opening', source: 'usajobs', external_id: 305972200, _ttl: '1s', + {type: 'position_opening', source: 'usajobs', external_id: 305972200, tags: %w(federal), locations: [{:city => "Dyess AFB", :state => "TX"}]} ) expect(position_openings[1]).to eq( - {type: 'position_opening', source: 'usajobs', external_id: 325054900, _ttl: '1s', + {type: 'position_opening', source: 'usajobs', external_id: 325054900, tags: %w(federal), locations: [{:city => "Charleston", :state => "SC"}]} ) expect(position_openings[2]).to eq( - {type: 'position_opening', source: 'usajobs', external_id: 327358300, _ttl: '1s', + {type: 'position_opening', source: 'usajobs', external_id: 327358300, tags: %w(federal), locations: [{:city => "Odessa", :state => "TX"}, {:city => "Pentagon, Arlington", :state => "VA"}, {:city => "San Angelo", :state => "TX"}, @@ -75,14 +75,14 @@ expect(PositionOpening).to receive(:import) do |position_openings| expect(position_openings.length).to eq(2) expect(position_openings[0]).to eq( - {type: "position_opening", source: 'usajobs', external_id: 305972200, _ttl: ttl, position_title: "Medical Officer", + {type: "position_opening", source: 'usajobs', external_id: 305972200, position_title: "Medical Officer", organization_id: "AF09", organization_name: "Air Force Personnel Center", tags: %w(federal), locations: [{:city => "Fulton", :state => "MD"}], start_date: Date.parse('28 Dec 2011'), end_date: far_away, minimum: 60274, maximum: 155500, rate_interval_code: "PA", position_schedule_type_code: 1, position_offering_type_code: 15327} ) expect(position_openings[1]).to eq( - {type: "position_opening", source: 'usajobs', external_id: 325054900, _ttl: "1s", locations: [], tags: %w(federal)} + {type: "position_opening", source: 'usajobs', external_id: 325054900, locations: [], tags: %w(federal)} ) end bad_location_importer.import @@ -96,7 +96,7 @@ expect(PositionOpening).to receive(:import) do |position_openings| expect(position_openings.length).to eq(1) expect(position_openings[0]).to eq( - {type: 'position_opening', source: 'usajobs', external_id: 327358300, _ttl: '1s', + {type: 'position_opening', source: 'usajobs', external_id: 327358300, tags: %w(federal), locations: []} ) end diff --git a/spec/models/geoname_spec.rb b/spec/models/geoname_spec.rb index 49ad717..2ca8878 100644 --- a/spec/models/geoname_spec.rb +++ b/spec/models/geoname_spec.rb @@ -14,7 +14,6 @@ end it 'should return the lat/lon hash of the place' do - Geoname.geocode(location: "Someplace", state: 'XY') expect(Geoname.geocode(location: "Someplace", state: 'XY').to_json).to eq({lat: 12.34, lon: -123.45}.to_json) end end @@ -34,7 +33,6 @@ it 'should find the matches' do @first_synonyms.each do |synonym| - Geoname.geocode(location: "#{synonym} City", state: 'CA') geo_hash = Geoname.geocode(location: "#{synonym} City", state: 'CA') expect(geo_hash[:lat]).to be_kind_of(Numeric) expect(geo_hash[:lon]).to be_kind_of(Numeric) @@ -47,7 +45,6 @@ it 'should set the document ID' do Geoname.import [{type: 'geoname', location: "Someplace", state: 'XY', geo: {lat: 12.34, lon: -123.45}}] Geoname.import [{type: 'geoname', location: "Someplace", state: 'XY', geo: {lat: 82.34, lon: 23.45}}] - Geoname.search_for(location: 'Someplace', state: 'XY', size: 2) search = Geoname.search_for(location: 'Someplace', state: 'XY', size: 2) expect(search.results.total).to eq(1) expect(search.results.first.id).to eq('Someplace:XY') diff --git a/spec/models/position_opening_spec.rb b/spec/models/position_opening_spec.rb index 2e2d6f4..74d8545 100644 --- a/spec/models/position_opening_spec.rb +++ b/spec/models/position_opening_spec.rb @@ -31,17 +31,17 @@ position_schedule_type_code: 1, position_offering_type_code: 15328, tags: %w(federal), start_date: Date.current, end_date: Date.current + 8, minimum: 0, maximum: 0, rate_interval_code: 'WC', locations: [{ city: 'San Francisco', state: 'CA' }] } - position_openings << { type: 'position_opening', source: 'ng:michigan', _timestamp: Date.current.weeks_ago(1).iso8601, external_id: 629140, + position_openings << { type: 'position_opening', source: 'ng:michigan', timestamp: Date.current.weeks_ago(1).iso8601, external_id: 629140, locations: [{ city: 'Lansing', state: 'MI' }], tags: %w(state), rate_interval_code: 'PH', position_schedule_type_code: 1, position_offering_type_code: 15317, position_title: 'Supervisor (DOH #28425)', start_date: Date.current, end_date: Date.current.tomorrow, minimum: 20.7, maximum: 36.8 } - position_openings << { type: 'position_opening', source: 'ng:michigan', _timestamp: Date.current.yesterday.iso8601, external_id: 616313, + position_openings << { type: 'position_opening', source: 'ng:michigan', timestamp: Date.current.yesterday.iso8601, external_id: 616313, locations: [{ city: 'Detroit', state: 'MI' }], tags: %w(state), rate_interval_code: 'PH', position_schedule_type_code: 1, position_offering_type_code: 15322, position_title: 'Indoor Lifeguard', start_date: Date.current, end_date: Date.current + 8, minimum: 15.68, maximum: 27.11 } - position_openings << { type: 'position_opening', source: 'ng:bloomingtonmn', _timestamp: Date.current.iso8601, external_id: 632865, + position_openings << { type: 'position_opening', source: 'ng:bloomingtonmn', timestamp: Date.current.iso8601, external_id: 632865, locations: [{ city: 'Detroit', state: 'MI' }], tags: %w(city), rate_interval_code: 'PA', position_schedule_type_code: 1, position_offering_type_code: 15317, position_title: 'Computer Specialist', From 8d6ebd6c7b67a1290018f1100a3dd9db3e1404dc Mon Sep 17 00:00:00 2001 From: Yih En Lim Date: Thu, 11 Jan 2018 14:56:56 +0800 Subject: [PATCH 13/16] Replace _ttl: Delete expired doc query --- app/models/position_opening.rb | 42 +++++++++++++++- lib/importers/neogov_data.rb | 1 + lib/tasks/position_openings.rake | 7 ++- spec/models/position_opening_spec.rb | 72 ++++++++++++++++++++++++++++ 4 files changed, 119 insertions(+), 3 deletions(-) diff --git a/app/models/position_opening.rb b/app/models/position_opening.rb index fdea888..2df7274 100644 --- a/app/models/position_opening.rb +++ b/app/models/position_opening.rb @@ -78,8 +78,7 @@ class PositionOpening indexes :position_schedule_type_code, type: 'integer' indexes :rate_interval_code, type: 'text', analyzer: 'keyword' indexes :id, type: 'keyword', index: false - indexes :timestamp, type: 'date' - indexes :ttl, type: 'date' + indexes :timestamp, type: 'date', null_value: 'NULL' end end @@ -291,6 +290,45 @@ def get_external_ids_by_source(source) external_ids.flatten end + def delete_expired_docs + query = Elasticsearch::DSL::Search.search do + query do + bool do + filter do + range :end_date do + lte Date.current + end + end + end + end + end + + client.delete_by_query(body: query.to_hash, index: INDEX_NAME) + __elasticsearch__.refresh_index! index: INDEX_NAME + end + + def delete_invalid_docs + definition = Elasticsearch::DSL::Search.search do + query do + bool do + must_not do + bool do + must do + exists { field 'end_date' } + end + must do + exists { field 'start_date' } + end + end + end + end + end + end + + client.delete_by_query(body: definition.to_hash, index: INDEX_NAME) + __elasticsearch__.refresh_index! index: INDEX_NAME + end + def url_for_position_opening(position_opening) case position_opening.source when 'usajobs' diff --git a/lib/importers/neogov_data.rb b/lib/importers/neogov_data.rb index 1042637..973ac8d 100644 --- a/lib/importers/neogov_data.rb +++ b/lib/importers/neogov_data.rb @@ -62,6 +62,7 @@ def process_job(job_xml) now = DateTime.current.freeze is_continuous = end_date_str =~ /^continuous$/i + if is_continuous end_datetime_utc = now + 7 end_date = end_datetime_utc.to_date diff --git a/lib/tasks/position_openings.rake b/lib/tasks/position_openings.rake index e548871..fd25b93 100644 --- a/lib/tasks/position_openings.rake +++ b/lib/tasks/position_openings.rake @@ -44,4 +44,9 @@ namespace :jobs do PositionOpening.delete_search_index if PositionOpening.search_index_exists? PositionOpening.create_search_index end -end \ No newline at end of file + + desc 'Delete expired position openings' + task delete_expired_position_openings: :environment do + + end +end diff --git a/spec/models/position_opening_spec.rb b/spec/models/position_opening_spec.rb index 74d8545..9e5e763 100644 --- a/spec/models/position_opening_spec.rb +++ b/spec/models/position_opening_spec.rb @@ -6,6 +6,78 @@ PositionOpening.create_search_index end + describe '.delete_expired_docs' do + before do + position_openings = [] + # deleted : end date is now + position_openings << { source: 'usajobs', external_id: 8801, type: 'position_opening', position_title: 'Deputy Special Assistant to the Chief Nurse Practitioner', + organization_id: 'AF09', organization_name: 'Air Force Personnel Center', + position_schedule_type_code: 1, position_offering_type_code: 15317, tags: %w(federal), + start_date: Date.current, end_date: Date.current, minimum: 80000, maximum: 100000, rate_interval_code: 'PA', + locations: [{ city: 'Andrews AFB', state: 'MD' }, + { city: 'Pentagon Arlington', state: 'VA' }, + { city: 'Air Force Academy', state: 'CO' }] } + # # deleted: end_date is nil + # position_openings << { source: 'usajobs', external_id: 8802, type: 'position_opening', position_title: 'Physician Assistant', + # position_schedule_type_code: 2, position_offering_type_code: 15318, tags: %w(federal), + # organization_id: 'VATA', organization_name: 'Veterans Affairs, Veterans Health Administration', + # start_date: Date.current, end_date: nil, minimum: 17, maximum: 23, rate_interval_code: 'PH', + # locations: [{ city: 'Fulton', state: 'MD' }] } + # not deleted + position_openings << { source: 'usajobs', external_id: 8803, type: 'position_opening', position_title: 'Future Person', + organization_id: 'FUTU', organization_name: 'Future Administration', + position_schedule_type_code: 2, position_offering_type_code: 15327, tags: %w(federal), + start_date: Date.current + 1, end_date: Date.current + 8, minimum: 17, maximum: 23, rate_interval_code: 'PH', + locations: [{ city: 'San Francisco', state: 'CA' }] } + # deleted: end_date is less than start date + position_openings << { source: 'usajobs', external_id: 8804, type: 'position_opening', position_title: 'Making No Money', + organization_id: 'FUTU', organization_name: 'Future Administration', + position_schedule_type_code: 1, position_offering_type_code: 15328, tags: %w(federal), + start_date: Date.current + 10, end_date: Date.current, minimum: 0, maximum: 0, rate_interval_code: 'WC', + locations: [{ city: 'San Francisco', state: 'CA' }] } + PositionOpening.import position_openings + end + + it 'should delete the position openings that are expired (less than today)' do + PositionOpening.delete_expired_docs + res = PositionOpening.search('*', index: 'test:jobs') + expect(res.size).to eq 1 + expect(res.results.first.id).to eq('usajobs:8803') + end + end + + describe '.delete_invalid_docs' do + before do + position_openings = [] + # deleted: end_date is nil + position_openings << { source: 'usajobs', external_id: 8805, type: 'position_opening', position_title: 'Physician Assistant', + position_schedule_type_code: 2, position_offering_type_code: 15318, tags: %w(federal), + organization_id: 'VATA', organization_name: 'Veterans Affairs, Veterans Health Administration', + start_date: Date.current, end_date: nil, minimum: 17, maximum: 23, rate_interval_code: 'PH', + locations: [{ city: 'Fulton', state: 'MD' }] } + #not deleted + position_openings << { source: 'usajobs', external_id: 8806, type: 'position_opening', position_title: 'Future Person', + organization_id: 'FUTU', organization_name: 'Future Administration', + position_schedule_type_code: 2, position_offering_type_code: 15327, tags: %w(federal), + start_date: Date.current + 1, end_date: Date.current + 8, minimum: 17, maximum: 23, rate_interval_code: 'PH', + locations: [{ city: 'San Francisco', state: 'CA' }] } + # deleted: start_date is nil + position_openings << { source: 'usajobs', external_id: 8807, type: 'position_opening', position_title: 'Making No Money', + organization_id: 'FUTU', organization_name: 'Future Administration', + position_schedule_type_code: 1, position_offering_type_code: 15328, tags: %w(federal), + start_date: nil, end_date: Date.current, minimum: 0, maximum: 0, rate_interval_code: 'WC', + locations: [{ city: 'San Francisco', state: 'CA' }] } + PositionOpening.import position_openings + end + + it 'should delete the position openings that are expired (less than today)' do + PositionOpening.delete_invalid_docs + res = PositionOpening.search('*', index: 'test:jobs') + expect(res.size).to eq 1 + expect(res.results.first.id).to eq('usajobs:8806') + end + end + describe '.search_for(options)' do before do position_openings = [] From a8314b007ba88f94015ef2739589704660f4c5f0 Mon Sep 17 00:00:00 2001 From: Yih En Lim Date: Thu, 11 Jan 2018 15:17:43 +0800 Subject: [PATCH 14/16] Set up cron job to delete expired docs --- Gemfile | 1 + Gemfile.lock | 4 ++++ config/schedule.rb | 5 +++++ lib/tasks/position_openings.rake | 3 ++- spec/models/position_opening_spec.rb | 8 +------- 5 files changed, 13 insertions(+), 8 deletions(-) create mode 100644 config/schedule.rb diff --git a/Gemfile b/Gemfile index 777b5a8..c6e328e 100644 --- a/Gemfile +++ b/Gemfile @@ -21,6 +21,7 @@ gem 'rake', '~> 11.0' gem 'elasticsearch-model' gem 'elasticsearch-rails' gem 'elasticsearch-dsl' +gem 'whenever' group :development, :test do gem 'puma', '~> 3.7' diff --git a/Gemfile.lock b/Gemfile.lock index 1f9dbd6..49c3acc 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -55,6 +55,7 @@ GEM net-sftp (>= 2.0.0) net-ssh (>= 2.0.14) net-ssh-gateway (>= 1.1.0) + chronic (0.10.2) concurrent-ruby (1.0.5) coveralls (0.7.0) multi_json (~> 1.3) @@ -225,6 +226,8 @@ GEM websocket-driver (0.6.5) websocket-extensions (>= 0.1.0) websocket-extensions (0.1.3) + whenever (0.10.0) + chronic (>= 0.6.3) PLATFORMS ruby @@ -257,6 +260,7 @@ DEPENDENCIES spring-watcher-listen (~> 2.0.0) test-unit (~> 3.0) us_states (~> 0.1.1)! + whenever BUNDLED WITH 1.16.0 diff --git a/config/schedule.rb b/config/schedule.rb new file mode 100644 index 0000000..eadbd13 --- /dev/null +++ b/config/schedule.rb @@ -0,0 +1,5 @@ +set :output, { error: "log/cron_error.log", standard: "log/cron.log" } + +every 1.day, at: zoned_time("12:00 am") do + rake "position_openings:delete_expired_position_openings" +end diff --git a/lib/tasks/position_openings.rake b/lib/tasks/position_openings.rake index fd25b93..338df67 100644 --- a/lib/tasks/position_openings.rake +++ b/lib/tasks/position_openings.rake @@ -47,6 +47,7 @@ namespace :jobs do desc 'Delete expired position openings' task delete_expired_position_openings: :environment do - + PositionOpening.delete_expired_docs + PositionOpening.delete_invalid_docs end end diff --git a/spec/models/position_opening_spec.rb b/spec/models/position_opening_spec.rb index 9e5e763..e336657 100644 --- a/spec/models/position_opening_spec.rb +++ b/spec/models/position_opening_spec.rb @@ -17,12 +17,6 @@ locations: [{ city: 'Andrews AFB', state: 'MD' }, { city: 'Pentagon Arlington', state: 'VA' }, { city: 'Air Force Academy', state: 'CO' }] } - # # deleted: end_date is nil - # position_openings << { source: 'usajobs', external_id: 8802, type: 'position_opening', position_title: 'Physician Assistant', - # position_schedule_type_code: 2, position_offering_type_code: 15318, tags: %w(federal), - # organization_id: 'VATA', organization_name: 'Veterans Affairs, Veterans Health Administration', - # start_date: Date.current, end_date: nil, minimum: 17, maximum: 23, rate_interval_code: 'PH', - # locations: [{ city: 'Fulton', state: 'MD' }] } # not deleted position_openings << { source: 'usajobs', external_id: 8803, type: 'position_opening', position_title: 'Future Person', organization_id: 'FUTU', organization_name: 'Future Administration', @@ -55,7 +49,7 @@ organization_id: 'VATA', organization_name: 'Veterans Affairs, Veterans Health Administration', start_date: Date.current, end_date: nil, minimum: 17, maximum: 23, rate_interval_code: 'PH', locations: [{ city: 'Fulton', state: 'MD' }] } - #not deleted + # not deleted position_openings << { source: 'usajobs', external_id: 8806, type: 'position_opening', position_title: 'Future Person', organization_id: 'FUTU', organization_name: 'Future Administration', position_schedule_type_code: 2, position_offering_type_code: 15327, tags: %w(federal), From 741fc07916f30477392d721411802d6ba9a5abab Mon Sep 17 00:00:00 2001 From: Yih En Lim Date: Thu, 11 Jan 2018 15:26:33 +0800 Subject: [PATCH 15/16] Set time zone for cron job --- config/schedule.rb | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/config/schedule.rb b/config/schedule.rb index eadbd13..079df8c 100644 --- a/config/schedule.rb +++ b/config/schedule.rb @@ -1,3 +1,12 @@ +require "active_support" +require "active_support/time" + +Time.zone = "Eastern Time (US & Canada)" + +def zoned_time(time) + Time.zone.parse(time).localtime +end + set :output, { error: "log/cron_error.log", standard: "log/cron.log" } every 1.day, at: zoned_time("12:00 am") do From 9db3f9a3b0b0a735615a3fde7179d26be2d134df Mon Sep 17 00:00:00 2001 From: Yih En Lim Date: Thu, 11 Jan 2018 15:58:34 +0800 Subject: [PATCH 16/16] Combine two delete query into one --- app/models/position_opening.rb | 42 +++++++++-------- lib/tasks/position_openings.rake | 1 - spec/models/position_opening_spec.rb | 67 ++++++++++------------------ 3 files changed, 43 insertions(+), 67 deletions(-) diff --git a/app/models/position_opening.rb b/app/models/position_opening.rb index 2df7274..e95058c 100644 --- a/app/models/position_opening.rb +++ b/app/models/position_opening.rb @@ -295,37 +295,35 @@ def delete_expired_docs query do bool do filter do - range :end_date do - lte Date.current - end - end - end - end - end - - client.delete_by_query(body: query.to_hash, index: INDEX_NAME) - __elasticsearch__.refresh_index! index: INDEX_NAME - end - - def delete_invalid_docs - definition = Elasticsearch::DSL::Search.search do - query do - bool do - must_not do bool do - must do - exists { field 'end_date' } + should do + range :end_date do + lte Date.current + end end - must do - exists { field 'start_date' } + + should do + bool do + must_not do + bool do + must do + exists { field 'end_date' } + end + must do + exists { field 'start_date' } + end + end + end + end end + end end end end end - client.delete_by_query(body: definition.to_hash, index: INDEX_NAME) + client.delete_by_query(body: query.to_hash, index: INDEX_NAME) __elasticsearch__.refresh_index! index: INDEX_NAME end diff --git a/lib/tasks/position_openings.rake b/lib/tasks/position_openings.rake index 338df67..5034bf8 100644 --- a/lib/tasks/position_openings.rake +++ b/lib/tasks/position_openings.rake @@ -48,6 +48,5 @@ namespace :jobs do desc 'Delete expired position openings' task delete_expired_position_openings: :environment do PositionOpening.delete_expired_docs - PositionOpening.delete_invalid_docs end end diff --git a/spec/models/position_opening_spec.rb b/spec/models/position_opening_spec.rb index e336657..89c336d 100644 --- a/spec/models/position_opening_spec.rb +++ b/spec/models/position_opening_spec.rb @@ -17,18 +17,29 @@ locations: [{ city: 'Andrews AFB', state: 'MD' }, { city: 'Pentagon Arlington', state: 'VA' }, { city: 'Air Force Academy', state: 'CO' }] } - # not deleted - position_openings << { source: 'usajobs', external_id: 8803, type: 'position_opening', position_title: 'Future Person', - organization_id: 'FUTU', organization_name: 'Future Administration', - position_schedule_type_code: 2, position_offering_type_code: 15327, tags: %w(federal), - start_date: Date.current + 1, end_date: Date.current + 8, minimum: 17, maximum: 23, rate_interval_code: 'PH', - locations: [{ city: 'San Francisco', state: 'CA' }] } - # deleted: end_date is less than start date - position_openings << { source: 'usajobs', external_id: 8804, type: 'position_opening', position_title: 'Making No Money', - organization_id: 'FUTU', organization_name: 'Future Administration', - position_schedule_type_code: 1, position_offering_type_code: 15328, tags: %w(federal), - start_date: Date.current + 10, end_date: Date.current, minimum: 0, maximum: 0, rate_interval_code: 'WC', - locations: [{ city: 'San Francisco', state: 'CA' }] } + # not deleted + position_openings << { source: 'usajobs', external_id: 8803, type: 'position_opening', position_title: 'Future Person', + organization_id: 'FUTU', organization_name: 'Future Administration', + position_schedule_type_code: 2, position_offering_type_code: 15327, tags: %w(federal), + start_date: Date.current + 1, end_date: Date.current + 8, minimum: 17, maximum: 23, rate_interval_code: 'PH', + locations: [{ city: 'San Francisco', state: 'CA' }] } + # deleted: end_date is less than start date + position_openings << { source: 'usajobs', external_id: 8804, type: 'position_opening', position_title: 'Making No Money', + organization_id: 'FUTU', organization_name: 'Future Administration', + position_schedule_type_code: 1, position_offering_type_code: 15328, tags: %w(federal), + start_date: Date.current + 10, end_date: Date.current, minimum: 0, maximum: 0, rate_interval_code: 'WC', + locations: [{ city: 'San Francisco', state: 'CA' }] } + position_openings << { source: 'usajobs', external_id: 8807, type: 'position_opening', position_title: 'Making No Money', + organization_id: 'FUTU', organization_name: 'Future Administration', + position_schedule_type_code: 1, position_offering_type_code: 15328, tags: %w(federal), + start_date: nil, end_date: Date.current + 8, minimum: 0, maximum: 0, rate_interval_code: 'WC', + locations: [{ city: 'San Francisco', state: 'CA' }] } + # deleted: end_date is nil + position_openings << { source: 'usajobs', external_id: 8805, type: 'position_opening', position_title: 'Physician Assistant', + position_schedule_type_code: 2, position_offering_type_code: 15318, tags: %w(federal), + organization_id: 'VATA', organization_name: 'Veterans Affairs, Veterans Health Administration', + start_date: Date.current, end_date: nil, minimum: 17, maximum: 23, rate_interval_code: 'PH', + locations: [{ city: 'Fulton', state: 'MD' }] } PositionOpening.import position_openings end @@ -40,38 +51,6 @@ end end - describe '.delete_invalid_docs' do - before do - position_openings = [] - # deleted: end_date is nil - position_openings << { source: 'usajobs', external_id: 8805, type: 'position_opening', position_title: 'Physician Assistant', - position_schedule_type_code: 2, position_offering_type_code: 15318, tags: %w(federal), - organization_id: 'VATA', organization_name: 'Veterans Affairs, Veterans Health Administration', - start_date: Date.current, end_date: nil, minimum: 17, maximum: 23, rate_interval_code: 'PH', - locations: [{ city: 'Fulton', state: 'MD' }] } - # not deleted - position_openings << { source: 'usajobs', external_id: 8806, type: 'position_opening', position_title: 'Future Person', - organization_id: 'FUTU', organization_name: 'Future Administration', - position_schedule_type_code: 2, position_offering_type_code: 15327, tags: %w(federal), - start_date: Date.current + 1, end_date: Date.current + 8, minimum: 17, maximum: 23, rate_interval_code: 'PH', - locations: [{ city: 'San Francisco', state: 'CA' }] } - # deleted: start_date is nil - position_openings << { source: 'usajobs', external_id: 8807, type: 'position_opening', position_title: 'Making No Money', - organization_id: 'FUTU', organization_name: 'Future Administration', - position_schedule_type_code: 1, position_offering_type_code: 15328, tags: %w(federal), - start_date: nil, end_date: Date.current, minimum: 0, maximum: 0, rate_interval_code: 'WC', - locations: [{ city: 'San Francisco', state: 'CA' }] } - PositionOpening.import position_openings - end - - it 'should delete the position openings that are expired (less than today)' do - PositionOpening.delete_invalid_docs - res = PositionOpening.search('*', index: 'test:jobs') - expect(res.size).to eq 1 - expect(res.results.first.id).to eq('usajobs:8806') - end - end - describe '.search_for(options)' do before do position_openings = []