From bb868d578a5b58d4f8981275eac79555ff0357c4 Mon Sep 17 00:00:00 2001 From: Rafa de la Torre Date: Tue, 17 Mar 2015 19:17:38 +0100 Subject: [PATCH 01/12] First version of guessing of namedplaces #1989 --- .../importer/lib/importer/content_guesser.rb | 43 +++++++++++++++++++ .../importer/lib/importer/georeferencer.rb | 31 +++++++++++++ 2 files changed, 74 insertions(+) diff --git a/services/importer/lib/importer/content_guesser.rb b/services/importer/lib/importer/content_guesser.rb index 5b08fece0cc0..599769be7762 100644 --- a/services/importer/lib/importer/content_guesser.rb +++ b/services/importer/lib/importer/content_guesser.rb @@ -41,6 +41,14 @@ def country_column nil end + def namedplace_column + return nil if not enabled? + columns.each do |column| + return column[:column_name] if is_namedplace_column? column + end + nil + end + def ip_column return nil if not enabled? columns.each do |column| @@ -73,10 +81,30 @@ def is_country_column?(column) end end + def is_namedplace_column?(column) + return false unless is_text_type? column + entropy = metric_entropy(column, country_name_normalizer) # TODO: optimize + if entropy < minimum_entropy + false + else + proportion = namedplace_proportion(column) + if proportion < threshold + false + else + log_namedplace_guessing_match_metrics(proportion) + true + end + end + end + def log_country_guessing_match_metrics(proportion) @importer_stats.gauge('country_proportion', proportion) end + def log_namedplace_guessing_match_metrics(proportion) + @importer_stats.gauge('namedplace_proportion', proportion) + end + def log_ip_guessing_match_metrics(proportion) @importer_stats.gauge('ip_proportion', proportion) end @@ -137,6 +165,21 @@ def country_proportion(column) country_proportion end + def namedplace_proportion(column) + column_name_sym = column[:column_name].to_sym + matches = count_namedplaces(sample, column_name_sym) + country_proportion = matches.to_f / sample.count + log "namedplace_proportion(#{column[:column_name]}) = #{country_proportion}" + country_proportion + end + + def count_namedplaces(sample, column_name_sym) + sql_array = sample.map{|row| "'" + row[column_name_sym] + "'"}.join(',') + query = "WITH geo_function as (SELECT (geocode_namedplace(Array[#{sql_array}])).*) select count(success) FROM geo_function where success = TRUE" + ret = geocoder_sql_api.fetch(query) + ret.first['count'] + end + def log(msg) @job.log msg if @job end diff --git a/services/importer/lib/importer/georeferencer.rb b/services/importer/lib/importer/georeferencer.rb index f80034a26ad0..21b59d42d216 100644 --- a/services/importer/lib/importer/georeferencer.rb +++ b/services/importer/lib/importer/georeferencer.rb @@ -47,6 +47,7 @@ def run create_the_geom_from_latlon || create_the_geom_from_ip_guessing || create_the_geom_from_country_guessing || + create_the_geom_from_namedplaces_guessing || create_the_geom_in(table_name) enable_autovacuum @@ -151,6 +152,31 @@ def create_the_geom_from_country_guessing return false end + def create_the_geom_from_namedplaces_guessing + return false if not @content_guesser.enabled? + job.log 'Trying namedplaces guessing...' + begin + namedplace_column_name = nil + @importer_stats.timing('guessing') do + @tracker.call('guessing') + namedplace_column_name = @content_guesser.namedplace_column + @tracker.call('importing') + end + if namedplace_column_name + job.log "Found namedplace column: #{namedplace_column_name}" + create_the_geom_in table_name + return geocode_namedplaces namedplace_column_name + end + rescue Exception => ex + message = "create_the_geom_from_namedplaces_guessing failed: #{ex.message}" + Rollbar.report_message(message, + 'warning', + {user_id: @job.logger.user_id, backtrace: ex.backtrace}) + job.log "WARNING: #{message}" + end + return false + end + def create_the_geom_from_ip_guessing return false if not @content_guesser.enabled? job.log 'Trying ip guessing...' @@ -180,6 +206,11 @@ def geocode_countries country_column_name geocode(country_column_name, 'polygon', 'admin0') end + def geocode_namedplaces namedplace_column_name + job.log "Geocoding namedplaces..." + geocode(namedplace_column_name, 'point', 'namedplace') + end + def geocode_ips ip_column_name job.log "Geocoding ips..." geocode(ip_column_name, 'point', 'ipaddress') From e8c7e8655cf04945eaf5c6ca1f9bdba79ffc802d Mon Sep 17 00:00:00 2001 From: Rafa de la Torre Date: Tue, 17 Mar 2015 19:42:29 +0100 Subject: [PATCH 02/12] Fix for the query generator #1989 --- .../lib/internal-geocoder/abstract_query_generator.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/table-geocoder/lib/internal-geocoder/abstract_query_generator.rb b/services/table-geocoder/lib/internal-geocoder/abstract_query_generator.rb index 03931d427fee..af4e57e582a8 100644 --- a/services/table-geocoder/lib/internal-geocoder/abstract_query_generator.rb +++ b/services/table-geocoder/lib/internal-geocoder/abstract_query_generator.rb @@ -27,7 +27,7 @@ def copy_results_to_table_query def country country = @internal_geocoder.countries - country == %Q{'world'} ? 'null' : country + (country == %Q{'world'} || country.blank?) ? 'null' : country end def dest_table From 1ddb4561dcf353a44742dc52cfe6e2a69b99f04c Mon Sep 17 00:00:00 2001 From: Rafa de la Torre Date: Wed, 25 Mar 2015 12:02:38 +0000 Subject: [PATCH 03/12] Revamp of namedplaces guessing #1989 --- .../importer/lib/importer/content_guesser.rb | 135 ++++++++++++------ .../importer/lib/importer/georeferencer.rb | 25 ++-- 2 files changed, 110 insertions(+), 50 deletions(-) diff --git a/services/importer/lib/importer/content_guesser.rb b/services/importer/lib/importer/content_guesser.rb index 599769be7762..9349134de362 100644 --- a/services/importer/lib/importer/content_guesser.rb +++ b/services/importer/lib/importer/content_guesser.rb @@ -41,12 +41,102 @@ def country_column nil end - def namedplace_column - return nil if not enabled? - columns.each do |column| - return column[:column_name] if is_namedplace_column? column + # This should return enough information to the caller so that it can geocode later on: + # - whether there're nameplaces or not: content_guesser.namedplaces.found? + # - if there's a country column: content_guesser.namedplaces.country_column.present? + # - in that case, which one it is: content_guesser.namedplaces.country_column + # - otherwise, which is the guessed country: content_guesser.namedplaces.country + # - what's the candidate column: content_guesser.namedplaces.column + # TODO: move to a different file + class Namedplaces + + def initialize(guesser) + @run = false + @guesser = guesser end - nil + + def found? + raise ContentGuesserException, 'Namedplaces: not run yet!' unless @run + return !@column.nil? + end + + def column + raise ContentGuesserException, 'Namedplaces: not run yet!' unless @run + @column + end + + def run! + if country_column.present? + guess_with_country_column + else + guess_without_country_column + end + @run = true + self + end + + def country_column + return @country_column if defined?(@country_column) + candidate = text_columns.sort{|a,b| country_proportion(a) <=> country_proportion(b)}.last + @country_column = (country_proportion(candidate) > @guesser.threshold) ? candidate : nil + end + + def country + @country + end + + private + + #TODO this is a hack + #TODO this calculation should be performed just once + def country_proportion(column) + @guesser.country_proportion(column) + end + + def guess_with_country_column + candidate_columns = text_columns.reject{|c| c == country_column} + candidate = candidate_columns.sort{|a,b| proportion(a) <=> proportion(b)}.last + @column = (proportion(candidate) > @guesser.threshold) ? candidate : nil + end + + def guess_without_country_column + text_columns.each do |candidate| + column_name_sym = candidate[:column_name].to_sym + places = @guesser.sample.map{|row| "'" + row[column_name_sym] + "'"}.join(',') + query = "SELECT namedplace_guess_country(Array[#{places}])" + ret = @guesser.geocoder_sql_api.fetch(query) + if ret + @country = ret + @column = candidate + return @column + end + end + end + + def proportion(column) + column_name_sym = column[:column_name].to_sym + matches = count_namedplaces_with_country_column(column_name_sym) + proportion = matches.to_f / @guesser.sample.count + proportion + end + + def count_namedplaces_with_country_column(column_name_sym) + places = @guesser.sample.map{|row| "'" + row[column_name_sym] + "'"}.join(',') + country_column_sym = country_column[:column_name].to_sym + countries = @guesser.sample.map{|row| "'" + row[country_column_sym] + "'"}.join(',') + query = "WITH geo_function as (SELECT (geocode_namedplace(Array[#{places}], Array[#{countries}])).*) select count(success) FROM geo_function where success = TRUE" + ret = @guesser.geocoder_sql_api.fetch(query) + ret.first['count'] + end + + def text_columns + @text_columns ||= @guesser.columns.all.select{|c| @guesser.is_text_type?(c)} + end + + end + + def namedplaces + @namedplaces ||= Namedplaces.new(self) end def ip_column @@ -81,30 +171,10 @@ def is_country_column?(column) end end - def is_namedplace_column?(column) - return false unless is_text_type? column - entropy = metric_entropy(column, country_name_normalizer) # TODO: optimize - if entropy < minimum_entropy - false - else - proportion = namedplace_proportion(column) - if proportion < threshold - false - else - log_namedplace_guessing_match_metrics(proportion) - true - end - end - end - def log_country_guessing_match_metrics(proportion) @importer_stats.gauge('country_proportion', proportion) end - def log_namedplace_guessing_match_metrics(proportion) - @importer_stats.gauge('namedplace_proportion', proportion) - end - def log_ip_guessing_match_metrics(proportion) @importer_stats.gauge('ip_proportion', proportion) end @@ -165,21 +235,6 @@ def country_proportion(column) country_proportion end - def namedplace_proportion(column) - column_name_sym = column[:column_name].to_sym - matches = count_namedplaces(sample, column_name_sym) - country_proportion = matches.to_f / sample.count - log "namedplace_proportion(#{column[:column_name]}) = #{country_proportion}" - country_proportion - end - - def count_namedplaces(sample, column_name_sym) - sql_array = sample.map{|row| "'" + row[column_name_sym] + "'"}.join(',') - query = "WITH geo_function as (SELECT (geocode_namedplace(Array[#{sql_array}])).*) select count(success) FROM geo_function where success = TRUE" - ret = geocoder_sql_api.fetch(query) - ret.first['count'] - end - def log(msg) @job.log msg if @job end diff --git a/services/importer/lib/importer/georeferencer.rb b/services/importer/lib/importer/georeferencer.rb index 21b59d42d216..1fd5ac15375b 100644 --- a/services/importer/lib/importer/georeferencer.rb +++ b/services/importer/lib/importer/georeferencer.rb @@ -156,16 +156,15 @@ def create_the_geom_from_namedplaces_guessing return false if not @content_guesser.enabled? job.log 'Trying namedplaces guessing...' begin - namedplace_column_name = nil @importer_stats.timing('guessing') do @tracker.call('guessing') - namedplace_column_name = @content_guesser.namedplace_column + @content_guesser.namedplaces.run! @tracker.call('importing') end - if namedplace_column_name - job.log "Found namedplace column: #{namedplace_column_name}" + if @content_guesser.namedplaces.found? + #job.log "Found namedplace column: #{namedplace_column_name}" create_the_geom_in table_name - return geocode_namedplaces namedplace_column_name + return geocode_namedplaces end rescue Exception => ex message = "create_the_geom_from_namedplaces_guessing failed: #{ex.message}" @@ -206,9 +205,13 @@ def geocode_countries country_column_name geocode(country_column_name, 'polygon', 'admin0') end - def geocode_namedplaces namedplace_column_name + def geocode_namedplaces job.log "Geocoding namedplaces..." - geocode(namedplace_column_name, 'point', 'namedplace') + geocode(@content_guesser.namedplaces.column[:column_name], + 'point', + 'namedplace', + @content_guesser.namedplaces.country_column, + @content_guesser.namedplaces.country) end def geocode_ips ip_column_name @@ -216,7 +219,7 @@ def geocode_ips ip_column_name geocode(ip_column_name, 'point', 'ipaddress') end - def geocode(formatter, geometry_type, kind) + def geocode(formatter, geometry_type, kind, country_column=nil, country=nil) geocoder = nil @importer_stats.timing("geocoding.#{kind}") do @tracker.call('geocoding') @@ -230,12 +233,14 @@ def geocode(formatter, geometry_type, kind) geometry_type: geometry_type, kind: kind, max_rows: nil, - country_column: nil + country_column: country_column[:column_name], + countries: country, + country_code: country ) geocoder = CartoDB::InternalGeocoder::Geocoder.new(config) begin - geocoding = Geocoding.new config.slice(:kind, :geometry_type, :formatter, :table_name) + geocoding = Geocoding.new config.slice(:kind, :geometry_type, :formatter, :table_name, :country_column, :country_code) geocoding.force_geocoder(geocoder) geocoding.user = user geocoding.data_import_id = data_import.id unless data_import.nil? From 7c189f9281c7102a96e0c0b7fa22603f0d8b9677 Mon Sep 17 00:00:00 2001 From: Rafa de la Torre Date: Wed, 25 Mar 2015 18:18:46 +0100 Subject: [PATCH 04/12] Fix call to postgres function #1989 --- services/importer/lib/importer/content_guesser.rb | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/services/importer/lib/importer/content_guesser.rb b/services/importer/lib/importer/content_guesser.rb index 9349134de362..655dd4e9d07c 100644 --- a/services/importer/lib/importer/content_guesser.rb +++ b/services/importer/lib/importer/content_guesser.rb @@ -103,10 +103,10 @@ def guess_without_country_column text_columns.each do |candidate| column_name_sym = candidate[:column_name].to_sym places = @guesser.sample.map{|row| "'" + row[column_name_sym] + "'"}.join(',') - query = "SELECT namedplace_guess_country(Array[#{places}])" - ret = @guesser.geocoder_sql_api.fetch(query) - if ret - @country = ret + query = "SELECT namedplace_guess_country(Array[#{places}]) as country" + country = @guesser.geocoder_sql_api.fetch(query).first['country'] + if country + @country = country @column = candidate return @column end From 2677efaa62f798643a6e48e4b8d37a8b493b4438 Mon Sep 17 00:00:00 2001 From: Rafa de la Torre Date: Wed, 25 Mar 2015 18:24:37 +0100 Subject: [PATCH 05/12] Fix params when geocoding from guess #1989 --- services/importer/lib/importer/content_guesser.rb | 4 ++++ services/importer/lib/importer/georeferencer.rb | 9 ++++----- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/services/importer/lib/importer/content_guesser.rb b/services/importer/lib/importer/content_guesser.rb index 655dd4e9d07c..6c28541ef586 100644 --- a/services/importer/lib/importer/content_guesser.rb +++ b/services/importer/lib/importer/content_guesser.rb @@ -81,6 +81,10 @@ def country_column @country_column = (country_proportion(candidate) > @guesser.threshold) ? candidate : nil end + def country_column_name + country_column[:column_name] if country_column + end + def country @country end diff --git a/services/importer/lib/importer/georeferencer.rb b/services/importer/lib/importer/georeferencer.rb index 1fd5ac15375b..97adf2024990 100644 --- a/services/importer/lib/importer/georeferencer.rb +++ b/services/importer/lib/importer/georeferencer.rb @@ -210,7 +210,7 @@ def geocode_namedplaces geocode(@content_guesser.namedplaces.column[:column_name], 'point', 'namedplace', - @content_guesser.namedplaces.country_column, + @content_guesser.namedplaces.country_column_name, @content_guesser.namedplaces.country) end @@ -219,7 +219,7 @@ def geocode_ips ip_column_name geocode(ip_column_name, 'point', 'ipaddress') end - def geocode(formatter, geometry_type, kind, country_column=nil, country=nil) + def geocode(formatter, geometry_type, kind, country_column_name=nil, country=nil) geocoder = nil @importer_stats.timing("geocoding.#{kind}") do @tracker.call('geocoding') @@ -233,9 +233,8 @@ def geocode(formatter, geometry_type, kind, country_column=nil, country=nil) geometry_type: geometry_type, kind: kind, max_rows: nil, - country_column: country_column[:column_name], - countries: country, - country_code: country + country_column: country_column_name, + countries: "'#{country}'" ) geocoder = CartoDB::InternalGeocoder::Geocoder.new(config) From 741a6dd06bb34924c912ee51494536a0bdd2b3a2 Mon Sep 17 00:00:00 2001 From: Rafa de la Torre Date: Wed, 25 Mar 2015 19:14:58 +0100 Subject: [PATCH 06/12] Fix small bug in geocode param #1989 --- services/importer/lib/importer/georeferencer.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/importer/lib/importer/georeferencer.rb b/services/importer/lib/importer/georeferencer.rb index 97adf2024990..884981a9a4b8 100644 --- a/services/importer/lib/importer/georeferencer.rb +++ b/services/importer/lib/importer/georeferencer.rb @@ -234,7 +234,7 @@ def geocode(formatter, geometry_type, kind, country_column_name=nil, country=nil kind: kind, max_rows: nil, country_column: country_column_name, - countries: "'#{country}'" + countries: country.present? ? "'#{country}'" : nil ) geocoder = CartoDB::InternalGeocoder::Geocoder.new(config) From 532b14eaff7aa1404bde07ca3a4697fa5f267d17 Mon Sep 17 00:00:00 2001 From: Rafa de la Torre Date: Thu, 26 Mar 2015 10:16:55 +0100 Subject: [PATCH 07/12] Move NamedplacesGuesser to a separate file #1989 --- .../importer/lib/importer/content_guesser.rb | 101 +---------------- .../lib/importer/namedplaces_guesser.rb | 103 ++++++++++++++++++ 2 files changed, 105 insertions(+), 99 deletions(-) create mode 100644 services/importer/lib/importer/namedplaces_guesser.rb diff --git a/services/importer/lib/importer/content_guesser.rb b/services/importer/lib/importer/content_guesser.rb index 6c28541ef586..d4f427fefd90 100644 --- a/services/importer/lib/importer/content_guesser.rb +++ b/services/importer/lib/importer/content_guesser.rb @@ -3,6 +3,7 @@ require 'ipaddr' require_relative 'table_sampler' require_relative 'importer_stats' +require_relative 'namedplaces_guesser' module CartoDB module Importer2 @@ -41,106 +42,8 @@ def country_column nil end - # This should return enough information to the caller so that it can geocode later on: - # - whether there're nameplaces or not: content_guesser.namedplaces.found? - # - if there's a country column: content_guesser.namedplaces.country_column.present? - # - in that case, which one it is: content_guesser.namedplaces.country_column - # - otherwise, which is the guessed country: content_guesser.namedplaces.country - # - what's the candidate column: content_guesser.namedplaces.column - # TODO: move to a different file - class Namedplaces - - def initialize(guesser) - @run = false - @guesser = guesser - end - - def found? - raise ContentGuesserException, 'Namedplaces: not run yet!' unless @run - return !@column.nil? - end - - def column - raise ContentGuesserException, 'Namedplaces: not run yet!' unless @run - @column - end - - def run! - if country_column.present? - guess_with_country_column - else - guess_without_country_column - end - @run = true - self - end - - def country_column - return @country_column if defined?(@country_column) - candidate = text_columns.sort{|a,b| country_proportion(a) <=> country_proportion(b)}.last - @country_column = (country_proportion(candidate) > @guesser.threshold) ? candidate : nil - end - - def country_column_name - country_column[:column_name] if country_column - end - - def country - @country - end - - private - - #TODO this is a hack - #TODO this calculation should be performed just once - def country_proportion(column) - @guesser.country_proportion(column) - end - - def guess_with_country_column - candidate_columns = text_columns.reject{|c| c == country_column} - candidate = candidate_columns.sort{|a,b| proportion(a) <=> proportion(b)}.last - @column = (proportion(candidate) > @guesser.threshold) ? candidate : nil - end - - def guess_without_country_column - text_columns.each do |candidate| - column_name_sym = candidate[:column_name].to_sym - places = @guesser.sample.map{|row| "'" + row[column_name_sym] + "'"}.join(',') - query = "SELECT namedplace_guess_country(Array[#{places}]) as country" - country = @guesser.geocoder_sql_api.fetch(query).first['country'] - if country - @country = country - @column = candidate - return @column - end - end - end - - def proportion(column) - column_name_sym = column[:column_name].to_sym - matches = count_namedplaces_with_country_column(column_name_sym) - proportion = matches.to_f / @guesser.sample.count - proportion - end - - def count_namedplaces_with_country_column(column_name_sym) - places = @guesser.sample.map{|row| "'" + row[column_name_sym] + "'"}.join(',') - country_column_sym = country_column[:column_name].to_sym - countries = @guesser.sample.map{|row| "'" + row[country_column_sym] + "'"}.join(',') - query = "WITH geo_function as (SELECT (geocode_namedplace(Array[#{places}], Array[#{countries}])).*) select count(success) FROM geo_function where success = TRUE" - ret = @guesser.geocoder_sql_api.fetch(query) - ret.first['count'] - end - - def text_columns - @text_columns ||= @guesser.columns.all.select{|c| @guesser.is_text_type?(c)} - end - - end - def namedplaces - @namedplaces ||= Namedplaces.new(self) + @namedplaces ||= NamedplacesGuesser.new(self) end def ip_column diff --git a/services/importer/lib/importer/namedplaces_guesser.rb b/services/importer/lib/importer/namedplaces_guesser.rb new file mode 100644 index 000000000000..2f8ba6056b9b --- /dev/null +++ b/services/importer/lib/importer/namedplaces_guesser.rb @@ -0,0 +1,103 @@ +# encoding: utf-8 + +require_relative 'content_guesser' + +module CartoDB + module Importer2 + + # This should return enough information to the caller so that it can geocode later on: + # - whether there're nameplaces or not: content_guesser.namedplaces.found? + # - if there's a country column: content_guesser.namedplaces.country_column.present? + # - in that case, which one it is: content_guesser.namedplaces.country_column + # - otherwise, which is the guessed country: content_guesser.namedplaces.country + # - what's the candidate column: content_guesser.namedplaces.column + class NamedplacesGuesser + + def initialize(content_guesser) + @run = false + @guesser = content_guesser + end + + def found? + raise ContentGuesserException, 'Namedplaces: not run yet!' unless @run + return !@column.nil? + end + + def column + raise ContentGuesserException, 'Namedplaces: not run yet!' unless @run + @column + end + + def run! + if country_column.present? + guess_with_country_column + else + guess_without_country_column + end + @run = true + self + end + + def country_column + return @country_column if defined?(@country_column) + candidate = text_columns.sort{|a,b| country_proportion(a) <=> country_proportion(b)}.last + @country_column = (country_proportion(candidate) > @guesser.threshold) ? candidate : nil + end + + def country_column_name + country_column[:column_name] if country_column + end + + def country + @country + end + + private + + def country_proportion(column) + @guesser.country_proportion(column) + end + + def guess_with_country_column + candidate_columns = text_columns.reject{|c| c == country_column} + candidate = candidate_columns.sort{|a,b| proportion(a) <=> proportion(b)}.last + @column = (proportion(candidate) > @guesser.threshold) ? candidate : nil + end + + def guess_without_country_column + text_columns.each do |candidate| + column_name_sym = candidate[:column_name].to_sym + places = @guesser.sample.map{|row| "'" + row[column_name_sym] + "'"}.join(',') + query = "SELECT namedplace_guess_country(Array[#{places}]) as country" + country = @guesser.geocoder_sql_api.fetch(query).first['country'] + if country + @country = country + @column = candidate + return @column + end + end + end + + def proportion(column) + column_name_sym = column[:column_name].to_sym + matches = count_namedplaces_with_country_column(column_name_sym) + proportion = matches.to_f / @guesser.sample.count + proportion + end + + def count_namedplaces_with_country_column(column_name_sym) + places = @guesser.sample.map{|row| "'" + row[column_name_sym] + "'"}.join(',') + country_column_sym = country_column[:column_name].to_sym + countries = @guesser.sample.map{|row| "'" + row[country_column_sym] + "'"}.join(',') + query = "WITH geo_function as (SELECT (geocode_namedplace(Array[#{places}], Array[#{countries}])).*) select count(success) FROM geo_function where success = TRUE" + ret = @guesser.geocoder_sql_api.fetch(query) + ret.first['count'] + end + + def text_columns + @text_columns ||= @guesser.columns.all.select{|c| @guesser.is_text_type?(c)} + end + + end + end +end From 5680442066515326e89b04890b40183fb3b01327 Mon Sep 17 00:00:00 2001 From: Rafa de la Torre Date: Thu, 26 Mar 2015 12:13:14 +0100 Subject: [PATCH 08/12] Some tests for NamedplacesGuesser #1989 --- Makefile | 1 + .../lib/importer/namedplaces_guesser.rb | 12 +-- .../spec/unit/namedplaces_guesser_spec.rb | 85 +++++++++++++++++++ 3 files changed, 92 insertions(+), 6 deletions(-) create mode 100644 services/importer/spec/unit/namedplaces_guesser_spec.rb diff --git a/Makefile b/Makefile index 7a9aba7b1eba..6be165e9362e 100644 --- a/Makefile +++ b/Makefile @@ -69,6 +69,7 @@ WORKING_SPECS_3 = \ services/importer/spec/unit/url_translator/osm_spec.rb \ services/importer/spec/unit/source_file_spec.rb \ services/importer/spec/unit/content_guesser_spec.rb \ + services/importer/spec/unit/namedplaces_guesser_spec.rb \ $(NULL) WORKING_SPECS_4 = \ diff --git a/services/importer/lib/importer/namedplaces_guesser.rb b/services/importer/lib/importer/namedplaces_guesser.rb index 2f8ba6056b9b..c1e84a0f1c86 100644 --- a/services/importer/lib/importer/namedplaces_guesser.rb +++ b/services/importer/lib/importer/namedplaces_guesser.rb @@ -19,20 +19,20 @@ def initialize(content_guesser) end def found? - raise ContentGuesserException, 'Namedplaces: not run yet!' unless @run - return !@column.nil? + raise ContentGuesserException, 'not run yet!' unless @run + !column.nil? end def column - raise ContentGuesserException, 'Namedplaces: not run yet!' unless @run + raise ContentGuesserException, 'not run yet!' unless @run @column end def run! - if country_column.present? + if country_column guess_with_country_column else - guess_without_country_column + namedplace_guess_country end @run = true self @@ -64,7 +64,7 @@ def guess_with_country_column @column = (proportion(candidate) > @guesser.threshold) ? candidate : nil end - def guess_without_country_column + def namedplace_guess_country text_columns.each do |candidate| column_name_sym = candidate[:column_name].to_sym places = @guesser.sample.map{|row| "'" + row[column_name_sym] + "'"}.join(',') diff --git a/services/importer/spec/unit/namedplaces_guesser_spec.rb b/services/importer/spec/unit/namedplaces_guesser_spec.rb new file mode 100644 index 000000000000..316adb08661d --- /dev/null +++ b/services/importer/spec/unit/namedplaces_guesser_spec.rb @@ -0,0 +1,85 @@ +# encoding: utf-8 + +require_relative '../../lib/importer/namedplaces_guesser' + +RSpec.configure do |config| + config.mock_with :mocha +end + +module CartoDB::Importer2 + + describe NamedplacesGuesser do + + describe '#found?' do + it 'raises an exception if not run yet' do + content_guesser = mock + namedplaces = NamedplacesGuesser.new(content_guesser) + expect { + namedplaces.found? + }.to raise_error(ContentGuesserException, 'not run yet!') + end + + it 'returns false if there was no namedplaces column found during checks' do + content_guesser = mock + namedplaces = NamedplacesGuesser.new(content_guesser) + namedplaces.stubs(:column).returns(nil) + namedplaces.stubs(:country_column).returns(nil) + namedplaces.stubs(:namedplace_guess_country) + + namedplaces.run! + namedplaces.found?.should be_false + end + + it 'returns true if there was a namedplaces column found' do + content_guesser = mock + namedplaces = NamedplacesGuesser.new(content_guesser) + namedplaces.stubs(:column).returns(:dummy_column) + namedplaces.stubs(:country_column).returns(nil) + namedplaces.stubs(:namedplace_guess_country) + + namedplaces.run! + namedplaces.found?.should be_true + end + + end + + describe '#run!' do + it "performs a guessing using the country column if there's any" do + content_guesser = mock + namedplaces = NamedplacesGuesser.new(content_guesser) + namedplaces.stubs(:country_column).returns(:dummy_column) + namedplaces.expects(:guess_with_country_column).once + namedplaces.expects(:namedplace_guess_country).never + + namedplaces.run! + end + + it "performs a guessing relying on namedplace_guess_country if there's no country column" do + content_guesser = mock + namedplaces = NamedplacesGuesser.new(content_guesser) + namedplaces.stubs(:country_column).returns(nil) + namedplaces.expects(:guess_with_country_column).never + namedplaces.expects(:namedplace_guess_country).once + + namedplaces.run! + end + + end + + describe '#country_column' do + it "returns a country column if there's one with a high proportion of countries" do + content_guesser = mock + namedplaces = NamedplacesGuesser.new(content_guesser) + namedplaces.stubs(:text_columns).returns([:my_country_column, :another_column]) + country_proportion_stub = Proc.new {|column| puts "hola"; column[:country_proportion]} + content_guesser.stubs(:country_proportion).with(:my_country_column).returns(0.9) + content_guesser.stubs(:country_proportion).with(:another_column).returns(0.1) + content_guesser.stubs(:threshold).returns(0.8) + + namedplaces.country_column.should eq :my_country_column + end + end + + end + +end From 16407312516735b4af84c51e63199f4f8e7e9fa2 Mon Sep 17 00:00:00 2001 From: Rafa de la Torre Date: Thu, 26 Mar 2015 12:16:13 +0100 Subject: [PATCH 09/12] remove unused var #1989 --- services/importer/spec/unit/namedplaces_guesser_spec.rb | 1 - 1 file changed, 1 deletion(-) diff --git a/services/importer/spec/unit/namedplaces_guesser_spec.rb b/services/importer/spec/unit/namedplaces_guesser_spec.rb index 316adb08661d..753aab1a2983 100644 --- a/services/importer/spec/unit/namedplaces_guesser_spec.rb +++ b/services/importer/spec/unit/namedplaces_guesser_spec.rb @@ -71,7 +71,6 @@ module CartoDB::Importer2 content_guesser = mock namedplaces = NamedplacesGuesser.new(content_guesser) namedplaces.stubs(:text_columns).returns([:my_country_column, :another_column]) - country_proportion_stub = Proc.new {|column| puts "hola"; column[:country_proportion]} content_guesser.stubs(:country_proportion).with(:my_country_column).returns(0.9) content_guesser.stubs(:country_proportion).with(:another_column).returns(0.1) content_guesser.stubs(:threshold).returns(0.8) From fafc05843992504f9bb30c961346c8c363cfa7e7 Mon Sep 17 00:00:00 2001 From: Rafa de la Torre Date: Thu, 26 Mar 2015 14:37:09 +0100 Subject: [PATCH 10/12] More tests for NamedplacesGuesser #1989 --- .../lib/importer/namedplaces_guesser.rb | 10 ++- .../spec/unit/namedplaces_guesser_spec.rb | 81 +++++++++++++++++++ 2 files changed, 88 insertions(+), 3 deletions(-) diff --git a/services/importer/lib/importer/namedplaces_guesser.rb b/services/importer/lib/importer/namedplaces_guesser.rb index c1e84a0f1c86..a4608aa39b4c 100644 --- a/services/importer/lib/importer/namedplaces_guesser.rb +++ b/services/importer/lib/importer/namedplaces_guesser.rb @@ -19,12 +19,12 @@ def initialize(content_guesser) end def found? - raise ContentGuesserException, 'not run yet!' unless @run + raise ContentGuesserException, 'not run yet!' unless run? !column.nil? end def column - raise ContentGuesserException, 'not run yet!' unless @run + raise ContentGuesserException, 'not run yet!' unless run? @column end @@ -38,6 +38,10 @@ def run! self end + def run? + @run + end + def country_column return @country_column if defined?(@country_column) candidate = text_columns.sort{|a,b| country_proportion(a) <=> country_proportion(b)}.last @@ -64,7 +68,7 @@ def guess_with_country_column @column = (proportion(candidate) > @guesser.threshold) ? candidate : nil end - def namedplace_guess_country + def namedplaces_guess_country text_columns.each do |candidate| column_name_sym = candidate[:column_name].to_sym places = @guesser.sample.map{|row| "'" + row[column_name_sym] + "'"}.join(',') diff --git a/services/importer/spec/unit/namedplaces_guesser_spec.rb b/services/importer/spec/unit/namedplaces_guesser_spec.rb index 753aab1a2983..3eae86fbadc1 100644 --- a/services/importer/spec/unit/namedplaces_guesser_spec.rb +++ b/services/importer/spec/unit/namedplaces_guesser_spec.rb @@ -79,6 +79,87 @@ module CartoDB::Importer2 end end + + # These methods below are private but worth testing + + describe '#guess_with_country_column' do + it "gets the column with highest proportion of namedplaces, if any" do + content_guesser = mock + namedplaces = NamedplacesGuesser.new(content_guesser) + + namedplaces.stubs(:text_columns).returns([:my_country_column, :another_column, :namedplaces_column]) + namedplaces.stubs(:country_column).returns(:my_country_column) + namedplaces.stubs(:proportion).with(:another_column).returns(0.7) + namedplaces.stubs(:proportion).with(:namedplaces_column).returns(0.9) + content_guesser.stubs(:threshold).returns(0.8) + namedplaces.stubs(:run?).returns(true) + + namedplaces.send(:guess_with_country_column) + namedplaces.column.should eq :namedplaces_column + end + end + + describe '#namedplace_guess_country' do + it "checks all candidates for a positive country guess through the geocoder api" do + content_guesser = mock + namedplaces = NamedplacesGuesser.new(content_guesser) + + namedplaces.stubs(:text_columns).returns([ + {column_name: 'japanese_cities'}, + {column_name: 'another_column'} + ]) + content_guesser.stubs(:sample).returns([{japanese_cities: 'Tokyo', another_column: 'whatever'}]) + + sql_api_mock = mock + sql_api_mock.expects(:fetch) + .with("SELECT namedplace_guess_country(Array['Tokyo']) as country") + .returns([{'country' => 'JP'}]) + content_guesser.stubs(:geocoder_sql_api).returns(sql_api_mock) + + namedplaces.stubs(:run?).returns(true) + namedplaces.send(:namedplaces_guess_country) + namedplaces.country.should eq 'JP' + namedplaces.column[:column_name].should eq 'japanese_cities' + end + end + + describe '#proportion' do + it 'calculates the proportion of namedplaces given a column and a country column' do + content_guesser = mock + namedplaces = NamedplacesGuesser.new(content_guesser) + + cities_column = {column_name: 'cities_column'} + countries_column = {column_name: 'countries'} + content_guesser.stubs(:sample).returns([{cities_column: 'Tokyo'}]) + namedplaces.stubs(:text_columns).returns([cities_column]) + namedplaces.stubs(:country_column).returns(countries_column) + namedplaces.stubs(:count_namedplaces_with_country_column).with(:cities_column).returns(1) + + + namedplaces.send(:proportion, cities_column).should eq 1.0 + end + end + + describe '#count_namedplaces_with_country_column' do + it 'queries the geocoder to get the number of namedplaces from the sample' do + content_guesser = mock + namedplaces = NamedplacesGuesser.new(content_guesser) + + content_guesser.stubs(:sample).returns([{japanese_cities: 'Tokyo', country: 'Japan'}]) + namedplaces.stubs(:country_column).returns({column_name: 'country'}) + namedplaces.stubs(:text_columns).returns([{column_name: 'japanese_cities'}]) + + sql_api_mock = mock + sql_api_mock.expects(:fetch) + .with("WITH geo_function as (SELECT (geocode_namedplace(Array['Tokyo'], Array['Japan'])).*) select count(success) FROM geo_function where success = TRUE") + .returns([{'count' => 1}]) + content_guesser.stubs(:geocoder_sql_api).returns(sql_api_mock) + + + namedplaces.send(:count_namedplaces_with_country_column, :japanese_cities).should eq 1 + end + end + end end From 79318760b15217e7d5fa67325bb84d07ceb53ca6 Mon Sep 17 00:00:00 2001 From: Rafa de la Torre Date: Thu, 26 Mar 2015 14:45:15 +0100 Subject: [PATCH 11/12] Uncomment log line #1989 --- services/importer/lib/importer/georeferencer.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/importer/lib/importer/georeferencer.rb b/services/importer/lib/importer/georeferencer.rb index 884981a9a4b8..1a81cc2c70d4 100644 --- a/services/importer/lib/importer/georeferencer.rb +++ b/services/importer/lib/importer/georeferencer.rb @@ -162,7 +162,7 @@ def create_the_geom_from_namedplaces_guessing @tracker.call('importing') end if @content_guesser.namedplaces.found? - #job.log "Found namedplace column: #{namedplace_column_name}" + job.log "Found namedplace column: #{@content_guesser.namedplaces.column}" create_the_geom_in table_name return geocode_namedplaces end From f985d5243822e7ce0ad10b460fb23dd269f162e0 Mon Sep 17 00:00:00 2001 From: Rafa de la Torre Date: Thu, 26 Mar 2015 15:15:34 +0100 Subject: [PATCH 12/12] Fix silly typo with singulars/plurals #1989 --- services/importer/lib/importer/namedplaces_guesser.rb | 2 +- services/importer/spec/unit/namedplaces_guesser_spec.rb | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/services/importer/lib/importer/namedplaces_guesser.rb b/services/importer/lib/importer/namedplaces_guesser.rb index a4608aa39b4c..63e78a641e15 100644 --- a/services/importer/lib/importer/namedplaces_guesser.rb +++ b/services/importer/lib/importer/namedplaces_guesser.rb @@ -32,7 +32,7 @@ def run! if country_column guess_with_country_column else - namedplace_guess_country + namedplaces_guess_country end @run = true self diff --git a/services/importer/spec/unit/namedplaces_guesser_spec.rb b/services/importer/spec/unit/namedplaces_guesser_spec.rb index 3eae86fbadc1..a24aece97e9b 100644 --- a/services/importer/spec/unit/namedplaces_guesser_spec.rb +++ b/services/importer/spec/unit/namedplaces_guesser_spec.rb @@ -24,7 +24,7 @@ module CartoDB::Importer2 namedplaces = NamedplacesGuesser.new(content_guesser) namedplaces.stubs(:column).returns(nil) namedplaces.stubs(:country_column).returns(nil) - namedplaces.stubs(:namedplace_guess_country) + namedplaces.stubs(:namedplaces_guess_country) namedplaces.run! namedplaces.found?.should be_false @@ -35,7 +35,7 @@ module CartoDB::Importer2 namedplaces = NamedplacesGuesser.new(content_guesser) namedplaces.stubs(:column).returns(:dummy_column) namedplaces.stubs(:country_column).returns(nil) - namedplaces.stubs(:namedplace_guess_country) + namedplaces.stubs(:namedplaces_guess_country) namedplaces.run! namedplaces.found?.should be_true @@ -49,7 +49,7 @@ module CartoDB::Importer2 namedplaces = NamedplacesGuesser.new(content_guesser) namedplaces.stubs(:country_column).returns(:dummy_column) namedplaces.expects(:guess_with_country_column).once - namedplaces.expects(:namedplace_guess_country).never + namedplaces.expects(:namedplaces_guess_country).never namedplaces.run! end @@ -59,7 +59,7 @@ module CartoDB::Importer2 namedplaces = NamedplacesGuesser.new(content_guesser) namedplaces.stubs(:country_column).returns(nil) namedplaces.expects(:guess_with_country_column).never - namedplaces.expects(:namedplace_guess_country).once + namedplaces.expects(:namedplaces_guess_country).once namedplaces.run! end