Skip to content

Commit

Permalink
Merge pull request #2809 from CartoDB/1989-guessing-of-namedplaces
Browse files Browse the repository at this point in the history
1989 guessing of namedplaces
  • Loading branch information
Rafa de la Torre committed Mar 30, 2015
2 parents 075d011 + f985d52 commit ac248f4
Show file tree
Hide file tree
Showing 6 changed files with 317 additions and 4 deletions.
1 change: 1 addition & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ WORKING_SPECS_3 = \
services/importer/spec/unit/url_translator/osm_spec.rb \
services/importer/spec/unit/source_file_spec.rb \
services/importer/spec/unit/content_guesser_spec.rb \
services/importer/spec/unit/namedplaces_guesser_spec.rb \
$(NULL)

WORKING_SPECS_4 = \
Expand Down
5 changes: 5 additions & 0 deletions services/importer/lib/importer/content_guesser.rb
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
require 'ipaddr'
require_relative 'table_sampler'
require_relative 'importer_stats'
require_relative 'namedplaces_guesser'

module CartoDB
module Importer2
Expand Down Expand Up @@ -41,6 +42,10 @@ def country_column
nil
end

def namedplaces
@namedplaces ||= NamedplacesGuesser.new(self)
end

def ip_column
return nil if not enabled?
columns.each do |column|
Expand Down
41 changes: 38 additions & 3 deletions services/importer/lib/importer/georeferencer.rb
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ def run
create_the_geom_from_latlon ||
create_the_geom_from_ip_guessing ||
create_the_geom_from_country_guessing ||
create_the_geom_from_namedplaces_guessing ||
create_the_geom_in(table_name)

enable_autovacuum
Expand Down Expand Up @@ -151,6 +152,30 @@ def create_the_geom_from_country_guessing
return false
end

def create_the_geom_from_namedplaces_guessing
return false if not @content_guesser.enabled?
job.log 'Trying namedplaces guessing...'
begin
@importer_stats.timing('guessing') do
@tracker.call('guessing')
@content_guesser.namedplaces.run!
@tracker.call('importing')
end
if @content_guesser.namedplaces.found?
job.log "Found namedplace column: #{@content_guesser.namedplaces.column}"
create_the_geom_in table_name
return geocode_namedplaces
end
rescue Exception => ex
message = "create_the_geom_from_namedplaces_guessing failed: #{ex.message}"
Rollbar.report_message(message,
'warning',
{user_id: @job.logger.user_id, backtrace: ex.backtrace})
job.log "WARNING: #{message}"
end
return false
end

def create_the_geom_from_ip_guessing
return false if not @content_guesser.enabled?
job.log 'Trying ip guessing...'
Expand Down Expand Up @@ -180,12 +205,21 @@ def geocode_countries country_column_name
geocode(country_column_name, 'polygon', 'admin0')
end

def geocode_namedplaces
job.log "Geocoding namedplaces..."
geocode(@content_guesser.namedplaces.column[:column_name],
'point',
'namedplace',
@content_guesser.namedplaces.country_column_name,
@content_guesser.namedplaces.country)
end

def geocode_ips ip_column_name
job.log "Geocoding ips..."
geocode(ip_column_name, 'point', 'ipaddress')
end

def geocode(formatter, geometry_type, kind)
def geocode(formatter, geometry_type, kind, country_column_name=nil, country=nil)
geocoder = nil
@importer_stats.timing("geocoding.#{kind}") do
@tracker.call('geocoding')
Expand All @@ -199,12 +233,13 @@ def geocode(formatter, geometry_type, kind)
geometry_type: geometry_type,
kind: kind,
max_rows: nil,
country_column: nil
country_column: country_column_name,
countries: country.present? ? "'#{country}'" : nil
)
geocoder = CartoDB::InternalGeocoder::Geocoder.new(config)

begin
geocoding = Geocoding.new config.slice(:kind, :geometry_type, :formatter, :table_name)
geocoding = Geocoding.new config.slice(:kind, :geometry_type, :formatter, :table_name, :country_column, :country_code)
geocoding.force_geocoder(geocoder)
geocoding.user = user
geocoding.data_import_id = data_import.id unless data_import.nil?
Expand Down
107 changes: 107 additions & 0 deletions services/importer/lib/importer/namedplaces_guesser.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
# encoding: utf-8

require_relative 'content_guesser'

module CartoDB
module Importer2

# This should return enough information to the caller so that it can geocode later on:
# - whether there're nameplaces or not: content_guesser.namedplaces.found?
# - if there's a country column: content_guesser.namedplaces.country_column.present?
# - in that case, which one it is: content_guesser.namedplaces.country_column
# - otherwise, which is the guessed country: content_guesser.namedplaces.country
# - what's the candidate column: content_guesser.namedplaces.column
class NamedplacesGuesser

def initialize(content_guesser)
@run = false
@guesser = content_guesser
end

def found?
raise ContentGuesserException, 'not run yet!' unless run?
!column.nil?
end

def column
raise ContentGuesserException, 'not run yet!' unless run?
@column
end

def run!
if country_column
guess_with_country_column
else
namedplaces_guess_country
end
@run = true
self
end

def run?
@run
end

def country_column
return @country_column if defined?(@country_column)
candidate = text_columns.sort{|a,b| country_proportion(a) <=> country_proportion(b)}.last
@country_column = (country_proportion(candidate) > @guesser.threshold) ? candidate : nil
end

def country_column_name
country_column[:column_name] if country_column
end

def country
@country
end

private

def country_proportion(column)
@guesser.country_proportion(column)
end

def guess_with_country_column
candidate_columns = text_columns.reject{|c| c == country_column}
candidate = candidate_columns.sort{|a,b| proportion(a) <=> proportion(b)}.last
@column = (proportion(candidate) > @guesser.threshold) ? candidate : nil
end

def namedplaces_guess_country
text_columns.each do |candidate|
column_name_sym = candidate[:column_name].to_sym
places = @guesser.sample.map{|row| "'" + row[column_name_sym] + "'"}.join(',')
query = "SELECT namedplace_guess_country(Array[#{places}]) as country"
country = @guesser.geocoder_sql_api.fetch(query).first['country']
if country
@country = country
@column = candidate
return @column
end
end
end

def proportion(column)
column_name_sym = column[:column_name].to_sym
matches = count_namedplaces_with_country_column(column_name_sym)
proportion = matches.to_f / @guesser.sample.count
proportion
end

def count_namedplaces_with_country_column(column_name_sym)
places = @guesser.sample.map{|row| "'" + row[column_name_sym] + "'"}.join(',')
country_column_sym = country_column[:column_name].to_sym
countries = @guesser.sample.map{|row| "'" + row[country_column_sym] + "'"}.join(',')
query = "WITH geo_function as (SELECT (geocode_namedplace(Array[#{places}], Array[#{countries}])).*) select count(success) FROM geo_function where success = TRUE"
ret = @guesser.geocoder_sql_api.fetch(query)
ret.first['count']
end

def text_columns
@text_columns ||= @guesser.columns.all.select{|c| @guesser.is_text_type?(c)}
end

end
end
end
165 changes: 165 additions & 0 deletions services/importer/spec/unit/namedplaces_guesser_spec.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
# encoding: utf-8

require_relative '../../lib/importer/namedplaces_guesser'

RSpec.configure do |config|
config.mock_with :mocha
end

module CartoDB::Importer2

describe NamedplacesGuesser do

describe '#found?' do
it 'raises an exception if not run yet' do
content_guesser = mock
namedplaces = NamedplacesGuesser.new(content_guesser)
expect {
namedplaces.found?
}.to raise_error(ContentGuesserException, 'not run yet!')
end

it 'returns false if there was no namedplaces column found during checks' do
content_guesser = mock
namedplaces = NamedplacesGuesser.new(content_guesser)
namedplaces.stubs(:column).returns(nil)
namedplaces.stubs(:country_column).returns(nil)
namedplaces.stubs(:namedplaces_guess_country)

namedplaces.run!
namedplaces.found?.should be_false
end

it 'returns true if there was a namedplaces column found' do
content_guesser = mock
namedplaces = NamedplacesGuesser.new(content_guesser)
namedplaces.stubs(:column).returns(:dummy_column)
namedplaces.stubs(:country_column).returns(nil)
namedplaces.stubs(:namedplaces_guess_country)

namedplaces.run!
namedplaces.found?.should be_true
end

end

describe '#run!' do
it "performs a guessing using the country column if there's any" do
content_guesser = mock
namedplaces = NamedplacesGuesser.new(content_guesser)
namedplaces.stubs(:country_column).returns(:dummy_column)
namedplaces.expects(:guess_with_country_column).once
namedplaces.expects(:namedplaces_guess_country).never

namedplaces.run!
end

it "performs a guessing relying on namedplace_guess_country if there's no country column" do
content_guesser = mock
namedplaces = NamedplacesGuesser.new(content_guesser)
namedplaces.stubs(:country_column).returns(nil)
namedplaces.expects(:guess_with_country_column).never
namedplaces.expects(:namedplaces_guess_country).once

namedplaces.run!
end

end

describe '#country_column' do
it "returns a country column if there's one with a high proportion of countries" do
content_guesser = mock
namedplaces = NamedplacesGuesser.new(content_guesser)
namedplaces.stubs(:text_columns).returns([:my_country_column, :another_column])
content_guesser.stubs(:country_proportion).with(:my_country_column).returns(0.9)
content_guesser.stubs(:country_proportion).with(:another_column).returns(0.1)
content_guesser.stubs(:threshold).returns(0.8)

namedplaces.country_column.should eq :my_country_column
end
end


# These methods below are private but worth testing

describe '#guess_with_country_column' do
it "gets the column with highest proportion of namedplaces, if any" do
content_guesser = mock
namedplaces = NamedplacesGuesser.new(content_guesser)

namedplaces.stubs(:text_columns).returns([:my_country_column, :another_column, :namedplaces_column])
namedplaces.stubs(:country_column).returns(:my_country_column)
namedplaces.stubs(:proportion).with(:another_column).returns(0.7)
namedplaces.stubs(:proportion).with(:namedplaces_column).returns(0.9)
content_guesser.stubs(:threshold).returns(0.8)
namedplaces.stubs(:run?).returns(true)

namedplaces.send(:guess_with_country_column)
namedplaces.column.should eq :namedplaces_column
end
end

describe '#namedplace_guess_country' do
it "checks all candidates for a positive country guess through the geocoder api" do
content_guesser = mock
namedplaces = NamedplacesGuesser.new(content_guesser)

namedplaces.stubs(:text_columns).returns([
{column_name: 'japanese_cities'},
{column_name: 'another_column'}
])
content_guesser.stubs(:sample).returns([{japanese_cities: 'Tokyo', another_column: 'whatever'}])

sql_api_mock = mock
sql_api_mock.expects(:fetch)
.with("SELECT namedplace_guess_country(Array['Tokyo']) as country")
.returns([{'country' => 'JP'}])
content_guesser.stubs(:geocoder_sql_api).returns(sql_api_mock)

namedplaces.stubs(:run?).returns(true)
namedplaces.send(:namedplaces_guess_country)
namedplaces.country.should eq 'JP'
namedplaces.column[:column_name].should eq 'japanese_cities'
end
end

describe '#proportion' do
it 'calculates the proportion of namedplaces given a column and a country column' do
content_guesser = mock
namedplaces = NamedplacesGuesser.new(content_guesser)

cities_column = {column_name: 'cities_column'}
countries_column = {column_name: 'countries'}
content_guesser.stubs(:sample).returns([{cities_column: 'Tokyo'}])
namedplaces.stubs(:text_columns).returns([cities_column])
namedplaces.stubs(:country_column).returns(countries_column)
namedplaces.stubs(:count_namedplaces_with_country_column).with(:cities_column).returns(1)


namedplaces.send(:proportion, cities_column).should eq 1.0
end
end

describe '#count_namedplaces_with_country_column' do
it 'queries the geocoder to get the number of namedplaces from the sample' do
content_guesser = mock
namedplaces = NamedplacesGuesser.new(content_guesser)

content_guesser.stubs(:sample).returns([{japanese_cities: 'Tokyo', country: 'Japan'}])
namedplaces.stubs(:country_column).returns({column_name: 'country'})
namedplaces.stubs(:text_columns).returns([{column_name: 'japanese_cities'}])

sql_api_mock = mock
sql_api_mock.expects(:fetch)
.with("WITH geo_function as (SELECT (geocode_namedplace(Array['Tokyo'], Array['Japan'])).*) select count(success) FROM geo_function where success = TRUE")
.returns([{'count' => 1}])
content_guesser.stubs(:geocoder_sql_api).returns(sql_api_mock)


namedplaces.send(:count_namedplaces_with_country_column, :japanese_cities).should eq 1
end
end

end

end
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def copy_results_to_table_query

def country
country = @internal_geocoder.countries
country == %Q{'world'} ? 'null' : country
(country == %Q{'world'} || country.blank?) ? 'null' : country
end

def dest_table
Expand Down

0 comments on commit ac248f4

Please sign in to comment.