-
Notifications
You must be signed in to change notification settings - Fork 654
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #2809 from CartoDB/1989-guessing-of-namedplaces
1989 guessing of namedplaces
- Loading branch information
Showing
6 changed files
with
317 additions
and
4 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,107 @@ | ||
# encoding: utf-8 | ||
|
||
require_relative 'content_guesser' | ||
|
||
module CartoDB | ||
module Importer2 | ||
|
||
# This should return enough information to the caller so that it can geocode later on: | ||
# - whether there're nameplaces or not: content_guesser.namedplaces.found? | ||
# - if there's a country column: content_guesser.namedplaces.country_column.present? | ||
# - in that case, which one it is: content_guesser.namedplaces.country_column | ||
# - otherwise, which is the guessed country: content_guesser.namedplaces.country | ||
# - what's the candidate column: content_guesser.namedplaces.column | ||
class NamedplacesGuesser | ||
|
||
def initialize(content_guesser) | ||
@run = false | ||
@guesser = content_guesser | ||
end | ||
|
||
def found? | ||
raise ContentGuesserException, 'not run yet!' unless run? | ||
!column.nil? | ||
end | ||
|
||
def column | ||
raise ContentGuesserException, 'not run yet!' unless run? | ||
@column | ||
end | ||
|
||
def run! | ||
if country_column | ||
guess_with_country_column | ||
else | ||
namedplaces_guess_country | ||
end | ||
@run = true | ||
self | ||
end | ||
|
||
def run? | ||
@run | ||
end | ||
|
||
def country_column | ||
return @country_column if defined?(@country_column) | ||
candidate = text_columns.sort{|a,b| country_proportion(a) <=> country_proportion(b)}.last | ||
@country_column = (country_proportion(candidate) > @guesser.threshold) ? candidate : nil | ||
end | ||
|
||
def country_column_name | ||
country_column[:column_name] if country_column | ||
end | ||
|
||
def country | ||
@country | ||
end | ||
|
||
private | ||
|
||
def country_proportion(column) | ||
@guesser.country_proportion(column) | ||
end | ||
|
||
def guess_with_country_column | ||
candidate_columns = text_columns.reject{|c| c == country_column} | ||
candidate = candidate_columns.sort{|a,b| proportion(a) <=> proportion(b)}.last | ||
@column = (proportion(candidate) > @guesser.threshold) ? candidate : nil | ||
end | ||
|
||
def namedplaces_guess_country | ||
text_columns.each do |candidate| | ||
column_name_sym = candidate[:column_name].to_sym | ||
places = @guesser.sample.map{|row| "'" + row[column_name_sym] + "'"}.join(',') | ||
query = "SELECT namedplace_guess_country(Array[#{places}]) as country" | ||
country = @guesser.geocoder_sql_api.fetch(query).first['country'] | ||
if country | ||
@country = country | ||
@column = candidate | ||
return @column | ||
end | ||
end | ||
end | ||
|
||
def proportion(column) | ||
column_name_sym = column[:column_name].to_sym | ||
matches = count_namedplaces_with_country_column(column_name_sym) | ||
proportion = matches.to_f / @guesser.sample.count | ||
proportion | ||
end | ||
|
||
def count_namedplaces_with_country_column(column_name_sym) | ||
places = @guesser.sample.map{|row| "'" + row[column_name_sym] + "'"}.join(',') | ||
country_column_sym = country_column[:column_name].to_sym | ||
countries = @guesser.sample.map{|row| "'" + row[country_column_sym] + "'"}.join(',') | ||
query = "WITH geo_function as (SELECT (geocode_namedplace(Array[#{places}], Array[#{countries}])).*) select count(success) FROM geo_function where success = TRUE" | ||
ret = @guesser.geocoder_sql_api.fetch(query) | ||
ret.first['count'] | ||
end | ||
|
||
def text_columns | ||
@text_columns ||= @guesser.columns.all.select{|c| @guesser.is_text_type?(c)} | ||
end | ||
|
||
end | ||
end | ||
end |
165 changes: 165 additions & 0 deletions
165
services/importer/spec/unit/namedplaces_guesser_spec.rb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,165 @@ | ||
# encoding: utf-8 | ||
|
||
require_relative '../../lib/importer/namedplaces_guesser' | ||
|
||
RSpec.configure do |config| | ||
config.mock_with :mocha | ||
end | ||
|
||
module CartoDB::Importer2 | ||
|
||
describe NamedplacesGuesser do | ||
|
||
describe '#found?' do | ||
it 'raises an exception if not run yet' do | ||
content_guesser = mock | ||
namedplaces = NamedplacesGuesser.new(content_guesser) | ||
expect { | ||
namedplaces.found? | ||
}.to raise_error(ContentGuesserException, 'not run yet!') | ||
end | ||
|
||
it 'returns false if there was no namedplaces column found during checks' do | ||
content_guesser = mock | ||
namedplaces = NamedplacesGuesser.new(content_guesser) | ||
namedplaces.stubs(:column).returns(nil) | ||
namedplaces.stubs(:country_column).returns(nil) | ||
namedplaces.stubs(:namedplaces_guess_country) | ||
|
||
namedplaces.run! | ||
namedplaces.found?.should be_false | ||
end | ||
|
||
it 'returns true if there was a namedplaces column found' do | ||
content_guesser = mock | ||
namedplaces = NamedplacesGuesser.new(content_guesser) | ||
namedplaces.stubs(:column).returns(:dummy_column) | ||
namedplaces.stubs(:country_column).returns(nil) | ||
namedplaces.stubs(:namedplaces_guess_country) | ||
|
||
namedplaces.run! | ||
namedplaces.found?.should be_true | ||
end | ||
|
||
end | ||
|
||
describe '#run!' do | ||
it "performs a guessing using the country column if there's any" do | ||
content_guesser = mock | ||
namedplaces = NamedplacesGuesser.new(content_guesser) | ||
namedplaces.stubs(:country_column).returns(:dummy_column) | ||
namedplaces.expects(:guess_with_country_column).once | ||
namedplaces.expects(:namedplaces_guess_country).never | ||
|
||
namedplaces.run! | ||
end | ||
|
||
it "performs a guessing relying on namedplace_guess_country if there's no country column" do | ||
content_guesser = mock | ||
namedplaces = NamedplacesGuesser.new(content_guesser) | ||
namedplaces.stubs(:country_column).returns(nil) | ||
namedplaces.expects(:guess_with_country_column).never | ||
namedplaces.expects(:namedplaces_guess_country).once | ||
|
||
namedplaces.run! | ||
end | ||
|
||
end | ||
|
||
describe '#country_column' do | ||
it "returns a country column if there's one with a high proportion of countries" do | ||
content_guesser = mock | ||
namedplaces = NamedplacesGuesser.new(content_guesser) | ||
namedplaces.stubs(:text_columns).returns([:my_country_column, :another_column]) | ||
content_guesser.stubs(:country_proportion).with(:my_country_column).returns(0.9) | ||
content_guesser.stubs(:country_proportion).with(:another_column).returns(0.1) | ||
content_guesser.stubs(:threshold).returns(0.8) | ||
|
||
namedplaces.country_column.should eq :my_country_column | ||
end | ||
end | ||
|
||
|
||
# These methods below are private but worth testing | ||
|
||
describe '#guess_with_country_column' do | ||
it "gets the column with highest proportion of namedplaces, if any" do | ||
content_guesser = mock | ||
namedplaces = NamedplacesGuesser.new(content_guesser) | ||
|
||
namedplaces.stubs(:text_columns).returns([:my_country_column, :another_column, :namedplaces_column]) | ||
namedplaces.stubs(:country_column).returns(:my_country_column) | ||
namedplaces.stubs(:proportion).with(:another_column).returns(0.7) | ||
namedplaces.stubs(:proportion).with(:namedplaces_column).returns(0.9) | ||
content_guesser.stubs(:threshold).returns(0.8) | ||
namedplaces.stubs(:run?).returns(true) | ||
|
||
namedplaces.send(:guess_with_country_column) | ||
namedplaces.column.should eq :namedplaces_column | ||
end | ||
end | ||
|
||
describe '#namedplace_guess_country' do | ||
it "checks all candidates for a positive country guess through the geocoder api" do | ||
content_guesser = mock | ||
namedplaces = NamedplacesGuesser.new(content_guesser) | ||
|
||
namedplaces.stubs(:text_columns).returns([ | ||
{column_name: 'japanese_cities'}, | ||
{column_name: 'another_column'} | ||
]) | ||
content_guesser.stubs(:sample).returns([{japanese_cities: 'Tokyo', another_column: 'whatever'}]) | ||
|
||
sql_api_mock = mock | ||
sql_api_mock.expects(:fetch) | ||
.with("SELECT namedplace_guess_country(Array['Tokyo']) as country") | ||
.returns([{'country' => 'JP'}]) | ||
content_guesser.stubs(:geocoder_sql_api).returns(sql_api_mock) | ||
|
||
namedplaces.stubs(:run?).returns(true) | ||
namedplaces.send(:namedplaces_guess_country) | ||
namedplaces.country.should eq 'JP' | ||
namedplaces.column[:column_name].should eq 'japanese_cities' | ||
end | ||
end | ||
|
||
describe '#proportion' do | ||
it 'calculates the proportion of namedplaces given a column and a country column' do | ||
content_guesser = mock | ||
namedplaces = NamedplacesGuesser.new(content_guesser) | ||
|
||
cities_column = {column_name: 'cities_column'} | ||
countries_column = {column_name: 'countries'} | ||
content_guesser.stubs(:sample).returns([{cities_column: 'Tokyo'}]) | ||
namedplaces.stubs(:text_columns).returns([cities_column]) | ||
namedplaces.stubs(:country_column).returns(countries_column) | ||
namedplaces.stubs(:count_namedplaces_with_country_column).with(:cities_column).returns(1) | ||
|
||
|
||
namedplaces.send(:proportion, cities_column).should eq 1.0 | ||
end | ||
end | ||
|
||
describe '#count_namedplaces_with_country_column' do | ||
it 'queries the geocoder to get the number of namedplaces from the sample' do | ||
content_guesser = mock | ||
namedplaces = NamedplacesGuesser.new(content_guesser) | ||
|
||
content_guesser.stubs(:sample).returns([{japanese_cities: 'Tokyo', country: 'Japan'}]) | ||
namedplaces.stubs(:country_column).returns({column_name: 'country'}) | ||
namedplaces.stubs(:text_columns).returns([{column_name: 'japanese_cities'}]) | ||
|
||
sql_api_mock = mock | ||
sql_api_mock.expects(:fetch) | ||
.with("WITH geo_function as (SELECT (geocode_namedplace(Array['Tokyo'], Array['Japan'])).*) select count(success) FROM geo_function where success = TRUE") | ||
.returns([{'count' => 1}]) | ||
content_guesser.stubs(:geocoder_sql_api).returns(sql_api_mock) | ||
|
||
|
||
namedplaces.send(:count_namedplaces_with_country_column, :japanese_cities).should eq 1 | ||
end | ||
end | ||
|
||
end | ||
|
||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters