Skip to content

Commit

Permalink
Merge ab3fcb8 into d43b466
Browse files Browse the repository at this point in the history
  • Loading branch information
jgoizueta committed Oct 2, 2019
2 parents d43b466 + ab3fcb8 commit 229db56
Show file tree
Hide file tree
Showing 5 changed files with 288 additions and 388 deletions.
4 changes: 2 additions & 2 deletions cartoframes/data/services/__init__.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
from __future__ import absolute_import

from .geocode import Geocode
from .geocoding import Geocoding
from .isolines import Isolines

__all__ = [
'Geocode',
'Geocoding',
'Isolines'
]
Original file line number Diff line number Diff line change
Expand Up @@ -189,11 +189,12 @@ def _hash_as_big_int(text):
def _set_pre_summary_info(summary, output):
logging.debug(summary)
output['total_rows'] = sum(summary.values())
output['required_quota'] = sum([summary[s] for s in ['new_geocoded', 'new_nongeocoded', 'changed_geocoded', 'changed_nongeocoded']])
output['required_quota'] = sum(
[summary[s] for s in ['new_geocoded', 'new_nongeocoded', 'changed_geocoded', 'changed_nongeocoded']])
output['previously_geocoded'] = summary.get('previously_geocoded', 0)
output['previously_failed'] = summary.get('previously_nongeocoded', 0)
output['records_with_geometry'] = sum([summary[s] for s in ['new_geocoded', 'changed_geocoded', 'previously_geocoded']])
# output['records_without_geometry'] = sum([summary[s] for s in ['new_nongeocoded', 'changed_nongeocoded', 'previously_nongeocoded']])
output['records_with_geometry'] = sum(
[summary[s] for s in ['new_geocoded', 'changed_geocoded', 'previously_geocoded']])


def _set_post_summary_info(summary, result, output):
Expand All @@ -203,7 +204,8 @@ def _set_post_summary_info(summary, result, output):
output['final_records_with_geometry'] = geom_count
# output['final_records_without_geometry'] = null_geom_count
output['geocoded_increment'] = output['final_records_with_geometry'] - output['records_with_geometry']
output['successfully_geocoded'] = output['geocoded_increment'] + sum([summary[s] for s in ['new_geocoded', 'changed_geocoded']])
new_or_changed = sum([summary[s] for s in ['new_geocoded', 'changed_geocoded']])
output['successfully_geocoded'] = output['geocoded_increment'] + new_or_changed
output['failed_geocodings'] = output['required_quota'] - output['successfully_geocoded']


Expand Down Expand Up @@ -238,7 +240,10 @@ def _column_or_value_arg(arg, valid_columns=None):
if any(invalid_keys):
invalid_keys_list = ', '.join(list(invalid_keys))
valid_keys_list = ', '.join(VALID_GEOCODE_KEYS)
raise ValueError("Invalid key for argument {} valid keys are: {}".format(invalid_keys_list, valid_keys_list))
raise ValueError("Invalid key for argument {} valid keys are: {}".format(
invalid_keys_list,
valid_keys_list)
)
if len(arg.keys()) != 1:
valid_keys_list = ', '.join(VALID_GEOCODE_KEYS)
raise ValueError("Exactly one key of {} must be present in argument".format(valid_keys_list))
Expand All @@ -256,8 +261,8 @@ def _column_or_value_arg(arg, valid_columns=None):
return arg


class Geocode(Service):
"""Geocode using CARTO data services.
class Geocoding(Service):
"""Geocoding using CARTO data services.
This requires a CARTO account with and API key that allows for using geocoding services;
(through explicit argument in constructor or via the default credentials).
Use of these methods will incur in geocoding credit consumption for the provided account.
Expand All @@ -268,10 +273,10 @@ class Geocode(Service):
.. code::
from data.services import Geocode
from data.services import Geocoding
from cartoframes.auth import set_default_credentials
set_default_credentials('YOUR_USER_NAME', 'YOUR_API_KEY')
gc = Geocode()
gc = Geocoding()
_, info = gc.geocode(dataset, street='address', dry_run=True)
print(info['required_quota'])
Expand All @@ -280,13 +285,13 @@ class Geocode(Service):
.. code::
import pandas
from data.services import Geocode
from data.services import Geocoding
from cartoframes.data import Dataset
from cartoframes.auth import set_default_credentials
set_default_credentials('YOUR_USER_NAME', 'YOUR_API_KEY')
dataframe = pandas.DataFrame([['Gran Vía 46', 'Madrid'], ['Ebro 1', 'Sevilla']], columns=['address','city'])
gc = Geocode()
gc = Geocoding()
geocoded_dataframe, info = gc.geocode(dataframe, street='address', city='city', country={'value': 'Spain'})
print(geocoded_dataframe)
Expand All @@ -295,13 +300,13 @@ class Geocode(Service):
.. code::
import pandas
from data.services import Geocode
from data.services import Geocoding
from cartoframes.data import Dataset
from cartoframes.auth import set_default_credentials
set_default_credentials('YOUR_USER_NAME', 'YOUR_API_KEY')
dataset = Dataset('YOUR_TABLE_NAME')
gc = Geocode()
gc = Geocoding()
geocoded_dataset, info = gc.geocode(dataset, street='address', city='city', country={'value': 'Spain'})
print(geocoded_dataset.download())
Expand All @@ -310,21 +315,21 @@ class Geocode(Service):
.. code::
import pandas
from data.services import Geocode
from data.services import Geocoding
from cartoframes.data import Dataset
from cartoframes.auth import set_default_credentials
set_default_credentials('YOUR_USER_NAME', 'YOUR_API_KEY')
df = pandas.DataFrame([['Gran Vía 46', 'Madrid'], ['Ebro 1', 'Sevilla']], columns=['address','city'])
gc = Geocode()
gc = Geocoding()
df, info = gc.geocode(df, street='address', city='city', country={'value': 'Spain'}, metadata='meta')
# show rows with relevance greater than 0.7:
print(df[df.apply(lambda x: json.loads(x['meta'])['relevance']>0.7, axis=1)])
"""

def __init__(self, credentials=None):
super(Geocode, self).__init__(credentials=credentials, quota_service=QUOTA_SERVICE)
super(Geocoding, self).__init__(credentials=credentials, quota_service=QUOTA_SERVICE)

def geocode(self, dataset, street,
city=None, state=None, country=None,
Expand Down Expand Up @@ -359,8 +364,14 @@ def geocode(self, dataset, street,
check the needed quota)
Returns:
Result: (Dataset, info_dict)
A named-tuple ``(data, metadata)`` containing either a ``data`` Dataset or DataFrame
(same type as the input) and a ``metadata`` dictionary with global information
about the geocoding process (not to be confused with the optional per-row
geocoding enabled by the ``metadata`` parameter)
The data contains a ``the_geom`` column with point locations for the geocoded addresses
and also a ``carto_geocode_hash`` that, if preserved, can avoid re-geocoding
unchanged data in future calls to geocode.
"""

input_dataframe = None
Expand Down Expand Up @@ -408,7 +419,7 @@ def geocode(self, dataset, street,
def _table_for_geocoding(self, dataset, table_name, if_exists):
temporary_table = False
input_dataset = dataset
if input_dataset.is_remote() and input_dataset.table_name: # FIXME: more robust to check first for query (hasattr(input_dataset, 'query'))
if input_dataset.is_remote() and input_dataset.table_name:
# input dataset is a table
if table_name:
# Copy input dataset into a new table
Expand Down Expand Up @@ -498,7 +509,8 @@ def _geocode(self, table_name, street, city=None, state=None, country=None, meta
logging.info("Adding columns {} if needed".format(', '.join([c[0] for c in add_columns])))
alter_sql = "ALTER TABLE {table} {add_columns};".format(
table=table_name,
add_columns=','.join(['ADD COLUMN IF NOT EXISTS {} {}'.format(name, type) for name, type in add_columns]))
add_columns=','.join([
'ADD COLUMN IF NOT EXISTS {} {}'.format(name, type) for name, type in add_columns]))
self._execute_query(alter_sql)

sql = _geocode_query(table_name, street, city, state, country, metadata)
Expand Down
75 changes: 54 additions & 21 deletions docs/geocode.rst
Original file line number Diff line number Diff line change
@@ -1,20 +1,26 @@
Geocode
=======

The ``cartoframes.data.dataservices.Geocode`` class provides geocoding using `CARTO Location Data Services (LDS) <https://carto.com/location-data-services/>`_
This process requires you to have a CARTO account with a geocoding provider and geocoding quota assigned, and its use will incur in the expense of geocoding credits.
The ``cartoframes.data.dataservices.Geocoding`` class provides geocoding using
`CARTO Location Data Services (LDS) <https://carto.com/location-data-services/>`_
This process requires you to have a CARTO account with a geocoding provider and geocoding quota assigned,
and its use will incur in the expense of geocoding credits.
In the case of accounts with soft geocoding limits, additional charges may apply if the monthly quota is exceeded.

The ``Geocode.geocode`` instance method provides the interface to geocoding; input data to be geocoded must be provided through a ``Dataset`` or ``DataFrame`` object as the first argument to this method.
The ``Geocoding.geocode`` instance method provides the interface to geocoding; input data to be geocoded must be
provided through a ``Dataset`` or ``DataFrame`` object as the first argument to this method.

A second mandatory argument, ``street`` defines the name of the data column that contains the street address.

Additional optional arguments can be used to define the ``city``, ``state`` and ``country``. These arguments can be used to either
pass the name of a column that contains the corresponding attribute; e.g. ``city={'column': 'column_name_of_the_city'}``, which can
be shortened as ``city='column_name_of_the_city'``,
or, when all the dataset corresponds to a single value of the attribute, a literal text, e.g. ``city={'value': 'London}'``.
Additional optional arguments can be used to define the ``city``, ``state`` and ``country``. These arguments can be
used to either pass the name of a column that contains the corresponding attribute;
e.g. ``city={'column': 'column_name_of_the_city'}``, which can be shortened as ``city='column_name_of_the_city'``,
or, when all the dataset corresponds to a single value of the attribute, a literal text,
e.g. ``city={'value': 'London}'``.

Another optional argument, ``metadata`` can define the name of a result column that will contain additional metadata about each gecododed row
as a JSON structure. The entries in this structure, as described in https://carto.com/developers/data-services-api/reference/ are:
Another optional argument, ``metadata`` can define the name of a result column that will contain additional metadata
about each gecododed row as a JSON structure.
The entries in this structure, as described in https://carto.com/developers/data-services-api/reference/ are:


+-------------+--------+------------------------------------------------------------+
Expand All @@ -30,8 +36,9 @@ as a JSON structure. The entries in this structure, as described in https://cart
+-------------+--------+------------------------------------------------------------+


The result of the ``geocode`` method is a tuple containing both a result Dataset
(or a Dataframe, in case the input was a Dataframe) and a dictionary with general information about the process.
The result of the ``geocode`` method is a named tuple containing both a result ``data``
(of same class as the input, ``Dataframe``or ``Dataframe``) and a ``metadata`` dictionary with general
information about the process.

Dry run
-------
Expand All @@ -40,23 +47,30 @@ To find out the number of quota credits that will be spent when geocoding a data

.. code:: python
from cartoframes.data.services import Geocode
from cartoframes.data.services import Geocoding
from cartoframes.data import Dataset
from cartoframes.auth import set_default_credentials
set_default_credentials(
username='YOUR_USERNAME',
api_key='YOUR_APIKEY'
)
gc = Geocode()
gc = Geocoding()
dataset = Dataset('YOUR_DATA')
_, info = gc.geocode(dataset, street='address', city='city', country={'value': 'Spain'}, dry_run=True)
nfo = gc.geocode(dataset, street='address', city='city', country={'value': 'Spain'}, dry_run=True).metadata
print(info.get('required_quota'))
When ``dry_run`` is True no changes will be made to the data and no quota will be consumed.
The returned dataset will simply be a reference to the input dataset, unmodified.

To know the quota available in the account used, the method ``available_quota`` can be used:

.. code:: python
print(gc.available_quota())
Geocoding Dataframes
--------------------

Expand All @@ -65,15 +79,15 @@ A Dataframe can be geocoded like this:
.. code:: python
import pandas
from cartoframes.data.services import Geocode
from cartoframes.data.services import Geocoding
from cartoframes.data import Dataset
from cartoframes.auth import set_default_credentials
set_default_credentials(
username='YOUR_USERNAME',
api_key='YOUR_APIKEY'
)
gc = Geocode()
gc = Geocoding()
df = pandas.DataFrame([['Gran Vía 46', 'Madrid'], ['Ebro 1', 'Sevilla']], columns=['address', 'city'])
Expand All @@ -100,15 +114,15 @@ When the Dataset to be geocoded corresponds to a CARTO table, it will by default
.. code:: python
import pandas
from cartoframes.data.services import Geocode
from cartoframes.data.services import Geocoding
from cartoframes.data import Dataset
from cartoframes.auth import set_default_credentials
set_default_credentials(
username='YOUR_USERNAME',
api_key='YOUR_APIKEY'
)
gc = Geocode()
gc = Geocoding()
dataset = Dataset('YOUR_DATA')
dataset, info = gc.geocode(dataset, street='address', country={'value': 'Spain'})
Expand All @@ -133,18 +147,17 @@ When the Dataset to be geocoded corresponds to a query, it will by default be ge
.. code:: python
import pandas
from cartoframes.data.services import Geocode
from cartoframes.data.services import Geocoding
from cartoframes.data import Dataset
from cartoframes.auth import set_default_credentials
set_default_credentials(
username='YOUR_USERNAME',
api_key='YOUR_APIKEY'
)
gc = Geocode()
gc = Geocoding()
dataset = Dataset('SELECT * FROM YOUR_DATA WHERE value>1000')
ds, info = gc.geocode(dataset, street='address', city='city', country={'value': 'Spain'})
geocoded_dataset, info = gc.geocode(dataset, street='address', city='city', country={'value': 'Spain'})
print(info)
print(geocoded_dataset.dataframe)
Expand All @@ -158,3 +171,23 @@ Again, the results can be stored in a new table using the `table_name` argument:
new_dataset, info = gc.geocode(dataset, street='address', country={'value': 'Spain'}, table_name='new_table')
print(info)
print(new_dataset.download())
Saving Quota
------------

To prevent having to geocode records that have been previously geocoded, and thus spend quota unnecessarily,
you should always preserve the ``the_geom`` and ``carto_geocode_hash`` columns generated by the
geocoding process. This will happen automatically if your input is a table Dataset processed in place
(i.e. without a ``table_name`` parameter) or if you save your results in a CARTO table using the ``table_name``
parameter, and only use the resulting table for any further geocoding.

In case you're geocoding local data from a ``DataFrame`` that you plan to re-geocode again, (e.g. because
you're making your work reproducible by saving all the data preparation steps in a notebook),
we advise to save the geocoding results immediately to the same store from when the data is originally taken,
for example:

.. code:: python
dataframe = pandas.read_csv('my_data')
dataframe = Geocode().geocode(dataframe, 'address').data
dataframe.to_csv('my_data')

0 comments on commit 229db56

Please sign in to comment.