Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

COPY TO for data obs queries - Part 2 use fetch #592

Merged
merged 15 commits into from
Apr 11, 2019
90 changes: 43 additions & 47 deletions cartoframes/context.py
Original file line number Diff line number Diff line change
Expand Up @@ -387,22 +387,6 @@ def delete(self, table_name):
err=err))
return None

def _table_exists(self, table_name):
"""Checks to see if table exists"""
try:
self.sql_client.send(
'EXPLAIN SELECT * FROM "{table_name}"'.format(
table_name=table_name),
do_post=False)
raise NameError(
'Table `{table_name}` already exists. '
'Run with `overwrite=True` if you wish to replace the '
'table.'.format(table_name=table_name))
except CartoException as err:
# If table doesn't exist, we get an error from the SQL API
self._debug_print(err=err)
return False

def _check_import(self, import_id):
"""Check the status of an Import API job"""

Expand Down Expand Up @@ -1099,19 +1083,7 @@ def data_boundaries(self, boundary=None, region=None, decode_geom=False,
boundaries in `region` (or the world if `region` is ``None``)
"""
# TODO: create a function out of this?
if (isinstance(region, collections.Iterable)
and not isinstance(region, str)):
if len(region) != 4:
raise ValueError(
'`region` should be a list of the geographic bounds of a '
'region in the following order: western longitude, '
'southern latitude, eastern longitude, and northern '
'latitude. For example, Switerland fits in '
'``[5.9559111595,45.8179931641,10.4920501709,'
'47.808380127]``.')
bounds = ('ST_MakeEnvelope({0}, {1}, {2}, {3}, 4326)').format(
*region)
elif isinstance(region, str):
if isinstance(region, str):
# see if it's a table
try:
geom_type = self._geom_type(region)
Expand All @@ -1126,7 +1098,17 @@ def data_boundaries(self, boundary=None, region=None, decode_geom=False,
regionsearch = '"geom_tags"::text ilike \'%{}%\''.format(
get_countrytag(region))
bounds = 'ST_MakeEnvelope(-180.0, -85.0, 180.0, 85.0, 4326)'

elif isinstance(region, collections.Iterable):
if len(region) != 4:
raise ValueError(
'`region` should be a list of the geographic bounds of a '
'region in the following order: western longitude, '
'southern latitude, eastern longitude, and northern '
'latitude. For example, Switerland fits in '
'``[5.9559111595,45.8179931641,10.4920501709,'
'47.808380127]``.')
bounds = ('ST_MakeEnvelope({0}, {1}, {2}, {3}, 4326)').format(
*region)
elif region is None:
bounds = 'ST_MakeEnvelope(-180.0, -85.0, 180.0, 85.0, 4326)'
else:
Expand All @@ -1149,9 +1131,8 @@ def data_boundaries(self, boundary=None, region=None, decode_geom=False,
'{filters}')).format(
bounds=bounds,
timespan=utils.pgquote(timespan),
filters='WHERE {}'.format(filters) if filters else ''
)
return self.query(query)
filters='WHERE {}'.format(filters) if filters else '')
return self.fetch(query, decode_geom=True)

query = utils.minify_sql((
'SELECT the_geom, geom_refs',
Expand All @@ -1162,7 +1143,7 @@ def data_boundaries(self, boundary=None, region=None, decode_geom=False,
bounds=bounds,
boundary=utils.pgquote(boundary),
time=utils.pgquote(timespan))
return self.query(query, decode_geom=decode_geom)
return self.fetch(query, decode_geom=decode_geom)

def data_discovery(self, region, keywords=None, regex=None, time=None,
boundaries=None, include_quantiles=False):
Expand Down Expand Up @@ -1425,8 +1406,7 @@ def data_discovery(self, region, keywords=None, regex=None, time=None,
numers=numers,
quantiles=quantiles).strip()
self._debug_print(query=query)
resp = self.sql_client.send(query)
return pd.DataFrame(resp['rows'])
return self.fetch(query, decode_geom=True)

def data(self, table_name, metadata, persist_as=None, how='the_geom'):
"""Get an augmented CARTO dataset with `Data Observatory
Expand Down Expand Up @@ -1536,8 +1516,7 @@ def data(self, table_name, metadata, persist_as=None, how='the_geom'):
' numeric, timespan_rownum numeric)',
)).format(table_name=table_name,
meta=json.dumps(metadata).replace('\'', '\'\''))
resp = self.sql_client.send(query)
_meta = pd.DataFrame(resp['rows'])
_meta = self.fetch(query)

if _meta.shape[0] == 0:
raise ValueError('There are no valid metadata entries. Check '
Expand All @@ -1550,30 +1529,40 @@ def data(self, table_name, metadata, persist_as=None, how='the_geom'):
'combine resulting DataFrames using '
'`pandas.concat`')

tablecols = self.sql_client.send(
'SELECT * FROM {table_name} LIMIT 0'.format(table_name=table_name),
**DEFAULT_SQL_ARGS
)['fields'].keys()
# get column names except the_geom_webmercator
dataset = Dataset(self, table_name)
table_columns = dataset.get_table_column_names(exclude=['the_geom_webmercator'])
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FYI, the DEFAULT_SQL_ARGS are the way to get the cartoframes calls to show up in logs so we can monitor performance, etc. They're fit in so that each method get registered once despite having multiple calls.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

mmm is this still true? I see DEFAULT_SQL_ARGS just having a do_post attribute.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good question. Are POST request args searchable in kibana?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The POST arguments are not saved by default. The only things we can check in Kibana about a POST request to SQL API are the queries

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Two options:

  • Use the user agent header, which is designed to do exactly this (and it's how we do it in carto-python)
  • Add a SQL comment to the queries (seems hacky)

I'd do the user agent thing, which should not be hard to do and it makes it searchable everywhere: Kibana, Rollbar, etc.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just in case it will be solved with a user agent header, I had created this issue the handle it #601

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I added the user-agent thing here

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also (CartoDB/carto-python#111).

Nice, well solved 👍

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👏


names = {}
for suggested in _meta['suggested_name']:
if suggested in tablecols:
names[suggested] = utils.unique_colname(suggested, tablecols)
if suggested in table_columns:
names[suggested] = utils.unique_colname(suggested, table_columns)
warn(
'{s0} was augmented as {s1} because of name '
'collision'.format(s0=suggested, s1=names[suggested])
)
else:
names[suggested] = suggested

# drop description columns to lighten the query
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

❤️
It's also worth exploring what the minimum set of metadata is we need to send. My guess is that only half of the metadata columns are needed to uniquely define the measure.

# FIXME https://github.com/CartoDB/cartoframes/issues/593
meta_columns = _meta.columns.values
drop_columns = []
for meta_column in meta_columns:
if meta_column.endswith('_description'):
drop_columns.append(meta_column)

if len(drop_columns) > 0:
_meta.drop(drop_columns, axis=1, inplace=True)

cols = ', '.join(
'(data->{n}->>\'value\')::{pgtype} AS {col}'.format(
n=row[0],
pgtype=row[1]['numer_type'],
col=names[row[1]['suggested_name']])
for row in _meta.iterrows())
query = utils.minify_sql((
'SELECT t.*, {cols}',
'SELECT {table_cols}, {cols}',
' FROM OBS_GetData(',
' (SELECT array_agg({how})',
' FROM "{tablename}"),',
Expand All @@ -1585,9 +1574,16 @@ def data(self, table_name, metadata, persist_as=None, how='the_geom'):
tablename=table_name,
rowid='cartodb_id' if how == 'the_geom' else how,
cols=cols,
table_cols=','.join('t.{}'.format(c) for c in table_columns),
meta=_meta.to_json(orient='records').replace('\'', '\'\''))
return self.query(query,
table_name=persist_as)

if persist_as:
dataset = Dataset.from_query(self, query, persist_as)
result = dataset.download(decode_geom=True)
else:
result = self.fetch(query, decode_geom=True)

return result

def _auth_send(self, relative_path, http_method, **kwargs):
self._debug_print(relative_path=relative_path,
Expand Down
36 changes: 32 additions & 4 deletions cartoframes/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,18 @@ def __init__(self, carto_context, table_name, schema='public', df=None):
self.df = df
warn('Table will be named `{}`'.format(table_name))

@staticmethod
def from_query(cart_context, query, table_name):
dataset = Dataset(cart_context, table_name)
dataset.cc.batch_sql_client \
.create_and_wait_for_completion(
'''BEGIN; {drop}; {create}; {cartodbfy}; COMMIT;'''
.format(drop=dataset._drop_table_query(),
create=dataset._create_table_from_query(query),
cartodbfy=dataset._cartodbfy_query()))

return dataset

def upload(self, with_lonlat=None, if_exists='fail'):
if self.df is None:
raise ValueError('You have to create a `Dataset` with a pandas DataFrame in order to upload it to CARTO')
Expand All @@ -46,7 +58,7 @@ def upload(self, with_lonlat=None, if_exists='fail'):
return self

def download(self, limit=None, decode_geom=False, retry_times=DEFAULT_RETRY_TIMES):
table_columns = self._get_table_columns()
table_columns = self.get_table_columns()
query = self._get_read_query(table_columns, limit)

return self.cc.fetch(query, decode_geom=decode_geom)
Expand Down Expand Up @@ -123,6 +135,10 @@ def _rows(self, df, cols, with_lonlat, geom_col):
def _drop_table_query(self):
return '''DROP TABLE IF EXISTS {table_name}'''.format(table_name=self.table_name)

def _create_table_from_query(self, query):
create_query = '''CREATE TABLE {table_name} AS ({query})'''.format(table_name=self.table_name, query=query)
return create_query

def _create_table_query(self, with_lonlat=None):
if with_lonlat is None:
geom_type = _get_geom_col_type(self.df)
Expand Down Expand Up @@ -158,11 +174,21 @@ def _get_read_query(self, table_columns, limit=None):

return query

def _get_table_columns(self):
def get_table_columns(self):
"""Get column names and types from a table"""
query = 'SELECT * FROM "{schema}"."{table}" limit 0'.format(table=self.table_name, schema=self.schema)
return get_columns(self.cc, query)

def get_table_column_names(self, exclude=None):
"""Get column names and types from a table"""
query = 'SELECT * FROM "{schema}"."{table}" limit 0'.format(table=self.table_name, schema=self.schema)
columns = get_columns(self.cc, query).keys()

if exclude and isinstance(exclude, list):
columns = list(set(columns) - set(exclude))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should ensure that exclude is a list or tuple and not a str because:

>>> exclude = 'the_geom_webmercator'
>>> set(exclude)
{'_', 'a', 'b', 'c', 'e', 'g', 'h', 'm', 'o', 'r', 't', 'w'}

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yep, we are already checking that it is an instance of list. Anyway we'll add proper docs once all these new methods are consolidated.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ha, I missed the isinstance(exclude, list) above 🤦‍♂️ Sorry for the noise!

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

np 😅


return columns


def get_columns(context, query):
"""Get column names and types from a query"""
Expand Down Expand Up @@ -314,7 +340,7 @@ def _decode_geom(ewkb):
def postprocess_dataframe(df, table_columns, decode_geom=False):
"""Clean a DataFrame with a dataset from CARTO:
- use cartodb_id as DataFrame index
- process date columns
- process date and bool columns
- (optionally) decode geom as a `Shapely <https://github.com/Toblerity/Shapely>`__ object

Args:
Expand All @@ -334,8 +360,10 @@ def postprocess_dataframe(df, table_columns, decode_geom=False):
for column_name in table_columns:
if table_columns[column_name]['type'] == 'date':
df[column_name] = pd.to_datetime(df[column_name], errors='ignore')
elif table_columns[column_name]['type'] == 'boolean':
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good catch, I wasn't aware that bools weren't properly converted. Maybe the second bullet item in the docstring should be expanded to include process date and boolean columns

df[column_name] = df[column_name].eq('t')

if decode_geom:
if decode_geom and 'the_geom' in df.columns:
df['geometry'] = df.the_geom.apply(_decode_geom)

return df
97 changes: 70 additions & 27 deletions test/test_context.py
Original file line number Diff line number Diff line change
Expand Up @@ -1102,7 +1102,7 @@ def test_data(self):
time=('2010 - 2014', ))
data = cc.data(self.test_data_table, meta)
anscols = set(meta['suggested_name'])
origcols = set(cc.read(self.test_data_table, limit=1).columns)
origcols = set(cc.read(self.test_data_table, limit=1, decode_geom=True).columns)
self.assertSetEqual(anscols, set(data.columns) - origcols)

meta = [{'numer_id': 'us.census.acs.B19013001',
Expand All @@ -1125,36 +1125,79 @@ def test_data(self):
keywords='education')
cc.data(self.test_read_table, meta)

def test_column_name_collision_do_enrichement(self):
"""context.CartoContext.data column collision"""
dup_col = 'female_third_level_studies_2011_by_female_pop'
self.sql_client.send(
"""
create table {table} as (
select cdb_latlng(40.4165,-3.70256) the_geom,
1 {dup_col})
""".format(
dup_col=dup_col,
table=self.test_write_table
)
)
self.sql_client.send(
"select cdb_cartodbfytable('public', '{table}')".format(
table=self.test_write_table
)
)

def test_data_with_persist_as(self):
"""context.CartoContext.data"""
cc = cartoframes.CartoContext(base_url=self.baseurl,
api_key=self.apikey)
meta = cc.data_discovery(region=self.test_write_table,
keywords='female')
meta = meta[meta.suggested_name == dup_col]
data = cc.data(
self.test_write_table,
meta[meta.suggested_name == dup_col]

meta = cc.data_discovery(self.test_read_table,
keywords=('poverty', ),
time=('2010 - 2014', ))
data = cc.data(self.test_data_table, meta)
anscols = set(meta['suggested_name'])
origcols = set(cc.read(self.test_data_table, limit=1, decode_geom=True).columns)
self.assertSetEqual(anscols, set(data.columns) - origcols)

meta = [{'numer_id': 'us.census.acs.B19013001',
'geom_id': 'us.census.tiger.block_group',
'numer_timespan': '2011 - 2015'}, ]
data = cc.data(self.test_data_table, meta, persist_as=self.test_write_table)
self.assertSetEqual(set(('median_income_2011_2015', )),
set(data.columns) - origcols)

df = cc.read(self.test_write_table, decode_geom=True)

# same number of rows
self.assertEqual(len(df), len(data),
msg='Expected number or rows')

# same type of object
self.assertIsInstance(df, pd.DataFrame,
'Should be a pandas DataFrame')
# same column names
self.assertSetEqual(set(data.columns.values),
set(df.columns.values),
msg='Should have the columns requested')

# should have exected schema
self.assertEqual(
sorted(tuple(str(d) for d in df.dtypes)),
sorted(tuple(str(d) for d in data.dtypes)),
msg='Should have same schema/types'
)

self.assertIn('_' + dup_col, data.keys())
# FIXME: https://github.com/CartoDB/cartoframes/issues/594
alrocar marked this conversation as resolved.
Show resolved Hide resolved
# def test_column_name_collision_do_enrichement(self):
# """context.CartoContext.data column collision"""
# import ipdb; ipdb.set_trace(context=30)
# dup_col = 'female_third_level_studies_2011_by_female_pop'
# self.sql_client.send(
# """
# create table {table} as (
# select cdb_latlng(40.4165,-3.70256) the_geom,
# 1 {dup_col})
# """.format(
# dup_col=dup_col,
# table=self.test_write_table
# )
# )
# self.sql_client.send(
# "select cdb_cartodbfytable('public', '{table}')".format(
# table=self.test_write_table
# )
# )

# cc = cartoframes.CartoContext(base_url=self.baseurl,
# api_key=self.apikey)
# meta = cc.data_discovery(region=self.test_write_table,
# keywords='female')
# meta = meta[meta.suggested_name == dup_col]
# data = cc.data(
# self.test_write_table,
# meta[meta.suggested_name == dup_col]
# )

# self.assertIn('_' + dup_col, data.keys())

def test_tables(self):
"""context.CartoContext.tables normal usage"""
Expand Down