Skip to content

Commit

Permalink
Merge pull request #1107 from CartoDB/feature/describe
Browse files Browse the repository at this point in the history
Feature/describe
  • Loading branch information
Jesus89 committed Oct 22, 2019
2 parents 6a43dba + 03774c9 commit 35a0439
Show file tree
Hide file tree
Showing 16 changed files with 196 additions and 23 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
## Added
- Add nested filters for catalog search (#1038, #1069)
- Get list of catalog entities by list of ids or slugs (#1089)
- Add describe methods for CatalogDataset and Variable (#1107)

## Changed
- Remove pandas extension in catalog classes (#1038, #1044)
Expand Down
26 changes: 24 additions & 2 deletions cartoframes/data/observatory/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,13 @@
from .repository.variable_repo import get_variable_repo
from .repository.variable_group_repo import get_variable_group_repo
from .repository.constants import DATASET_FILTER
from .summary import dataset_describe, head, tail, counts, fields_by_type, geom_coverage
from . import subscription_info
from . import subscriptions
from . import utils

DATASET_TYPE = 'dataset'


class CatalogDataset(CatalogEntity):
entity_repo = get_dataset_repo()

Expand Down Expand Up @@ -78,7 +78,29 @@ def is_public_data(self):

@property
def summary(self):
return self.data['summary_jsonb']
return self.data['summary_json']

def head(self):
data = self.data['summary_json']
return head(self.__class__, data)

def tail(self):
data = self.data['summary_json']
return tail(self.__class__, data)

def counts(self):
data = self.data['summary_json']
return counts(data)

def fields_by_type(self):
data = self.data['summary_json']
return fields_by_type(data)

def geom_coverage(self):
return geom_coverage(self.geography)

def describe(self):
return dataset_describe(self.variables)

@classmethod
def get_all(cls, filters=None, credentials=None):
Expand Down
2 changes: 1 addition & 1 deletion cartoframes/data/observatory/entity.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ class CatalogEntity(ABC):

id_field = 'id'
entity_repo = None
export_excluded_fields = ['summary_jsonb']
export_excluded_fields = ['summary_json']

def __init__(self, data):
self.data = data
Expand Down
2 changes: 1 addition & 1 deletion cartoframes/data/observatory/geography.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ def is_public_data(self):

@property
def summary(self):
return self.data['summary_jsonb']
return self.data['summary_json']

@classmethod
def get_all(cls, filters=None, credentials=None):
Expand Down
3 changes: 2 additions & 1 deletion cartoframes/data/observatory/repository/dataset_repo.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from __future__ import absolute_import


from .constants import CATEGORY_FILTER, COUNTRY_FILTER, GEOGRAPHY_FILTER, PROVIDER_FILTER, VARIABLE_FILTER
from .entity_repo import EntityRepository

Expand Down Expand Up @@ -49,7 +50,7 @@ def _map_row(self, row):
'update_frequency': self._normalize_field(row, 'update_frequency'),
'version': self._normalize_field(row, 'version'),
'is_public_data': self._normalize_field(row, 'is_public_data'),
'summary_jsonb': self._normalize_field(row, 'summary_jsonb')
'summary_json': self._normalize_field(row, 'summary_json')
}


Expand Down
2 changes: 1 addition & 1 deletion cartoframes/data/observatory/repository/geography_repo.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ def _map_row(self, row):
'update_frequency': self._normalize_field(row, 'update_frequency'),
'version': self._normalize_field(row, 'version'),
'is_public_data': self._normalize_field(row, 'is_public_data'),
'summary_jsonb': self._normalize_field(row, 'summary_jsonb')
'summary_json': self._normalize_field(row, 'summary_json')
}


Expand Down
2 changes: 1 addition & 1 deletion cartoframes/data/observatory/repository/variable_repo.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def _map_row(self, row):
'agg_method': self._normalize_field(row, 'agg_method'),
'variable_group_id': self._normalize_field(row, 'variable_group_id'),
'starred': self._normalize_field(row, 'starred'),
'summary_jsonb': self._normalize_field(row, 'summary_jsonb')
'summary_json': self._normalize_field(row, 'summary_json')
}


Expand Down
120 changes: 120 additions & 0 deletions cartoframes/data/observatory/summary.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt

from shapely import wkb


def variable_describe(data):
if not data or not data.get('stats'):
return

stats = dict(data.get('stats'))
stats.update(data.get('quantiles'))

return pd.Series(stats)


def dataset_describe(variables):
describe = dict()

for variable in variables:
if variable.describe() is None:
continue

describe[variable.column_name] = variable.describe()

return pd.DataFrame.from_dict(describe)


def head(cls, data):
from .dataset import CatalogDataset
from .variable import Variable

if not data:
return

if cls == Variable:
head = pd.Series(data['head'])
elif cls == CatalogDataset:
head = pd.DataFrame(data['glimpses']['head'])

return head


def tail(cls, data):
from .dataset import CatalogDataset
from .variable import Variable

if not data:
return

if cls == Variable:
tail = pd.Series(data['tail'])
elif cls == CatalogDataset:
tail = pd.DataFrame(data['glimpses']['tail'])

return tail


def counts(data):
if not data:
return
return pd.Series(data['counts'])


def quantiles(data):
if not data:
return
return pd.Series(data['quantiles'])


def top_values(data):
if not data:
return

top_values = pd.DataFrame(data['top_values'])

position = list(reversed(range(top_values.shape[0])))

plt.barh(position, top_values['count'], align='center', alpha=0.5)
plt.yticks(position, top_values['value'])
plt.xlabel('Count')
plt.ylabel('Value')
plt.title('Top values')

plt.show()


def fields_by_type(data):
if not data:
return
return pd.Series(data['fields_by_type'])


def geom_coverage(geography_id):
from .geography import Geography
from ...viz import Map, Layer

geography = Geography.get(geography_id)
geom_coverage = wkb.loads(geography.geom_coverage, hex=True)
geom_coverage_gdf = gpd.GeoDataFrame({'geometry': [geom_coverage]}, geometry='geometry')

return Map(Layer(geom_coverage_gdf))


def histogram(data):

range_element = [round(element['min_range'], 2) for element in data['histogram']]
count = [element['count'] for element in data['histogram']]

count_normalized = [element/sum(count) for element in count]

position = list(range(len(range_element)))
plt.figure(figsize=(12, 7))
plt.bar(position, count_normalized, align='center', alpha=0.5, width=abs(position[1] - position[0]))
plt.xticks(position, range_element)

plt.title('Histogram')
plt.xticks(rotation=60)
plt.show()
31 changes: 30 additions & 1 deletion cartoframes/data/observatory/variable.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from .repository.dataset_repo import get_dataset_repo
from .repository.variable_repo import get_variable_repo
from .repository.constants import VARIABLE_FILTER
from .summary import variable_describe, head, tail, counts, quantiles, top_values, histogram


_DESCRIPTION_LENGTH_LIMIT = 30
Expand Down Expand Up @@ -51,7 +52,7 @@ def starred(self):

@property
def summary(self):
return self.data['summary_jsonb']
return self.data['summary_json']

@property
def project_name(self):
Expand All @@ -68,6 +69,34 @@ def dataset_name(self):
_, _, dataset, _ = self.id.split('.')
return dataset

def describe(self):
data = self.data['summary_json']
return variable_describe(data)

def head(self):
data = self.data['summary_json']
return head(self.__class__, data)

def tail(self):
data = self.data['summary_json']
return tail(self.__class__, data)

def counts(self):
data = self.data['summary_json']
return counts(data)

def quantiles(self):
data = self.data['summary_json']
return quantiles(data)

def top_values(self):
data = self.data['summary_json']
return top_values(data)

def histogram(self):
data = self.data['summary_json']
return histogram(data)

def __repr__(self):
descr = self.description

Expand Down
12 changes: 6 additions & 6 deletions test/data/observatory/examples.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@
'update_frequency': 'monthly',
'version': '20190203',
'is_public_data': True,
'summary_jsonb': {}
'summary_json': {}
}
db_geography2 = {
'id': 'carto-do-public.tiger.geography_esp_municipalities_2019',
Expand All @@ -51,7 +51,7 @@
'update_frequency': 'monthly',
'version': '20190203',
'is_public_data': False,
'summary_jsonb': {}
'summary_json': {}
}
test_geography1 = Geography(db_geography1)
test_geography2 = Geography(db_geography2)
Expand All @@ -73,7 +73,7 @@
'update_frequency': 'monthly',
'version': '20190203',
'is_public_data': True,
'summary_jsonb': {}
'summary_json': {}
}
db_dataset2 = {
'id': 'carto-do-public.project.basicstats-municipalities',
Expand All @@ -91,7 +91,7 @@
'update_frequency': 'monthly',
'version': '20190203',
'is_public_data': False,
'summary_jsonb': {}
'summary_json': {}
}
test_dataset1 = CatalogDataset(db_dataset1)
test_dataset2 = CatalogDataset(db_dataset2)
Expand All @@ -108,7 +108,7 @@
'agg_method': '',
'variable_group_id': 'vargroup1',
'starred': True,
'summary_jsonb': {}
'summary_json': {}
}
db_variable2 = {
'id': 'carto-do.variable.var2',
Expand All @@ -121,7 +121,7 @@
'agg_method': '',
'variable_group_id': 'vargroup1',
'starred': False,
'summary_jsonb': {}
'summary_json': {}
}
test_variable1 = Variable(db_variable1)
test_variable2 = Variable(db_variable2)
Expand Down
2 changes: 1 addition & 1 deletion test/data/observatory/repository/test_dataset_repo.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,7 @@ def test_missing_fields_are_mapped_as_None(self, mocked_repo):
'update_frequency': None,
'version': None,
'is_public_data': None,
'summary_jsonb': None
'summary_json': None
})])

# When
Expand Down
2 changes: 1 addition & 1 deletion test/data/observatory/repository/test_geography_repo.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,7 +200,7 @@ def test_missing_fields_are_mapped_as_None(self, mocked_repo):
'update_frequency': None,
'version': None,
'is_public_data': None,
'summary_jsonb': None
'summary_json': None
})])

# When
Expand Down
2 changes: 1 addition & 1 deletion test/data/observatory/repository/test_variable_repo.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,7 @@ def test_missing_fields_are_mapped_as_None(self, mocked_repo):
'agg_method': None,
'variable_group_id': None,
'starred': None,
'summary_jsonb': None
'summary_json': None
})])

# When
Expand Down
4 changes: 2 additions & 2 deletions test/data/observatory/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ def test_dataset_properties(self):
assert update_frequency == db_dataset1['update_frequency']
assert version == db_dataset1['version']
assert is_public_data == db_dataset1['is_public_data']
assert summary == db_dataset1['summary_jsonb']
assert summary == db_dataset1['summary_json']

def test_dataset_is_exported_as_series(self):
# Given
Expand All @@ -144,7 +144,7 @@ def test_dataset_is_exported_as_series(self):
def test_dataset_is_exported_as_dict(self):
# Given
dataset = CatalogDataset(db_dataset1)
expected_dict = {key: value for key, value in db_dataset1.items() if key is not 'summary_jsonb'}
expected_dict = {key: value for key, value in db_dataset1.items() if key is not 'summary_json'}

# When
dataset_dict = dataset.to_dict()
Expand Down
4 changes: 2 additions & 2 deletions test/data/observatory/test_geography.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ def test_geography_properties(self):
assert update_frequency == db_geography1['update_frequency']
assert version == db_geography1['version']
assert is_public_data == db_geography1['is_public_data']
assert summary == db_geography1['summary_jsonb']
assert summary == db_geography1['summary_json']

def test_geography_is_exported_as_series(self):
# Given
Expand All @@ -121,7 +121,7 @@ def test_geography_is_exported_as_series(self):
def test_geography_is_exported_as_dict(self):
# Given
geography = Geography(db_geography1)
expected_dict = {key: value for key, value in db_geography1.items() if key is not 'summary_jsonb'}
expected_dict = {key: value for key, value in db_geography1.items() if key is not 'summary_json'}

# When
geography_dict = geography.to_dict()
Expand Down

0 comments on commit 35a0439

Please sign in to comment.