From 63fb1827b51d4f839259079631b1b36f9ca29ddd Mon Sep 17 00:00:00 2001 From: Ben Yetton Date: Fri, 19 Jun 2015 11:37:45 -0400 Subject: [PATCH 1/3] fixed pyhton 3 bug --- sharepa/analysis.py | 7 +++++-- tests/test_analysis.py | 6 ++++-- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/sharepa/analysis.py b/sharepa/analysis.py index 8041ad6..94d629d 100644 --- a/sharepa/analysis.py +++ b/sharepa/analysis.py @@ -17,7 +17,8 @@ def bucket_to_dataframe(name, buckets, append_name=None): single_dict = item.to_dict() single_dict[name] = single_dict.pop('doc_count') if append_name: - for key in single_dict.keys(): + persistance_dict = single_dict.copy() + for key in persistance_dict.keys(): single_dict[append_name + '.' + key] = single_dict.pop(key) expanded_buckets.append(single_dict) return pd.DataFrame(expanded_buckets) @@ -36,7 +37,9 @@ def agg_to_two_dim_dataframe(agg): if dict not in [type(item) for item in bucket_as_dict.values()]: return bucket_to_dataframe('doc_count', agg.buckets) else: - name_of_lower_level = bucket_as_dict.keys()[0] + lower_level_dict = [item for item in bucket_as_dict.keys() if type(bucket_as_dict[item]) is dict] + assert len(lower_level_dict) == 1 + name_of_lower_level = lower_level_dict[0] single_level_dataframe = bucket_to_dataframe(bucket.key, bucket[name_of_lower_level]['buckets'], name_of_lower_level) diff --git a/tests/test_analysis.py b/tests/test_analysis.py index f325983..f130232 100644 --- a/tests/test_analysis.py +++ b/tests/test_analysis.py @@ -3,7 +3,7 @@ from elasticsearch_dsl.utils import AttrDict import pandas as pd from mock import Mock -from sharepa.analysis import +from sharepa.analysis import agg_to_two_dim_dataframe def test_basic_aggregation(): search_with_basic_aggs = ShareSearch() @@ -112,7 +112,7 @@ def test_two_dim_aggregation(): search_with_two_dim_aggs.execute = Mock(return_value=search_mock) search_results = search_with_two_dim_aggs.execute() two_dim_dataframe = agg_to_two_dim_dataframe(search_results.testing_two_dim_aggs) - print two_dim_dataframe + print(two_dim_dataframe) assert isinstance(two_dim_dataframe, pd.DataFrame) assert two_dim_dataframe.shape[1] == 4 assert two_dim_dataframe.shape[0] == 18 @@ -131,3 +131,5 @@ def test_merge_dataframes(): assert isinstance(family, pd.core.frame.DataFrame) assert family.columns.values.tolist() == ['Rhodes', 'Rhodes'] assert family.index.item() == 'Rhodes' + + From 980b85b13a103d8e01896a8bcb326f7a540a9290 Mon Sep 17 00:00:00 2001 From: Ben Yetton Date: Fri, 19 Jun 2015 14:37:10 -0400 Subject: [PATCH 2/3] added testing of eroor checking --- tests/test_analysis.py | 246 ++++++++++++++++++++++++++++++++--------- 1 file changed, 191 insertions(+), 55 deletions(-) diff --git a/tests/test_analysis.py b/tests/test_analysis.py index 0f7435d..9b74b0d 100644 --- a/tests/test_analysis.py +++ b/tests/test_analysis.py @@ -4,6 +4,7 @@ import pandas as pd from mock import Mock from sharepa.analysis import agg_to_two_dim_dataframe +import pytest def test_basic_aggregation(): search_with_basic_aggs = ShareSearch() @@ -44,7 +45,7 @@ def test_basic_aggregation(): def test_two_dim_aggregation(): - num_cols = 5; + num_cols = 5 search_with_two_dim_aggs = ShareSearch() search_with_two_dim_aggs.aggs.bucket( 'testing_two_dim_agg', @@ -52,62 +53,98 @@ def test_two_dim_aggregation(): field='subjects', size=num_cols, min_doc_count=0 - ).metric( - 'dates', - 'date_histogram', - field='providerUpdatedDateTime', - interval='1M', - format='yyyy-MM-dd', - extended_bounds={ - "min": "2014-01-01", - "max": "2014-12-31"}, - min_doc_count=0 + ).metric( + 'dates', + 'date_histogram', + field='providerUpdatedDateTime', + interval='1M', + format='yyyy-MM-dd', + extended_bounds={ + "min": "2014-01-01", + "max": "2014-12-31"}, + min_doc_count=0 ) # Create Mock object for test so we are not actually hitting server: search_mock = AttrDict({u'testing_two_dim_aggs': - {u'buckets': - [{u'dates': {u'buckets': - [{u'key': 1388534400000, u'key_as_string': u'2014-01-01', u'doc_count': 0}, - {u'key': 1391212800000, u'key_as_string': u'2014-02-01', u'doc_count': 0}, - {u'key': 1393632000000, u'key_as_string': u'2014-03-01', u'doc_count': 0}, - {u'key': 1396310400000, u'key_as_string': u'2014-04-01', u'doc_count': 0}, - {u'key': 1398902400000, u'key_as_string': u'2014-05-01', u'doc_count': 0}, - {u'key': 1401580800000, u'key_as_string': u'2014-06-01', u'doc_count': 0}, - {u'key': 1404172800000, u'key_as_string': u'2014-07-01', u'doc_count': 0}, - {u'key': 1406851200000, u'key_as_string': u'2014-08-01', u'doc_count': 0}, - {u'key': 1409529600000, u'key_as_string': u'2014-09-01', u'doc_count': 0}, - {u'key': 1412121600000, u'key_as_string': u'2014-10-01', u'doc_count': 4286}, - {u'key': 1414800000000, u'key_as_string': u'2014-11-01', u'doc_count': 7415}, - {u'key': 1417392000000, u'key_as_string': u'2014-12-01', u'doc_count': 4732}, - {u'key': 1420070400000, u'key_as_string': u'2015-01-01', u'doc_count': 4623}, - {u'key': 1422748800000, u'key_as_string': u'2015-02-01', u'doc_count': 8219}, - {u'key': 1425168000000, u'key_as_string': u'2015-03-01', u'doc_count': 10152}, - {u'key': 1427846400000, u'key_as_string': u'2015-04-01', u'doc_count': 10658}, - {u'key': 1430438400000, u'key_as_string': u'2015-05-01', u'doc_count': 22954}, - {u'key': 1433116800000, u'key_as_string': u'2015-06-01', u'doc_count': 17333}]}, - u'key': u'and', u'doc_count': 90372}, - {u'dates': {u'buckets': - [{u'key': 1388534400000, u'doc_count': 0, u'key_as_string': u'2014-01-01'}, - {u'key': 1391212800000, u'doc_count': 0, u'key_as_string': u'2014-02-01'}, - {u'key': 1393632000000, u'doc_count': 0, u'key_as_string': u'2014-03-01'}, - {u'key': 1396310400000, u'doc_count': 0, u'key_as_string': u'2014-04-01'}, - {u'key': 1398902400000, u'doc_count': 0, u'key_as_string': u'2014-05-01'}, - {u'key': 1401580800000, u'doc_count': 0, u'key_as_string': u'2014-06-01'}, - {u'key': 1404172800000, u'doc_count': 0, u'key_as_string': u'2014-07-01'}, - {u'key': 1406851200000, u'doc_count': 0, u'key_as_string': u'2014-08-01'}, - {u'key': 1409529600000, u'doc_count': 0, u'key_as_string': u'2014-09-01'}, - {u'key': 1412121600000, u'doc_count': 0, u'key_as_string': u'2014-10-01'}, - {u'key': 1414800000000, u'doc_count': 0, u'key_as_string': u'2014-11-01'}, - {u'key': 1417392000000, u'doc_count': 1685, u'key_as_string': u'2014-12-01'}, - {u'key': 1420070400000, u'doc_count': 13871, u'key_as_string': u'2015-01-01'}, - {u'key': 1422748800000, u'doc_count': 10175, u'key_as_string': u'2015-02-01'}, - {u'key': 1425168000000, u'doc_count': 14745, u'key_as_string': u'2015-03-01'}, - {u'key': 1427846400000, u'doc_count': 13881, u'key_as_string': u'2015-04-01'}, - {u'key': 1430438400000, u'doc_count': 16696, u'key_as_string': u'2015-05-01'}, - {u'key': 1433116800000, u'doc_count': 10784, u'key_as_string': u'2015-06-01'}]}, - u'key': u'article', u'doc_count': 81837}] - }}) + {u'buckets': + [{u'dates': {u'buckets': + [{u'key': 1388534400000, u'key_as_string': u'2014-01-01', + u'doc_count': 0}, + {u'key': 1391212800000, u'key_as_string': u'2014-02-01', + u'doc_count': 0}, + {u'key': 1393632000000, u'key_as_string': u'2014-03-01', + u'doc_count': 0}, + {u'key': 1396310400000, u'key_as_string': u'2014-04-01', + u'doc_count': 0}, + {u'key': 1398902400000, u'key_as_string': u'2014-05-01', + u'doc_count': 0}, + {u'key': 1401580800000, u'key_as_string': u'2014-06-01', + u'doc_count': 0}, + {u'key': 1404172800000, u'key_as_string': u'2014-07-01', + u'doc_count': 0}, + {u'key': 1406851200000, u'key_as_string': u'2014-08-01', + u'doc_count': 0}, + {u'key': 1409529600000, u'key_as_string': u'2014-09-01', + u'doc_count': 0}, + {u'key': 1412121600000, u'key_as_string': u'2014-10-01', + u'doc_count': 4286}, + {u'key': 1414800000000, u'key_as_string': u'2014-11-01', + u'doc_count': 7415}, + {u'key': 1417392000000, u'key_as_string': u'2014-12-01', + u'doc_count': 4732}, + {u'key': 1420070400000, u'key_as_string': u'2015-01-01', + u'doc_count': 4623}, + {u'key': 1422748800000, u'key_as_string': u'2015-02-01', + u'doc_count': 8219}, + {u'key': 1425168000000, u'key_as_string': u'2015-03-01', + u'doc_count': 10152}, + {u'key': 1427846400000, u'key_as_string': u'2015-04-01', + u'doc_count': 10658}, + {u'key': 1430438400000, u'key_as_string': u'2015-05-01', + u'doc_count': 22954}, + {u'key': 1433116800000, u'key_as_string': u'2015-06-01', + u'doc_count': 17333}]}, + u'key': u'and', u'doc_count': 90372}, + {u'dates': {u'buckets': + [{u'key': 1388534400000, u'doc_count': 0, + u'key_as_string': u'2014-01-01'}, + {u'key': 1391212800000, u'doc_count': 0, + u'key_as_string': u'2014-02-01'}, + {u'key': 1393632000000, u'doc_count': 0, + u'key_as_string': u'2014-03-01'}, + {u'key': 1396310400000, u'doc_count': 0, + u'key_as_string': u'2014-04-01'}, + {u'key': 1398902400000, u'doc_count': 0, + u'key_as_string': u'2014-05-01'}, + {u'key': 1401580800000, u'doc_count': 0, + u'key_as_string': u'2014-06-01'}, + {u'key': 1404172800000, u'doc_count': 0, + u'key_as_string': u'2014-07-01'}, + {u'key': 1406851200000, u'doc_count': 0, + u'key_as_string': u'2014-08-01'}, + {u'key': 1409529600000, u'doc_count': 0, + u'key_as_string': u'2014-09-01'}, + {u'key': 1412121600000, u'doc_count': 0, + u'key_as_string': u'2014-10-01'}, + {u'key': 1414800000000, u'doc_count': 0, + u'key_as_string': u'2014-11-01'}, + {u'key': 1417392000000, u'doc_count': 1685, + u'key_as_string': u'2014-12-01'}, + {u'key': 1420070400000, u'doc_count': 13871, + u'key_as_string': u'2015-01-01'}, + {u'key': 1422748800000, u'doc_count': 10175, + u'key_as_string': u'2015-02-01'}, + {u'key': 1425168000000, u'doc_count': 14745, + u'key_as_string': u'2015-03-01'}, + {u'key': 1427846400000, u'doc_count': 13881, + u'key_as_string': u'2015-04-01'}, + {u'key': 1430438400000, u'doc_count': 16696, + u'key_as_string': u'2015-05-01'}, + {u'key': 1433116800000, u'doc_count': 10784, + u'key_as_string': u'2015-06-01'}]}, + u'key': u'article', u'doc_count': 81837}] + }}) search_with_two_dim_aggs.execute = Mock(return_value=search_mock) search_results = search_with_two_dim_aggs.execute() @@ -115,12 +152,111 @@ def test_two_dim_aggregation(): assert isinstance(two_dim_dataframe, pd.DataFrame) assert two_dim_dataframe.shape[1] == 4 assert two_dim_dataframe.shape[0] == 18 - for index,bucket in enumerate(search_results.testing_two_dim_aggs.buckets): + for index, bucket in enumerate(search_results.testing_two_dim_aggs.buckets): for index_2nd_level, raw_date in enumerate(two_dim_dataframe['dates.key']): - assert raw_date == search_results.testing_two_dim_aggs.buckets[index].dates.buckets[index_2nd_level]['dates.key'] + assert raw_date == search_results.testing_two_dim_aggs.buckets[index].dates.buckets[index_2nd_level][ + 'dates.key'] for index_2nd_level, count_num in enumerate(two_dim_dataframe['dates.and']): assert count_num == search_results.testing_two_dim_aggs.buckets[0].dates.buckets[index_2nd_level]['dates.and'] + +def test_throw_error_in_agg_convert_when_too_many_levels(): + num_cols = 5 + search_with_multilevel = ShareSearch() + search_with_multilevel.aggs.bucket( + 'testing_two_dim_agg', + 'terms', + field='subjects', + size=num_cols, + min_doc_count=0 + ).metric( + 'source', + 'terms', + field='source', + size=10, + min_doc_count=0 + ).metric( + 'dates', + 'date_histogram', + field='providerUpdatedDateTime', + interval='1M', + format='yyyy-MM-dd', + extended_bounds={ + "min": "2014-01-01", + "max": "2014-12-31"}, + min_doc_count=0 + ) + + # Create Mock object for test so we are not actually hitting server: + search_mock = AttrDict({u'testing_multilevel': + {u'buckets': + [ + { + u"source": {u"buckets": + [{u"key": u"dataone", u"doc_count": 15760}, + {u"key": u"clinicaltrials", u"doc_count": 21}] + }, + u'dates': {u'buckets': + [{u'key': 1388534400000, u'key_as_string': u'2014-01-01', u'doc_count': 0}, + {u'key': 1391212800000, u'key_as_string': u'2014-02-01', u'doc_count': 0}, + {u'key': 1393632000000, u'key_as_string': u'2014-03-01', u'doc_count': 0}, + {u'key': 1396310400000, u'key_as_string': u'2014-04-01', u'doc_count': 0}, + {u'key': 1398902400000, u'key_as_string': u'2014-05-01', u'doc_count': 0}, + {u'key': 1401580800000, u'key_as_string': u'2014-06-01', u'doc_count': 0}, + {u'key': 1404172800000, u'key_as_string': u'2014-07-01', u'doc_count': 0}, + {u'key': 1406851200000, u'key_as_string': u'2014-08-01', u'doc_count': 0}, + {u'key': 1409529600000, u'key_as_string': u'2014-09-01', u'doc_count': 0}, + {u'key': 1412121600000, u'key_as_string': u'2014-10-01', u'doc_count': 4286}, + {u'key': 1414800000000, u'key_as_string': u'2014-11-01', u'doc_count': 7415}, + {u'key': 1417392000000, u'key_as_string': u'2014-12-01', u'doc_count': 4732}, + {u'key': 1420070400000, u'key_as_string': u'2015-01-01', u'doc_count': 4623}, + {u'key': 1422748800000, u'key_as_string': u'2015-02-01', u'doc_count': 8219}, + {u'key': 1425168000000, u'key_as_string': u'2015-03-01', u'doc_count': 10152}, + {u'key': 1427846400000, u'key_as_string': u'2015-04-01', u'doc_count': 10658}, + {u'key': 1430438400000, u'key_as_string': u'2015-05-01', u'doc_count': 22954}, + {u'key': 1433116800000, u'key_as_string': u'2015-06-01', u'doc_count': 17333}] + }, + u'key': u'and', u'doc_count': 90372 + }, + { + u"source":{u"buckets": + [ + {u"key": u"dataone", u"doc_count": 100}, + {u"key": u"clinicaltrials", u"doc_count": 4} + ] + }, + u'dates': {u'buckets': + [{u'key': 1388534400000, u'doc_count': 0, u'key_as_string': u'2014-01-01'}, + {u'key': 1391212800000, u'doc_count': 0, u'key_as_string': u'2014-02-01'}, + {u'key': 1393632000000, u'doc_count': 0, u'key_as_string': u'2014-03-01'}, + {u'key': 1396310400000, u'doc_count': 0, u'key_as_string': u'2014-04-01'}, + {u'key': 1398902400000, u'doc_count': 0, u'key_as_string': u'2014-05-01'}, + {u'key': 1401580800000, u'doc_count': 0, u'key_as_string': u'2014-06-01'}, + {u'key': 1404172800000, u'doc_count': 0, u'key_as_string': u'2014-07-01'}, + {u'key': 1406851200000, u'doc_count': 0, u'key_as_string': u'2014-08-01'}, + {u'key': 1409529600000, u'doc_count': 0, u'key_as_string': u'2014-09-01'}, + {u'key': 1412121600000, u'doc_count': 0, u'key_as_string': u'2014-10-01'}, + {u'key': 1414800000000, u'doc_count': 0, u'key_as_string': u'2014-11-01'}, + {u'key': 1417392000000, u'doc_count': 1685, u'key_as_string': u'2014-12-01'}, + {u'key': 1420070400000, u'doc_count': 13871, u'key_as_string': u'2015-01-01'}, + {u'key': 1422748800000, u'doc_count': 10175, u'key_as_string': u'2015-02-01'}, + {u'key': 1425168000000, u'doc_count': 14745, u'key_as_string': u'2015-03-01'}, + {u'key': 1427846400000, u'doc_count': 13881, u'key_as_string': u'2015-04-01'}, + {u'key': 1430438400000, u'doc_count': 16696, u'key_as_string': u'2015-05-01'}, + {u'key': 1433116800000, u'doc_count': 10784, u'key_as_string': u'2015-06-01'}]}, + u'key': u'article', u'doc_count': 81837 + } + ] + } + }) + + search_with_multilevel.execute = Mock(return_value=search_mock) + search_results = search_with_multilevel.execute() + with pytest.raises(ValueError): + multilevel_dataframe = agg_to_two_dim_dataframe(search_results.testing_multilevel) + +test_throw_error_in_agg_convert_when_too_many_levels() + def test_merge_dataframes(): dream = pd.DataFrame({'Rhodes': 'Dusty'}, index=['Rhodes']) stardust = pd.DataFrame({'Rhodes': 'Cody'}, index=['Rhodes']) From fa9b08df3fe3ccc99451f32698d7409da621c262 Mon Sep 17 00:00:00 2001 From: Ben Yetton Date: Fri, 19 Jun 2015 14:51:12 -0400 Subject: [PATCH 3/3] added testing of backet_to_df --- tests/test_analysis.py | 42 +++++++++++++++++++++++++++++++++++++++--- 1 file changed, 39 insertions(+), 3 deletions(-) diff --git a/tests/test_analysis.py b/tests/test_analysis.py index 9b74b0d..5c3a877 100644 --- a/tests/test_analysis.py +++ b/tests/test_analysis.py @@ -1,4 +1,4 @@ -from sharepa.analysis import agg_to_two_dim_dataframe, merge_dataframes +from sharepa.analysis import agg_to_two_dim_dataframe, merge_dataframes, bucket_to_dataframe from sharepa.search import ShareSearch from elasticsearch_dsl.utils import AttrDict import pandas as pd @@ -6,6 +6,43 @@ from sharepa.analysis import agg_to_two_dim_dataframe import pytest +def test_bucket_to_data_frame(): + search_basic = ShareSearch() + search_basic.aggs.bucket( + 'agg_by_subs', # Every aggregation needs a name + 'terms', + field='subjects', + size=10, + min_doc_count=0 + ) + + # Create Mock object for test so we are not actually hitting server: + search_mock = AttrDict({u'agg_by_subs': + {u'buckets': + [ + {u'key': u'and', u'doc_count': 90372}, + {u'key': u'article', u'doc_count': 81837}, + {u'key': u'physics', u'doc_count': 74792}, + {u'key': u'astrophysics', u'doc_count': 45266}, + {u'key': u'energy', u'doc_count': 41402}, + {u'key': u'high', u'doc_count': 41272}, + {u'key': u'mathematics', u'doc_count': 37734}, + {u'key': u'research', u'doc_count': 36819}, + {u'key': u'theory', u'doc_count': 34277}, + {u'key': u'matter', u'doc_count': 33688} + ] + } + }) + search_basic.execute = Mock(return_value=search_mock) + search_results = search_basic.execute() + basic_dataframe = bucket_to_dataframe('ten_top_subjects', search_results.agg_by_subs.buckets) + assert isinstance(basic_dataframe, pd.DataFrame) + assert basic_dataframe.shape[0] == 10 + for index, row_count in enumerate(basic_dataframe.ten_top_subjects): + assert row_count == search_results.agg_by_subs.buckets[index].ten_top_subjects + for index, row_name in enumerate(basic_dataframe.key): + assert row_name == search_results.agg_by_subs.buckets[index].key + def test_basic_aggregation(): search_with_basic_aggs = ShareSearch() search_with_basic_aggs.aggs.bucket( @@ -44,6 +81,7 @@ def test_basic_aggregation(): assert row_name == search_results.testing_basic_agg.buckets[index].key + def test_two_dim_aggregation(): num_cols = 5 search_with_two_dim_aggs = ShareSearch() @@ -255,8 +293,6 @@ def test_throw_error_in_agg_convert_when_too_many_levels(): with pytest.raises(ValueError): multilevel_dataframe = agg_to_two_dim_dataframe(search_results.testing_multilevel) -test_throw_error_in_agg_convert_when_too_many_levels() - def test_merge_dataframes(): dream = pd.DataFrame({'Rhodes': 'Dusty'}, index=['Rhodes']) stardust = pd.DataFrame({'Rhodes': 'Cody'}, index=['Rhodes'])