From 842dce7a8019cdda0e44f3dabcf24e40c575e88e Mon Sep 17 00:00:00 2001 From: Ben Yetton Date: Fri, 19 Jun 2015 20:35:09 -0400 Subject: [PATCH 1/4] [WIP] Added experemental data analysis functions for dealing with nested data --- sharepa/experimental_analysis_functions.py | 72 ++++++++++++++++++++++ tests/test_experimental_functions.py | 36 +++++++++++ 2 files changed, 108 insertions(+) create mode 100644 sharepa/experimental_analysis_functions.py create mode 100644 tests/test_experimental_functions.py diff --git a/sharepa/experimental_analysis_functions.py b/sharepa/experimental_analysis_functions.py new file mode 100644 index 0000000..69cd2f4 --- /dev/null +++ b/sharepa/experimental_analysis_functions.py @@ -0,0 +1,72 @@ + +import pandas as pd + +def convert_nested_to_dataframe(agg, dates_as_key=True): + '''A function that takes nested elasticsearch response with aggregation and returns a nested dataframe + Warning: This is a recursive function, and rather non-intuitive to understand + + This function takes nested and crossed aggregations and converts them to an easy to manipulates pandas dataframe + e.g. Here we have a gender aggregation nested in year which is nested in state + + the output we want: + + state year gender doc_count + CA 2000 male 2 + CA 2000 female 5 + CA 2001 male 5 + CA 2001 female 5 + CA 2002 male 5 + CA 2002 female 5 + MN 2000 male 2 + MN 2000 female 5 + MN 2001 male 5 + MN 2001 female 5 + MN 2002 male 5 + MN 2002 female 5 + NY 2000 male 2 + NY 2000 female 5 + NY 2001 male 5 + NY 2001 female 5 + NY 2002 male 5 + NY 2002 female 5 + + What we want to do is step down through all the layers of nested data (recursively) until we reach the end, + and from the end, start creating pandas dataframes that get merged back into one giant dataframe + + this function is in an experimental state, and currently only tested on 2 nested levels, crossed date does not work + :param agg: an aggregation from elasticsearch results + :type agg: elasticsearch response.aggregation.agg_name object + :returns: pandas data frame of one or two dimetions depending on input data + ''' + agg_as_dict = agg.to_dict() + cat_names = [item for item in agg_as_dict.keys() if type(agg_as_dict[item]) is dict] + for cat_name in cat_names: #FIXME deal with multiple aggergations at the same level + expanded_buckets = [] + merge_vert = False + for bucket in getattr(agg, cat_name).buckets: + bucket_as_dict = bucket.to_dict() + if dict not in [type(item) for item in bucket_as_dict.values()]: + # we are at lowest level, begin return + if ('key_as_string' in bucket_as_dict.keys()) and dates_as_key: #change dates to readble format + bucket_as_dict['key'] = bucket['key_as_string'] + + bucket_as_dict[cat_name] = bucket_as_dict.pop('key') #change the name of the key to something meaningful + expanded_buckets.append(bucket_as_dict) #combine each dict at the lowest level + + else: + #We are at some level other than th lowest + level_name = str(bucket.key) #save the name of this level + lower_level_return = convert_nested_to_dataframe(bucket) #and drop down into the next level + cat_name_dataframe = pd.DataFrame([level_name for i in range(0,lower_level_return.shape[0])]) #create a cat name column + cat_name_dataframe.columns = [cat_name] #name the column something meaningful + merged_names_dataframe = pd.concat([cat_name_dataframe, lower_level_return], axis=1) #add return dataframes from lower levels, and attach the cat name coloum + expanded_buckets.append(merged_names_dataframe) #combine each cat and its data + merge_vert = True + # if merge_vert: + # dataframe_out = pd.concat(expanded_buckets, axis=0) + # return dataframe_out + if not merge_vert: + dataframe_out = pd.DataFrame(expanded_buckets) + dataframe_out.rename(columns=lambda x: x.replace('key', cat_name)) + return dataframe_out # FIXME this return here means we cannot add in other catogories at the same level (cant deal with corssing) + return pd.concat(expanded_buckets, axis=0) diff --git a/tests/test_experimental_functions.py b/tests/test_experimental_functions.py new file mode 100644 index 0000000..a42c960 --- /dev/null +++ b/tests/test_experimental_functions.py @@ -0,0 +1,36 @@ +import json +from sharepa.search import ShareSearch +from sharepa.experimental_analysis_functions import convert_nested_to_dataframe + +def pretty_print(d): + print(json.dumps(d, indent=4)) + +my_search = ShareSearch() +my_search_two_level_bins = ShareSearch() +my_search_two_level_bins.aggs.bucket( + 'tags', # Every aggregation needs a name + 'terms', + field='tags', + # We store the source of a document in its type, so this will aggregate by source #BYNOTE so this looks at the type feild and agregates by that? + size=10, # These are just to make sure we get numbers for all the sources, to make it easier to combine graphs + min_doc_count=0, +).metric( + 'source', + 'terms', + field='source', + size=10, + min_doc_count=0 +).metric( + 'dates', + 'date_histogram', + field='providerUpdatedDateTime', + interval='1M', + format='yyyy-MM-dd', + extended_bounds={ + "min": "2014-01-01", + "max": "2015-06-01"}, + min_doc_count=0 +) + +my_results_two_level_bins = my_search_two_level_bins.execute() +print(convert_nested_to_dataframe(my_results_two_level_bins.aggregations)) From 1130a1623bc58118609ce04cc4983dadc7ee9402 Mon Sep 17 00:00:00 2001 From: Ben Yetton Date: Mon, 22 Jun 2015 14:49:03 -0400 Subject: [PATCH 2/4] added testing and fixed crossing and 3 level nesting --- sharepa/experimental_analysis_functions.py | 69 +++-- tests/test_experimental_functions.py | 291 +++++++++++++++++++-- 2 files changed, 308 insertions(+), 52 deletions(-) diff --git a/sharepa/experimental_analysis_functions.py b/sharepa/experimental_analysis_functions.py index 69cd2f4..0544146 100644 --- a/sharepa/experimental_analysis_functions.py +++ b/sharepa/experimental_analysis_functions.py @@ -1,6 +1,6 @@ - import pandas as pd + def convert_nested_to_dataframe(agg, dates_as_key=True): '''A function that takes nested elasticsearch response with aggregation and returns a nested dataframe Warning: This is a recursive function, and rather non-intuitive to understand @@ -30,43 +30,66 @@ def convert_nested_to_dataframe(agg, dates_as_key=True): NY 2002 male 5 NY 2002 female 5 - What we want to do is step down through all the layers of nested data (recursively) until we reach the end, + What we do is step down through all the layers of nested data (recursively) until we reach the end, and from the end, start creating pandas dataframes that get merged back into one giant dataframe - this function is in an experimental state, and currently only tested on 2 nested levels, crossed date does not work - :param agg: an aggregation from elasticsearch results - :type agg: elasticsearch response.aggregation.agg_name object - :returns: pandas data frame of one or two dimetions depending on input data + this function is in an experimental state, and currently only tested on 3 nested levels, + TODO crossed data does not work + :param agg: an aggregation from elasticsearch results with nesting + :type agg: elasticsearch response.aggregation object + :returns: pandas data frame like example above, with nested data ''' + crossed_cats_expanded = [] + high_level_returning = False agg_as_dict = agg.to_dict() cat_names = [item for item in agg_as_dict.keys() if type(agg_as_dict[item]) is dict] - for cat_name in cat_names: #FIXME deal with multiple aggergations at the same level + for cat_name in cat_names: # TODO deal with multiple aggregations at the same level (Crossing) expanded_buckets = [] merge_vert = False + if not len(getattr(agg, cat_name).buckets): + raise ValueError('There is no count data in the lowest level of nesting. Is your search setup correctly?') + for bucket in getattr(agg, cat_name).buckets: bucket_as_dict = bucket.to_dict() if dict not in [type(item) for item in bucket_as_dict.values()]: # we are at lowest level, begin return - if ('key_as_string' in bucket_as_dict.keys()) and dates_as_key: #change dates to readble format + if ('key_as_string' in bucket_as_dict.keys()) and dates_as_key: # change dates to readble format bucket_as_dict['key'] = bucket['key_as_string'] + bucket_as_dict.pop('key_as_string') - bucket_as_dict[cat_name] = bucket_as_dict.pop('key') #change the name of the key to something meaningful - expanded_buckets.append(bucket_as_dict) #combine each dict at the lowest level - + bucket_as_dict[cat_name] = bucket_as_dict.pop( + 'key') # change the name of the key to something meaningful + expanded_buckets.append(bucket_as_dict) # combine each dict at the lowest level else: - #We are at some level other than th lowest - level_name = str(bucket.key) #save the name of this level - lower_level_return = convert_nested_to_dataframe(bucket) #and drop down into the next level - cat_name_dataframe = pd.DataFrame([level_name for i in range(0,lower_level_return.shape[0])]) #create a cat name column - cat_name_dataframe.columns = [cat_name] #name the column something meaningful - merged_names_dataframe = pd.concat([cat_name_dataframe, lower_level_return], axis=1) #add return dataframes from lower levels, and attach the cat name coloum - expanded_buckets.append(merged_names_dataframe) #combine each cat and its data + # We are at some level other than the lowest + level_name = str(bucket.key) # save the name of this level + lower_level_return = convert_nested_to_dataframe(bucket) # and drop down into the next level + expanded_buckets.append(add_category_labels(level_name,cat_name,lower_level_return)) merge_vert = True - # if merge_vert: - # dataframe_out = pd.concat(expanded_buckets, axis=0) - # return dataframe_out if not merge_vert: dataframe_out = pd.DataFrame(expanded_buckets) dataframe_out.rename(columns=lambda x: x.replace('key', cat_name)) - return dataframe_out # FIXME this return here means we cannot add in other catogories at the same level (cant deal with corssing) - return pd.concat(expanded_buckets, axis=0) + crossed_cats_expanded.append(dataframe_out.reset_index(drop=True)) + high_level_returning = True + + if high_level_returning: + return pd.concat(crossed_cats_expanded, axis=1).reset_index(drop=True) + else: + return pd.concat(expanded_buckets, axis=0).reset_index(drop=True) + + +def add_category_labels(level_name,cat_name,dataframe_needing_cat): + '''A function that adds a category name column to a pandas dataframe + + :param level_name: an aggregation from elasticsearch results with nesting + :type level_name: elasticsearch response.aggregation object + :param cat_name: an aggregation from elasticsearch results with nesting + :type cat_name: elasticsearch response.aggregation object + :param dataframe_needing_cat: a pandas dataframe to append category name too + :type dataframe_needing_cat: elasticsearch response.aggregation object + :returns: pandas data frame like example above, with nested data + ''' + cat_name_dataframe = pd.DataFrame( + [level_name for i in range(0, dataframe_needing_cat.shape[0])]) # create a cat name column + cat_name_dataframe.columns = [cat_name] # name the column something meaningful + return pd.concat([cat_name_dataframe, dataframe_needing_cat], axis=1) diff --git a/tests/test_experimental_functions.py b/tests/test_experimental_functions.py index a42c960..ad769e7 100644 --- a/tests/test_experimental_functions.py +++ b/tests/test_experimental_functions.py @@ -1,36 +1,269 @@ import json +import sys from sharepa.search import ShareSearch from sharepa.experimental_analysis_functions import convert_nested_to_dataframe +from elasticsearch_dsl.utils import AttrDict +from mock import Mock + + def pretty_print(d): print(json.dumps(d, indent=4)) -my_search = ShareSearch() -my_search_two_level_bins = ShareSearch() -my_search_two_level_bins.aggs.bucket( - 'tags', # Every aggregation needs a name - 'terms', - field='tags', - # We store the source of a document in its type, so this will aggregate by source #BYNOTE so this looks at the type feild and agregates by that? - size=10, # These are just to make sure we get numbers for all the sources, to make it easier to combine graphs - min_doc_count=0, -).metric( - 'source', - 'terms', - field='source', - size=10, - min_doc_count=0 -).metric( - 'dates', - 'date_histogram', - field='providerUpdatedDateTime', - interval='1M', - format='yyyy-MM-dd', - extended_bounds={ - "min": "2014-01-01", - "max": "2015-06-01"}, - min_doc_count=0 -) - -my_results_two_level_bins = my_search_two_level_bins.execute() -print(convert_nested_to_dataframe(my_results_two_level_bins.aggregations)) +def test_convert_nested_to_dataframe_crossed(): + my_search = ShareSearch() # BASE_URL='https://staging.osf.io/api/v1/share/search/') + + # first we test crossed data + my_search.aggs.bucket( + 'tags', # Every aggregation needs a name + 'terms', + field='tags', + # We store the source of a document in its type, so this will aggregate by source #BYNOTE so this looks at the type feild and agregates by that? + size=3, # These are just to make sure we get numbers for all the sources, to make it easier to combine graphs + min_doc_count=0, + ).metric( + 'source', + 'terms', + field='source', + size=3, + min_doc_count=0 + ).metric( + 'dates', + 'date_histogram', + field='providerUpdatedDateTime', + interval='1y', + format='yyyy-MM-dd', + extended_bounds={ + "min": "2014-01-01", + "max": "2015-01-01"}, + min_doc_count=0 + ) + + search_mock = AttrDict({u'aggregations':{u'tags': {u'buckets': + [{u'dates': {u'buckets': [{u'doc_count': 5, + u'key': 1388534400000, + u'key_as_string': u'2014-01-01'}, + {u'doc_count': 15776, + u'key': 1420070400000, + u'key_as_string': u'2015-01-01'}]}, + u'doc_count': 15781, + u'key': u'water', + u'source': {u'buckets': [{u'doc_count': 15760, u'key': u'dataone'}, + {u'doc_count': 21, u'key': u'clinicaltrials'}, + {u'doc_count': 0, u'key': u'arxiv_oai'}], + u'doc_count_error_upper_bound': 0, + u'sum_other_doc_count': 0}}, + {u'dates': {u'buckets': [{u'doc_count': 0, + u'key': 1388534400000, + u'key_as_string': u'2014-01-01'}, + {u'doc_count': 15505, + u'key': 1420070400000, + u'key_as_string': u'2015-01-01'}]}, + u'doc_count': 15505, + u'key': u'california', + u'source': {u'buckets': [{u'doc_count': 15505, u'key': u'dataone'}, + {u'doc_count': 0, u'key': u'arxiv_oai'}, + {u'doc_count': 0, u'key': u'asu'}], + u'doc_count_error_upper_bound': 0, + u'sum_other_doc_count': 0}}, + {u'dates': {u'buckets': [{u'doc_count': 1, + u'key': 1388534400000, + u'key_as_string': u'2014-01-01'}, + {u'doc_count': 14825, + u'key': 1420070400000, + u'key_as_string': u'2015-01-01'}]}, + u'doc_count': 14826, + u'key': u'county', + u'source': {u'buckets': [{u'doc_count': 14825, u'key': u'dataone'}, + {u'doc_count': 1, u'key': u'clinicaltrials'}, + {u'doc_count': 0, u'key': u'arxiv_oai'}], + u'doc_count_error_upper_bound': 0, + u'sum_other_doc_count': 0}}], + u'doc_count_error_upper_bound': 5860, + u'sum_other_doc_count': 706643}}}) + + my_search.execute = Mock(return_value=search_mock) + my_results = my_search.execute() + my_dataframe = convert_nested_to_dataframe(my_results.aggregations) + + assert my_dataframe.shape == (9, 5) + for tag_buckets in my_results.aggregations.tags.buckets: + assert tag_buckets.key in my_dataframe['tags'].values.tolist() + for source_buckets in tag_buckets.source.buckets: + assert source_buckets.source in my_dataframe['source'].values.tolist() or (dates_buckets.dates is 'NaN') + for dates_buckets in tag_buckets.dates.buckets: + assert (dates_buckets.dates in my_dataframe['dates'].values.tolist()) or (dates_buckets.dates is 'NaN') + +def test_convert_nested_to_dataframe_nested(): + my_search = ShareSearch() + my_search.aggs.bucket( + 'tags', # Every aggregation needs a name + 'terms', + field='tags', + # We store the source of a document in its type, so this will aggregate by source #BYNOTE so this looks at the type feild and agregates by that? + size=3, # These are just to make sure we get numbers for all the sources, to make it easier to combine graphs + min_doc_count=0, + ).bucket( + 'source', + 'terms', + field='source', + size=3, + min_doc_count=0 + ).bucket( + 'dates', + 'date_histogram', + field='providerUpdatedDateTime', + interval='1y', + format='yyyy-MM-dd', + extended_bounds={ + "min": "2014-11-01", + "max": "2015-01-01"}, + min_doc_count=0 + ) + + search_mock = AttrDict({u'aggregations': + {u'tags': {u'buckets': [{u'doc_count': 15781, + u'key': u'water', + u'source': {u'buckets': [ + {u'dates': {u'buckets': + [{u'doc_count': 0, + u'key': 1388534400000, + u'key_as_string': u'2014-01-01'}, + {u'doc_count': 15760, + u'key': 1420070400000, + u'key_as_string': u'2015-01-01'} + ]}, + u'doc_count': 15760, + u'key': u'dataone'}, + {u'dates': {u'buckets': + [{u'doc_count': 5, + u'key': 1388534400000, + u'key_as_string': u'2014-01-01'}, + {u'doc_count': 16, + u'key': 1420070400000, + u'key_as_string': u'2015-01-01'} + ]}, + u'doc_count': 21, + u'key': u'clinicaltrials'}, + {u'dates': {u'buckets': + [{u'doc_count': 0, + u'key': 1388534400000, + u'key_as_string': u'2014-01-01'}, + {u'doc_count': 0, + u'key': 1420070400000, + u'key_as_string': u'2015-01-01'} + ]}, + u'doc_count': 0, + u'key': u'arxiv_oai'}], + u'doc_count_error_upper_bound': 0, + u'sum_other_doc_count': 0}}, + {u'doc_count': 15505, + u'key': u'california', + u'source': {u'buckets': [{u'dates': {u'buckets': [{u'doc_count': 0, + u'key': 1388534400000, + u'key_as_string': u'2014-01-01'}, + {u'doc_count': 15505, + u'key': 1420070400000, + u'key_as_string': u'2015-01-01'}]}, + u'doc_count': 15505, + u'key': u'dataone'}, + {u'dates': {u'buckets': [{u'doc_count': 0, + u'key': 1388534400000, + u'key_as_string': u'2014-01-01'}, + {u'doc_count': 0, + u'key': 1420070400000, + u'key_as_string': u'2015-01-01'}]}, + u'doc_count': 0, + u'key': u'arxiv_oai'}, + {u'dates': {u'buckets': [{u'doc_count': 0, + u'key': 1388534400000, + u'key_as_string': u'2014-01-01'}, + {u'doc_count': 0, + u'key': 1420070400000, + u'key_as_string': u'2015-01-01'}]}, + u'doc_count': 0, + u'key': u'asu'}], + u'doc_count_error_upper_bound': 0, + u'sum_other_doc_count': 0}}, + {u'doc_count': 14826, + u'key': u'county', + u'source': {u'buckets': [{u'dates': {u'buckets': [{u'doc_count': 0, + u'key': 1388534400000, + u'key_as_string': u'2014-01-01'}, + {u'doc_count': 14825, + u'key': 1420070400000, + u'key_as_string': u'2015-01-01'}]}, + u'doc_count': 14825, + u'key': u'dataone'}, + {u'dates': {u'buckets': [{u'doc_count': 1, + u'key': 1388534400000, + u'key_as_string': u'2014-01-01'}, + {u'doc_count': 0, + u'key': 1420070400000, + u'key_as_string': u'2015-01-01'}]}, + u'doc_count': 1, + u'key': u'clinicaltrials'}, + {u'dates': {u'buckets': [{u'doc_count': 0, + u'key': 1388534400000, + u'key_as_string': u'2014-01-01'}, + {u'doc_count': 0, + u'key': 1420070400000, + u'key_as_string': u'2015-01-01'}]}, + u'doc_count': 0, + u'key': u'arxiv_oai'}], + u'doc_count_error_upper_bound': 0, + u'sum_other_doc_count': 0}}], + u'doc_count_error_upper_bound': 5860, + u'sum_other_doc_count': 706643} + }}) + + my_search.execute = Mock(return_value=search_mock) + my_results = my_search.execute() + my_dataframe = convert_nested_to_dataframe(my_results.aggregations) + assert my_dataframe.shape == (18, 4) + for tag_buckets in my_results.aggregations.tags.buckets: + assert tag_buckets.key in my_dataframe['tags'].values.tolist() + for source_buckets in tag_buckets.source.buckets: + assert source_buckets.key in my_dataframe['source'].values.tolist() + for dates_buckets in source_buckets.dates.buckets: + assert dates_buckets.dates in my_dataframe['dates'].values.tolist() + + # TODO create test to throw ValueError exception + +test_convert_nested_to_dataframe_crossed() +test_convert_nested_to_dataframe_nested() + +# The search the broke Share? +# my_search = ShareSearch() +# +# my_search.aggs.bucket( +# 'tags', # Every aggregation needs a name +# 'terms', +# field='tags', +# # We store the source of a document in its type, so this will aggregate by source #BYNOTE so this looks at the type feild and agregates by that? +# size=10, # These are just to make sure we get numbers for all the sources, to make it easier to combine graphs +# min_doc_count=0, +# ).bucket( +# 'source', +# 'terms', +# field='source', +# size=10, +# min_doc_count=0 +# ).bucket( +# 'tags2', +# 'terms', +# field='tags', +# size=10, +# min_doc_count=0 +# ).bucket( +# 'dates', +# 'date_histogram', +# field='providerUpdatedDateTime', +# interval='1M', +# format='yyyy-MM-dd', +# extended_bounds={ +# "min": "2014-10-01", +# "max": "2015-01-01"}, +# min_doc_count=0 +# ) + From e9b762f08f445d14c96e4f85f8444b7b80df608d Mon Sep 17 00:00:00 2001 From: Ben Yetton Date: Mon, 29 Jun 2015 10:08:13 -0400 Subject: [PATCH 3/4] Fixed Flake8 issues --- sharepa/experimental_analysis_functions.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sharepa/experimental_analysis_functions.py b/sharepa/experimental_analysis_functions.py index 0544146..525401e 100644 --- a/sharepa/experimental_analysis_functions.py +++ b/sharepa/experimental_analysis_functions.py @@ -64,7 +64,7 @@ def convert_nested_to_dataframe(agg, dates_as_key=True): # We are at some level other than the lowest level_name = str(bucket.key) # save the name of this level lower_level_return = convert_nested_to_dataframe(bucket) # and drop down into the next level - expanded_buckets.append(add_category_labels(level_name,cat_name,lower_level_return)) + expanded_buckets.append(add_category_labels(level_name, cat_name, lower_level_return)) merge_vert = True if not merge_vert: dataframe_out = pd.DataFrame(expanded_buckets) @@ -78,7 +78,7 @@ def convert_nested_to_dataframe(agg, dates_as_key=True): return pd.concat(expanded_buckets, axis=0).reset_index(drop=True) -def add_category_labels(level_name,cat_name,dataframe_needing_cat): +def add_category_labels(level_name, cat_name, dataframe_needing_cat): '''A function that adds a category name column to a pandas dataframe :param level_name: an aggregation from elasticsearch results with nesting From 675d54d7d2cb10b3b2c9153ca78e7300effe8196 Mon Sep 17 00:00:00 2001 From: Ben Yetton Date: Mon, 29 Jun 2015 10:44:12 -0400 Subject: [PATCH 4/4] added barebones of valueerror test, cannot be completed due to sharepa issues --- tests/test_experimental_functions.py | 78 +++++++++++++++------------- 1 file changed, 42 insertions(+), 36 deletions(-) diff --git a/tests/test_experimental_functions.py b/tests/test_experimental_functions.py index ad769e7..4cc819c 100644 --- a/tests/test_experimental_functions.py +++ b/tests/test_experimental_functions.py @@ -228,42 +228,48 @@ def test_convert_nested_to_dataframe_nested(): for dates_buckets in source_buckets.dates.buckets: assert dates_buckets.dates in my_dataframe['dates'].values.tolist() - # TODO create test to throw ValueError exception -test_convert_nested_to_dataframe_crossed() -test_convert_nested_to_dataframe_nested() +def test_convert_nested_to_dataframe_raise_ValueError(): -# The search the broke Share? -# my_search = ShareSearch() -# -# my_search.aggs.bucket( -# 'tags', # Every aggregation needs a name -# 'terms', -# field='tags', -# # We store the source of a document in its type, so this will aggregate by source #BYNOTE so this looks at the type feild and agregates by that? -# size=10, # These are just to make sure we get numbers for all the sources, to make it easier to combine graphs -# min_doc_count=0, -# ).bucket( -# 'source', -# 'terms', -# field='source', -# size=10, -# min_doc_count=0 -# ).bucket( -# 'tags2', -# 'terms', -# field='tags', -# size=10, -# min_doc_count=0 -# ).bucket( -# 'dates', -# 'date_histogram', -# field='providerUpdatedDateTime', -# interval='1M', -# format='yyyy-MM-dd', -# extended_bounds={ -# "min": "2014-10-01", -# "max": "2015-01-01"}, -# min_doc_count=0 -# ) + return + + #FIXME currently this search breaks sharepa, no sure why, but needed to raise the value error + + my_search = ShareSearch() # BASE_URL='https://staging.osf.io/api/v1/share/search/') + + # first we test crossed data + my_search.aggs.bucket( + 'tags', # Every aggregation needs a name + 'terms', + field='tags', + size=3, + min_doc_count=0, + ).bucket( + 'source', + 'terms', + field='source', + size=3, + min_doc_count=0 + ).bucket( + 'tags2', + 'terms', + field='tags', + size=10, + min_doc_count=0 + ).bucket( + 'dates', + 'date_histogram', + field='providerUpdatedDateTime', + interval='1y', + format='yyyy-MM-dd', + extended_bounds={ + "min": "2014-01-01", + "max": "2015-01-01"}, + min_doc_count=0 + ) + + #TODO create Mock return object for my_search.execute() here + my_results = my_search.execute() + my_dataframe = convert_nested_to_dataframe(my_results.aggregations) + print(my_dataframe)