In [None]:
import requests
import pandas as pd

In [None]:
## Base path the active DataARC API
BASE_URL = "https://api.data-arc.org"

## A basic function to generate a path based on the
## documentation located at https://api.data-arc.org/documentation
## example: MAKE_PATH("datasets")
MAKE_PATH = lambda path: f"{BASE_URL}/{path}"

# End point URIs
DATASET_END_POINT = "datasets"
DATASET_FEATURES_END_POINT = "features"

# Utility Functions

def get_dataset_by_name(name: str) -> dict:
    """
    Simple method for quickly querying and filtering a dataset
    based on a name

    params
    ------
    name: str
      The string name as it is in the database
    
    returns
    -------
    dict
      The returned JSON data from the API
    """
    datasets = requests.get(MAKE_PATH(DATASET_END_POINT)).json()
    dataset = list(filter(lambda d: d.get('name') == name, datasets))[0]
    return dataset


def map_feature_to_nested_property(feature, prop='properties'):
    """
    This is a basic function to extract nested dictionaries from features
    returned from the DataARC database. This is not the same as flattening
    as only the specified nested dictionary is extracted. 
    The original feature id as kept as `_id` in case future REST queries
    require this database specific id to be in place

    params
    ------
    feature: dict
      the feature returned from the DataARC API
    
    prop: str
      the nested property to be returned - usually a dictionary value

    returns
    -------
    dict
      the extracted nested property with the _id in place
    """
    properties = feature.get(prop, {})
    ## Capture the database ID to run additional feature queries
    properties['_id'] = feature.get('id') 
    return properties


def get_combinators_from_features(features: list) -> dict:
    """
    Collect combinators based on a unique list of feature _id values

    params
    ------
    features: List[str]
      a unique list of str _id values resulted from the /features api
    
    returns
    -------
    dict
      a dictionary containing both the found related and contextual unique
      ids as properties
    """
    # Use the original feature _id to collect concepts through their combinators
    data = requests.get(MAKE_PATH("combinators"), {"features_in": features}).json()
    # >> List[List[Dict]]
    # Combinators using the `_in` suffix will return a list of lists with dictionaries
    combinator_collection = [combinator.get('concepts') for combinator in data]

    related = []
    contextual = []

    for combinators in combinator_collection:
        for combinator in combinators:
            related += combinator.get('related')
            contextual += combinator.get('contextual')
    
    return {
        'related': set(related),
        'contextual': set(contextual)
    }


def make_concepts_request(concept_ids: list) -> pd.DataFrame:
    """
    Collect concepts from their _id values

    params
    ------
    concept_ids: List[str]
      The id returned from the API end point for /concepts
    
    returns
    -------
    pd.DataFrame
      A precompiled DataFrame with the concept results
    """
    data = requests.get(MAKE_PATH("concepts"), {
        "id_in": concept_ids
    }).json()
    return pd.DataFrame(data)


def get_features_by_dataset_id(dataset_id: str) -> pd.DataFrame:
    """
    Given a dataset id (_id), construct a DataFrame with the results

    params
    ------
    dataset_id: str
      the dataset id (or _id) returned from the /datasets endpoint
    
    returns
    -------
    pd.DataFrame
      the constructed DataFrame
    """
    params = {
        "dataset": dataset_id,
        "_limit": -1 ## This will retrieve all records without a limit
    }

    features_data = requests.get(MAKE_PATH(DATASET_FEATURES_END_POINT), params=params).json()

    return pd.DataFrame(map(map_feature_to_nested_property, features_data))



### Use case 1: 
I'm interested in different communities' reliance on marine vs. terrestrial resources for food. I want to find places where there's a fairly balanced reliance on these. As a ballpark figure, I'd start by looking at places where the ration is something between 40/60 and 60/40 marine to terrestrial.

Step 1: I know from exploring on the front end UI that nabonosead has some of the data I want... I've seen indicators of both marine and terrestrial resources in that data when I searched on a concept about 'livestock'. I head to the 'advanced user interface' aka this jupyter notebook

Step 2: I make an api call to get the list of datasets (so I know what the nabonosead dataset is called in the API) and another API call to get the list of field names in the nabonosead data

In [None]:
# Collect the nabonosead dataset dictionary
nabonosead_dataset = get_dataset_by_name('nabonosead')
# The dataset has all fields in the initial return - this can be extracted from the dict
nabonosead_dataset_fields = nabonosead_dataset.get('fields')
# Create a DataFrame from the extracted fields
nabonosead_fields_df = pd.DataFrame(nabonosead_dataset_fields)
nabonosead_fields_df

Step 3: I can see that Indicators Freshwater Fish, Indicators Marine Fish, Indicators Domestic and Indicators Wild seem like likely candidate fields to point at sites I'd want to investigate.

Step 4: I want to construct my query to get sites with a balanced ratio... So I start by trying an API query that gets all the sites where Indicators Freshwater Fish OR Indicators Marine Fish OR Indicators Domestic OR Indicators Wild >0 and put them into a dataframe.

In [None]:
# Use the dataset id from the above dictionary to collect the features
# For this we can call our handy function from above
features_df = get_features_by_dataset_id(nabonosead_dataset.get('id'))

# Sub-select DataFrame with our conditional OR '|' operations
sub_features = features_df[
    (features_df['indicators_wild'] > 0) | 
    (features_df['indicators_marine_fish'] > 0) | 
    (features_df['indicators_domestic'] > 0) | 
    (features_df['indicators_freshwater_fish'] > 0)
]

Step 5: That's given me more sites than what I really want (balanced and unbalanced ratios are included). But I can filter the dataframe using the magic of pandas. I do 

```
df.mydataset = df.loc[
    (Indicators Freshwater Fish + Indicators Marine Fish) / (Indicators Domestic + Indicators Wild) > 40 & 
    (Indicators Freshwater Fish + Indicators Marine Fish) / (Indicators Domestic + Indicators Wild) < 60
]
``` 


In [None]:
# Create a new column in a new DataFrame with the `ratio` applied
features_with_ratio_df = sub_features.assign(
    ratio=(sub_features['indicators_freshwater_fish'] + sub_features['indicators_marine_fish']) / \
          (sub_features['indicators_domestic'] + sub_features['indicators_wild'])
)
# Drop any NaN values resulting from division by 0
features_with_a_valid_ratio = features_with_ratio_df.dropna(subset=["ratio"])

# Collect the balanced features remaining
balanced = features_with_a_valid_ratio[
    (features_with_a_valid_ratio['ratio'] > 0.4) & (features_with_a_valid_ratio['ratio'] < 0.6)
]
balanced

Step 6: I am happy and maybe make myself some nice charts showing the different ratios in my subset of sites with balanced ratios...

In [None]:
f"{len(balanced)} features found"

Step 7: I wonder what other data might be connected via concepts and also relevant to these sites with balanced resource use. I run an API call against the ids of these items from nabonosead to get the set of concepts that are 'related' and 'contextual' to their concepts <-- this part I'm not sure how you have it set up in the API.

In [None]:
# Collect the unique list of balanced feature _id values for the API
unique_feature_ids = set(balanced['_id'])

# To get concepts, we must first travel through the combinators
# Collect combinators
combinators = get_combinators_from_features(unique_feature_ids)

# For this we want the `related` and `contextual` combinators
related = combinators.get('related')
contextual = combinators.get('contextual')

In [None]:
# Use the combinator ids to collect concepts using another handy function
related_df = make_concepts_request(related)
contextual_df = make_concepts_request(contextual)

Step 8: I could go back to the UI and search on these concepts, or I could search for the datasets linked to one of the related concepts from here...

### Use case: 
I'm interested in looking at places with lots of data from short well defined timespans. From looking around in the UI, I can see the tephrabase data should let me find places with well defined chronologies and then I could see what other data is available in these places and how much of it there is.

Step 1: I do an API query to get the list of dataset names and another to get the list of field names in the tephrabase dataset

In [None]:
# Collect the tephrabase dictioary
tephrabase_dataset = get_dataset_by_name('tephrabase')
# Extract the fields from the dictionary directly
tephrabase_dataset_fields = tephrabase_dataset.get('fields', [])
# Construct the DataFrame to inspect the fields
tephrabase_dataset_fields_df = pd.DataFrame(tephrabase_dataset_fields)

Step 2: I can see the earliest date and latest date fields in the tephrabase dataset. I construct an API query to get the tephrabase data into a dataframe.

In [None]:
# Using the `id` within the dataset dictionary, collect all the features
# Again, we'll use our handy function for this
tephrabase_features_df = get_features_by_dataset_id(tephrabase_dataset.get('id'))

Step 3: I filter my dataframe to select rows where df.mydataset = df.loc[tephrabase(latest date - earliest date)<100] to get sequences of less than 100 years

In [None]:
# Apply the conditions that we only need less than 100 year span
# From the fields above we see it's `latestyear` and `earliestyear`
tephrabase_less_than_100 = tephrabase_features_df[
    (tephrabase_features_df['latestyear'] - tephrabase_features_df['earliestyear']) < 100
]
tephrabase_less_than_100

Step 4: Assuming this gives me a reasonable set of sites, I do another API query to see what concepts are mapped to these sites.

In [None]:
# Collect the unique list of balanced feature _id values for the API
unique_feature_ids = set(tephrabase_less_than_100['_id'])

# To get concepts, we must first travel through the combinators
# Collect combinators
combinators = get_combinators_from_features(unique_feature_ids)

# For this we want the `related` and `contextual` combinators
related = combinators.get('related')
contextual = combinators.get('contextual')

In [None]:
# Use the combinator ids to collect concepts using another handy function
related_df = make_concepts_request(related)
contextual_df = make_concepts_request(contextual)

In [None]:
related_df

In [None]:
contextual_df

Step 5: I go back to the UI and apply a spatial filter to look at the specific sites I've identified here and see what other data is related to them AND is mapped to the same concepts (by filtering by concept).