In [None]:
#| default_exp discovery

In [None]:
#| export
from istatapi.base import ISTAT
from istatapi.utils import make_tree, strip_ns
import pandas as pd
from fastcore.test import *

`<<<<<<< HEAD`

In [None]:
#| hide
from nbdev.showdoc import *

`=======`

In [None]:
#| hide
from istatapi.base import ISTAT
from istatapi.utils import make_tree, strip_ns
import pandas as pd
from fastcore.test import *

In [None]:
#| export
from istatapi.base import ISTAT
from istatapi.utils import make_tree, strip_ns
import pandas as pd
from fastcore.test import *

`>>>>>>> c2fb413 (update fastai and fastcore)`

# Discovery

> Functions used to discover and explore the data exposed by ISTAT webservice.

This module implements functions to discover the data exposed by ISTAT. To do so, `istatapi` make metadata requests to the API endpoints. The `Discovery` module provides useful methods to parse and analyze API metadata responses. It makes use of the library `pandas` and returns data in the `DataFrame` format, making it convenient for interactive and exploratory analysis in Jupyter Notebooks.

The main class implemented in the `Discovery` module is `DataSet`.

def parse_dataflows(response):
    """parse the `response` containing all the available datasets and return a list of dataflows."""
    tree = make_tree(response)
    strip_ns(tree)
    root = tree.root

    dataflows_l = []
    for dataflow in root.iter("Dataflow"):
        id = dataflow.get("id")
        version = dataflow.get("version")
        structure_id = [ref.get("id") for ref in dataflow.iter("Ref")][0]

        # iter over names and get the descriptions
        for name in dataflow.findall("Name"):
            lang = name.get("{http://www.w3.org/XML/1998/namespace}lang")
            if lang == "en":
                description_en = name.text
            # if lang == 'it':
            # description_it = name.text

        dataflow_dict = {
            "df_id": id,
            "version": version,
            "df_description": description_en,
            # "description_it": description_it,
            "df_structure_id": structure_id,
        }

        dataflows_l.append(dataflow_dict)

    return dataflows_l


def all_available(dataframe=True):
    """Return all available dataflows"""
    path = "dataflow/IT1"
    client = ISTAT()
    response = client._request(path=path)
    dataflows = parse_dataflows(response)

    if dataframe == True:
        dataflows = pd.DataFrame(dataflows)

    return dataflows


def search_dataset(keyword):
    """Search available dataflows that contain `keyword`. Return these dataflows in a DataFrame"""
    dataflows = all_available()[
        all_available()["df_description"].str.contains(keyword, case=False)
    ]
    
    if len(dataflows) == 0: raise ValueError('No dataset matching `keyword`')

    return dataflows

In [None]:
#| export
def parse_dataflows(response):
    """parse the `response` containing all the available datasets and return a list of dataflows."""
    tree = make_tree(response)
    strip_ns(tree)
    root = tree.root

    dataflows_l = []
    for dataflow in root.iter("Dataflow"):
        id = dataflow.get("id")
        version = dataflow.get("version")
        structure_id = [ref.get("id") for ref in dataflow.iter("Ref")][0]

        # iter over names and get the descriptions
        for name in dataflow.findall("Name"):
            lang = name.get("{http://www.w3.org/XML/1998/namespace}lang")
            if lang == "en":
                description_en = name.text
            # if lang == 'it':
            # description_it = name.text

        dataflow_dict = {
            "df_id": id,
            "version": version,
            "df_description": description_en,
            # "description_it": description_it,
            "df_structure_id": structure_id,
        }

        dataflows_l.append(dataflow_dict)

    return dataflows_l


def all_available(dataframe=True):
    """Return all available dataflows"""
    path = "dataflow/IT1"
    client = ISTAT()
    response = client._request(path=path)
    dataflows = parse_dataflows(response)

    if dataframe == True:
        dataflows = pd.DataFrame(dataflows)

    return dataflows


def search_dataset(keyword):
    """Search available dataflows that contain `keyword`. Return these dataflows in a DataFrame"""
    dataflows = all_available()[
        all_available()["df_description"].str.contains(keyword, case=False)
    ]
    
    if len(dataflows) == 0: raise ValueError('No dataset matching `keyword`')

    return dataflows

The simplest way to get a full list of the dataflows provided by ISTAT is to call the method `all_available()` which returns a list of all the explorable dataflows, together with their IDs and descriptions.

In [None]:
show_doc(all_available)

---

### all_available

>      all_available (dataframe=True)

Return all available dataflows

In [None]:
available_datasets = all_available()
available_datasets.head()

Unnamed: 0,df_id,version,df_description,df_structure_id
0,101_1015,1.3,Crops,DCSP_COLTIVAZIONI
1,101_1030,1.0,"PDO, PGI and TSG quality products",DCSP_DOPIGP
2,101_1033,1.0,slaughtering,DCSP_MACELLAZIONI
3,101_1039,1.2,Agritourism - municipalities,DCSP_AGRITURISMO_COM
4,101_1077,1.0,"PDO, PGI and TSG products: operators - munici...",DCSP_DOPIGP_COM


In [None]:
print(f'number of available datasets: {len(available_datasets)}')

number of available datasets: 501


In [None]:
test_eq(available_datasets.columns, ['df_id', 'version', 'df_description', 'df_structure_id'])

In [None]:
show_doc(search_dataset)

---

### search_dataset

>      search_dataset (keyword)

Search available dataflows that contain `keyword`. Return these dataflows in a DataFrame

This method looks for `keyword` inside all datasets descriptions. By default, the `keyword` needs to be an english word.

In [None]:
df = search_dataset(keyword="Tax")
df.head()

Unnamed: 0,df_id,version,df_description,df_structure_id
168,168_261,1.1,Hicp - at constant tax rates annual data(base ...,DCSP_IPCATC2
169,168_306,1.2,Hicp - at constant tax rates monthly data (bas...,DCSP_IPCATC1
172,168_756,1.4,Hicp - at constant tax rates monthly data (bas...,DCSP_IPCATC1B2015
173,168_757,1.1,Hicp- at constant tax rates annual data (base ...,DCSP_IPCATC2B2015
265,30_1008,1.1,Irpef taxable incomes (Ipef) - municipalities,MEF_REDDITIIRPEF_COM


In [None]:
test_fail(lambda: search_dataset(keyword="disoccupazione"))

## Data Structures and Information about available Datasets

class DataSet(ISTAT):
    """Class that implements methods to retrieve informations (metadata) about a Dataset"""

    def __init__(self, dataflow_identifier):
        super().__init__()
        self.resource = "datastructure"
        self.all_available = all_available()  # df with all the available dataflows
        self.identifiers = self.set_identifiers(dataflow_identifier)
        self.available_values = self.get_available_values()
        self.dimensions = list(self.dimensions_info(description=False).dimension)
        self.filters = self.default_filters()
        # self.dimensions_values = self.available_dimensions_values()
        
        # TODO: returning all metadata related to the dataflow contained in 'Header'

    def set_identifiers(self, dataflow_identifier):
        """Take any type of `dataflow_identifier` and return all identifiers in a dictionary"""
        if dataflow_identifier[3] == "_":
            return self.set_from_id(dataflow_identifier)
        elif dataflow_identifier[4] == "_":
            return self.set_from_structure_id(dataflow_identifier)
        else:
            if type(dataflow_identifier) == str:
                return self.set_from_description(dataflow_identifier)
            else:
                raise ValueError(dataflow_identifier)

    def set_from_id(self, df_id):
        mask = self.all_available["df_id"] == df_id
        df = self.all_available[mask]
        return df.to_dict(orient="records")[0]

    def set_from_structure_id(self, df_structure_id):
        mask = self.all_available["df_structure_id"] == df_structure_id
        df = self.all_available[mask]
        return df.to_dict(orient="records")[0]

    def set_from_description(self, description):
        mask = self.all_available["df_description"] == description
        df = self.all_available[mask]
        return df.to_dict(orient="records")[0]

    def parse_dimensions(self, response):
        """Parse the `response` containing a dataflow's dimensions and return them in a list"""
        tree = make_tree(response)
        strip_ns(tree)
        root = tree.root

        dimensions_l = []
        for dimension in root.iter("Dimension"):
            dimension_name = dimension.attrib["id"]

            dimension_id = [
                enumeration.find("Ref").get("id")
                for enumeration in dimension.iter("Enumeration")
            ][0]

            dimension_dict = {"dimension": dimension_name, "dimension_ID": dimension_id}

            dimensions_l.append(dimension_dict)

        return dimensions_l

    def dimensions_info(self, dataframe=True, description=True):
        """Return the dimensions of a specific dataflow and their `descriptions`."""
        df_structure_id = self.identifiers["df_structure_id"]

        path_parts = [self.resource, self.agencyID, df_structure_id]
        path = "/".join(path_parts)
        response = self._request(path=path)
        dimensions = self.parse_dimensions(response)

        if dataframe == True:
            dimensions = pd.DataFrame(dimensions)

        if description == True:
            dimensions_description = self.dimensions_description(dimensions)
            dimensions = dimensions.merge(dimensions_description, on="dimension_ID")

        return dimensions

    def dimensions_description(self, dimensions):
        """Return a dataframe with the descriptions of `dimensions`"""
        resource = "codelist"
        dimensions_l = dimensions.dimension_ID.tolist()
        descriptions_l = []

        for dimension_id in dimensions_l:
            path_parts = [resource, self.agencyID, dimension_id]
            path = "/".join(path_parts)
            response = self._request(path=path)
            tree = make_tree(response)
            strip_ns(tree)
            root = tree.root

            description = [x for x in root.iter("Codelist")][0]
            # description_it = description.findall('Name')[0].text
            description = description.findall("Name")[1].text

            description_dict = {
                "dimension_ID": dimension_id,
                "description": description,
            }
            descriptions_l.append(description_dict)

        dimensions_descriptions = pd.DataFrame(descriptions_l)

        return dimensions_descriptions

    def get_available_values(self):
        """Return a dictionary with available values for each dimension in the DataSet instance"""
        resource = "availableconstraint"
        df_id = self.identifiers["df_id"]
        path_parts = [
            resource,
            df_id,
            "?references=all&detail=full",
        ]  # TODO: pass them as parameters
        path = "/".join(path_parts)
        response = self._request(path=path)
        tree = make_tree(response)
        strip_ns(tree)
        root = tree.root

        dimensions_values = {}

        for dimension in root.iter("Codelist"):
            dimension_id = dimension.get("id")

            values = {}
            value_id_l, value_descr_l = [], []

            for value in dimension.iter("Code"):
                value_id = value.get("id")
                value_descr = [name.text for name in value.findall("Name")][1]
                value_id_l.append(value_id)
                value_descr_l.append(value_descr)

            values["values_ids"] = value_id_l
            values["values_description"] = value_descr_l
            dimensions_values[dimension_id] = values

        for dimension_id in list(dimensions_values.keys()):
            dimension = self.get_dimension_name(dimension_id)
            dimensions_values[dimension] = dimensions_values.pop(dimension_id)

        return dimensions_values

    def get_dimension_values(self, dimension, dataframe=True):
        """Return the available values of a single `dimension` in the dataset"""
        dimension_dict = self.available_values[dimension]
        dimension_df = pd.DataFrame.from_dict(dimension_dict)
        return dimension_df if dataframe else dimension_dict

    def get_dimension_name(self, dimension_id):
        """Convert `dimension_id` to `dimension`"""
        dimensions_df = self.dimensions_info(description=False)
        mask = dimensions_df["dimension_ID"] == dimension_id
        dimension = dimensions_df[mask]["dimension"]
        return dimension.values[0]

    def default_filters(self):
        """"initiate self.filters with default values"""
        default_filters = {}  
        # no filter equals all values (default)
        for dimension in self.dimensions:
            default_filters[dimension] = "."
        return default_filters

    def set_filters(self, **kwargs):
        """set filters for the dimensions of the dataset by passing dimension_name=value"""
        # add kwargs in case passed
        for arg, arg_value in kwargs.items():
            self.filters[arg.upper()] = arg_value

In [None]:
#| export
class DataSet(ISTAT):
    """Class that implements methods to retrieve informations (metadata) about a Dataset"""

    def __init__(self, dataflow_identifier):
        super().__init__()
        self.resource = "datastructure"
        self.all_available = all_available()  # df with all the available dataflows
        self.identifiers = self.set_identifiers(dataflow_identifier)
        self.available_values = self.get_available_values()
        self.dimensions = list(self.dimensions_info(description=False).dimension)
        self.filters = self.default_filters()
        # self.dimensions_values = self.available_dimensions_values()
        
        # TODO: returning all metadata related to the dataflow contained in 'Header'

    def set_identifiers(self, dataflow_identifier):
        """Take any type of `dataflow_identifier` and return all identifiers in a dictionary"""
        if dataflow_identifier[3] == "_":
            return self.set_from_id(dataflow_identifier)
        elif dataflow_identifier[4] == "_":
            return self.set_from_structure_id(dataflow_identifier)
        else:
            if type(dataflow_identifier) == str:
                return self.set_from_description(dataflow_identifier)
            else:
                raise ValueError(dataflow_identifier)

    def set_from_id(self, df_id):
        mask = self.all_available["df_id"] == df_id
        df = self.all_available[mask]
        return df.to_dict(orient="records")[0]

    def set_from_structure_id(self, df_structure_id):
        mask = self.all_available["df_structure_id"] == df_structure_id
        df = self.all_available[mask]
        return df.to_dict(orient="records")[0]

    def set_from_description(self, description):
        mask = self.all_available["df_description"] == description
        df = self.all_available[mask]
        return df.to_dict(orient="records")[0]

    def parse_dimensions(self, response):
        """Parse the `response` containing a dataflow's dimensions and return them in a list"""
        tree = make_tree(response)
        strip_ns(tree)
        root = tree.root

        dimensions_l = []
        for dimension in root.iter("Dimension"):
            dimension_name = dimension.attrib["id"]

            dimension_id = [
                enumeration.find("Ref").get("id")
                for enumeration in dimension.iter("Enumeration")
            ][0]

            dimension_dict = {"dimension": dimension_name, "dimension_ID": dimension_id}

            dimensions_l.append(dimension_dict)

        return dimensions_l

    def dimensions_info(self, dataframe=True, description=True):
        """Return the dimensions of a specific dataflow and their `descriptions`."""
        df_structure_id = self.identifiers["df_structure_id"]

        path_parts = [self.resource, self.agencyID, df_structure_id]
        path = "/".join(path_parts)
        response = self._request(path=path)
        dimensions = self.parse_dimensions(response)

        if dataframe == True:
            dimensions = pd.DataFrame(dimensions)

        if description == True:
            dimensions_description = self.dimensions_description(dimensions)
            dimensions = dimensions.merge(dimensions_description, on="dimension_ID")

        return dimensions

    def dimensions_description(self, dimensions):
        """Return a dataframe with the descriptions of `dimensions`"""
        resource = "codelist"
        dimensions_l = dimensions.dimension_ID.tolist()
        descriptions_l = []

        for dimension_id in dimensions_l:
            path_parts = [resource, self.agencyID, dimension_id]
            path = "/".join(path_parts)
            response = self._request(path=path)
            tree = make_tree(response)
            strip_ns(tree)
            root = tree.root

            description = [x for x in root.iter("Codelist")][0]
            # description_it = description.findall('Name')[0].text
            description = description.findall("Name")[1].text

            description_dict = {
                "dimension_ID": dimension_id,
                "description": description,
            }
            descriptions_l.append(description_dict)

        dimensions_descriptions = pd.DataFrame(descriptions_l)

        return dimensions_descriptions

    def get_available_values(self):
        """Return a dictionary with available values for each dimension in the DataSet instance"""
        resource = "availableconstraint"
        df_id = self.identifiers["df_id"]
        path_parts = [
            resource,
            df_id,
            "?references=all&detail=full",
        ]  # TODO: pass them as parameters
        path = "/".join(path_parts)
        response = self._request(path=path)
        tree = make_tree(response)
        strip_ns(tree)
        root = tree.root

        dimensions_values = {}

        for dimension in root.iter("Codelist"):
            dimension_id = dimension.get("id")

            values = {}
            value_id_l, value_descr_l = [], []

            for value in dimension.iter("Code"):
                value_id = value.get("id")
                value_descr = [name.text for name in value.findall("Name")][1]
                value_id_l.append(value_id)
                value_descr_l.append(value_descr)

            values["values_ids"] = value_id_l
            values["values_description"] = value_descr_l
            dimensions_values[dimension_id] = values

        for dimension_id in list(dimensions_values.keys()):
            dimension = self.get_dimension_name(dimension_id)
            dimensions_values[dimension] = dimensions_values.pop(dimension_id)

        return dimensions_values

    def get_dimension_values(self, dimension, dataframe=True):
        """Return the available values of a single `dimension` in the dataset"""
        dimension_dict = self.available_values[dimension]
        dimension_df = pd.DataFrame.from_dict(dimension_dict)
        return dimension_df if dataframe else dimension_dict

    def get_dimension_name(self, dimension_id):
        """Convert `dimension_id` to `dimension`"""
        dimensions_df = self.dimensions_info(description=False)
        mask = dimensions_df["dimension_ID"] == dimension_id
        dimension = dimensions_df[mask]["dimension"]
        return dimension.values[0]

    def default_filters(self):
        """"initiate self.filters with default values"""
        default_filters = {}  
        # no filter equals all values (default)
        for dimension in self.dimensions:
            default_filters[dimension] = "."
        return default_filters

    def set_filters(self, **kwargs):
        """set filters for the dimensions of the dataset by passing dimension_name=value"""
        # add kwargs in case passed
        for arg, arg_value in kwargs.items():
            self.filters[arg.upper()] = arg_value

The class takes `df_id`, `df_structure_id` or `df_description` as inputs. These 3 values can be found by using the `all_available()` function.

In [None]:
ds = DataSet(dataflow_identifier="151_914")
test_eq(ds.identifiers['df_id'], '151_914')
test_eq(ds.identifiers['df_description'], 'Unemployment  rate')
test_eq(ds.identifiers['df_structure_id'], 'DCCV_TAXDISOCCU1')

we can look at the dimensions of a dataflow by simply accessing its attribute `dimensions`. However, we won't have dimensions' descriptions here.

In [None]:
show_doc(DataSet.dimensions_info)

---

### DataSet.dimensions_info

>      DataSet.dimensions_info (dataframe=True, description=True)

Return the dimensions of a specific dataflow and their `descriptions`.

To have a look at the dimensions together with their menaing/description, we can use the `dimension_info` function. It will return an easy to read pandas DataFrame.

In [None]:
dimensions_df = ds.dimensions_info()
test_eq(dimensions_df.columns, ['dimension', 'dimension_ID', 'description'])
dimensions_df

Unnamed: 0,dimension,dimension_ID,description
0,FREQ,CL_FREQ,Frequency
1,CITTADINANZA,CL_CITTADINANZA,Citizenship
2,DURATA_DISOCCUPAZ,CL_DURATA,Duration
3,CLASSE_ETA,CL_ETA1,Age class
4,ITTER107,CL_ITTER107,Territory
5,SESSO,CL_SEXISTAT1,Gender
6,TIPO_DATO,CL_TIPO_DATO_FOL,Data type FOL
7,TITOLO_STUDIO,CL_TITOLO_STUDIO,Level of education


The values that the different dimensions can take can also be explored. The `available_values` attribute contains a dictionary with the dimensions of the dataset as keys. The values of the dictionary are themselves dictionaries which can be accessed through the `values_ids` and `values_description` keys. The former key returns an ID of the dimension's values, the latter a description of these values.

In [None]:
values_dict = ds.available_values
test_eq(isinstance(values_dict, dict), True)
test_eq(list(values_dict.keys()).sort(), ds.dimensions.sort())
test_eq(values_dict['DURATA_DISOCCUPAZ']['values_ids'], ['TOTAL', 'M_GE12'])
test_eq(values_dict['DURATA_DISOCCUPAZ']['values_description'], ['total', '12 months and over'])

In [None]:
show_doc(DataSet.get_dimension_values)

---

### DataSet.get_dimension_values

>      DataSet.get_dimension_values (dimension, dataframe=True)

Return the available values of a single `dimension` in the dataset

In [None]:
ds.get_dimension_values('DURATA_DISOCCUPAZ')

Unnamed: 0,values_ids,values_description
0,TOTAL,total
1,M_GE12,12 months and over


In [None]:
show_doc(DataSet.set_filters)

---

### DataSet.set_filters

>      DataSet.set_filters (**kwargs)

set filters for the dimensions of the dataset by passing dimension_name=value

With `DataSet.set_filters()` we can filter the dimensions of the dataset by passing the values that we want to filter for. The dataset will then only return data containing our filters. A dictionary with the selected filters is contained in the attribute `DataSet.filters`.

**Note** that the arguments of `DataSet.set_filters` are lower case letters, but in `DataSet.filters` they are converted to upper case to be consistent with dimension names on ISTAT API.

In [None]:
dz = DataSet(dataflow_identifier="139_176")
dz.set_filters(freq="M", tipo_dato=["ISAV", "ESAV"], paese_partner="WORLD")

test_eq(dz.filters['FREQ'], 'M')
test_eq(dz.filters['TIPO_DATO'], ["ISAV", "ESAV"])
test_fail(lambda: dz.filters['freq']) #the filter is not saved in lower case