In [1]:
from kiara import KiaraAPI
import os
import networkx
from networkx.readwrite import json_graph

Process based on kiara version 0.4.21

In [3]:
data_folder = "/Users/mariella.decrouychan/Desktop/data"

personnel_csv = os.path.join(data_folder, "personnel.csv")
coords_csv = os.path.join(data_folder, 'anommedicalgeocoded-edited.csv')

api = KiaraAPI.instance()

In [4]:
# convert kiara table to pandas for previewing
def kiara_to_pandas(table_value):
    table_obj = table_value.data
    arrow_table = table_obj.arrow_table
    df = arrow_table.to_pandas()
    return df

### 1. CSV file onboarding

In [5]:
! kiara operation explain import.table.from.csv_file


╭─ Operation: [1;3mimport.table.from.csv_file[0m ──────────────────────────────────────╮
│                                                                              │
│  [3m [0m[3mDocumentation[0m[3m [0m  Import a table from a csv file.                            │
│                                                                              │
│  [3m [0m[3mInputs       [0m[3m [0m                                                             │
│  [3m               [0m   [1m [0m[1mfield    [0m[1m [0m [1m        [0m [1m          [0m [1m          [0m [1m           [0m    │
│  [3m               [0m   [1m [0m[1mname     [0m[1m [0m [1m [0m[1mtype  [0m[1m [0m [1m [0m[1mdescrip…[0m[1m [0m [1m [0m[1mRequired[0m[1m [0m [1m [0m[1mDefault  [0m[1m [0m    │
│  [3m               [0m   ──────────────────────────────────────────────────────    │
│  [3m               [0m   [3m [0m[3mpath     [0m[3m [0m  string   The        [1myes

In [6]:
onboarding_result = api.run_job(operation="import.table.from.csv_file", inputs={'path':personnel_csv})

In [7]:
# preview data
table_value = api.get_value(onboarding_result['table'].value_id)
df = kiara_to_pandas(table_value)

In [8]:
df.head()

Unnamed: 0,archref,name,placesdisplay,placesnormalized,corp,startdate,enddate,daterange,alldates,fullLink,fullRecord
0,"<a class=""archref"" href=""/ark:/61561/up424lfky...",Abadie,['Sainte-Lucie'],"['Sainte-Lucie, Île (Antilles)']",,1787.0,1787.0,1787-1787,1787,/ark:/61561/up424lfky,"Abadie, huissier à Sainte-Lucie 1787"
1,"<a class=""archref"" href=""/ark:/61561/up424tntx...","Abbadie, d'",['Canada'],"['Canada, Colonie française']",Régiment de Carignan-Salières,1665.0,1708.0,1665-1708,"1665, 1708",/ark:/61561/up424tntx,"Abbadie, d', enseigne dans le régiment de Cari..."
2,"<a class=""archref"" href=""/ark:/61561/up424icjr...","Abeille, Jean",['Port-au-Prince'],"['Port-au-Prince (Saint-Domingue, Île de)']",,1788.0,1789.0,1788-1789,"1788, 1789",/ark:/61561/up424icjr,"Abeille, Jean, lieutenant de milices du batail..."
3,"<a class=""archref"" href=""/ark:/61561/up424tnvz...","Abeille, Jean Joseph",['Pondichéry'],['Pondichéry (Inde)'],Conseil supérieur (Pondichéry ; Inde),1769.0,1772.0,1769-1772,"1769, 1772",/ark:/61561/up424tnvz,"Abeille, Jean Joseph, conseiller au Conseil su..."
4,"<a class=""archref"" href=""/ark:/61561/up424ezxy...","Abeille, Pierre",['Port-au-Prince'],"['Port-au-Prince (Saint-Domingue, Île de)']",,1774.0,1787.0,1774-1787,"1774, 1787",/ark:/61561/up424ezxy,"Abeille, Pierre, négociant à Port-au-Prince à ..."


In [9]:
df.tail()

Unnamed: 0,archref,name,placesdisplay,placesnormalized,corp,startdate,enddate,daterange,alldates,fullLink,fullRecord
19333,"<a class=""archref"" href=""/ark:/61561/up424tnvp...","Zemard, Ambroise",['île Royale'],"['Royale, Île (Canada)']",,1741.0,1741.0,1741-1741,1741,/ark:/61561/up424tnvpsny,"Zemard, Ambroise, habitant de l'île Royale, co..."
19334,"<a class=""archref"" href=""/ark:/61561/up424nhpj...","Zévallos, de",['Sainte-Anne'],"['Sainte-Anne (Guadeloupe, Île de la)']",,1770.0,1770.0,1770-1770,1770,/ark:/61561/up424nhpjmip,"Zévallos, de, commandant des milices du quarti..."
19335,"<a class=""archref"" href=""/ark:/61561/up424tnvp...","Zévallos, Edouard de",,,Régiment de la Guadeloupe,1786.0,1791.0,1786-1791,"1786, 1790, 1791",/ark:/61561/up424tnvpspa,"Zévallos, Edouard de, sous-lieutenant au régim..."
19336,"<a class=""archref"" href=""/ark:/61561/up424wqys...",Zunbergen,['Guyane'],['Guyane française'],,1784.0,1784.0,1784-1784,1784,/ark:/61561/up424wqysvtt,"Zunbergen, accusé de malversations en Guyane 1784"
19337,"<a class=""archref"" href=""/ark:/61561/up424vpxr...","Zweerts, Constantin Christian",['Tabago'],"['Tabago, Île (Antilles)']",,1782.0,1787.0,1782-1787,"1782, 1787",/ark:/61561/up424vpxruto,"Zweerts, Constantin Christian, interprète de l..."


In [10]:
# checking if missing values
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19338 entries, 0 to 19337
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   archref           19338 non-null  object 
 1   name              19338 non-null  object 
 2   placesdisplay     19338 non-null  object 
 3   placesnormalized  19338 non-null  object 
 4   corp              19338 non-null  object 
 5   startdate         19251 non-null  float64
 6   enddate           19251 non-null  float64
 7   daterange         19338 non-null  object 
 8   alldates          19338 non-null  object 
 9   fullLink          19338 non-null  object 
 10  fullRecord        19338 non-null  object 
dtypes: float64(2), object(9)
memory usage: 1.6+ MB


### 2. Columns rename

In [11]:
! kiara operation explain anom_processing.column_names_replace


╭─ Operation: [1;3manom_processing.column_names_replace[0m ────────────────────────────╮
│                                                                              │
│  [3m [0m[3mDocumentation[0m[3m [0m  Replace columns names.                                     │
│                                                                              │
│  [3m [0m[3mInputs       [0m[3m [0m                                                             │
│  [3m               [0m   [1m [0m[1mfield    [0m[1m [0m [1m       [0m [1m           [0m [1m          [0m [1m           [0m    │
│  [3m               [0m   [1m [0m[1mname     [0m[1m [0m [1m [0m[1mtype [0m[1m [0m [1m [0m[1mdescript…[0m[1m [0m [1m [0m[1mRequired[0m[1m [0m [1m [0m[1mDefault  [0m[1m [0m    │
│  [3m               [0m   ──────────────────────────────────────────────────────    │
│  [3m               [0m   [3m [0m[3mtable    [0m[3m [0m  table   The table   [1myes

In [12]:
col_map = column_map = {
    "name": "foaf:name",
    "placesdisplay": "gn:name",
    "daterange": "dc:date",
    "fullLink": "dc:identifier"
}

In [13]:
rename_columns_result = api.run_job(operation="anom_processing.column_names_replace", inputs={'table':table_value, 'columns_map':col_map})

In [14]:
rename_columns_result

In [15]:
rename_columns_value = api.get_value(rename_columns_result['table'].value_id)

### 3. Check if all location data available in dataset and all coordinates present in coordinates data

If the objective is to be able to display the dataset on a map, all rows must contain a normalized location name, and
each location name needs to have a latitude and a longitude

#### 3.1 Remove rows that contain nans in location place

In [16]:
with_loc_result = api.run_job(operation="anom_processing.remove_nans", inputs={'table':rename_columns_value, 'column':'placesnormalized'})

At the moment the module doesn't output the list of rows containing nans in normalized location name col

In [17]:
with_loc_value = api.get_value(with_loc_result['table'].value_id)

#### 3.1 Access coordinates CSV

In [18]:
coords_onboarding_result = api.run_job(operation="import.table.from.csv_file", inputs={'path': coords_csv})

In [19]:
coords_table_value = api.get_value(coords_onboarding_result['table'].value_id)

#### 3.2 Rename latitude and longitude

I'm temporary leaving this step out for now since there are 2 place names, I'm wondering if we shouldn't keep the standardised one for the normalized place name col as it is common accross tables.

#### 3.2 Check if all coordinates present

In [20]:
! kiara operation explain anom_processing.coords_check


╭─ Operation: [1;3manom_processing.coords_check[0m ────────────────────────────────────╮
│                                                                              │
│  [3m [0m[3mDocumentation[0m[3m [0m  This module aims at comparing two tables: one table that   │
│  [3m               [0m  provides observations with a place name, and one table     │
│  [3m               [0m  that                                                       │
│  [3m               [0m                                                             │
│  [3m               [0m  includes latitudes, longitudes and place names.            │
│  [3m               [0m  A verification is performed to see if all place names of   │
│  [3m               [0m  the first dataset are included in the second dataset.      │
│  [3m               [0m  In this specific example, the first table includes         │
│  [3m               [0m  several place names per row.                               │
│  [3m   

In [21]:
coords_check_result = api.run_job(operation="anom_processing.coords_check", inputs={'table1':with_loc_value, 'table2':coords_table_value, "column1":'placesnormalized',"column2":'normalized'})

In [22]:
coords_check_result['result'].data

'Missing coordinates.'

In [23]:
# uncomment to see locations that are not present in the geocoded places file
# coords_check_result['places_list'].data

### 4. Filter out some columns

In [24]:
! kiara operation explain table_filter.select_columns


╭─ Operation: [1;3mtable_filter.select_columns[0m ─────────────────────────────────────╮
│                                                                              │
│  [3m [0m[3mDocumentation[0m[3m [0m  -- n/a --                                                  │
│                                                                              │
│  [3m [0m[3mInputs       [0m[3m [0m                                                             │
│  [3m               [0m   [1m [0m[1mfield   [0m[1m [0m [1m         [0m [1m          [0m [1m          [0m [1m           [0m    │
│  [3m               [0m   [1m [0m[1mname    [0m[1m [0m [1m [0m[1mtype   [0m[1m [0m [1m [0m[1mdescrip…[0m[1m [0m [1m [0m[1mRequired[0m[1m [0m [1m [0m[1mDefault  [0m[1m [0m    │
│  [3m               [0m   ──────────────────────────────────────────────────────    │
│  [3m               [0m   [3m [0m[3mvalue   [0m[3m [0m  table     A value    [1myes

In [25]:
to_include = ['foaf:name','dc:date','dc:identifier']

In [26]:
filter_columns_result = api.run_job(operation="table_filter.select_columns", inputs={'value':rename_columns_value, 'columns':to_include})

In [27]:
filter_columns_result

In [28]:
# preview data
table_value = api.get_value(filter_columns_result['value'].value_id)
df = kiara_to_pandas(table_value)

In [29]:
df.head()

Unnamed: 0,foaf:name,dc:date,dc:identifier
0,Abadie,1787-1787,/ark:/61561/up424lfky
1,"Abbadie, d'",1665-1708,/ark:/61561/up424tntx
2,"Abeille, Jean",1788-1789,/ark:/61561/up424icjr
3,"Abeille, Jean Joseph",1769-1772,/ark:/61561/up424tnvz
4,"Abeille, Pierre",1774-1787,/ark:/61561/up424ezxy


#### 5. Save table in data registry

In [30]:
table_alias = "20221103_test"

In [31]:
api.store_value(table_value, table_alias)

StoreValueResult(value=Value(id=24bdbb44-5830-48cb-b211-1032dc0f9657, type=table, status=set, initialized=True optional=False), aliases=['20221103_test'], persisted_data=PersistedData(model_id=zdpuAu5iFaJ6x2GQ7UZe6rQ7psUsFKcJAkS3FrwMzGdJwVX62, category=instance.persisted_data, fields=[data_type, data_type_config, serialization_profile, metadata, hash_codec, archive_id, chunk_id_map]), error=None)

In [32]:
# chack data registry
! kiara data list


╭─ Available aliases ──────────────────────────────────────────────────────────╮
│                                                                              │
│  [1m [0m[1malias        [0m[1m [0m [1m [0m[1mtype [0m[1m [0m [1m [0m[1m   size[0m[1m [0m                                           │
│  ─────────────────────────────────                                           │
│   20221103_test   table   1.21 MB                                            │
│                                                                              │
╰──────────────────────────────────────────────────────────────────────────────╯


#### 6. Explore data lineage

In [35]:
table_value.lineage

In [33]:
graph = table_value.lineage.module_graph
result = json_graph.node_link_data(graph)

In [34]:
result

{'directed': True,
 'multigraph': False,
 'graph': {},
 'nodes': [{'data_type': 'table',
   'label': '[this value]',
   'node_type': 'value',
   'data_type_config': {},
   'level': 1,
   'id': 'value:24bdbb44-5830-48cb-b211-1032dc0f9657'},
  {'module_type': 'table.filters',
   'module_config': {'constants': {},
    'defaults': {},
    'filter_name': 'select_columns'},
   'label': 'table.filters',
   'node_type': 'operation',
   'level': 3,
   'id': 'module:zdpuAvfzLhF9DnDgV5XB9Zw58jzzHpc3CK3tQrf8cywbMhM77'},
  {'label': 'columns (list)',
   'node_type': 'value',
   'data_type': 'list',
   'data_type_config': {},
   'level': 4,
   'id': 'value:34fb98a4-6cc9-4046-a763-716067504005'},
  {'label': 'ignore_invalid_column_names (boolean)',
   'node_type': 'value',
   'data_type': 'boolean',
   'data_type_config': {},
   'level': 4,
   'id': 'value:7b7ea8d9-a281-447c-954b-7f3be27d7c09'},
  {'module_type': 'anom_processing.column_names_replace',
   'module_config': {'constants': {}, 'defaults'

In [36]:
result = json_graph.node_link_data(graph)

In [37]:
nodes = graph.nodes.data()
augmented_nodes = dict()

In [39]:
def get_info(node):
    # all this is terribly inefficient
    if node[1]["node_type"] == "operation":
        result = api.retrieve_module_type_info(node[1]["module_type"]).dict()
    elif node[1]["node_type"] == "value":
        value_id = node[0][6:]
        v = api.get_value(value_id)

        render_result = api.render_value(value=v, target_format="string").rendered

        result = {
            "preview": render_result
        }
    return result

for idx, node in enumerate(nodes):
    node_dict = {
        "id": node[0],
        "desc": node[1],
        "parentIds": [pred for pred in graph.predecessors(node[0])],
        "info": get_info(node)
    }
    augmented_nodes[idx] = node_dict

In [40]:
for idx, node in enumerate(nodes):
    node_dict = {
        "id": node[0],
        "desc": node[1],
        "parentIds": [pred for pred in graph.predecessors(node[0])],
        "info": get_info(node)
    }
    augmented_nodes[idx] = node_dict