This notebook is a preparatory step to assess which modules to use or to create, and how to assemble them, in the context of data processing for a specific dataset for a map.
<b>This won't be the notebook used ultimately</b> but it was useful to get started to prep the different foreseen steps.

In [1]:
from kiara import KiaraAPI, Kiara
import networkx
from networkx.readwrite import json_graph

Process based on kiara version 0.4.21

In [2]:
kiara = Kiara.instance()
api = KiaraAPI(kiara=kiara)

#### I. Test foreseen steps

In [3]:
# optional, choose alias for process, that will be added as a prefix to data saved in data registry
process_alias = 'map_20221020'

In [4]:
def kiara_to_pandas(table_value):
    table_obj = table_value.data
    arrow_table = table_obj.arrow_table
    df = arrow_table.to_pandas()
    return df

### 1. CSV file onboarding

In [5]:
! kiara operation explain import.table.from.csv_file


╭─ Operation: [1;3mimport.table.from.csv_file[0m ──────────────────────────────────────╮
│                                                                              │
│  [3m [0m[3mDocumentation[0m[3m [0m  Import a table from a csv file.                            │
│                                                                              │
│  [3m [0m[3mInputs       [0m[3m [0m                                                             │
│  [3m               [0m   [1m [0m[1mfield    [0m[1m [0m [1m        [0m [1m          [0m [1m          [0m [1m           [0m    │
│  [3m               [0m   [1m [0m[1mname     [0m[1m [0m [1m [0m[1mtype  [0m[1m [0m [1m [0m[1mdescrip…[0m[1m [0m [1m [0m[1mRequired[0m[1m [0m [1m [0m[1mDefault  [0m[1m [0m    │
│  [3m               [0m   ────────────────────���─────────────────────────────────    │
│  [3m               [0m   [3m [0m[3mpath     [0m[3m [0m  string   The        [1my

In [6]:
file_path = '/Users/mariella.decrouychan/Desktop/personnel.csv'

# alternate path
# file_path = '/Users/stakats/Development/data/medicalagents.csv'

In [7]:
onboarding_result = api.run_job(operation="import.table.from.csv_file", inputs={'path':file_path})

In [8]:
# preview data
table_value = kiara.data_registry.get_value(onboarding_result['table'].value_id)
df = kiara_to_pandas(table_value)

In [9]:
df.head()

Unnamed: 0,archref,name,placesdisplay,placesnormalized,corp,startdate,enddate,daterange,alldates,fullLink,fullRecord
0,"<a class=""archref"" href=""/ark:/61561/up424lfky...",Abadie,['Sainte-Lucie'],"['Sainte-Lucie, Île (Antilles)']",,1787.0,1787.0,1787-1787,1787,/ark:/61561/up424lfky,"Abadie, huissier à Sainte-Lucie 1787"
1,"<a class=""archref"" href=""/ark:/61561/up424tntx...","Abbadie, d'",['Canada'],"['Canada, Colonie française']",Régiment de Carignan-Salières,1665.0,1708.0,1665-1708,"1665, 1708",/ark:/61561/up424tntx,"Abbadie, d', enseigne dans le régiment de Cari..."
2,"<a class=""archref"" href=""/ark:/61561/up424icjr...","Abeille, Jean",['Port-au-Prince'],"['Port-au-Prince (Saint-Domingue, Île de)']",,1788.0,1789.0,1788-1789,"1788, 1789",/ark:/61561/up424icjr,"Abeille, Jean, lieutenant de milices du batail..."
3,"<a class=""archref"" href=""/ark:/61561/up424tnvz...","Abeille, Jean Joseph",['Pondichéry'],['Pondichéry (Inde)'],Conseil supérieur (Pondichéry ; Inde),1769.0,1772.0,1769-1772,"1769, 1772",/ark:/61561/up424tnvz,"Abeille, Jean Joseph, conseiller au Conseil su..."
4,"<a class=""archref"" href=""/ark:/61561/up424ezxy...","Abeille, Pierre",['Port-au-Prince'],"['Port-au-Prince (Saint-Domingue, Île de)']",,1774.0,1787.0,1774-1787,"1774, 1787",/ark:/61561/up424ezxy,"Abeille, Pierre, négociant à Port-au-Prince à ..."


In [10]:
df.tail()

Unnamed: 0,archref,name,placesdisplay,placesnormalized,corp,startdate,enddate,daterange,alldates,fullLink,fullRecord
19333,"<a class=""archref"" href=""/ark:/61561/up424tnvp...","Zemard, Ambroise",['île Royale'],"['Royale, Île (Canada)']",,1741.0,1741.0,1741-1741,1741,/ark:/61561/up424tnvpsny,"Zemard, Ambroise, habitant de l'île Royale, co..."
19334,"<a class=""archref"" href=""/ark:/61561/up424nhpj...","Zévallos, de",['Sainte-Anne'],"['Sainte-Anne (Guadeloupe, Île de la)']",,1770.0,1770.0,1770-1770,1770,/ark:/61561/up424nhpjmip,"Zévallos, de, commandant des milices du quarti..."
19335,"<a class=""archref"" href=""/ark:/61561/up424tnvp...","Zévallos, Edouard de",,,Régiment de la Guadeloupe,1786.0,1791.0,1786-1791,"1786, 1790, 1791",/ark:/61561/up424tnvpspa,"Zévallos, Edouard de, sous-lieutenant au régim..."
19336,"<a class=""archref"" href=""/ark:/61561/up424wqys...",Zunbergen,['Guyane'],['Guyane française'],,1784.0,1784.0,1784-1784,1784,/ark:/61561/up424wqysvtt,"Zunbergen, accusé de malversations en Guyane 1784"
19337,"<a class=""archref"" href=""/ark:/61561/up424vpxr...","Zweerts, Constantin Christian",['Tabago'],"['Tabago, Île (Antilles)']",,1782.0,1787.0,1782-1787,"1782, 1787",/ark:/61561/up424vpxruto,"Zweerts, Constantin Christian, interprète de l..."


In [11]:
# checking if missing values
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19338 entries, 0 to 19337
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   archref           19338 non-null  object 
 1   name              19338 non-null  object 
 2   placesdisplay     19338 non-null  object 
 3   placesnormalized  19338 non-null  object 
 4   corp              19338 non-null  object 
 5   startdate         19251 non-null  float64
 6   enddate           19251 non-null  float64
 7   daterange         19338 non-null  object 
 8   alldates          19338 non-null  object 
 9   fullLink          19338 non-null  object 
 10  fullRecord        19338 non-null  object 
dtypes: float64(2), object(9)
memory usage: 1.6+ MB


### 2. Columns rename

In [12]:
! kiara operation explain anom_processing.column_names_replace


╭─ Operation: [1;3manom_processing.column_names_replace[0m ────────────────────────────╮
│                                                                              │
│  [3m [0m[3mDocumentation[0m[3m [0m  Replace columns names.                                     │
│                                                                              │
│  [3m [0m[3mInputs       [0m[3m [0m                                                             │
│  [3m               [0m   [1m [0m[1mfield    [0m[1m [0m [1m       [0m [1m           [0m [1m          [0m [1m           [0m    │
│  [3m               [0m   [1m [0m[1mname     [0m[1m [0m [1m [0m[1mtype [0m[1m [0m [1m [0m[1mdescript…[0m[1m [0m [1m [0m[1mRequired[0m[1m [0m [1m [0m[1mDefault  [0m[1m [0m    │
│  [3m               [0m   ──────────────────────────────────────────────────────    │
│  [3m               [0m   [3m [0m[3mtable    [0m[3m [0m  table   The table   [1myes

In [13]:
col_map = column_map = {
    "name": "foaf:name",
    "placesdisplay": "gn:name",
    "daterange": "dc:date",
    "fullLink": "dc:identifier"
}

In [14]:
rename_columns_result = api.run_job(operation="anom_processing.column_names_replace", inputs={'table':table_value, 'columns_map':col_map})

In [15]:
rename_columns_result

In [16]:
rename_columns_value = kiara.data_registry.get_value(rename_columns_result['table'].value_id)

### 3. Check if all location data available in dataset and all coordinates present in coordinates data

If the objective is to be able to display the dataset on a map, all rows must contain a normalized location name, and
each location name needs to have a latitude and a longitude

#### 3.1 Remove rows that contain nans in location place

In [17]:
with_loc_result = api.run_job(operation="anom_processing.remove_nans", inputs={'table':rename_columns_value, 'column':'placesnormalized'})

At the moment the module doesn't output the list of rows containing nans in normalized location name col

In [18]:
with_loc_value = kiara.data_registry.get_value(with_loc_result['table'].value_id)

#### 3.1 Access coordinates CSV

In [19]:
coords_file_path = '/Users/mariella.decrouychan/Downloads/anommedicalgeocoded-edited.csv'

In [20]:
coords_onboarding_result = api.run_job(operation="import.table.from.csv_file", inputs={'path':coords_file_path})

In [21]:
coords_table_value = kiara.data_registry.get_value(coords_onboarding_result['table'].value_id)

#### 3.2 Rename latitude and longitude

I'm temporary leaving this step out for now since there are 2 place names, I'm wondering if we shouldn't keep the standardised one for the normalized place name col as it is common accross tables.

#### 3.2 Check if all coordinates present

In [22]:
! kiara operation explain anom_processing.coords_check


╭─ Operation: [1;3manom_processing.coords_check[0m ────────────────────────────────────╮
│                                                                              │
│  [3m [0m[3mDocumentation[0m[3m [0m  This module aims at comparing two tables: one table that   │
│  [3m               [0m  provides observations with a place name, and one table     │
│  [3m               [0m  that                                                       │
│  [3m               [0m                                                             │
│  [3m               [0m  includes latitudes, longitudes and place names.            │
│  [3m               [0m  A verification is performed to see if all place names of   │
│  [3m               [0m  the first dataset are included in the second dataset.      │
│  [3m               [0m  In this specific example, the first table includes         │
│  [3m               [0m  several place names per row.                               │
│  [3m   

In [23]:
# better to use a sample by using the "sample_nr" input
coords_check_result = api.run_job(operation="anom_processing.coords_check", inputs={'table1':with_loc_value, 'table2':coords_table_value, "column1":'placesnormalized',"column2":'normalized',"sample_nr":300})

 not evaluated
 not evaluated
 not evaluated
 not evaluated
 not evaluated
 not evaluated
 not evaluated
 not evaluated
 not evaluated
 not evaluated
 not evaluated
 not evaluated
 not evaluated
 not evaluated
 not evaluated
 not evaluated
 not evaluated
 not evaluated
 not evaluated
 not evaluated
 not evaluated
 not evaluated
 not evaluated
 not evaluated
 not evaluated
 not evaluated
 not evaluated
 not evaluated
 not evaluated
 not evaluated
 not evaluated
 not evaluated
 not evaluated
 not evaluated
 not evaluated
 not evaluated
 not evaluated
 not evaluated
 not evaluated


In [24]:
coords_check_result['result'].data

'Missing coordinates.'

In [25]:
coords_check_result['places_list'].data

ListModel(list_data=['Nantes (Loire-Atlantique ; France)', 'Saint-Christophe, Île (Antilles)', 'Le Havre (Seine-Maritime ; France)', 'Navacelles (Gard ; France)', 'Ploemeur (Morbihan ; France)', 'Strasbourg (Bas-Rhin ; France)', 'Cap-Tiburon (Saint-Domingue, Île de)', 'Tonnay-Charente (Charente-Maritime ; France)', 'Ouanaminthe (Saint-Domingue, Île de)', 'Nouvelle-France', 'Beaujolais, Région géographique (France)', 'Grand-Cul-de-Sac (Guadeloupe, Île de la)', 'Sens (Yonne ; France)', 'Lyon (Rhône ; France)', 'Afrique, Côtes occidentales', 'Tulle (Corrèze ; France)', 'Providence (Rhode Island ; États-Unis)', 'Saint-Denis (Seine-Saint-Denis ; France)', 'Saint-Domingue, Colonie espagnole (Saint-Domingue, Île de)', 'Vent, Îles du (Antilles)', 'Saint-Étienne (Loire ; France)', 'Niort (Deux-Sèvres ; France)', 'Flandre, Région géographique (Europe)', 'Nippes (Saint-Domingue, Île de)', 'Lorraine, Région géographique (France)', 'La Flèche (Sarthe ; France)', 'Anjou, Région géographique (France)

### 3. Filter out some columns

In [26]:
! kiara operation explain table_filter.select_columns


╭─ Operation: [1;3mtable_filter.select_columns[0m ─────────────────────────────────────╮
│                                                                              │
│  [3m [0m[3mDocumentation[0m[3m [0m  -- n/a --                                                  │
│                                                                              │
│  [3m [0m[3mInputs       [0m[3m [0m                                                             │
│  [3m               [0m   [1m [0m[1mfield   [0m[1m [0m [1m         [0m [1m          [0m [1m          [0m [1m           [0m    │
│  [3m               [0m   [1m [0m[1mname    [0m[1m [0m [1m [0m[1mtype   [0m[1m [0m [1m [0m[1mdescrip…[0m[1m [0m [1m [0m[1mRequired[0m[1m [0m [1m [0m[1mDefault  [0m[1m [0m    │
│  [3m               [0m   ──────────────────────────────────────────────────────    │
│  [3m               [0m   [3m [0m[3mvalue   [0m[3m [0m  table     A value    [1myes

In [27]:
# col_map = column_map = {
    "name": "",
    "placesdisplay": "gn:name",
    "daterange": "",
    "fullLink": ""
}

IndentationError: unexpected indent (1740952404.py, line 2)

In [26]:
to_include = ['foaf:name','dc:date','dc:identifier']

In [27]:
filter_columns_result = api.run_job(operation="table_filter.select_columns", inputs={'value':table_value, 'columns':to_include})

In [28]:
# preview data
table_value = kiara.data_registry.get_value(filter_columns_result['value'].value_id)
df = kiara_to_pandas(table_value)

In [29]:
df.head()

Unnamed: 0,name,placesdisplay,placesnormalized,corp,daterange,fullLink
0,Abadie,['Sainte-Lucie'],"['Sainte-Lucie, Île (Antilles)']",,1787-1787,/ark:/61561/up424lfky
1,"Abbadie, d'",['Canada'],"['Canada, Colonie française']",Régiment de Carignan-Salières,1665-1708,/ark:/61561/up424tntx
2,"Abeille, Jean",['Port-au-Prince'],"['Port-au-Prince (Saint-Domingue, Île de)']",,1788-1789,/ark:/61561/up424icjr
3,"Abeille, Jean Joseph",['Pondichéry'],['Pondichéry (Inde)'],Conseil supérieur (Pondichéry ; Inde),1769-1772,/ark:/61561/up424tnvz
4,"Abeille, Pierre",['Port-au-Prince'],"['Port-au-Prince (Saint-Domingue, Île de)']",,1774-1787,/ark:/61561/up424ezxy


#### II. Test pipeline and modules that are being created

In [3]:
from kiara import KiaraAPI, Kiara

In [4]:
kiara = Kiara.instance()
api = KiaraAPI(kiara=kiara)

Pipeline

In [6]:
file_path = '/Users/mariella.decrouychan/Desktop/personnel.csv'

In [4]:
pipeline_result = api.run_job(operation="anom_preprocessing", inputs={'import_table__path':file_path})

2022-11-02 11:23.13 [error    ] error.queue_job                inputs={'filter_columns__columns': 'b66213d9-1636-4abf-acb6-fd7bc1566e17', 'filter_columns__ignore_invalid_column_names': '7b822142-1e62-4eeb-9e2f-fec0678d5022', 'import_table__path': '610a49cd-6f6a-4e2f-9dca-5c3c98bd2b63', 'preprocess_strings__column_name': '003b6b7c-5d47-4117-bc36-9872d5368661'} job_hash=zdpuApR5WYqP4Hxo9kVYgkMsMSnQyq6vZ5iXiQMQe5acQGL7J job_id=UUID('ae5df087-d4f0-481a-bf0a-0f0d9af8092b') module_config={'constants': {}, 'defaults': {'import_table__path': '/Users/mariella.decrouychan/Desktop/personnel.csv', 'filter_columns__columns': ['name', 'placesdisplay', 'placesnormalized', 'corp', 'daterange', 'fullLink'], 'preprocess_strings__column_name': 'placesdisplay'}, 'pipeline_name': 'anom_preprocessing', 'steps': [{'module_type': 'pipeline', 'module_config': {'constants': {}, 'defaults': {}, 'pipeline_name': 'import.table.from.csv_file', 'steps': [{'module_type': 'import.file', 'module_config': {}, 'is_resolv

Exception: Can't retrieve active job with id 'ae5df087-d4f0-481a-bf0a-0f0d9af8092b', no such job registered.

In [16]:
pipeline_result['filter_columns__value']

KeyError: "Field 'filter_columns__value' not available in value set. Available fields: import_table__imported_file, import_table__table"

bug in module anom_processing.strings_preprocess

In [3]:
from kiara import KiaraAPI, Kiara
kiara = Kiara.instance()
api = KiaraAPI(kiara=kiara)

In [4]:
file_path = '/Users/mariella.decrouychan/Desktop/personnel.csv'

In [5]:
onboarding_result = api.run_job(operation="import.table.from.csv_file", inputs={'path':file_path})

In [6]:
table_value = kiara.data_registry.get_value(onboarding_result['table'].value_id)

In [7]:
preprocess_col = api.run_job(operation="anom_processing.strings_preprocess", inputs={'table':table_value, 'column_name':"placesdisplay"})

In [8]:
preprocess_col