This notebook is a preparatory step to assess which modules to use or to create, and how to assemble them, in the context of data processing for a specific dataset for a map.
<b>This won't be the notebook used ultimately</b> but it was useful to get started to prep the different foreseen steps.

In [1]:
from kiara import KiaraAPI, Kiara
import networkx
from networkx.readwrite import json_graph

Process based on kiara version 0.4.21

In [2]:
kiara = Kiara.instance()
api = KiaraAPI(kiara=kiara)

In [6]:
# optional, choose alias for process, that will be added as a prefix to data saved in data registry
process_alias = 'map_20221020'

In [36]:
def kiara_to_pandas(table_value):
    table_obj = table_value.data
    arrow_table = table_obj.arrow_table
    df = arrow_table.to_pandas()
    return df

### I. CSV file onboarding

In [10]:
! kiara operation explain import.table.from.csv_file


╭─ Operation: [1;3mimport.table.from.csv_file[0m ──────────────────────────────────────╮
│                                                                              │
│  [3m [0m[3mDocumentation[0m[3m [0m  Import a table from a csv file.                            │
│                                                                              │
│  [3m [0m[3mInputs       [0m[3m [0m                                                             │
│  [3m               [0m   [1m [0m[1mfield    [0m[1m [0m [1m        [0m [1m          [0m [1m          [0m [1m           [0m    │
│  [3m               [0m   [1m [0m[1mname     [0m[1m [0m [1m [0m[1mtype  [0m[1m [0m [1m [0m[1mdescrip…[0m[1m [0m [1m [0m[1mRequired[0m[1m [0m [1m [0m[1mDefault  [0m[1m [0m    │
│  [3m               [0m   ──────────────────────────────────────────────────────    │
│  [3m               [0m   [3m [0m[3mpath     [0m[3m [0m  string   The        [1myes

In [42]:
file_path = '/Users/mariella.decrouychan/Desktop/personnel.csv'

In [43]:
onboarding_result = api.run_job(operation="import.table.from.csv_file", inputs={'path':file_path})

In [44]:
# preview data
table_value = kiara.data_registry.get_value(onboarding_result['table'].value_id)
df = kiara_to_pandas(table_value)

In [45]:
df.head()

Unnamed: 0,archref,name,placesdisplay,placesnormalized,corp,startdate,enddate,daterange,alldates,fullLink,fullRecord
0,"<a class=""archref"" href=""/ark:/61561/up424lfky...",Abadie,['Sainte-Lucie'],"['Sainte-Lucie, Île (Antilles)']",,1787.0,1787.0,1787-1787,1787,/ark:/61561/up424lfky,"Abadie, huissier à Sainte-Lucie 1787"
1,"<a class=""archref"" href=""/ark:/61561/up424tntx...","Abbadie, d'",['Canada'],"['Canada, Colonie française']",Régiment de Carignan-Salières,1665.0,1708.0,1665-1708,"1665, 1708",/ark:/61561/up424tntx,"Abbadie, d', enseigne dans le régiment de Cari..."
2,"<a class=""archref"" href=""/ark:/61561/up424icjr...","Abeille, Jean",['Port-au-Prince'],"['Port-au-Prince (Saint-Domingue, Île de)']",,1788.0,1789.0,1788-1789,"1788, 1789",/ark:/61561/up424icjr,"Abeille, Jean, lieutenant de milices du batail..."
3,"<a class=""archref"" href=""/ark:/61561/up424tnvz...","Abeille, Jean Joseph",['Pondichéry'],['Pondichéry (Inde)'],Conseil supérieur (Pondichéry ; Inde),1769.0,1772.0,1769-1772,"1769, 1772",/ark:/61561/up424tnvz,"Abeille, Jean Joseph, conseiller au Conseil su..."
4,"<a class=""archref"" href=""/ark:/61561/up424ezxy...","Abeille, Pierre",['Port-au-Prince'],"['Port-au-Prince (Saint-Domingue, Île de)']",,1774.0,1787.0,1774-1787,"1774, 1787",/ark:/61561/up424ezxy,"Abeille, Pierre, négociant à Port-au-Prince à ..."


In [46]:
df.tail()

Unnamed: 0,archref,name,placesdisplay,placesnormalized,corp,startdate,enddate,daterange,alldates,fullLink,fullRecord
19333,"<a class=""archref"" href=""/ark:/61561/up424tnvp...","Zemard, Ambroise",['île Royale'],"['Royale, Île (Canada)']",,1741.0,1741.0,1741-1741,1741,/ark:/61561/up424tnvpsny,"Zemard, Ambroise, habitant de l'île Royale, co..."
19334,"<a class=""archref"" href=""/ark:/61561/up424nhpj...","Zévallos, de",['Sainte-Anne'],"['Sainte-Anne (Guadeloupe, Île de la)']",,1770.0,1770.0,1770-1770,1770,/ark:/61561/up424nhpjmip,"Zévallos, de, commandant des milices du quarti..."
19335,"<a class=""archref"" href=""/ark:/61561/up424tnvp...","Zévallos, Edouard de",,,Régiment de la Guadeloupe,1786.0,1791.0,1786-1791,"1786, 1790, 1791",/ark:/61561/up424tnvpspa,"Zévallos, Edouard de, sous-lieutenant au régim..."
19336,"<a class=""archref"" href=""/ark:/61561/up424wqys...",Zunbergen,['Guyane'],['Guyane française'],,1784.0,1784.0,1784-1784,1784,/ark:/61561/up424wqysvtt,"Zunbergen, accusé de malversations en Guyane 1784"
19337,"<a class=""archref"" href=""/ark:/61561/up424vpxr...","Zweerts, Constantin Christian",['Tabago'],"['Tabago, Île (Antilles)']",,1782.0,1787.0,1782-1787,"1782, 1787",/ark:/61561/up424vpxruto,"Zweerts, Constantin Christian, interprète de l..."


In [47]:
# checking if missing values
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19338 entries, 0 to 19337
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   archref           19338 non-null  object 
 1   name              19338 non-null  object 
 2   placesdisplay     19338 non-null  object 
 3   placesnormalized  19338 non-null  object 
 4   corp              19338 non-null  object 
 5   startdate         19251 non-null  float64
 6   enddate           19251 non-null  float64
 7   daterange         19338 non-null  object 
 8   alldates          19338 non-null  object 
 9   fullLink          19338 non-null  object 
 10  fullRecord        19338 non-null  object 
dtypes: float64(2), object(9)
memory usage: 1.6+ MB


In [49]:
! kiara operation explain table_filter.select_columns


╭─ Operation: [1;3mtable_filter.select_columns[0m ─────────────────────────────────────╮
│                                                                              │
│  [3m [0m[3mDocumentation[0m[3m [0m  -- n/a --                                                  │
│                                                                              │
│  [3m [0m[3mInputs       [0m[3m [0m                                                             │
│  [3m               [0m   [1m [0m[1mfield   [0m[1m [0m [1m         [0m [1m          [0m [1m          [0m [1m           [0m    │
│  [3m               [0m   [1m [0m[1mname    [0m[1m [0m [1m [0m[1mtype   [0m[1m [0m [1m [0m[1mdescrip…[0m[1m [0m [1m [0m[1mRequired[0m[1m [0m [1m [0m[1mDefault  [0m[1m [0m    │
│  [3m               [0m   ──────────────────────────────────────────────────────    │
│  [3m               [0m   [3m [0m[3mvalue   [0m[3m [0m  table     A value    [1myes

In [50]:
to_include = ['name','placesdisplay','placesnormalized','corp','daterange','fullLink']

In [52]:
filter_columns_result = api.run_job(operation="table_filter.select_columns", inputs={'value':table_value, 'columns':to_include})

In [53]:
# preview data
table_value = kiara.data_registry.get_value(filter_columns_result['value'].value_id)
df = kiara_to_pandas(table_value)

In [54]:
df.head()

Unnamed: 0,name,placesdisplay,placesnormalized,corp,daterange,fullLink
0,Abadie,['Sainte-Lucie'],"['Sainte-Lucie, Île (Antilles)']",,1787-1787,/ark:/61561/up424lfky
1,"Abbadie, d'",['Canada'],"['Canada, Colonie française']",Régiment de Carignan-Salières,1665-1708,/ark:/61561/up424tntx
2,"Abeille, Jean",['Port-au-Prince'],"['Port-au-Prince (Saint-Domingue, Île de)']",,1788-1789,/ark:/61561/up424icjr
3,"Abeille, Jean Joseph",['Pondichéry'],['Pondichéry (Inde)'],Conseil supérieur (Pondichéry ; Inde),1769-1772,/ark:/61561/up424tnvz
4,"Abeille, Pierre",['Port-au-Prince'],"['Port-au-Prince (Saint-Domingue, Île de)']",,1774-1787,/ark:/61561/up424ezxy
