In [1]:
# install the package by running the command "pip install git+https://github.com/DHARPA-Project/kiara_plugin.topic_modelling"
# in your virtual environment.
# Alternatively, uncomment and run the line below.
# ! pip install git+https://github.com/DHARPA-Project/kiara_plugin.topic_modelling

In [1]:
from kiara.api import KiaraAPI
kiara = KiaraAPI.instance()

## 1. Data onboarding

Choose one of the two options below (1.1 or 1.2) to onboard data.

### 1.1. Get files from zenodo

In [2]:
! kiara operation explain topic_modelling.create_table_from_zenodo


╭─ Operation: [1;3mtopic_modelling.create_table_from_zenodo[0m ────────────────────────╮
│                                                                              │
│  [3m [0m[3mDocumentation[0m[3m [0m  This module retrieves text files from a specified folder   │
│  [3m               [0m  hosted on Zenodo.                                          │
│  [3m               [0m                                                             │
│  [3m               [0m  It takes the DOI and the name of the file as inputs.       │
│  [3m               [0m  It outputs a table with two columns: one for the file      │
│  [3m               [0m  names and the other for the content of these files.        │
│  [3m               [0m                                                             │
│  [3m               [0m  Dependencies:                                              │
│  [3m               [0m  - urllib.request:                                          │
│  [3m   

In [3]:
create_table_from_zenodo_inputs = {
    "doi": "4596345",
    "file_name": "ChroniclItaly_3.0_original.zip"
}

In [4]:
create_table_from_zenodo_results = kiara.run_job('topic_modelling.create_table_from_zenodo', inputs=create_table_from_zenodo_inputs)

In [5]:
create_table_from_zenodo_results

In [6]:
corpus_table_zenodo = create_table_from_zenodo_results['corpus_table']

### 1.2. Get files from .zip url

In [7]:
! kiara operation explain topic_modelling.create_table_from_url


╭─ Operation: [1;3mtopic_modelling.create_table_from_url[0m ───────────────────────────╮
│                                                                              │
│  [3m [0m[3mDocumentation[0m[3m [0m  This module retrieves text files from a specified          │
│  [3m               [0m  sub-path within a zip file hosted at a given URL.          │
│  [3m               [0m                                                             │
│  [3m               [0m  It outputs a table with two columns: one for the file      │
│  [3m               [0m  names and the other for the content of these files.        │
│  [3m               [0m                                                             │
│  [3m               [0m  Dependencies:                                              │
│  [3m               [0m  - urllib.request:                                          │
│  [3m               [0m  https://docs.python.org/3/library/urllib.request.html      │
│  [3m   

In [8]:
create_table_from_url_inputs = {
    "url": "https://github.com/DHARPA-Project/kiara.examples/archive/refs/heads/main.zip",
    "sub_path": "kiara.examples-main/examples/workshops/dh_benelux_2023/data"
}

In [9]:
create_table_from_url_results = kiara.run_job('topic_modelling.create_table_from_url', inputs=create_table_from_url_inputs)

In [10]:
create_table_from_url_results

In [11]:
corpus_table_url = create_table_from_url_results['corpus_table']

## 2. Corpus preparation

### 2.1. Get LCCN Metadata

In [12]:
! kiara operation explain topic_modelling.get_lccn_metadata


╭─ Operation: [1;3mtopic_modelling.get_lccn_metadata[0m ───────────────────────────────╮
│                                                                              │
│  [3m [0m[3mDocumentation[0m[3m [0m  This module will get metadata from strings that comply     │
│  [3m               [0m  with LCCN pattern: '/sn86069873/1900-01-05/' to get the    │
│  [3m               [0m  publication references and the dates and add those         │
│  [3m               [0m  informations as two new columns.                           │
│  [3m               [0m                                                             │
│  [3m               [0m  In addition, if a mapping scheme is provided between       │
│  [3m               [0m  publication references and publication names, it will      │
│  [3m               [0m  add a column with the publication names.                   │
│  [3m               [0m  Such a map is provided in the form of a list of lists      │
│  [3m   

In [13]:
get_lccn_metadata_inputs = {
    "corpus_table": corpus_table_zenodo,
    "column_name": "file_name",
    "map": [['2012271201','sn85054967','sn93053873','sn85066408','sn85055164','sn84037024','sn84037025','sn84020351','sn86092310','sn92051386'],['Cronaca_Sovversiva','Il_Patriota','L\'Indipendente','L\'Italia','La_Libera_Parola','La_Ragione','La_Rassegna','La_Sentinella','La_Sentinella_del_West','La_Tribuna_del_Connecticut']],
}

In [14]:
# variant with corpus table from url
# get_lccn_metadata_inputs = {
#     "corpus_table": corpus_table_url,
#     "column_name": "file_name",
#     "map": [['2012271201','sn85054967','sn93053873','sn85066408','sn85055164','sn84037024','sn84037025','sn84020351','sn86092310','sn92051386'],['Cronaca_Sovversiva','Il_Patriota','L\'Indipendente','L\'Italia','La_Libera_Parola','La_Ragione','La_Rassegna','La_Sentinella','La_Sentinella_del_West','La_Tribuna_del_Connecticut']],
# }

In [15]:
get_lccn_metadata_results = kiara.run_job('topic_modelling.get_lccn_metadata', inputs=get_lccn_metadata_inputs)

In [16]:
get_lccn_metadata_results

### 2.3. Examine corpus distribution

In [17]:
! kiara operation explain topic_modelling.time_dist 


╭─ Operation: [1;3mtopic_modelling.time_dist[0m ───────────────────────────────────────╮
│                                                                              │
│  [3m [0m[3mDocumentation[0m[3m [0m  This module aggregates a table by day, month or year       │
│  [3m               [0m  from a corpus table that contains a date column. It        │
│  [3m               [0m  returns the distribution over time, which can be used      │
│  [3m               [0m  for display purposes, such as visualization.               │
│  [3m               [0m                                                             │
│  [3m               [0m  Dependencies:                                              │
│  [3m               [0m  - polars: https://www.pola.rs/                             │
│  [3m               [0m  - pyarrow: https://arrow.apache.org/docs/python/           │
│  [3m               [0m  - duckdb: https://duckdb.org/                              │
│         

In [20]:
time_dist_inputs = {
    "periodicity": 'month',
    "date_col_name": "date",
    "title_col_name": "pub_ref",
    "corpus_table": get_lccn_metadata_results['corpus_table'],
}

In [21]:
time_dist_inputs_results = kiara.run_job('topic_modelling.time_dist', inputs=time_dist_inputs)

In [22]:
time_dist_inputs_results