In [75]:
import requests
import pandas as pd
import duckdb
import os
import os, json
from uuid import uuid4
import pandas as pd
import numpy as np
pd.set_option("display.max_columns", None)

In [48]:
# the database is available on the following endpoint:
api_url = "https://ccs-lab.zcu.cz/grela-api/api/query"

The database is publicly available online from the following endpoint:

```bash
https://ccs-lab.zcu.cz/grela-api/api/query
```

Through the API, you can query it using the same queries as the local version.

For instance, to retrieve the first 10 rows from  the `works` table, your SQL query would be:
```
SELECT * FROM works LIMIT 10;
```

To execute the query from the command line, you could use the `curl` command, e.g.:
```bash
curl -X POST https://ccs-lab.zcu.cz/grela-api/api/query \
  -H "Content-Type: application/json" \
  -d '{"query": "SELECT * FROM works", "format": "json"}'
```



The API returns the response as a JSON object.

The query output is available as a downloadable file, which you can download using the `download_url` field in the response, which you can load directly into Python as a pandas DataFrame object.

In Python, the whole pipeline consists of several steps:

In [49]:
# (1) define the database connection
api_url = "https://ccs-lab.zcu.cz/grela-api/api/query"
# (2) define the query, e.g.:
query = "SELECT * FROM works"
# (3) execute the query via requests
response = requests.post(api_url, json={"query": query})
# (4) retrieve the download URL from the response with error checking
download_url = response.json()["download_url"]
print(download_url)

https://ccs-lab.zcu.cz/grela-api-out/a4209096-606a-4633-b03c-508a3c67830e.parquet


In [50]:
# (5) load the file object into a pandas DataFrame
df = pd.read_parquet(download_url)
len(df)

11117

## Working with the `works` table

the `works` table contains all the metadata about the works in the database.

In [51]:
# you can easily obtain the entire `works` table as a pandas DataFrame
query = "SELECT * FROM works"
df = pd.read_parquet(requests.post(api_url, json={"query": query}).json()["download_url"])

In [52]:
# you can also select specific works, for instance, based on the date columns:
query = """
SELECT * FROM works as w
WHERE (w.not_before > 100 AND w.not_before < 200)
   OR (w.not_after > 100 AND w.not_after < 200);
"""

df = pd.read_parquet(requests.post(api_url, json={"query": query}).json()["download_url"])
df.head(5)

Unnamed: 0,grela_source,grela_id,author,title,not_before,not_after,lagt_tlg_epithet,lagt_genre,lagt_provenience,noscemus_place,noscemus_genre,noscemus_discipline,title_short,emlap_noscemus_id,place_publication,place_geonames,author_viaf,title_viaf,date_random,token_count
0,lagt,lagt_ogl0001.ogl001,Pinytus,De Epistola Pinyti ad Dionysium,101.0,200.0,[],[],christian,,,,,,,,,,135.0,109
1,lagt,lagt_tlg0007.tlg001,Plutarch,Θησεύς,96.0,116.0,['Biographi' 'Philosophici/-ae'],[],pagan,,,,,,,,,,110.0,8514
2,lagt,lagt_tlg0007.tlg002,Plutarch,Ῥωμύλος,96.0,116.0,['Biographi' 'Philosophici/-ae'],[],pagan,,,,,,,,,,98.0,10668
3,lagt,lagt_tlg0007.tlg003,Plutarch,Θησέως καὶ Ῥωμύλου σύγκρισις,96.0,116.0,['Biographi' 'Philosophici/-ae'],[],pagan,,,,,,,,,,106.0,1315
4,lagt,lagt_tlg0007.tlg004,Plutarch,Λυκοῦργος,96.0,120.0,['Biographi' 'Philosophici/-ae'],[],pagan,,,,,,,,,,105.0,10933


In [53]:
# to get a quick overview of the subcorpora, including the work counts, sentence counts and token counts
query = """
WITH subcorpora AS (
    SELECT
        SUBSTR(grela_id, 0, INSTR(grela_id, '_')) AS subcorpus,
        COUNT(DISTINCT grela_id) AS works_N,
        COUNT(DISTINCT sentence_id) AS sentences_N,
        COUNT(*) AS tokens_N
    FROM tokens
    GROUP BY subcorpus
)
SELECT * FROM subcorpora
ORDER BY subcorpus;
"""

# Execute the query and fetch as a pandas DataFrame
subcorpus_stats_df = pd.read_parquet(requests.post(api_url, json={"query": query}).json()["download_url"])
subcorpus_stats_df

Unnamed: 0,subcorpus,works_N,sentences_N,tokens_N
0,cc,7819,11834607,201909639
1,emlap,73,220846,3495212
2,lagt,1957,2703678,35808742
3,noscemus,996,11802783,139401899
4,vulgate,73,35254,603091


## Working with the `sentences` table

In [54]:
# extract a subset of sentences with work level metadata, select on specific grela_id pattern
query = """
        SELECT s.grela_id,
               s.sentence_id,
               s.text,
               w.*
        FROM sentences s
                 JOIN works w ON s.grela_id = w.grela_id
        WHERE w.grela_id LIKE 'vulgate_%' \
        """

vulgate_sentences = pd.read_parquet(requests.post(api_url, json={"query": query}).json()["download_url"])

In [55]:
vulgate_sentences.head(5)

Unnamed: 0,grela_id,sentence_id,text,grela_source,grela_id_1,author,title,not_before,not_after,lagt_tlg_epithet,lagt_genre,lagt_provenience,noscemus_place,noscemus_genre,noscemus_discipline,title_short,emlap_noscemus_id,place_publication,place_geonames,author_viaf,title_viaf,date_random,token_count
0,vulgate_tlg0031.tlg001.obi-lat,vulgate_tlg0031.tlg001.obi-lat:1.1,liber generationis Iesu Christi filii David fi...,vulgate,vulgate_tlg0031.tlg001.obi-lat,,Vulgate - Matthew,,,,,,,,,,,,,,,,0
1,vulgate_tlg0031.tlg001.obi-lat,vulgate_tlg0031.tlg001.obi-lat:1.2,Abraham genuit Isaac Isaac autem genuit Iacob ...,vulgate,vulgate_tlg0031.tlg001.obi-lat,,Vulgate - Matthew,,,,,,,,,,,,,,,,0
2,vulgate_tlg0031.tlg001.obi-lat,vulgate_tlg0031.tlg001.obi-lat:1.3,Iudas autem genuit Phares et Zara de Thamar Ph...,vulgate,vulgate_tlg0031.tlg001.obi-lat,,Vulgate - Matthew,,,,,,,,,,,,,,,,0
3,vulgate_tlg0031.tlg001.obi-lat,vulgate_tlg0031.tlg001.obi-lat:1.4,Aram autem genuit Aminadab Aminadab autem genu...,vulgate,vulgate_tlg0031.tlg001.obi-lat,,Vulgate - Matthew,,,,,,,,,,,,,,,,0
4,vulgate_tlg0031.tlg001.obi-lat,vulgate_tlg0031.tlg001.obi-lat:1.5,Salmon autem genuit Booz de Rachab Booz autem ...,vulgate,vulgate_tlg0031.tlg001.obi-lat,,Vulgate - Matthew,,,,,,,,,,,,,,,,0


### Working with the `tokens` table

We can get all tokens data based on a specific work attribute, e.g. author name pattern

In the example below, we select all tokens belonging to works by Plato.

The tokens are in the order as they appear in the text, so you use this output to compile the text.

You can also filter the tokens based on their POS tags, e.g. nouns, adjectives, etc.

We also keep track of the `sentence_id`, so you can at any time retrieve the raw text of the sentence text from the `sentences` table.

In [56]:
query = """
    SELECT t.*, w.*
    FROM tokens t
    JOIN works w ON t.grela_id = w.grela_id
    WHERE w.author LIKE 'Plato'
"""

plato_tokens = pd.read_parquet(requests.post(api_url, json={"query": query}).json()["download_url"])
len(plato_tokens)

1379783

In [58]:
# below we retrieve the raw text of all sentences containing the lemma "labyrinthus"
query = """
SELECT
    t.sentence_id,
    t.grela_id,
    t.token_id AS target_token_id,
    t.char_start AS target_char_start,
    t.char_end AS target_char_end,
    s.text AS sentence_text,
    w.author,
    w.title,
    w.not_before,
    w.not_after
FROM tokens t
JOIN sentences s ON t.sentence_id = s.sentence_id
JOIN works w ON t.grela_id = w.grela_id
WHERE t.lemma = 'labyrinthus'
"""

target_sentences  = pd.read_parquet(requests.post(api_url, json={"query": query}).json()["download_url"])
target_sentences.head(5)

Unnamed: 0,sentence_id,grela_id,target_token_id,target_char_start,target_char_end,sentence_text,author,title,not_before,not_after
0,noscemus_605285_6,noscemus_605285,34285288,91,101,sine istarum cognitione nec uerbum quidem inte...,"Cappeller, Moritz Anton",Prodromus crystallographiae: de crystallis imp...,1723.0,1723.0
1,noscemus_914304_6629,noscemus_914304,34503897,1253,1264,Nidus formicarum niger ex siluestribus de men ...,"Rumpf, Georg Eberhard","Herbarium Amboinense, plurimas complectens arb...",1741.0,1750.0
2,noscemus_786344_21148,noscemus_786344,73757811,137,147,Uerum quidem est multos inesse maeandros obliq...,"L'Ecluse, Charles de","Exoticorum libri decem: quibus animalium, plan...",1605.0,1605.0
3,noscemus_786344_25714,noscemus_786344,73832879,70,80,"Istic uespertiliones erant a nostris, & ab iis...","L'Ecluse, Charles de","Exoticorum libri decem: quibus animalium, plan...",1605.0,1605.0
4,cc_12765_1477,cc_12765,236733163,48,58,Quid faciamus homines miserrimi et noui generi...,Petronius,"Satyricon, Fragmenta, et Poemata",14.0,66.0


In [60]:
# Extract token data from all sentences containing the lemma "liber" with the POS tag "NOUN"

query = """
WITH target_matches AS (
    SELECT sentence_id
    FROM tokens t
    WHERE t.lemma = 'liber' AND t.pos IN ('NOUN')
)

SELECT
    sentence_id,
    LIST(
        STRUCT_PACK(
            token_id    := token_id,
            token_text  := token_text,
            lemma       := lemma,
            pos         := pos,
            char_start  := char_start,
            char_end    := char_end,
            sentence_id := sentence_id
        )
        ORDER BY token_id
    ) AS tokens
FROM tokens
WHERE sentence_id IN (SELECT DISTINCT sentence_id FROM target_matches)
GROUP BY sentence_id;
"""

target_sentences_token_data  = pd.read_parquet(requests.post(api_url, json={"query": query}).json()["download_url"])
target_sentences_token_data.head(5)


In [61]:
target_sentences_token_data.head(5)

Unnamed: 0,sentence_id,tokens
0,emlap_100061_88,"[{'char_end': 11, 'char_start': 0, 'lemma': 'h..."
1,emlap_100061_458,"[{'char_end': 3, 'char_start': 0, 'lemma': 'li..."
2,emlap_100061_539,"[{'char_end': 3, 'char_start': 0, 'lemma': 'li..."
3,emlap_100061_1287,"[{'char_end': 1, 'char_start': 0, 'lemma': '&'..."
4,emlap_100061_2320,"[{'char_end': 3, 'char_start': 0, 'lemma': 'li..."


You can also retrieve the token data for a subset of works, like for all Christian works from the first three centuries

In [36]:
### all token data from sentences with lagt_provenience "christian" and "not_ and not_after intersecting the first three centuries

query = """
SELECT
    t.sentence_id,
    w.author,
    w.title,
    w.not_before,
    w.not_after,
    LIST(
        STRUCT_PACK(
            token_id    := t.token_id,
            token_text  := t.token_text,
            lemma       := t.lemma,
            pos         := t.pos,
            char_start  := t.char_start,
            char_end    := t.char_end,
            sentence_id := t.sentence_id
        )
        ORDER BY t.token_id
    ) AS tokens
FROM tokens t
JOIN works w ON t.grela_id = w.grela_id
WHERE w.lagt_provenience = 'christian'
  AND w.not_before >= 0 AND w.not_before < 300
  AND w.not_after  >= 0 AND w.not_after  < 300
GROUP BY t.sentence_id, w.author, w.title, w.not_before, w.not_after
"""

EC_sentence_tokens = pd.read_parquet(requests.post(api_url, json={"query": query}).json()["download_url"])
len(EC_sentence_tokens)

156571

From this output you can easily get to the data format we used previously in the LAGT dataset and elsewhere for training distributional semantic models.

In [42]:
EC_sentence_tokens["lemmatized_sentence"] = EC_sentence_tokens["tokens"].apply(lambda x: [t["lemma"] for t in x if t["pos"] in ["n", "v", "a"]])
EC_sentence_tokens["lemmatized_sentence"].tolist()[:10]

[['σύνειμι', 'λογισμός', 'οἶδα', 'ἀρχή', 'λαμβάνω'],
 ['τίς', 'νοέω', 'δυνατός'],
 ['ὠχριακότα', 'τήκομαι'],
 ['βούλομαι', 'φθάνω', 'λέγω'],
 ['λέγω'],
 [],
 ['συνεβέλευέν', 'οὗτς', 'τολμάω', 'πολύς', 'τρόπες'],
 ['λογισμός',
  'πρᾶγμα',
  'φήμη',
  'τιβερίς',
  'καῖσαρ',
  'βασιλεία',
  'ἐαρινός',
  'τροπή',
  'ἀρχή',
  'λαμβάνεσα',
  'ἤυξανεν',
  'ἀγαθός',
  'θεός',
  'ἄγγελος',
  'διέρχομαι',
  'κόσμος',
  'θεός',
  'βούλημα',
  'σιγάω',
  'στέγω',
  'δύναμαι'],
 ['ἀπειθήω', 'ψυχή', 'σῶμα', 'λύσις', 'τόπος', 'πῦρ', 'βληθήσω'],
 []]

You can ultimately run very complex queries using the API.

In the example below, we retrieve the contextual data for all tokens matching the lemma "liber" with the POS tag "NOUN".

For each token, we retrieve:
- (1) the raw sentence in which the (lemmatized) token appears (`semtence_tokens`),
- (2) the token data from the sentence in which the token appears (`sentence_tokens`),
- (3) a broader context, including also 1 raw sentence before and 1 raw sentence after the target token (`context_3sents`)
- (4) concordance from the 10 tokens preceding and 10 tokens following the target token (i.e. 10+1+10 tokens) (`concordance_tokens`).

All these co-occurrence data are then returned within a single table and can be used for different kinds of co-occurrence and semantic analyses.

In [63]:


def get_grouped_token_data_with_metadata_api(api_url: str, lemma: str, pos_tags: list) -> pd.DataFrame:
    """
    API version of grouped token query with context and metadata.
    """
    pos_placeholder = ", ".join(f"'{tag}'" for tag in pos_tags)
    lemma_literal = lemma.replace("'", "''")  # SQL-safe single quotes

    query = f"""
    WITH target_matches AS (
        SELECT
            t.sentence_id,
            t.grela_id,
            s.position AS sentence_position,
            t.token_id AS target_token_id,
            t.char_start AS target_char_start,
            t.char_end AS target_char_end
        FROM tokens t
        JOIN sentences s USING (sentence_id)
        WHERE t.lemma = '{lemma_literal}'
        AND t.pos IN ({pos_placeholder})
    ),

    context_3sents AS (
        SELECT
            tm.sentence_id,
            STRING_AGG(s.text, ' | ' ORDER BY s.position) AS context_3sents
        FROM target_matches tm
        JOIN sentences s
          ON s.grela_id = tm.grela_id
         AND s.position BETWEEN tm.sentence_position - 1
                            AND tm.sentence_position + 1
        GROUP BY tm.sentence_id
    ),

    sentence_tokens AS (
        SELECT
            sentence_id,
            LIST(
              STRUCT_PACK(
                 token_id := token_id,
                 token_text := token_text,
                 lemma := lemma,
                 pos := pos,
                 char_start := char_start,
                 char_end := char_end,
                 sentence_id := sentence_id
              )
              ORDER BY token_id
            ) AS tokens
        FROM tokens
        WHERE sentence_id IN (SELECT DISTINCT sentence_id FROM target_matches)
        GROUP BY sentence_id
    ),

    concordance_tokens AS (
        SELECT
            tm.sentence_id,
            tm.target_token_id,
            LIST(
              STRUCT_PACK(
                 token_id := ct.token_id,
                 token_text := ct.token_text,
                 lemma := ct.lemma,
                 pos := ct.pos,
                 char_start := ct.char_start,
                 char_end := ct.char_end,
                 sentence_id := ct.sentence_id
              )
              ORDER BY ct.token_id
            ) AS concordance_tokens
        FROM target_matches tm
        JOIN tokens ct
          ON ct.grela_id = tm.grela_id
         AND ct.token_id BETWEEN tm.target_token_id - 10
                            AND tm.target_token_id + 10
        GROUP BY tm.sentence_id, tm.target_token_id
    )

    SELECT
        w.author,
        w.title,
        tm.grela_id,
        tm.sentence_id,
        s.text AS sentence_text,
        c3.context_3sents,
        st.tokens,
        ct.concordance_tokens,
        w.not_before,
        w.not_after,
        w.date_random,
        w.lagt_genre,
        w.lagt_provenience,
        w.noscemus_genre,
        w.noscemus_discipline,
        tm.target_token_id,
        tm.target_char_start,
        tm.target_char_end
    FROM target_matches tm
    JOIN sentences s USING (sentence_id)
    JOIN works w ON w.grela_id = tm.grela_id
    LEFT JOIN context_3sents c3 USING (sentence_id)
    LEFT JOIN sentence_tokens st USING (sentence_id)
    LEFT JOIN concordance_tokens ct
           ON ct.sentence_id = tm.sentence_id
          AND ct.target_token_id = tm.target_token_id
    ORDER BY tm.sentence_id, tm.target_token_id;
    """

    # Send query to the API
    response = requests.post(api_url, json={"query": query})
    response.raise_for_status()

    # Load the result
    download_url = response.json()["download_url"]
    return pd.read_parquet(download_url)

In [72]:

# Example usage
liber_contextual_data = get_grouped_token_data_with_metadata_api(
    api_url="https://ccs-lab.zcu.cz/grela-api/api/query",
    lemma="liber",
    pos_tags=["NOUN"])

In [73]:
len(liber_contextual_data)

303829

In [76]:
liber_contextual_data.head()

Unnamed: 0,author,title,grela_id,sentence_id,sentence_text,context_3sents,tokens,concordance_tokens,not_before,not_after,date_random,lagt_genre,lagt_provenience,noscemus_genre,noscemus_discipline,target_token_id,target_char_start,target_char_end
0,Franco Leodiensis,De quadratura circuli,cc_10000,cc_10000_12,"En autem partem prologi ad primum librum, item...",unde satis liquet Franconis opus intra hoc tem...,"[{'char_end': 2, 'char_start': 0, 'lemma': 'in...","[{'char_end': 59, 'char_start': 52, 'lemma': '...",,,,,,,,248829267,34,40
1,Franco Leodiensis,De quadratura circuli,cc_10000,cc_10000_12,"En autem partem prologi ad primum librum, item...",unde satis liquet Franconis opus intra hoc tem...,"[{'char_end': 2, 'char_start': 0, 'lemma': 'in...","[{'char_end': 26, 'char_start': 24, 'lemma': '...",,,,,,,,248829275,73,78
2,Franco Leodiensis,De quadratura circuli,cc_10000,cc_10000_14,Incipit Prologus In Primum Librum Domni Franco...,"hos enim locos ad specimen praebendum delegi, ...","[{'char_end': 7, 'char_start': 0, 'lemma': 'in...","[{'char_end': 50, 'char_start': 46, 'lemma': '...",,,,,,,,248829298,27,33
3,Franco Leodiensis,De quadratura circuli,cc_10000,cc_10000_38,Uirgilius cupiens a parentibus magnificare Aug...,"Si tu is esses, praesul eximie, cujus suae lau...","[{'char_end': 9, 'char_start': 0, 'lemma': 'ui...","[{'char_end': 437, 'char_start': 436, 'lemma':...",,,,,,,,248829889,65,71
4,Franco Leodiensis,De quadratura circuli,cc_10000,cc_10000_4,Amatores scientiae saecularis taxent ejus scie...,"Sic igitur Sigebertus, cap. 164. << Franco Sch...","[{'char_end': 8, 'char_start': 0, 'lemma': 'am...","[{'char_end': 158, 'char_start': 147, 'lemma':...",,,,,,,,248829141,55,60
