In [4]:
import pandas as pd
import duckdb
import os
import os, json
from uuid import uuid4
import pandas as pd
import numpy as np

In [6]:
#works_df = pd.read_parquet("../data/grela_works_metadata.parquet")

In [7]:
conn = duckdb.connect("/srv/data/greek/grela.duckdb")
#conn.execute("CREATE TABLE works AS SELECT * FROM works_df")

In [8]:
def process_sentences_from_dir(dir_path, grela_prefix):
    sentences = []
    tokens = []

    # Iterate through items in the directory
    for fn in os.listdir(dir_path):
        # Ensure it's a JSON file (not a directory or other file type)
        if fn.endswith(".json") and os.path.isfile(os.path.join(dir_path, fn)):
            grela_id = grela_prefix + "_" + fn.replace(".json", "")
            sents_data = json.load(open(os.path.join(dir_path, fn), "rb"))

            # Process sentences and tokens
            for sent in sents_data:
                work_id, pos, text, token_data = sent
                sentence_id = grela_id + "_" + str(pos)  # Create sentence ID
                sentences.append([sentence_id, grela_id, pos, text])

                # Process tokens (with extra field handling)
                for token in token_data:
                    tok_text, lemma, pos_tag, char_span, *extra = token
                    page_idx, textblock_idx = (extra + [None, None])[:2]
                    tokens.append([sentence_id, grela_id, tok_text, lemma, pos_tag, char_span[0], char_span[1], page_idx, textblock_idx])

    return pd.DataFrame(sentences, columns=["sentence_id", "grela_id", "position", "text"]), \
           pd.DataFrame(tokens, columns=["sentence_id", "grela_id", "token_text", "lemma", "pos", "char_start", "char_end", "page_idx", "textblock_idx"])

In [25]:
def process_and_store_in_duckdb(dir_path, grela_prefix, conn, sentences_table="sentences", tokens_table="tokens"):
    """
    Processes large JSON sentence and token datasets, storing them incrementally into DuckDB.

    Args:
        dir_path (str): Path to the directory with JSON files.
        grela_prefix (str): Prefix for generating unique `grela_id` for each document.
        conn (duckdb.DuckDBPyConnection): Active DuckDB connection.
        sentences_table (str): Name of the table for storing sentence-level data.
        tokens_table (str): Name of the table for storing token-level data.

    Returns:
        None
    """
    for fn in os.listdir(dir_path):
        if not fn.endswith(".json"):  # Process only JSON files
            continue

        grela_id = f"{grela_prefix}_{fn.replace('.json', '')}"

        # Load JSON data
        with open(os.path.join(dir_path, fn), "r", encoding="utf-8") as f:
            sents_data = json.load(f)

        sentences, tokens = [], []

        for sent in sents_data:
            work_id, pos, text, token_data = sent
            sentence_id = f"{grela_id}_{pos}"

            # Store sentence data
            sentences.append([sentence_id, grela_id, pos, text])

            # Process tokens (exclude `page_idx` and `textblock_idx`)
            for token in token_data:
                tok_text, lemma, pos_tag, char_span = token[:4]
                tokens.append([
                    sentence_id,  # Sentence ID
                    grela_id,     # Document ID
                    tok_text,     # Token text
                    lemma,        # Lemma
                    pos_tag,      # Part of Speech
                    char_span[0],  # Start character position
                    char_span[1],  # End character position
                ])

        # Convert to DataFrames
        sentences_df = pd.DataFrame(
            sentences,
            columns=["sentence_id", "grela_id", "position", "text"]
        )
        tokens_df = pd.DataFrame(
            tokens,
            columns=[
                "sentence_id", "grela_id", "token_text", "lemma", "pos", 
                "char_start", "char_end"
            ]
        )

        # Create database tables (remove `page_idx` and `textblock_idx` from schema)
        conn.execute(f"""
            CREATE TABLE IF NOT EXISTS {sentences_table} (
                sentence_id STRING,
                grela_id STRING,
                position INT,
                text STRING
            )
        """)
        conn.execute(f"""
            CREATE TABLE IF NOT EXISTS {tokens_table} (
                sentence_id STRING,
                grela_id STRING,
                token_text STRING,
                lemma STRING,
                pos STRING,
                char_start INT,
                char_end INT
            )
        """)

        # Register DataFrames as temporary tables
        conn.register("temp_sentences", sentences_df)
        conn.register("temp_tokens", tokens_df)

        # Insert into DuckDB
        conn.execute(f"""
            INSERT INTO {sentences_table}
            SELECT * FROM temp_sentences
        """)
        conn.execute(f"""
            INSERT INTO {tokens_table}
            SELECT * FROM temp_tokens
        """)

        # Unregister temporary tables
        conn.unregister("temp_sentences")
        conn.unregister("temp_tokens")

In [10]:
lagt_sents_data_dir = "/home/jupyter-vojta/notebooks/LAGT/data/large_files/sents_data_jsons"
emlap_sents_data_dir = "/srv/data/tome/tome-corpus/sents_data_id_jsons_v3-0/"
noscemus_sents_data_dir = "/srv/data/tome/noscemus/sents_data_jsons/"
cc_sents_data_dir = "/srv/data/corpus-corporum/cc_sents_jsons/"

In [30]:
# Drop old tables if they exist
tokens_table = "tokens"
sentences_table = "sentences"
conn.execute(f"DROP TABLE IF EXISTS {sentences_table}")
conn.execute(f"DROP TABLE IF EXISTS {tokens_table}")

<duckdb.duckdb.DuckDBPyConnection at 0x7443a6f43270>

In [31]:
# Process each corpus and store into DuckDB
process_and_store_in_duckdb(emlap_sents_data_dir, "emlap", conn)

In [None]:
process_and_store_in_duckdb(noscemus_sents_data_dir, "noscemus", conn)
process_and_store_in_duckdb(lagt_sents_data_dir, "lagt", conn)
process_and_store_in_duckdb(cc_sents_data_dir, "cc", conn)

In [7]:
#sents_emlap, tokens_emlap = process_sentences_from_dir(emlap_sents_data_dir, "emlap")

In [8]:
# look at emlap data for testing
#tokens_emlap.sample(10)

Unnamed: 0,sentence_id,grela_id,token_text,lemma,pos,char_start,char_end,page_idx,textblock_idx
1349552,emlap_100059_3060,emlap_100059,tria,tres,NUM,24,28,[241],[5]
2049492,emlap_100061_2489,emlap_100061,ibid,ibid,ADV,0,4,[108],[5]
2198084,emlap_100063_99,emlap_100063,perpetuitare,perpetuito,VERB,153,165,[16],[19]
3141712,emlap_100039_141,emlap_100039,Mercurio,Mercurius,PROPN,34,42,[20],"[15, 16]"
1937786,emlap_100062_690,emlap_100062,negemus,nego,VERB,95,102,[64],[19]
1504414,emlap_100032_2012,emlap_100032,creationibus,creatio,NOUN,33,45,[146],[21]
1292954,emlap_100053_878,emlap_100053,copia,copia,NOUN,405,410,[133],[19]
3049342,emlap_100015_1257,emlap_100015,theriacalis,theriacalis,ADJ,43,54,[114],[18]
1539254,emlap_100032_4003,emlap_100032,argento,argentum,NOUN,114,121,[324],[23]
979396,emlap_100038_7507,emlap_100038,in,in,ADP,30,32,[575],[25]


In [9]:
# look
sents_emlap.sample(10)

Unnamed: 0,sentence_id,grela_id,position,text
110291,emlap_100051_3639,emlap_100051,3639,"Ad uenarum humores erassos, lentos, ac pituito..."
49304,emlap_100070_5892,emlap_100070,5892,"Figulina pinguis, sanguis draconis paucus, ter..."
131811,emlap_100061_612,emlap_100061,612,ibid. 91.
124958,emlap_100046_1953,emlap_100046,1953,"Uerum eiusmodi pascua quandoque peiora sunt, q..."
77197,emlap_100042_1078,emlap_100042,1078,"Nec est quod ita mireris, medicinam aliquam in..."
198900,emlap_100015_2627,emlap_100015,2627,"Prima aqua erit clara, & ualet podagrae humidae:"
95195,emlap_100032_1298,emlap_100032,1298,"quia ad formam oui dispositum est, & dicitur s..."
34064,emlap_100013_2815,emlap_100013,2815,"Omni mane & sero de hac aqua impones ad aures,..."
216215,emlap_100011_399,emlap_100011,399,Promissio & diuisio dicendorum de operationib.
94951,emlap_100032_1054,emlap_100032,1054,Hec est huius rei radix ut qui eam addiscere u...


In [None]:
%%time
sents_nos, tokens_nos = process_sentences_from_dir(noscemus_sents_data_dir, "noscemus")

In [15]:
sents_nos.sample(10)

Unnamed: 0,sentence_id,grela_id,position,text
743250,noscemus_668522_21910,noscemus_668522,21910,Elacatena quoque salsamento idoneus piscis est...
2565541,noscemus_914304_9018,noscemus_914304,9018,pag. 212.
2661058,noscemus_928147_13394,noscemus_928147,13394,Secunda ibi.
3149822,noscemus_732628_8192,noscemus_732628,8192,Idem inquietis & stolidis ingemus euenit:
9961985,noscemus_901145_78964,noscemus_901145,78964,fiat confectio in rotulis ponderis. 3 8.
3466975,noscemus_631363_13372,noscemus_631363,13372,Calceolario Ueronensi Pharmacopol Ioannis Bapt...
4135971,noscemus_929375_5561,noscemus_929375,5561,Inter sinistrum humerum & Tertiam 2.
8949356,noscemus_906961_21555,noscemus_906961,21555,Harduini in 410.
8237679,noscemus_664561_4548,noscemus_664561,4548,"Pitem attenuat, resoluit, Matt."
2789886,noscemus_835557_24530,noscemus_835557,24530,b. 50.


In [16]:
tokens_nos.sample(10)

Unnamed: 0,sentence_id,grela_id,token_text,lemma,pos,char_start,char_end,page_idx,textblock_idx
95331451,noscemus_767766_1405,noscemus_767766,camelorum,camelus,NOUN,13,22,,
5493374,noscemus_704336_15313,noscemus_704336,impressit,imprimo,VERB,53,62,,
99530955,noscemus_918511_10291,noscemus_918511,In,in,ADP,0,2,,
91258461,noscemus_906964_7937,noscemus_906964,De,de,ADP,26,28,,
92824347,noscemus_655273_3785,noscemus_655273,circulos,circulus,NOUN,82,90,,
31477593,noscemus_756874_21222,noscemus_756874,headed,,VERB,35,41,,
108428896,noscemus_756878_20567,noscemus_756878,Les,Les,PROPN,0,3,,
100898942,noscemus_888138_57149,noscemus_888138,corporis,corpus,NOUN,38,46,,
115204905,noscemus_744031_4973,noscemus_744031,",",",",PUNCT,84,85,,
106678555,noscemus_900765_6687,noscemus_900765,a,ab,ADP,88,89,,


In [17]:
sents_lagt, tokens_lagt = process_sentences_from_dir(lagt_sents_data_dir, "lagt")

In [None]:
sents_cc, tokens_cc = process_sentences_from_dir(cc_sents_data_dir, "cc")

In [None]:
# Merge
all_sents = pd.concat([sents_cc, sents_nos, sents_emlap, sents_lagt])
all_tokens = pd.concat([tokens_cc, tokens_nos, tokens_emlap, tokens_lagt])

## Add tokens and sentences into the database

In [None]:
# add table with sentence data
conn.execute("CREATE TABLE IF NOT EXISTS sentences AS SELECT * FROM all_sents")

In [None]:
# Store in batches if necessary:
for i, chunk in enumerate(np.array_split(all_tokens, 100)):
    conn.register("chunk", chunk)
    conn.execute("INSERT INTO tokens SELECT * FROM chunk")

In [None]:
lemmata_df = all_tokens.groupby(["lemma", "pos"]).size().reset_index(name="count")
conn.execute("CREATE TABLE lemmata AS SELECT * FROM lemmata_df")