In [17]:
import pandas as pd
import duckdb
import os
import os, json
from uuid import uuid4
import pandas as pd
import numpy as np

In [3]:
works_df = pd.read_parquet("../data/grela_works_metadata.parquet")

In [15]:
conn = duckdb.connect("/srv/data/greek/grela.duckdb")
conn.execute("CREATE TABLE works AS SELECT * FROM works_df")

<duckdb.duckdb.DuckDBPyConnection at 0x72e3cd945ff0>

In [9]:
def process_sentences_from_dir(dir_path, grela_prefix):
    sentences = []
    tokens = []

    for fn in os.listdir(dir_path):
        grela_id = grela_prefix + "_" + fn.replace(".json", "")
        sents_data = json.load(open(os.path.join(dir_path, fn), "rb"))

        for sent in sents_data:
            work_id, pos, text, token_data = sent
            sentence_id = grela_id + "_" + str(pos) # str(uuid4())  # or use work_id + pos
            sentences.append([sentence_id, grela_id, pos, text])

            for token in token_data:
                tok_text, lemma, pos_tag, char_span, *extra = token
                page_idx, textblock_idx = (extra + [None, None])[:2]
                tokens.append([sentence_id, grela_id, tok_text, lemma, pos_tag, char_span[0], char_span[1], page_idx, textblock_idx])

    return pd.DataFrame(sentences, columns=["sentence_id", "grela_id", "position", "text"]), \
           pd.DataFrame(tokens, columns=["sentence_id", "grela_id", "token_text", "lemma", "pos", "char_start", "char_end", "page_idx", "textblock_idx"])

In [16]:
lagt_sents_data_dir = "/home/jupyter-vojta/notebooks/LAGT/data/large_files/sents_data_jsons"
emlap_sents_data_dir = "/srv/data/tome/tome-corpus/sents_data_id_jsons_v3-0/"
noscemus_sents_data_dir = "/srv/data/tome/noscemus/sents_data_jsons/"
cc_sents_data_dir = "/srv/data/corpus-corporum/cc_sents_jsons/"

In [12]:
sents_emlap, tokens_emlap = process_sentences_from_dir(emlap_sents_data_dir, "emlap")

In [13]:
# look at emlap data for testing
tokens_emlap.sample(10)

Unnamed: 0,sentence_id,grela_id,token_text,lemma,pos,char_start,char_end,page_idx,textblock_idx
19099,emlap_100044_2257,emlap_100044,fimus,fio,VERB,25,30,[92],[12]
1304734,emlap_100059_112,emlap_100059,quidem,quidem,PART,4,10,[18],[9]
316431,emlap_100012_1274,emlap_100012,genere,genus,NOUN,61,67,[138],[12]
681111,emlap_100070_1997,emlap_100070,.,.,PUNCT,33,34,[86],[11]
910438,emlap_100038_3334,emlap_100038,affinius,affinius,ADJ,10,18,[262],[12]
2764682,emlap_100030_385,emlap_100030,Solis,Soli,NOUN,55,60,[53],[8]
511034,emlap_100013_1838,emlap_100013,simplicem,simplex,ADJ,58,67,[81],"[5, 6]"
817257,emlap_100070_12032,emlap_100070,",",",",PUNCT,192,193,[392],[39]
1213528,emlap_100071_1163,emlap_100071,per,per,ADP,51,54,[207],[4]
2435019,emlap_100067_11810,emlap_100067,sit,sum,AUX,95,98,[683],[22]


In [14]:
# look
sents_emlap.sample(10)

Unnamed: 0,sentence_id,grela_id,position,text
40777,emlap_100028_6443,emlap_100028,6443,"Quo facto, siccentur ad solem, deinde uirga al..."
8228,emlap_100034_179,emlap_100034,179,"Sapientiam nempe longe a nobis peccatum fecit,..."
88763,emlap_100002_235,emlap_100002,235,P Facta per multos dies in uase circulatione i...
47413,emlap_100070_4001,emlap_100070,4001,Ita metalla in se transeunt mutuo.
25215,emlap_100012_4684,emlap_100012,4684,"Argentum (uiuum) praecipitatum sic fit, ut scr..."
129662,emlap_100006_524,emlap_100006,524,"Sique eorum sententia, qui omnes in hanc conue..."
98884,emlap_100032_4987,emlap_100032,4987,cumque caetera in metallis reperta igni perfic...
104665,emlap_100045_2401,emlap_100045,2401,has illarum abiectarum iam loco si substitueri...
92731,emlap_100049_3137,emlap_100049,3137,Ambrae drach. 2.
216710,emlap_100011_894,emlap_100011,894,"Concedimus eis utique, sed non propter hoc nos..."


In [None]:
sents_nos, tokens_nos = process_sentences_from_dir(noscemus_sents_data_dir, "noscemus")
sents_lagt, tokens_lagt = process_sentences_from_dir(lagt_sents_data_dir, "lagt")
sents_cc, tokens_cc = process_sentences_from_dir(cc_sents_data_dir, "cc")

In [None]:
# Merge
all_sents = pd.concat([sents_cc, sents_nos, sents_emlap, sents_lagt])
all_tokens = pd.concat([tokens_cc, tokens_nos, tokens_emlap, tokens_lagt])

## Add tokens and sentences into the database

In [None]:
# add table with sentence data
conn.execute("CREATE TABLE IF NOT EXISTS sentences AS SELECT * FROM all_sents")

In [None]:
# Store in batches if necessary:
for i, chunk in enumerate(np.array_split(all_tokens, 100)):
    conn.register("chunk", chunk)
    conn.execute("INSERT INTO tokens SELECT * FROM chunk")

In [None]:
lemmata_df = all_tokens.groupby(["lemma", "pos"]).size().reset_index(name="count")
conn.execute("CREATE TABLE lemmata AS SELECT * FROM lemmata_df")