In [23]:
import requests
import pickle
import pandas as pd

In [3]:

# Define the GitHub API endpoint for the target folder
api_url = "https://api.github.com/repos/lascivaroma/latin-lemmatized-texts/contents/lemmatized/xml"

# Optional: add headers to increase rate limits if needed
headers = {"Accept": "application/vnd.github.v3+json"}

response = requests.get(api_url, headers=headers)

if response.status_code == 200:
    files = response.json()
    xml_files = [file["name"] for file in files if file["type"] == "file"]

In [8]:
xml_files

['urn:cts:greekLit:tlg0031.tlg001.obi-lat1.xml',
 'urn:cts:greekLit:tlg0031.tlg002.obi-lat1.xml',
 'urn:cts:greekLit:tlg0031.tlg003.obi-lat1.xml',
 'urn:cts:greekLit:tlg0031.tlg004.obi-lat1.xml',
 'urn:cts:greekLit:tlg0031.tlg005.obi-lat1.xml',
 'urn:cts:greekLit:tlg0031.tlg006.obi-lat1.xml',
 'urn:cts:greekLit:tlg0031.tlg007.obi-lat1.xml',
 'urn:cts:greekLit:tlg0031.tlg008.obi-lat1.xml',
 'urn:cts:greekLit:tlg0031.tlg009.obi-lat1.xml',
 'urn:cts:greekLit:tlg0031.tlg010.obi-lat1.xml',
 'urn:cts:greekLit:tlg0031.tlg011.obi-lat1.xml',
 'urn:cts:greekLit:tlg0031.tlg012.obi-lat1.xml',
 'urn:cts:greekLit:tlg0031.tlg013.obi-lat1.xml',
 'urn:cts:greekLit:tlg0031.tlg014.obi-lat1.xml',
 'urn:cts:greekLit:tlg0031.tlg015.obi-lat1.xml',
 'urn:cts:greekLit:tlg0031.tlg016.obi-lat1.xml',
 'urn:cts:greekLit:tlg0031.tlg017.obi-lat1.xml',
 'urn:cts:greekLit:tlg0031.tlg018.obi-lat1.xml',
 'urn:cts:greekLit:tlg0031.tlg019.obi-lat1.xml',
 'urn:cts:greekLit:tlg0031.tlg020.obi-lat1.xml',
 'urn:cts:greekLit:t

In [9]:
filenames_vulgate = [f for f in xml_files if (":tlg0031" in f) or (":tlg0527" in f)]
len(filenames_vulgate)

73

In [14]:
import requests
import re
from bs4 import BeautifulSoup

def map_tei_pos(pos_tag):
    POS_MAP_3LETTER = {
        "VER": "VERB",
        "NOM": "NOUN",
        "ADJ": "ADJ",
        "ADV": "ADV",
        "PRE": "ADP",
        "CON": "CCONJ",
        "PRO": "PRON",
    }

    if pos_tag == "NOMpro":
        return "PROPN"
    elif pos_tag == "ADJcar":
        return "NUM"
    elif pos_tag == "CONsub":
        return "SCONJ"
    elif pos_tag == "CONcoo":
        return "CCONJ"

    return POS_MAP_3LETTER.get(pos_tag[:3], "X")

def parse_tei_verses(file_name):
    url = f"https://raw.githubusercontent.com/lascivaroma/latin-lemmatized-texts/main/lemmatized/xml/{file_name}"
    response = requests.get(url)
    response.raise_for_status()

    soup = BeautifulSoup(response.content, "xml")

    # Extract CTS URN and construct work_id
    urn = soup.find("text")["n"]
    work_id = urn.replace("urn:cts:greekLit:", "").replace(".xml", "").replace(".obi-lat1", ".obi-lat")

    # Extract book title from header
    title_tag = soup.find("title")
    title = title_tag.text.strip() if title_tag else "Unknown_Book"

    # Normalize book title to create reference token string
    title_ref_part = title.replace(" ", "_")

    verses = []

    for ab in soup.find_all("ab", {"type": "verse"}):
        verse_id = ab.get("n")  # e.g., urn:cts:greekLit:tlg0031.tlg003.obi-lat1:1.13
        chapter_verse = verse_id.split(":")[-1]  # e.g., 1.13
        ref_string = f"{title_ref_part}_{chapter_verse}"

        verse_tokens = []
        token_data = []

        for w in ab.find_all("w"):
            word = w.text.strip()
            lemma = re.sub(r"\d+$", "", w.get("lemma") or "")
            pos = w.get("pos")

            if word:
                mapped_pos = map_tei_pos(pos)
                token_data.append((word, lemma, mapped_pos, ref_string))
                verse_tokens.append(word)

        verse_text = " ".join(verse_tokens)
        verses.append((work_id, chapter_verse, verse_text, token_data))

    return work_id, title, verses

In [15]:
parse_tei_verses(filenames_vulgate[0])

('tlg0031.tlg001.obi-lat',
 'Matthew',
 [('tlg0031.tlg001.obi-lat',
   '1.1',
   'liber generationis Iesu Christi filii David filii Abraham',
   [('liber', 'liber', 'ADJ', 'Matthew_1.1'),
    ('generationis', 'generatio', 'NOUN', 'Matthew_1.1'),
    ('Iesu', 'Iesus', 'PROPN', 'Matthew_1.1'),
    ('Christi', 'Christus', 'PROPN', 'Matthew_1.1'),
    ('filii', 'filius', 'NOUN', 'Matthew_1.1'),
    ('David', 'Dauid', 'PROPN', 'Matthew_1.1'),
    ('filii', 'filius', 'NOUN', 'Matthew_1.1'),
    ('Abraham', 'Abraham', 'PROPN', 'Matthew_1.1')]),
  ('tlg0031.tlg001.obi-lat',
   '1.2',
   'Abraham genuit Isaac Isaac autem genuit Iacob Iacob autem genuit Iudam et fratres eius',
   [('Abraham', 'Abraham', 'PROPN', 'Matthew_1.2'),
    ('genuit', 'gigno', 'VERB', 'Matthew_1.2'),
    ('Isaac', 'Isaac', 'PROPN', 'Matthew_1.2'),
    ('Isaac', 'Isaac', 'PROPN', 'Matthew_1.2'),
    ('autem', 'autem', 'CCONJ', 'Matthew_1.2'),
    ('genuit', 'gigno', 'VERB', 'Matthew_1.2'),
    ('Iacob', 'Iacob', 'PROPN', 

In [16]:
vulgate_works = []
sentences_data = []
for filename in filenames_vulgate:
    work_id, title, verses = parse_tei_verses(filename)
    vulgate_works.append({"grela_id" : "vulgate" + "_" + work_id, "title" : "Vulgate - " + title})
    sentences_data.extend(verses)

In [19]:
vulgate_works

[{'grela_id': 'vulgate_tlg0031.tlg001.obi-lat', 'title': 'Vulgate - Matthew'},
 {'grela_id': 'vulgate_tlg0031.tlg002.obi-lat', 'title': 'Vulgate - Mark'},
 {'grela_id': 'vulgate_tlg0031.tlg003.obi-lat', 'title': 'Vulgate - Luke'},
 {'grela_id': 'vulgate_tlg0031.tlg004.obi-lat', 'title': 'Vulgate - John'},
 {'grela_id': 'vulgate_tlg0031.tlg005.obi-lat', 'title': 'Vulgate - Acts'},
 {'grela_id': 'vulgate_tlg0031.tlg006.obi-lat', 'title': 'Vulgate - Romans'},
 {'grela_id': 'vulgate_tlg0031.tlg007.obi-lat',
  'title': 'Vulgate - 1 Corinthians'},
 {'grela_id': 'vulgate_tlg0031.tlg008.obi-lat',
  'title': 'Vulgate - 2 Corinthians'},
 {'grela_id': 'vulgate_tlg0031.tlg009.obi-lat',
  'title': 'Vulgate - Galatians'},
 {'grela_id': 'vulgate_tlg0031.tlg010.obi-lat',
  'title': 'Vulgate - Ephesians'},
 {'grela_id': 'vulgate_tlg0031.tlg011.obi-lat',
  'title': 'Vulgate - Philippians'},
 {'grela_id': 'vulgate_tlg0031.tlg012.obi-lat',
  'title': 'Vulgate - Colossians'},
 {'grela_id': 'vulgate_tlg0031

In [22]:
# Save vulgate_works
with open("../data/vulgate_works.pkl", "wb") as f:
    pickle.dump(vulgate_works, f)

# Save sentences_data
with open("../data/vulgate_sentences.pkl", "wb") as f:
    pickle.dump(sentences_data, f)

In [24]:
vulgate_works_df = pd.DataFrame(vulgate_works)
vulgate_works_df.head(5)

Unnamed: 0,grela_id,title
0,vulgate_tlg0031.tlg001.obi-lat,Vulgate - Matthew
1,vulgate_tlg0031.tlg002.obi-lat,Vulgate - Mark
2,vulgate_tlg0031.tlg003.obi-lat,Vulgate - Luke
3,vulgate_tlg0031.tlg004.obi-lat,Vulgate - John
4,vulgate_tlg0031.tlg005.obi-lat,Vulgate - Acts


In [25]:
vulgate_works_df.to_parquet("../data/vulgate_works_df.parquet")