<a href="https://colab.research.google.com/github/EzraBrand/DOM-manipulation-manipulating-text/blob/master/Parsing_Sefaria_Jastrow_Dictionary_XML_File_for_Greek_loanword_entries_11_Feb_25.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import requests
import xml.etree.ElementTree as ET
import pandas as pd
import re
import io

# Download and parse XML
url = "https://raw.githubusercontent.com/Sefaria/Sefaria-Data/947c1b91684df9f8b92f14cf0d281b5d4f29bfc7/dictionaries/Jastrow/data/01-Merged%20XML/Jastrow-full.xml"
response = requests.get(url)
xml_content = response.content
tree = ET.parse(io.BytesIO(xml_content))
root = tree.getroot()

def has_greek_content(text):
    patterns = [
        r'[αβγδεζηθικλμνξοπρστυφχψω]',
        r'(?i)greek',
        r'(?i)gr\.',
        r'\([^)]*(?:ος|ον|ης|ευ|ιος|ια|ιον)[^)]*\)',
        r'\b(?:ος|ον|ης|ευ|ιος|ια|ιον)\b',
        r'(?i)(?:thrac|hellenic|byzantin)',
        r'(?i)(?:europ|macedon)'
    ]
    return any(re.search(pattern, text, re.IGNORECASE | re.UNICODE) for pattern in patterns)

def clean_text(text):
    if text is None:
        return ""
    # Clean text and handle equal signs at start
    cleaned = ' '.join(text.replace('\t', ' ').split())
    if cleaned.startswith('='):
        cleaned = "'" + cleaned
    return cleaned

def extract_greek_form(text):
    patterns = [
        r'\(([^)]*?(?:α|β|γ|δ|ε|ζ|η|θ|ι|κ|λ|μ|ν|ξ|ο|π|ρ|σ|τ|υ|φ|χ|ψ|ω)[^)]*?)\)',
        r'(?:Greek|Gr\.)[:\s]+([^.,;\[\]]+)',
        r'\b(\w+(?:ος|ον|ης|ευ|ιος|ια|ιον))\b',
        r'([^\s(),;]+?(?:α|β|γ|δ|ε|ζ|η|θ|ι|κ|λ|μ|ν|ξ|ο|π|ρ|σ|τ|υ|φ|χ|ψ|ω)[^\s(),;]*)'
    ]

    greek_forms = []
    for pattern in patterns:
        matches = re.finditer(pattern, text, re.UNICODE)
        for match in matches:
            greek_form = match.group(1).strip()
            if any(c in greek_form for c in 'αβγδεζηθικλμνξοπρστυφχψω'):
                greek_forms.append(greek_form)
    return greek_forms

def remove_greek_content(text, greek_forms):
    """Remove Greek forms and their parenthetical contexts from text"""
    cleaned_text = text
    for greek_form in greek_forms:
        # Remove the Greek form and its surrounding parentheses
        cleaned_text = re.sub(r'\([^)]*' + re.escape(greek_form) + r'[^)]*\)', '', cleaned_text)
        # Remove the bare Greek form
        cleaned_text = cleaned_text.replace(greek_form, '')

    # Clean up any resulting double spaces or empty parentheses
    cleaned_text = re.sub(r'\(\s*\)', '', cleaned_text)
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text)
    return cleaned_text.strip()

def extract_greek_loanwords():
    entries = []

    for entry in root.findall('.//entry'):
        entry_text = ' '.join(entry.itertext()).strip()

        if has_greek_content(entry_text):
            headword_elem = entry.find('head-word')
            headword = clean_text(headword_elem.text) if headword_elem is not None else ''

            definition = ""
            notes = ""
            all_greek_forms = set()

            senses_elem = entry.find('senses')
            if senses_elem is not None:
                for sense in senses_elem.findall('sense'):
                    def_elem = sense.find('definition')
                    if def_elem is not None:
                        def_text = clean_text(' '.join(def_elem.itertext()))
                        greek_forms = extract_greek_form(def_text)
                        all_greek_forms.update(greek_forms)
                        definition += remove_greek_content(def_text, greek_forms) + ' '

                    notes_elem = sense.find('notes')
                    if notes_elem is not None:
                        notes_text = clean_text(' '.join(notes_elem.itertext()))
                        greek_forms = extract_greek_form(notes_text)
                        all_greek_forms.update(greek_forms)
                        notes += remove_greek_content(notes_text, greek_forms) + ' '

            entries.append([
                entry.get('id', ''),
                headword,
                '; '.join(all_greek_forms),
                clean_text(definition),
                clean_text(notes)
            ])

    return entries

# Extract entries and create output
entries = extract_greek_loanwords()

# Print headers and data in tab-separated format
print(f"Total entries found: {len(entries)}\n")
print("ID\tHebrew/Aramaic\tGreek\tDefinition\tNotes")
for entry in entries[:2000]:  # First 10 entries
    print('\t'.join(entry))

In [None]:
def greek_to_latin(greek_text):
    """Transliterate Greek to standard Latin characters"""
    def transliterate_word(word):
        # First handle diphthongs by direct replacement
        replacements = [
            ('ευ', 'eu'), ('εύ', 'eu'), ('εὐ', 'eu'), ('εὺ', 'eu'), ('εὔ', 'eu'),
            ('αυ', 'au'), ('αύ', 'au'), ('αὐ', 'au'), ('αὺ', 'au'), ('αὔ', 'au'),
            ('Ευ', 'Eu'), ('Εύ', 'Eu'), ('Εὐ', 'Eu'), ('Εὺ', 'Eu'), ('Εὔ', 'Eu'),
            ('Αυ', 'Au'), ('Αύ', 'Au'), ('Αὐ', 'Au'), ('Αὺ', 'Au'), ('Αὔ', 'Au'),
            ('ου', 'ou'), ('ού', 'ou'), ('οὐ', 'ou'), ('οὺ', 'ou'), ('οὔ', 'ou'),
            ('αι', 'ai'), ('αί', 'ai'), ('αἰ', 'ai'), ('αὶ', 'ai'), ('αὔ', 'ai'),
            ('ει', 'ei'), ('εί', 'ei'), ('εἰ', 'ei'), ('εὶ', 'ei'), ('εὔ', 'ei'),
            ('οι', 'oi'), ('οί', 'oi'), ('οἰ', 'oi'), ('οὶ', 'oi'), ('οὔ', 'oi'),
        ]

        for greek, latin in replacements:
            word = word.replace(greek, latin)

        # Then handle individual characters
        char_map = {
            'α': 'a', 'ά': 'a', 'ὰ': 'a', 'ἀ': 'a', 'ἁ': 'ha', 'ᾶ': 'a', 'ᾴ': 'a', 'ᾳ': 'a',
            'β': 'b',
            'γ': 'g',
            'δ': 'd',
            'ε': 'e', 'έ': 'e', 'ὲ': 'e', 'ἐ': 'e', 'ἑ': 'he', 'ἔ': 'e', 'ἕ': 'he',
            'ζ': 'z',
            'η': 'ē', 'ή': 'ē', 'ὴ': 'ē', 'ἠ': 'ē', 'ἡ': 'hē', 'ῆ': 'ē', 'ῃ': 'ē',
            'θ': 'th',
            'ι': 'i', 'ί': 'i', 'ὶ': 'i', 'ἰ': 'i', 'ἱ': 'hi', 'ῖ': 'i',
            'κ': 'k',
            'λ': 'l',
            'μ': 'm',
            'ν': 'n',
            'ξ': 'x',
            'ο': 'o', 'ό': 'o', 'ὸ': 'o', 'ὀ': 'o', 'ὁ': 'ho', 'ὄ': 'o', 'ὅ': 'ho',
            'π': 'p',
            'ρ': 'r', 'ῥ': 'rh',
            'σ': 's', 'ς': 's',
            'τ': 't',
            'υ': 'y', 'ύ': 'y', 'ὺ': 'y', 'ὐ': 'y', 'ὑ': 'hy', 'ὔ': 'y',
            'φ': 'ph',
            'χ': 'ch',
            'ψ': 'ps',
            'ω': 'ō', 'ώ': 'ō', 'ὼ': 'ō', 'ὠ': 'ō', 'ὡ': 'hō', 'ῶ': 'ō',
            # Capital letters with variants
            'Α': 'A', 'Ά': 'A', 'Ἀ': 'A', 'Ἁ': 'Ha',
            'Β': 'B',
            'Γ': 'G',
            'Δ': 'D',
            'Ε': 'E', 'Έ': 'E', 'Ἐ': 'E', 'Ἑ': 'He',
            'Ζ': 'Z',
            'Η': 'Ē', 'Ή': 'Ē',
            'Θ': 'Th',
            'Ι': 'I', 'Ί': 'I',
            'Κ': 'K',
            'Λ': 'L',
            'Μ': 'M',
            'Ν': 'N',
            'Ξ': 'X',
            'Ο': 'O', 'Ό': 'O',
            'Π': 'P',
            'Ρ': 'R',
            'Σ': 'S',
            'Τ': 'T',
            'Υ': 'Y', 'Ύ': 'Y',
            'Φ': 'Ph',
            'Χ': 'Ch',
            'Ψ': 'Ps',
            'Ω': 'Ō', 'Ώ': 'Ō'
        }

        result = ''
        i = 0
        while i < len(word):
            char = word[i]
            if char in char_map:
                result += char_map[char]
            else:
                result += char
            i += 1

        return result


    # Handle equal sign at start of string
    if greek_text.lstrip().startswith('='):
        greek_text = greek_text.replace('=', '', 1).lstrip()

    # Split by whitespace and punctuation, preserving them
    parts = re.split(r'(\s+|[.,;()])', greek_text)
    transliterated_parts = [transliterate_word(part) for part in parts]
    return ''.join(transliterated_parts)

# Process the entries
processed_entries = []
for entry in entries:
    id_, hebrew, greek, definition, notes = entry

    # Strip nikud from Hebrew/Aramaic
    clean_hebrew = strip_nikud(hebrew)

    # Transliterate Greek and handle equal signs
    transliterated = greek_to_latin(greek) if greek else ''
    greek = clean_text(greek) if greek else ''

    processed_entries.append([
        clean_hebrew,
        transliterated,
        greek,
        definition,
        notes
    ])

# Print headers and data in tab-separated format
print("Hebrew/Aramaic\tTransliterated\tGreek\tDefinition\tEntry")
for entry in processed_entries:
    print('\t'.join(entry))

Hebrew/Aramaic	Transliterated	Greek	Definition	Entry
אב־	origin answering to au-; eugenēs.; automatos	origin answering to αὐ-; εὐγενής.; αὐτόματος		a prefix of words of Greek , au-, e.g. אבטומטוס = ; or to εὐ, e.g. אבגינוס =
אבגינוס	נֵיס—eugenēs; corr. נֵיס—eugenēs	נֵיס—εὐγενής; corr. נֵיס—εὐγενής	of noble descent .	Koh. R. beg.; a. fr. (Midr. Till. to Ps. I אביינוס; Cant. R. beg. אווגיטוס, corr. acc.).— Pl. Yalk. Ps. 863 שהוא אֶוְגְּנֵסִין (read בן א') he is the son of nobles; Midr. Till. to Ps. CV בן גנסין (corr. acc.) cmp. גְּנִיס.
אבגניסטי	eugenestatoi	εὐγενέστατοι	(read סְטָטֵי—), אֶוְוגִּינִיסְטָאטֵי m. pl. most noble .	Ruth R. to I, 2. Midr. Sam. ch. I.
אבדוקוס	Eutokos	Εὔτοκος	Ebdocus (Eutocus).	Y. Meg. III, 74 a bot. rendered in a secret political letter טוב ילד Good-Child . (Ed. קום—, קין—corr. acc.)
אבדימוס	Eudēmos	Εὔδημος	Ebdimos, Eudemus .	Y. Keth. XI, 34 b ; mostly abbrev. אבדימא, אבדימי (corrupt. אבדומי, אבודמא, אבודמי), name of several Amoraim , the most prominent: Eb. o