In [31]:
import random
import requests
import csv
import os
import pandas as pd
from bs4 import BeautifulSoup as bs
from io import StringIO
from urllib.parse import quote
import re
import unicodedata

In [32]:
def strip_accents(s):
    """Remove accents from a given string."""
    return ''.join(c for c in unicodedata.normalize('NFD', s)
                    if unicodedata.category(c) != 'Mn')

def generate_morph_code(part_of_speech, person, number, tense, mood, voice, gender, case, degree):
    """
    Generate a morph code based on grammatical features according to the 9-position key.
    """
    morph_code = f"{part_of_speech}{person}{number}{tense}{mood}{voice}{gender}{case}{degree}"
    return morph_code

def normalize_word(word):
    """Normalize a Greek word by removing accents and converting to lowercase."""
    return strip_accents(word).lower()



In [33]:
def parse_noun_table(df, base_word, gender, sequence_num_start, definition):
    """Parse the noun inflection table and extract forms along with their grammatical features."""
    # Map for cases and numbers
    case_map = {
        'nominative': 'n',
        'genitive': 'g',
        'dative': 'd',
        'accusative': 'a',
        'vocative': 'v',
        'ablative': 'b',
        'locative': 'l',
        'instrumental': 'i',
        'notes:': ''
    }
    number_map = {
        'singular': 's',
        'dual': 'd',
        'plural': 'p'
    }

    exclude_texts = set([
        'adverb', 'comparative', 'superlative', '-', '—', '', ' ', 'notes:'
    ])

    forms = []
    sequence_num = sequence_num_start

    # Clean the DataFrame headers
    df.columns = [str(col).lower().strip() for col in df.columns]

    # The first column should be cases
    cases = df.iloc[:, 0].astype(str).str.lower().str.strip()
    # The rest of the columns are numbers
    numbers = [str(col).lower().strip() for col in df.columns[1:]]

    for idx, case in enumerate(cases):
        case = case.strip()
        case_key = case_map.get(case, '-')
        if case_key == '' or case in exclude_texts:
            continue  # Skip notes or irrelevant rows
        for col_idx, num_col in enumerate(numbers):
            num_col = num_col.strip()
            number_key = number_map.get(num_col, '-')
            if number_key == '-' or num_col in exclude_texts:
                continue
            form = df.iloc[idx, col_idx + 1]
            if isinstance(form, str):
                form = form.strip()
                if form != '' and form not in exclude_texts:
                    bare_text = strip_accents(form)
                    bare_base_form = strip_accents(base_word)
                    morph_code = generate_morph_code(
                        part_of_speech='n',
                        person='-',
                        number=number_key,
                        tense='-',
                        mood='-',
                        voice='-',
                        gender=gender,
                        case=case_key,
                        degree='-'
                    )
                    form_entry = {
                        'id': sequence_num,
                        'text': form,
                        'bare_text': bare_text,
                        'sequence_num': sequence_num,
                        'morph_code': morph_code,
                        'base_form': base_word,
                        'bare_base_form': bare_base_form,
                        'definition': definition
                    }
                    forms.append(form_entry)
                    sequence_num += 1
    return forms

def parse_adjective_table(df, base_word, sequence_num_start, definition):
    """
    Parse a multi-level Ancient Greek adjective table.
    
    If the table’s header isn’t properly detected as multirow (for example,
    if the first column reads "number"), this function will assume the first row
    contains the actual header and reassign them.
    
    IMPORTANT:
      When calling pd.read_html on Wiktionary adjective tables, consider using:
          header=[0,1,2]
      to see if you obtain a proper MultiIndex. If not, this function will try to adjust.
    """
    import unicodedata

    def strip_accents(s):
        """Remove accents from a given string."""
        return ''.join(c for c in unicodedata.normalize('NFD', s)
                       if unicodedata.category(c) != 'Mn')

    # Mapping definitions.
    case_map = {
        'nominative': 'n',
        'genitive': 'g',
        'dative': 'd',
        'accusative': 'a',
        'vocative': 'v',
        'ablative': 'b',
        'locative': 'l',
        'instrumental': 'i',
    }
    number_map = {
        'singular': 's',
        'dual': 'd',
        'plural': 'p'
    }
    gender_map = {
        'masculine': 'm',
        'feminine': 'f',
        'masculine / feminine': 'mf',
        'masculine/feminine': 'mf',
        'neuter': 'n',
    }
    # Skip only these keywords.
    skip_keywords = {'derived forms', 'notes', 'adverb', 'comparative', 'superlative', '—', '-'}

    forms = []
    sequence_num = sequence_num_start

    # === Step 1: Examine and Flatten Headers ===
    print("=== DEBUG: Original DF.columns ===")
    print(df.columns)
    if isinstance(df.columns, pd.MultiIndex):
        df.columns = [
            " ".join(str(x).strip().lower() for x in col if str(x).lower() != 'nan')
            for col in df.columns
        ]
    else:
        df.columns = [str(col).strip().lower() for col in df.columns]
        # If the first header is "number" (which indicates the header wasn't read properly),
        # use the first row as the header and drop it.
        if df.columns[0] == "number":
            print("=== DEBUG: Detected 'number' as first header. Using first row as new header.")
            new_header = df.iloc[0].str.lower().tolist()
            df = df.iloc[1:].reset_index(drop=True)
            df.columns = new_header

    print("=== DEBUG: Processed columns ===")
    print(df.columns.tolist())

    # === Step 2: Identify Columns for (Number, Gender) ===
    # Assume the first column now is the row label (which contains the case name).
    col_info = {}
    for idx, col_name in enumerate(df.columns):
        col_name = col_name.strip()
        if idx == 0:
            continue  # skip row label column

        # Skip columns with skip keywords.
        if any(kw in col_name for kw in skip_keywords):
            print(f"=== DEBUG: Skipping column '{col_name}' due to skip keyword")
            continue

        # Check for the number (singular, dual, plural) in the column name.
        number_val = None
        for pat, nm_code in number_map.items():
            if pat in col_name:
                number_val = nm_code
                break

        # Check for gender information.
        gender_val = None
        if 'masculine' in col_name and 'feminine' in col_name:
            gender_val = 'mf'
        else:
            for pat, g_code in gender_map.items():
                if pat in col_name:
                    gender_val = g_code
                    break

        if number_val and gender_val:
            col_info[idx] = (number_val, gender_val)
            print(f"=== DEBUG: Column index {idx} '{col_name}' mapped to ({number_val}, {gender_val})")
        else:
            print(f"=== DEBUG: Column index {idx} '{col_name}' did not match number/gender criteria.")

    if not col_info:
        print("=== DEBUG: No columns with both number and gender info were found. Check header reading.")
        return forms  # return empty if nothing found

    # === Step 3: Process Each Row to Extract Forms ===
    for row_idx in range(len(df)):
        row_label = str(df.iloc[row_idx, 0]).lower().strip()
        if not row_label or row_label in skip_keywords:
            print(f"=== DEBUG: Skipping row {row_idx} with label '{row_label}'")
            continue

        # Determine the morphological case from the row label.
        case_found = None
        for cm_key, cm_val in case_map.items():
            if cm_key in row_label:
                case_found = cm_val
                break

        if not case_found:
            print(f"=== DEBUG: Row {row_idx} with label '{row_label}' did not match any expected case.")
            continue

        print(f"=== DEBUG: Row {row_idx} with case '{row_label}' mapped to case code '{case_found}'")

        # Process each (number, gender) column.
        for col_idx, (num_code, gen_code) in col_info.items():
            cell_val = df.iat[row_idx, col_idx]
            if not isinstance(cell_val, str):
                print(f"=== DEBUG: Cell at row {row_idx}, col {col_idx} is not a string; skipping.")
                continue
            cell_val = cell_val.strip()
            if not cell_val or cell_val in skip_keywords:
                print(f"=== DEBUG: Cell at row {row_idx}, col {col_idx} is empty or skipped.")
                continue

            # If there are multiple forms in one cell, separate them.
            sub_forms = [x.strip() for x in cell_val.split('/') if x.strip()]
            for sf in sub_forms:
                bare_text = strip_accents(sf)
                bare_base_form = strip_accents(base_word)
                degree_key = 'p'  # assuming positive degree by default
                morph_code = generate_morph_code(
                    part_of_speech='a',   # adjective
                    person='-',
                    number=num_code,
                    tense='-',
                    mood='-',
                    voice='-',
                    gender=gen_code,
                    case=case_found,
                    degree=degree_key
                )
                form_entry = {
                    'id': sequence_num,
                    'text': sf,
                    'bare_text': bare_text,
                    'sequence_num': sequence_num,
                    'morph_code': morph_code,
                    'base_form': base_word,
                    'bare_base_form': bare_base_form,
                    'definition': definition
                }
                print(f"=== DEBUG: Adding form: {form_entry}")
                forms.append(form_entry)
                sequence_num += 1

    if not forms:
        print("=== DEBUG: No adjective forms were parsed from the table.")
    else:
        print(f"=== DEBUG: Parsed {len(forms)} adjective forms.")
    return forms



def parse_verb_table(df, base_word, sequence_num_start, definition):
    """
    Parse the verb inflection table and extract forms along with their grammatical features.
    Handles tables with multiple header levels.
    """
    # Maps for grammatical features
    person_map = {
        'first person': '1',
        'second person': '2',
        'third person': '3',
    }
    
    number_map = {
        'singular': 's',
        'dual': 'd',
        'plural': 'p'
    }
    
    mood_map = {
        'indicative': 'i',
        'subjunctive': 's',
        'optative': 'o',
        'imperative': 'm',
        'infinitive': 'n',
        'participle': 'p'
    }
    
    voice_map = {
        'active': 'a',
        'middle': 'm',
        'passive': 'p',
        'mediopassive': 'e'
    }
    
    tense_map = {
        'present': 'p',
        'imperfect': 'i',
        'future': 'f',
        'aorist': 'a',
        'perfect': 'r',
        'pluperfect': 'l',
        'future perfect': 't'
    }
        # Set of unwanted entries to exclude
    exclude_texts = set([
        'adverb', 'comparative', 'superlative', '-', '—', '', ' ', 'notes:'
    ])

    forms = []
    sequence_num = sequence_num_start

    # Flatten MultiIndex columns
    if isinstance(df.columns, pd.MultiIndex):
        df.columns = [' '.join([str(s).strip().lower() for s in col if str(s) != 'nan']) for col in df.columns]
    else:
        df.columns = [col.lower().strip() for col in df.columns]

    # Identify the grammatical features from the table
    for col in df.columns:
        if ' ' in col:
            continue
        if col in ['person', 'number', 'person / number']:
            # This is the row index column
            df.rename(columns={col: 'person_number'}, inplace=True)
            break

    # Extract mood, voice, tense from the table caption or nearby headers
    # For this example, let's assume we have mood, voice, tense information passed to this function
    # Alternatively, you can modify the code to extract these from the HTML as needed

    # Since the tables are complex, we need to get mood, voice, tense from the column headers
    columns_info = []
    for col in df.columns:
        if col == 'person_number':
            columns_info.append({'column': col})
            continue
        features = {'column': col}
        # Split the column name to extract features
        tokens = col.split()
        for token in tokens:
            if token in mood_map:
                features['mood'] = mood_map[token]
            elif token in voice_map:
                features['voice'] = voice_map[token]
            elif token in tense_map:
                features['tense'] = tense_map[token]
        columns_info.append(features)

    # Iterate over the rows to extract forms
    for idx, row in df.iterrows():
        person_number = str(row.get('person_number', '')).lower()
        person = '-'
        number = '-'
        # Extract person and number from row labels
        for pn in person_map:
            if pn in person_number:
                person = person_map[pn]
                break
        for num in number_map:
            if num in person_number:
                number = number_map[num]
                break
        if person == '-' and number == '-':
            # Try splitting the person_number string
            tokens = person_number.split()
            for token in tokens:
                if token in person_map:
                    person = person_map[token]
                elif token in number_map:
                    number = number_map[token]
        # Iterate over the columns to get forms
        for col_info in columns_info:
            col = col_info['column']
            if col == 'person_number':
                continue
            form = row[col]
            if isinstance(form, str) and form.strip() != '':
                form = form.strip()
                bare_text = strip_accents(form)
                bare_base_form = strip_accents(base_word)
                mood = col_info.get('mood', '-')
                voice = col_info.get('voice', '-')
                tense = col_info.get('tense', '-')
                # Handle non-finite forms
                if person == '-' and number == '-' and 'person_number' in df.columns:
                    if 'singular' in person_number:
                        number = 's'
                    elif 'dual' in person_number:
                        number = 'd'
                    elif 'plural' in person_number:
                        number = 'p'
                    else:
                        number = '-'
                    if 'first' in person_number:
                        person = '1'
                    elif 'second' in person_number:
                        person = '2'
                    elif 'third' in person_number:
                        person = '3'
                    else:
                        person = '-'
                morph_code = generate_morph_code(
                    part_of_speech='v',
                    person=person,
                    number=number,
                    tense=tense,
                    mood=mood,
                    voice=voice,
                    gender='-',
                    case='-',
                    degree='-'
                )
                form_entry = {
                    'id': sequence_num,
                    'text': form,
                    'bare_text': bare_text,
                    'sequence_num': sequence_num,
                    'morph_code': morph_code,
                    'base_form': base_word,
                    'bare_base_form': bare_base_form,
                    'definition': definition
                }
                forms.append(form_entry)
                sequence_num += 1

    return forms

def parse_verb_table_specific(table, base_word, sequence_num_start, definition):
    """
    Parse the specific 'Present (uncontracted)' verb table for ἀάω.
    """
    forms = []
    sequence_num = sequence_num_start

    # Initialize grammatical feature mappings
    person_number_list = [
        ('1', 's'),  # First person singular
        ('2', 's'),  # Second person singular
        ('3', 's'),  # Third person singular
        ('2', 'd'),  # Second person dual
        ('3', 'd'),  # Third person dual
        ('1', 'p'),  # First person plural
        ('2', 'p'),  # Second person plural
        ('3', 'p'),  # Third person plural
    ]

    mood_map = {
        'indicative': 'i',
        'subjunctive': 's',
        'optative': 'o',
        'imperative': 'm',
        'infinitive': 'n',
        'participle': 'p'
    }

    voice_map = {
        'active': 'a',
        'middle': 'm',
        'passive': 'p',
        'mediopassive': 'e'
    }

    tense_map = {
        'present': 'p',
        # Add other tenses if needed
    }

    # Initialize current grammatical features
    current_voice = '-'
    current_mood = '-'
    current_tense = 'p'  # Present tense

    # Iterate over table rows
    rows = table.find_all('tr')
    for row in rows:
        cells = row.find_all(['th', 'td'])
        if not cells:
            continue
        first_cell_text = cells[0].get_text(strip=True).lower()
        # Check for voice labels
        if first_cell_text in voice_map:
            current_voice = voice_map[first_cell_text]
            continue
        # Check for mood labels
        elif first_cell_text in mood_map:
            current_mood = mood_map[first_cell_text]
            continue
        # Check for participle gender labels
        elif first_cell_text in ['m', 'f', 'n']:
            current_gender = first_cell_text
            # Process participle forms
            if len(cells) > 1:
                form = cells[1].get_text(strip=True)
                if form and form not in ['-', '—']:
                    bare_text = strip_accents(form)
                    bare_base_form = strip_accents(base_word)
                    morph_code = generate_morph_code(
                        part_of_speech='v',
                        person='-',
                        number='-',
                        tense=current_tense,
                        mood='p',  # Participle
                        voice=current_voice,
                        gender=current_gender,
                        case='-',
                        degree='-'
                    )
                    form_entry = {
                        'id': sequence_num,
                        'text': form,
                        'bare_text': bare_text,
                        'sequence_num': sequence_num,
                        'morph_code': morph_code,
                        'base_form': base_word,
                        'bare_base_form': bare_base_form,
                        'definition': definition
                    }
                    forms.append(form_entry)
                    sequence_num += 1
            continue
        # Check for infinitive
        elif first_cell_text == 'infinitive':
            # Process infinitive form
            if len(cells) > 1:
                form = cells[1].get_text(strip=True)
                if form and form not in ['-', '—']:
                    bare_text = strip_accents(form)
                    bare_base_form = strip_accents(base_word)
                    morph_code = generate_morph_code(
                        part_of_speech='v',
                        person='-',
                        number='-',
                        tense=current_tense,
                        mood='n',  # Infinitive
                        voice=current_voice,
                        gender='-',
                        case='-',
                        degree='-'
                    )
                    form_entry = {
                        'id': sequence_num,
                        'text': form,
                        'bare_text': bare_text,
                        'sequence_num': sequence_num,
                        'morph_code': morph_code,
                        'base_form': base_word,
                        'bare_base_form': bare_base_form,
                        'definition': definition
                    }
                    forms.append(form_entry)
                    sequence_num += 1
            continue
        # Skip notes and empty rows
        elif first_cell_text in ['notes:', 'note:', '']:
            continue

        # Process rows with verb forms
        forms_in_row = [cell.get_text(strip=True) for cell in cells]
        # Remove empty forms and labels
        forms_in_row = [form for form in forms_in_row if form not in ['', '-', '—']]
        # Skip rows with insufficient data
        if len(forms_in_row) < 2:
            continue

        # For moods with person and number
        if current_mood in ['i', 's', 'o', 'm']:  # Indicative, Subjunctive, Optative, Imperative
            for idx_form, form in enumerate(forms_in_row[1:], start=0):
                if form in ['', '-', '—']:
                    continue
                if idx_form < len(person_number_list):
                    person, number = person_number_list[idx_form]
                    bare_text = strip_accents(form)
                    bare_base_form = strip_accents(base_word)
                    morph_code = generate_morph_code(
                        part_of_speech='v',
                        person=person,
                        number=number,
                        tense=current_tense,
                        mood=current_mood,
                        voice=current_voice,
                        gender='-',
                        case='-',
                        degree='-'
                    )
                    form_entry = {
                        'id': sequence_num,
                        'text': form,
                        'bare_text': bare_text,
                        'sequence_num': sequence_num,
                        'morph_code': morph_code,
                        'base_form': base_word,
                        'bare_base_form': bare_base_form,
                        'definition': definition
                    }
                    forms.append(form_entry)
                    sequence_num += 1
        else:
            # Handle other moods if necessary
            pass

    return forms


In [34]:
def strip_accents(s):
    """Remove accents from a given string."""
    return ''.join(c for c in unicodedata.normalize('NFD', s)
                    if unicodedata.category(c) != 'Mn')

def clean_dataframe(df):
    """Clean the DataFrame by removing unwanted characters and standardizing the data."""
    df = df.dropna(how='all', axis=0)  # Remove empty rows

    # Remove unwanted characters and symbols
    df = df.replace({'➤': '', '•': '', '\[': '', '\]': '', '\{': '', '\}': '', '—': ' '}, regex=True)
    df = df.replace(r"\([^)]*\)", "", regex=True)  # Remove content within parentheses

    # Make all strings lowercase
    df = df.map(lambda s: s.lower() if isinstance(s, str) else s)

    # Split strings to remove extraneous information
    df = df.map(lambda s: s.split(',', 1)[0] if isinstance(s, str) else s)
    df = df.map(lambda s: s.split('-', 1)[0] if isinstance(s, str) else s)
    df = df.map(lambda s: s.split('+', 1)[0] if isinstance(s, str) else s)
    df = df.map(lambda s: s.split('/', 1)[0] if isinstance(s, str) else s)
    df = df.map(lambda s: s.split('\\', 1)[0] if isinstance(s, str) else s)

    df = df.map(lambda x: x.strip() if isinstance(x, str) else x)

    df = df.replace(r"\s\s+", " ", regex=True)  # Replace multiple spaces with a single space

    return df

def generate_variations(word, show=False, definition_map=None):
    import requests
    from bs4 import BeautifulSoup as bs
    import pandas as pd
    
    # 1. Fetch the page
    url = f"https://en.wiktionary.org/wiki/{quote(word)}"
    response = requests.get(url)
    soup = bs(response.text, 'html.parser')

    # 2. Clean up
    for tag in soup.find_all('span', {'class': 'tr Latn'}):
        tag.decompose()
    for tag in soup.find_all('table', {'class': 'audiotable'}):
        tag.decompose()
    for tag in soup.find_all('sup'):
        tag.decompose()

    # 3. Definition from map
    if definition_map is not None:
        normalized_word = strip_accents(word).lower()
        definition = definition_map.get(normalized_word, '')
    else:
        definition = ''

    forms = []
    sequence_num = 1

    # 4. Find all tables whose class contains "inflection"
    #    (since "inflection-table" might not be exact)
    all_inflect_tables = []
    for tbl in soup.find_all("table"):
        classes = tbl.get("class", [])
        class_str = " ".join(classes).lower()
        if "inflection" in class_str:
            all_inflect_tables.append(tbl)

    if not all_inflect_tables:
        print(f"No inflection tables found on page for {word}")
        return pd.DataFrame(forms)

    # 5. For each table, guess the part of speech from preceding <h3> or the table’s caption
    for table in all_inflect_tables:
        # Check heading above
        part_of_speech_tag = table.find_previous('h3')
        if part_of_speech_tag:
            pos_text = part_of_speech_tag.get_text(strip=True).lower()
        else:
            pos_text = ""

        # Also check caption
        caption_tag = table.find('caption')
        caption_text = caption_tag.get_text(strip=True).lower() if caption_tag else ""

        # Decide if verb, adjective, noun, etc.
        if 'verb' in pos_text or 'verb' in caption_text:
            # parse as verb
            try:
                df_list = pd.read_html(str(table))
                if not df_list:
                    continue
                df = df_list[0]
                df = clean_dataframe(df)

                # parse with parse_verb_table
                parsed_forms = parse_verb_table(df, word, sequence_num, definition)
                forms.extend(parsed_forms)
                sequence_num += len(parsed_forms)
            except Exception as e:
                print(f"Error parsing verb table for {word}: {e}")
                continue

        elif 'adjective' in pos_text or 'adjective' in caption_text:
            try:
                # Use the proper header parameter so that the real headers are read.
                df_list = pd.read_html(str(table), header=[0,1,2])
                if not df_list:
                    continue
                df = df_list[0]
                df = clean_dataframe(df)
                parsed_forms = parse_adjective_table(
                    df, 
                    base_word=word, 
                    sequence_num_start=sequence_num, 
                    definition=definition
                )
                forms.extend(parsed_forms)
                sequence_num += len(parsed_forms)
            except Exception as e:
                print(f"Error parsing adjective table for {word}: {e}")
                continue

        elif 'noun' in pos_text or 'noun' in caption_text:
            # parse as noun
            try:
                df_list = pd.read_html(str(table))
                if not df_list:
                    continue
                df = df_list[0]
                df = clean_dataframe(df)

                parsed_forms = parse_noun_table(
                    df, 
                    base_word=word, 
                    gender='-', 
                    sequence_num_start=sequence_num, 
                    definition=definition
                )
                forms.extend(parsed_forms)
                sequence_num += len(parsed_forms)
            except Exception as e:
                print(f"Error parsing noun table for {word}: {e}")
                continue
        else:
            pass

    return pd.DataFrame(forms)


In [35]:
# def update_csv_with_wiktionary(csv_path, base_word):
    # """
    # Update the CSV file with forms from Wiktionary by comparing existing morph codes
    # and adding any missing forms.
    # """
    # # Read the original CSV
    # df = pd.read_csv(csv_path)
    
    # # Get variations from Wiktionary
    # scraped_df = generate_variations(base_word)
    
    # if scraped_df.empty:
    #     print(f"No new forms found for {base_word}.")
    #     return csv_path  # Return the original path if no updates
    
    # # Ensure morph_code is a string
    # df['morph_code'] = df['morph_code'].astype(str)
    # scraped_df['morph_code'] = scraped_df['morph_code'].astype(str)
    
    # # Filter rows corresponding to the base word
    # base_word_block = df[df['base_form'] == base_word]
    # existing_morph_codes = set(base_word_block['morph_code'])
    
    # # Get new forms not in the original CSV
    # new_forms = scraped_df[~scraped_df['morph_code'].isin(existing_morph_codes)]
    
    # if new_forms.empty:
    #     print(f"All forms of {base_word} are already present in the dataset.")
    #     return csv_path  # Return the original path if no new forms to add
    
    # # Append new forms to the DataFrame
    # updated_df = pd.concat([df, new_forms], ignore_index=True)
    
    # # Save the updated DataFrame to a new CSV
    # updated_csv_path = csv_path.replace('.csv', '_updated.csv')
    # updated_df.to_csv(updated_csv_path, index=False)
    
    # print(f"Updated CSV saved to {updated_csv_path}")
    # return updated_csv_path

In [36]:
import pandas as pd
import random

# --- 1. Load your word bank (and old morphological entries) ---
greek_words_df = pd.read_csv('greek_words.csv',
                             encoding='utf-8', sep='\t', index_col=False)

# Ensure we have the columns we need
desired_columns = [
    'id', 'text', 'bare_text', 'sequence_num',
    'morph_code', 'base_form', 'bare_base_form', 'definition'
]
for col in desired_columns:
    if col not in greek_words_df.columns:
        greek_words_df[col] = ""

# Build a definition map for the scraper
greek_words_df['normalized_base_form'] = greek_words_df['base_form'].apply(normalize_word)
definition_map = greek_words_df.set_index('normalized_base_form')['definition'].to_dict()

# --- 2. Pick 100 *random* unique headwords to test ---
unique_words = list(greek_words_df['base_form'].unique())
sampled_words = random.sample(unique_words, k=100)

# Extract the “old” entries for those sampled words
old_entries = greek_words_df[greek_words_df['base_form'].isin(sampled_words)][desired_columns].copy()

# --- 3. Scrape Wiktionary for each of those 100 headwords ---
all_scraped = []
total_scraped = 0
next_milestone = 1000

for word in sampled_words:
    print(f"Scraping '{word}'…", end='')
    df = generate_variations(word, definition_map=definition_map)
    if df.empty:
        print(" no forms found.")
        continue

    # Ensure correct columns & order
    for col in desired_columns:
        if col not in df.columns:
            df[col] = ""
    df = df[desired_columns]

    count = len(df)
    total_scraped += count
    all_scraped.append(df)

    print(f" found {count} forms (total so far: {total_scraped}).")
    if total_scraped >= next_milestone:
        print(f"=== Progress: {total_scraped} total forms scraped so far ===")
        next_milestone += 1000

scraped_df = pd.concat(all_scraped, ignore_index=True) if all_scraped else pd.DataFrame(columns=desired_columns)

# --- 4. Merge old + new, drop duplicates, re-number full set ---
combined = pd.concat([old_entries, scraped_df], ignore_index=True)
combined = combined.drop_duplicates(subset=['text', 'morph_code'], keep='first')
combined = combined.reset_index(drop=True)
combined['id'] = combined.index + 1
combined['sequence_num'] = combined['id']
final_df = combined[desired_columns]
final_df.to_csv('greek_words_updated.csv', sep='\t', index=False, encoding='utf-8')
print(f"Done! {len(final_df)} unique forms written to greek_words_updated.csv.")

# --- 5. Identify *new* forms (in scraped_df but not in old_entries) ---
# Compare by (text, morph_code)
old_set = set(zip(old_entries['text'], old_entries['morph_code']))
scraped_new = scraped_df[~scraped_df.apply(lambda r: (r['text'], r['morph_code']) in old_set, axis=1)]

# Re-number just the new ones
scraped_new = scraped_new.reset_index(drop=True)
scraped_new['id'] = scraped_new.index + 1
scraped_new['sequence_num'] = scraped_new['id']

# Save out the “new only” CSV
scraped_new.to_csv('greek_words_new_only.csv', sep='\t', index=False, encoding='utf-8')
print(f"Done! {len(scraped_new)} new forms written to greek_words_new_only.csv.")


Scraping 'προπονέω'…

KeyboardInterrupt: 