In [204]:
import random
import requests
import csv
import os
import pandas as pd
from bs4 import BeautifulSoup as bs
from urllib.parse import quote
import re
import unicodedata


In [205]:
def strip_accents(s):
    """Remove accents from a given string."""
    return ''.join(c for c in unicodedata.normalize('NFD', s)
                    if unicodedata.category(c) != 'Mn')

def generate_morph_code(part_of_speech, person, number, tense, mood, voice, gender, case, degree):
    """
    Generate a morph code based on grammatical features according to the 9-position key.
    """
    morph_code = f"{part_of_speech}{person}{number}{tense}{mood}{voice}{gender}{case}{degree}"
    return morph_code

In [206]:
def parse_noun_table(df, base_word, gender, sequence_num_start):
    """Parse the noun inflection table and extract forms along with their grammatical features."""
    # Map for cases and numbers
    case_map = {
        'nominative': 'n',
        'genitive': 'g',
        'dative': 'd',
        'accusative': 'a',
        'vocative': 'v',
        'ablative': 'b',
        'locative': 'l',
        'instrumental': 'i',
        'notes:': ''
    }
    number_map = {
        'singular': 's',
        'dual': 'd',
        'plural': 'p'
    }

    forms = []
    sequence_num = sequence_num_start

    # Clean the DataFrame headers
    df.columns = [col.lower().strip() for col in df.columns]

    # The first column should be cases
    cases = df.iloc[:, 0].str.lower().str.strip()
    # The rest of the columns are numbers
    numbers = [col.lower().strip() for col in df.columns[1:]]

    for idx, case in enumerate(cases):
        case = case.strip()
        case_key = case_map.get(case, '-')
        if case_key == '':
            continue  # Skip notes or irrelevant rows
        for col_idx, num_col in enumerate(numbers):
            num_col = num_col.strip()
            number_key = number_map.get(num_col, '-')
            if number_key == '-':
                continue
            form = df.iloc[idx, col_idx + 1]
            if isinstance(form, str):
                form = form.strip()
                if form != '':
                    bare_text = strip_accents(form)
                    bare_base_form = strip_accents(base_word)
                    morph_code = generate_morph_code(
                        part_of_speech='n',
                        person='-',
                        number=number_key,
                        tense='-',
                        mood='-',
                        voice='-',
                        gender=gender,
                        case=case_key,
                        degree='-'
                    )
                    form_entry = {
                        'id': sequence_num,
                        'text': form,
                        'bare_text': bare_text,
                        'sequence_num': sequence_num,
                        'morph_code': morph_code,
                        'base_form': base_word,
                        'bare_base_form': bare_base_form,
                        'definition': ''  # Add definition if available
                    }
                    forms.append(form_entry)
                    sequence_num += 1
    return forms

def parse_adjective_table(df, base_word, sequence_num_start):
    """
    Parse the adjective inflection table and extract forms along with their grammatical features.
    Handles tables with extra header rows and reconstructs columns accordingly.
    """
    # Map for degrees
    degree_map = {
        'positive': 'p',
        'comparative': 'c',
        'superlative': 's'
    }
    # Map for genders
    gender_map = {
        'masculine': 'm',
        'feminine': 'f',
        'masculine / feminine': 'mf',
        'masculine/feminine': 'mf',
        'neuter': 'n'
    }
    # Map for cases
    case_map = {
        'nominative': 'n',
        'genitive': 'g',
        'dative': 'd',
        'accusative': 'a',
        'vocative': 'v',
        'ablative': 'b',
        'locative': 'l',
        'instrumental': 'i',
        'notes:': ''
    }
    # Map for numbers
    number_map = {
        'singular': 's',
        'dual': 'd',
        'plural': 'p'
    }

    forms = []
    sequence_num = sequence_num_start

    # Adjust the DataFrame
    # Set columns to the first row
    df.columns = df.iloc[0]
    df = df.drop(df.index[0])

    # Clean up column names
    df.columns = [str(col).strip().lower() for col in df.columns]
    # Drop columns with 'nan' names
    df = df.loc[:, df.columns != 'nan']

    # Reconstruct df.columns to include 'number' and 'gender'
    df_columns = []
    for idx, col in enumerate(df.columns):
        if idx == 0:
            # First column is 'case'
            df_columns.append('case')
        else:
            if idx in [1, 2]:
                number = 'singular'
            elif idx in [3, 4]:
                number = 'dual'
            elif idx in [5, 6]:
                number = 'plural'
            else:
                number = ''
            gender = col.strip().lower()
            df_columns.append(f"{number} {gender}")

    df.columns = df_columns

    # Extract cases
    cases = df['case'].str.lower().str.strip()

    for idx, case in enumerate(cases):
        case_key = case_map.get(case, '-')
        if case_key == '':
            continue  # Skip notes or irrelevant rows

        for col_idx in range(1, len(df.columns)):
            col_name = df.columns[col_idx]
            tokens = col_name.split()
            if len(tokens) >= 2:
                number_name = tokens[0]
                gender_name = ' '.join(tokens[1:])
            else:
                continue  # Cannot extract number and gender

            number_key = number_map.get(number_name.strip(), '-')
            gender_key = gender_map.get(gender_name.strip(), '-')
            if number_key == '-' or gender_key == '-':
                continue

            form = df.iloc[idx, col_idx]
            if isinstance(form, str):
                form = form.strip()
                if form != '':
                    bare_text = strip_accents(form)
                    bare_base_form = strip_accents(base_word)
                    # Assume degree is 'positive' unless specified
                    degree_key = 'p'
                    # Handle 'masculine / feminine' gender
                    if gender_key == 'mf':
                        genders = ['m', 'f']
                    else:
                        genders = [gender_key]
                    for gender in genders:
                        morph_code = generate_morph_code(
                            part_of_speech='a',
                            person='-',
                            number=number_key,
                            tense='-',
                            mood='-',
                            voice='-',
                            gender=gender,
                            case=case_key,
                            degree=degree_key
                        )
                        form_entry = {
                            'id': sequence_num,
                            'text': form,
                            'bare_text': bare_text,
                            'sequence_num': sequence_num,
                            'morph_code': morph_code,
                            'base_form': base_word,
                            'bare_base_form': bare_base_form,
                            'definition': ''  # Add definition if available
                        }
                        forms.append(form_entry)
                        sequence_num += 1

    return forms


def parse_verb_table(df, base_word, sequence_num_start):
    """
    Parse the verb inflection table and extract forms along with their grammatical features.
    Handles tables with multiple header levels.
    """
    # Maps for grammatical features
    person_map = {
        'first person': '1',
        'second person': '2',
        'third person': '3',
    }
    
    number_map = {
        'singular': 's',
        'dual': 'd',
        'plural': 'p'
    }
    
    mood_map = {
        'indicative': 'i',
        'subjunctive': 's',
        'optative': 'o',
        'imperative': 'm',
        'infinitive': 'n',
        'participle': 'p'
    }
    
    voice_map = {
        'active': 'a',
        'middle': 'm',
        'passive': 'p',
        'mediopassive': 'e'  # For middle/passive forms
    }
    
    tense_map = {
        'present': 'p',
        'imperfect': 'i',
        'future': 'f',
        'aorist': 'a',
        'perfect': 'r',
        'pluperfect': 'l',
        'future perfect': 't'
    }
    
    forms = []
    sequence_num = sequence_num_start

    # Flatten MultiIndex columns
    if isinstance(df.columns, pd.MultiIndex):
        df.columns = [' '.join([str(s).strip().lower() for s in col if str(s) != 'nan']) for col in df.columns]
    else:
        df.columns = [col.lower().strip() for col in df.columns]

    # Identify the grammatical features from the table
    for col in df.columns:
        if ' ' in col:
            continue
        if col in ['person', 'number', 'person / number']:
            # This is the row index column
            df.rename(columns={col: 'person_number'}, inplace=True)
            break

    # Extract mood, voice, tense from the table caption or nearby headers
    # For this example, let's assume we have mood, voice, tense information passed to this function
    # Alternatively, you can modify the code to extract these from the HTML as needed

    # Since the tables are complex, we need to get mood, voice, tense from the column headers
    columns_info = []
    for col in df.columns:
        if col == 'person_number':
            columns_info.append({'column': col})
            continue
        features = {'column': col}
        # Split the column name to extract features
        tokens = col.split()
        for token in tokens:
            if token in mood_map:
                features['mood'] = mood_map[token]
            elif token in voice_map:
                features['voice'] = voice_map[token]
            elif token in tense_map:
                features['tense'] = tense_map[token]
        columns_info.append(features)

    # Iterate over the rows to extract forms
    for idx, row in df.iterrows():
        person_number = str(row.get('person_number', '')).lower()
        person = '-'
        number = '-'
        # Extract person and number from row labels
        for pn in person_map:
            if pn in person_number:
                person = person_map[pn]
                break
        for num in number_map:
            if num in person_number:
                number = number_map[num]
                break
        if person == '-' and number == '-':
            # Try splitting the person_number string
            tokens = person_number.split()
            for token in tokens:
                if token in person_map:
                    person = person_map[token]
                elif token in number_map:
                    number = number_map[token]
        # Iterate over the columns to get forms
        for col_info in columns_info:
            col = col_info['column']
            if col == 'person_number':
                continue
            form = row[col]
            if isinstance(form, str) and form.strip() != '':
                form = form.strip()
                bare_text = strip_accents(form)
                bare_base_form = strip_accents(base_word)
                mood = col_info.get('mood', '-')
                voice = col_info.get('voice', '-')
                tense = col_info.get('tense', '-')
                # Handle non-finite forms
                if person == '-' and number == '-' and 'person_number' in df.columns:
                    if 'singular' in person_number:
                        number = 's'
                    elif 'dual' in person_number:
                        number = 'd'
                    elif 'plural' in person_number:
                        number = 'p'
                    else:
                        number = '-'
                    if 'first' in person_number:
                        person = '1'
                    elif 'second' in person_number:
                        person = '2'
                    elif 'third' in person_number:
                        person = '3'
                    else:
                        person = '-'
                morph_code = generate_morph_code(
                    part_of_speech='v',
                    person=person,
                    number=number,
                    tense=tense,
                    mood=mood,
                    voice=voice,
                    gender='-',
                    case='-',
                    degree='-'
                )
                form_entry = {
                    'id': sequence_num,
                    'text': form,
                    'bare_text': bare_text,
                    'sequence_num': sequence_num,
                    'morph_code': morph_code,
                    'base_form': base_word,
                    'bare_base_form': bare_base_form,
                    'definition': ''  # Add definition if available
                }
                forms.append(form_entry)
                sequence_num += 1

    return forms

In [207]:
def strip_accents(s):
    """Remove accents from a given string."""
    return ''.join(c for c in unicodedata.normalize('NFD', s)
                    if unicodedata.category(c) != 'Mn')

def clean_dataframe(df):
    """Clean the DataFrame by removing unwanted characters and standardizing the data."""
    df = df.dropna(how='all', axis=0)  # Remove empty rows

    # Remove unwanted characters and symbols
    df = df.replace({'➤': '', '•': '', '\[': '', '\]': '', '\{': '', '\}': '', '—': ' '}, regex=True)
    df = df.replace(r"\([^)]*\)", "", regex=True)  # Remove content within parentheses

    # Make all strings lowercase
    df = df.applymap(lambda s: s.lower() if isinstance(s, str) else s)

    # Split strings to remove extraneous information
    df = df.applymap(lambda s: s.split(',', 1)[0] if isinstance(s, str) else s)
    df = df.applymap(lambda s: s.split('-', 1)[0] if isinstance(s, str) else s)
    df = df.applymap(lambda s: s.split('+', 1)[0] if isinstance(s, str) else s)
    df = df.applymap(lambda s: s.split('/', 1)[0] if isinstance(s, str) else s)
    df = df.applymap(lambda s: s.split('\\', 1)[0] if isinstance(s, str) else s)

    df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)

    df = df.replace(r"\s\s+", " ", regex=True)  # Replace multiple spaces with a single space

    return df

def generate_variations(word, language='ancient greek', show=False):
    # Fetch the Wiktionary page
    response = requests.get(f'https://en.wiktionary.org/wiki/{quote(word)}')
    if response.status_code != 200:
        print(f"Failed to retrieve Wiktionary page for {word}")
        return pd.DataFrame()
    soup = bs(response.text, 'html.parser')

    # Remove unwanted elements
    for tag in soup.find_all(['span', 'table', 'sup'], {'class': ['tr Latn', 'audiotable']}):
        tag.decompose()

    # Read all tables from the prettified soup
    dataframes = pd.read_html(soup.prettify(), header=[0, 1])

    # Find all tables in the soup
    tables = soup.find_all('table')

    forms = []
    sequence_num = 1  # To keep track of sequence numbers

    for i, df in enumerate(dataframes):
        if i >= len(tables):
            continue  # In case there are more dataframes than tables

        # Get the header corresponding to the table
        header_tag = tables[i].find_previous('h2')
        if header_tag:
            header = header_tag.text.replace('[edit]', '').lower().strip()
        else:
            continue

        # Check if the language matches (case-insensitive)
        if header.lower() != language.lower():
            continue

        # Get the part of speech
        pos_tag = tables[i].find_previous('h3')
        if pos_tag:
            part_of_speech = pos_tag.text.replace('[edit]', '').replace('\"', '').lower().strip()
        else:
            part_of_speech = ''

        # Clean the DataFrame
        df = clean_dataframe(df)

        # Process based on part of speech
        forms_before = len(forms)
        if part_of_speech == 'noun':
            # Process noun
            gender = '-'  # You can extract gender if needed
            forms.extend(parse_noun_table(df, word, gender, sequence_num))
        elif part_of_speech == 'adjective':
            # Process adjective
            forms.extend(parse_adjective_table(df, word, sequence_num))
        elif part_of_speech == 'verb':
            # Process verb
            forms.extend(parse_verb_table(df, word, sequence_num))
        # Update sequence_num based on the number of forms added
        sequence_num += len(forms) - forms_before

    return pd.DataFrame(forms)

In [208]:
def update_csv_with_wiktionary(csv_path, base_word):
    """
    Update the CSV file with forms from Wiktionary by comparing existing morph codes
    and adding any missing forms.
    """
    # Read the original CSV
    df = pd.read_csv(csv_path)
    
    # Get variations from Wiktionary
    scraped_df = generate_variations(base_word)
    
    if scraped_df.empty:
        print(f"No new forms found for {base_word}.")
        return csv_path  # Return the original path if no updates
    
    # Ensure morph_code is a string
    df['morph_code'] = df['morph_code'].astype(str)
    scraped_df['morph_code'] = scraped_df['morph_code'].astype(str)
    
    # Filter rows corresponding to the base word
    base_word_block = df[df['base_form'] == base_word]
    existing_morph_codes = set(base_word_block['morph_code'])
    
    # Get new forms not in the original CSV
    new_forms = scraped_df[~scraped_df['morph_code'].isin(existing_morph_codes)]
    
    if new_forms.empty:
        print(f"All forms of {base_word} are already present in the dataset.")
        return csv_path  # Return the original path if no new forms to add
    
    # Append new forms to the DataFrame
    updated_df = pd.concat([df, new_forms], ignore_index=True)
    
    # Save the updated DataFrame to a new CSV
    updated_csv_path = csv_path.replace('.csv', '_updated.csv')
    updated_df.to_csv(updated_csv_path, index=False)
    
    print(f"Updated CSV saved to {updated_csv_path}")
    return updated_csv_path

In [209]:
verb_word = 'λύω'
verb_df = generate_variations(verb_word)
print("Verb Variations for λύω:")
print(verb_df.head(20))


  dataframes = pd.read_html(soup.prettify(), header=[0, 1])
  df = df.applymap(lambda s: s.lower() if isinstance(s, str) else s)
  df = df.applymap(lambda s: s.split(',', 1)[0] if isinstance(s, str) else s)
  df = df.applymap(lambda s: s.split('-', 1)[0] if isinstance(s, str) else s)
  df = df.applymap(lambda s: s.split('+', 1)[0] if isinstance(s, str) else s)
  df = df.applymap(lambda s: s.split('/', 1)[0] if isinstance(s, str) else s)
  df = df.applymap(lambda s: s.split('\\', 1)[0] if isinstance(s, str) else s)
  df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)
  df = df.applymap(lambda s: s.lower() if isinstance(s, str) else s)
  df = df.applymap(lambda s: s.split(',', 1)[0] if isinstance(s, str) else s)
  df = df.applymap(lambda s: s.split('-', 1)[0] if isinstance(s, str) else s)
  df = df.applymap(lambda s: s.split('+', 1)[0] if isinstance(s, str) else s)
  df = df.applymap(lambda s: s.split('/', 1)[0] if isinstance(s, str) else s)
  df = df.applymap(lambda s: s

Verb Variations for λύω:
    id         text    bare_text  sequence_num morph_code base_form  \
0    1       active       active             1  v--------       λύω   
1    2   indicative   indicative             2  v--------       λύω   
2    3         λῡ́ω          λυω             3  v--------       λύω   
3    4       λῡ́εις        λυεις             4  v--------       λύω   
4    5        λῡ́ει         λυει             5  v--------       λύω   
5    6      λῡ́ετον       λυετον             6  v--------       λύω   
6    7      λῡ́ετον       λυετον             7  v--------       λύω   
7    8      λῡ́ομεν       λυομεν             8  v--------       λύω   
8    9       λῡ́ετε        λυετε             9  v--------       λύω   
9   10      λῡ́ουσῐ       λυουσι            10  v--------       λύω   
10  11       active       active            11  v--------       λύω   
11  12  subjunctive  subjunctive            12  v--------       λύω   
12  13         λῡ́ω          λυω            13  v---