In [35]:
import random
import requests
import csv
import os
import pandas as pd
from bs4 import BeautifulSoup as bs
from urllib.parse import quote
import re
import unicodedata


In [36]:
def strip_accents(s):
    """Remove accents from a given string."""
    return ''.join(c for c in unicodedata.normalize('NFD', s)
                    if unicodedata.category(c) != 'Mn')

def generate_morph_code(part_of_speech, person, number, tense, mood, voice, gender, case, degree):
    """
    Generate a morph code based on grammatical features according to the 9-position key.
    """
    morph_code = f"{part_of_speech}{person}{number}{tense}{mood}{voice}{gender}{case}{degree}"
    return morph_code

In [None]:
def parse_noun_table(df, base_word, gender, sequence_num_start):
    """Parse the noun inflection table and extract forms along with their grammatical features."""
    # Map for cases and numbers
    case_map = {
        'nominative': 'n',
        'genitive': 'g',
        'dative': 'd',
        'accusative': 'a',
        'vocative': 'v',
        'ablative': 'b',
        'locative': 'l',
        'instrumental': 'i',
        'notes:': ''
    }
    number_map = {
        'singular': 's',
        'dual': 'd',
        'plural': 'p'
    }

    forms = []
    sequence_num = sequence_num_start

    # Clean the DataFrame headers
    df.columns = [col.lower().strip() for col in df.columns]

    # The first column should be cases
    cases = df.iloc[:, 0].str.lower().str.strip()
    # The rest of the columns are numbers
    numbers = [col.lower().strip() for col in df.columns[1:]]

    for idx, case in enumerate(cases):
        case = case.strip()
        case_key = case_map.get(case, '-')
        if case_key == '':
            continue  # Skip notes or irrelevant rows
        for col_idx, num_col in enumerate(numbers):
            num_col = num_col.strip()
            number_key = number_map.get(num_col, '-')
            if number_key == '-':
                continue
            form = df.iloc[idx, col_idx + 1]
            if isinstance(form, str):
                form = form.strip()
                if form != '':
                    bare_text = strip_accents(form)
                    bare_base_form = strip_accents(base_word)
                    morph_code = generate_morph_code(
                        part_of_speech='n',
                        person='-',
                        number=number_key,
                        tense='-',
                        mood='-',
                        voice='-',
                        gender=gender,
                        case=case_key,
                        degree='-'
                    )
                    form_entry = {
                        'id': sequence_num,
                        'text': form,
                        'bare_text': bare_text,
                        'sequence_num': sequence_num,
                        'morph_code': morph_code,
                        'base_form': base_word,
                        'bare_base_form': bare_base_form,
                        'definition': ''  # Add definition if available
                    }
                    forms.append(form_entry)
                    sequence_num += 1
    return forms

def parse_adjective_table(df, base_word, sequence_num_start):
    """Placeholder for adjective parsing."""
    # Implement parsing logic for adjectives
    return []

def parse_verb_table(df, base_word, sequence_num_start):
    """Placeholder for verb parsing."""
    # Implement parsing logic for verbs
    return []

In [38]:
def strip_accents(s):
    """Remove accents from a given string."""
    return ''.join(c for c in unicodedata.normalize('NFD', s)
                    if unicodedata.category(c) != 'Mn')

def clean_dataframe(df):
    """Clean the DataFrame by removing unwanted characters and standardizing the data."""
    df = df.dropna(how='all', axis=0)  # Remove empty rows

    # Remove unwanted characters and symbols
    df = df.replace({'➤': '', '•': '', '\[': '', '\]': '', '\{': '', '\}': '', '—': ' '}, regex=True)
    df = df.replace(r"\([^)]*\)", "", regex=True)  # Remove content within parentheses

    # Make all strings lowercase
    df = df.applymap(lambda s: s.lower() if isinstance(s, str) else s)

    # Split strings to remove extraneous information
    df = df.applymap(lambda s: s.split(',', 1)[0] if isinstance(s, str) else s)
    df = df.applymap(lambda s: s.split('-', 1)[0] if isinstance(s, str) else s)
    df = df.applymap(lambda s: s.split('+', 1)[0] if isinstance(s, str) else s)
    df = df.applymap(lambda s: s.split('/', 1)[0] if isinstance(s, str) else s)
    df = df.applymap(lambda s: s.split('\\', 1)[0] if isinstance(s, str) else s)

    df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)

    df = df.replace(r"\s\s+", " ", regex=True)  # Replace multiple spaces with a single space

    return df

def generate_variations(word, language='ancient greek', show=False):
    # Fetch the Wiktionary page
    url = requests.get(f'https://en.wiktionary.org/wiki/{quote(word)}')
    soup = bs(url.text, 'html.parser')

    # Remove unwanted elements
    for tag in soup.find_all('span', {'class': 'tr Latn'}):
        tag.decompose()
    for tag in soup.find_all('table', {'class': 'audiotable'}):
        tag.decompose()
    for tag in soup.find_all('sup'):
        tag.decompose()

    # Find all tables
    tables = soup.find_all('table')
    part_of_speech_h3 = []
    h4 = []  # Inflection/declension
    nav = []  # Verb tense, noun declension
    gender = None

    # DataFrames
    dataframes = pd.read_html(str(soup))

    # Initialize list to collect forms
    forms = []
    sequence_num = 1  # To keep track of sequence numbers

    for i, df in enumerate(dataframes):
        # Find the header corresponding to the table
        header = tables[i].find_previous('h2').text.replace('[edit]', '').lower().strip()

        # Check if the language matches
        if header != language:
            continue

        # Get the part of speech
        if tables[i].find_previous('h3'):
            part_of_speech = tables[i].find_previous('h3').text.replace('[edit]', '').replace('\"', '').lower().strip()
        else:
            part_of_speech = ''

        # Get gender
        gender_tag = tables[i].find_previous(class_="gender")
        if gender_tag:
            gender = gender_tag.find_next("abbr").text.lower()
            if 'm' in gender:
                gender = 'm'
            elif 'f' in gender:
                gender = 'f'
            elif 'n' in gender:
                gender = 'n'
            else:
                gender = '-'
        else:
            gender = '-'

        # Clean the DataFrame
        df = clean_dataframe(df)

        # Process based on part of speech
        if part_of_speech == 'noun':
            forms.extend(parse_noun_table(df, word, gender, sequence_num))
            sequence_num += len(df.index)
        elif part_of_speech == 'adjective':
            forms.extend(parse_adjective_table(df, word, sequence_num))
            sequence_num += len(df.index)
        elif part_of_speech == 'verb':
            forms.extend(parse_verb_table(df, word, sequence_num))
            sequence_num += len(df.index)

    return pd.DataFrame(forms)

In [None]:
def update_csv_with_wiktionary(csv_path, base_word):
    """
    Update the CSV file with forms from Wiktionary by comparing existing morph codes
    and adding any missing forms.
    """
    # Read the original CSV
    df = pd.read_csv(csv_path)
    
    # Get variations from Wiktionary
    scraped_df = generate_variations(base_word)
    
    if scraped_df.empty:
        print(f"No new forms found for {base_word}.")
        return csv_path  # Return the original path if no updates
    
    # Ensure morph_code is a string
    df['morph_code'] = df['morph_code'].astype(str)
    scraped_df['morph_code'] = scraped_df['morph_code'].astype(str)
    
    # Filter rows corresponding to the base word
    base_word_block = df[df['base_form'] == base_word]
    existing_morph_codes = set(base_word_block['morph_code'])
    
    # Get new forms not in the original CSV
    new_forms = scraped_df[~scraped_df['morph_code'].isin(existing_morph_codes)]
    
    if new_forms.empty:
        print(f"All forms of {base_word} are already present in the dataset.")
        return csv_path  # Return the original path if no new forms to add
    
    # Append new forms to the DataFrame
    updated_df = pd.concat([df, new_forms], ignore_index=True)
    
    # Save the updated DataFrame to a new CSV
    updated_csv_path = csv_path.replace('.csv', '_updated.csv')
    updated_df.to_csv(updated_csv_path, index=False)
    
    print(f"Updated CSV saved to {updated_csv_path}")
    return updated_csv_path


In [44]:
# noun test
noun_word = 'λόγος'
noun_df = generate_variations(noun_word)
print("Noun Variations for λόγος:")
print(noun_df)


Noun Variations for λόγος:
    id         text    bare_text  sequence_num morph_code base_form  \
0    1      ὁ λόγος      ο λογος             1  n-s---mn-     λόγος   
1    2      τὼ λόγω      τω λογω             2  n-d---mn-     λόγος   
2    3     οἱ λόγοι     οι λογοι             3  n-p---mn-     λόγος   
3    4    τοῦ λόγου    του λογου             4  n-s---mg-     λόγος   
4    5  τοῖν λόγοιν  τοιν λογοιν             5  n-d---mg-     λόγος   
5    6    τῶν λόγων    των λογων             6  n-p---mg-     λόγος   
6    7      τῷ λόγῳ      τω λογω             7  n-s---md-     λόγος   
7    8  τοῖν λόγοιν  τοιν λογοιν             8  n-d---md-     λόγος   
8    9  τοῖς λόγοις  τοις λογοις             9  n-p---md-     λόγος   
9   10    τὸν λόγον    τον λογον            10  n-s---ma-     λόγος   
10  11      τὼ λόγω      τω λογω            11  n-d---ma-     λόγος   
11  12  τοὺς λόγους  τους λογους            12  n-p---ma-     λόγος   
12  13         λόγε         λογε            13  n-

  dataframes = pd.read_html(str(soup))
  df = df.applymap(lambda s: s.lower() if isinstance(s, str) else s)
  df = df.applymap(lambda s: s.split(',', 1)[0] if isinstance(s, str) else s)
  df = df.applymap(lambda s: s.split('-', 1)[0] if isinstance(s, str) else s)
  df = df.applymap(lambda s: s.split('+', 1)[0] if isinstance(s, str) else s)
  df = df.applymap(lambda s: s.split('/', 1)[0] if isinstance(s, str) else s)
  df = df.applymap(lambda s: s.split('\\', 1)[0] if isinstance(s, str) else s)
  df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)
