In [41]:
# Import modules
import pandas as pd

# Import the transliteration rules
path_data = r'../../UKR_DATA/'
path_rules = r'UKR_transliteration/ukrainian_transliteration_rules.xlsx' # You can find a copy in the GitHub repository

# Initiliase a DataFrame
transliteration_df = pd.read_excel(f"{path_data}{path_rules}")

# Initialize an empty dictionary
translit_dict = {}

# Iterate through the rows of the DataFrame
for index, row in transliteration_df.iterrows():
    key = row['Cyrillic']  # Get the value from the first column
    value = row['Latin1'] if not pd.isna(row['Latin1']) else ''  # Replace NaN with empty string
    translit_dict[key] = value  # Add key-value pair to the dictionary

# translit_dict = {'а': 'a', 'б': 'b', 'в': 'v', 'г': 'h', 'ґ': 'g', 'д': 'd', 'е': 'e', 'є': 'ie', 'ж': 'zh', 'з': 'z',
#                  'и': 'y', 'i': 'i', 'ї': 'i', 'й': 'i', 'к': 'k', 'л': 'l', 'м': 'm', 'н': 'n', 'о': 'o', 'п': 'p', 
#                  'р': 'r', 'с': 's', 'т': 't', 'у': 'u', 'ф': 'f', 'х': 'kh', 'ц': 'ts', 'ч': 'ch', 'ш': 'sh', 
#                  'щ': 'shch', 'ю': 'iu', 'я': 'ia', 'ь': '', 'ʼ': ''} 

# 1. Define a function that finds "ї" after Cyrillic vowels and replaces it with 'yi'
def find_ї_after_vowels(string):
    # List all the Ukrainian vowels    
    list_vowels = ['а', 'е', 'є', 'и', 'i', 'ї', 'й', 'о', 'у', 'ю', 'я']
    
    # Convert the input string to lowercase to handle words with upper case
    string = string.lower()
    
    # create a list of all the characters comprising the string
    char_list = list(string)
    
    # loop through each character in the list and when 'ї' follows a vowel 
    # defined in the list_vowels change it to 'yi'
    for count, value in enumerate(char_list):
        if char_list[count-1] in list_vowels and char_list[count] == 'ї':
            char_list[count] = 'yi'
        
    # rebuild the string
    modified_str = "".join(char_list)

    # return the modified string
    return modified_str

# 2. Define a function that deals with all the other exceptions and then maps the characters that remain to their appropriate Latin counterpart                   
def transliterate_the_rest(string):
    new_str = ''

    # change 'зг' in 'zgh' 
    if 'зг' in string:
        string = string.replace('зг', 'zgh')

    # if a string starts with 'є' change it into 'ye'
    if string[0] == 'є':
        new_str = 'ye'
        new_str = new_str + string[1:]
        string = new_str
    
    # if a string starts with 'й' change it into 'y'
    elif string[0] == 'й':
        new_str = 'y'
        new_str = new_str + string[1:]
        string = new_str

    # if a string starts with 'ю' change it into 'yu'
    elif string[0] == 'ю':
        new_str = 'yu'
        new_str = new_str + string[1:]
        string = new_str

    # if a string starts with 'я' change it into 'ya'
    elif string[0] == 'я':
        new_str = 'ya'
        new_str = new_str + string[1:]
        string = new_str

    # if a string starts with 'ї' change it into 'yi'
    elif string[0] == 'ї':
        new_str = 'yi'
        new_str = new_str + string[1:]
        string = new_str

    # Transliterate the remaining characters that have not yet been transliterated
    output_str = ''.join(translit_dict[char] if char in translit_dict else char for char in string)
    
    # # In case you need to (re)capitalise the first letter of the string
    # output_str = output_str[0].upper() + output_str[1:]

    # # In case you need to capitalise the entire string
    # output_str = output_str.upper()
    
    # return the final string
    return output_str

# Sequence both functions into a single one
def transliterate_ukrainian(string):
    modified_str = find_ї_after_vowels(string)
    output_str = transliterate_the_rest(modified_str)
    return output_str



In [49]:
# Create a test sample
import random

# List of Ukrainian words and names
ukrainian_words = [
    'відомо', 'україна', 'гарний', 'мова', 'слово', 'програмування', "здоров'я", 'розвиток', 'наука', 'сонце',
    'осінь', 'красивий', "комп'ютер", 'друзі', 'робота', 'пошта', 'медицина', 'футбол', 'освіта', 'книга'
]

ukrainian_names = [
    'Іван', 'Марія', 'Олег', 'Наталя', 'Василь', 'Софія', 'Петро', 'Анна', 'Андрій', 'Оксана',
    'Ірина', 'Сергій', 'Марина', 'Віктор', 'Юлія', 'Олена', 'Дмитро', 'Тетяна', 'Роман', 'Людмила'
]

# Generate 100 random Ukrainian words and names with different capitalization forms
words = []
for _ in range(25):
    word = random.choice(ukrainian_words)
    words.extend([word.lower(), word.capitalize(), word.upper()])

for _ in range(25):
    name = random.choice(ukrainian_names)
    words.extend([name, name.upper()])

# Shuffle the list of words
random.shuffle(words)

# Create a DataFrame
df = pd.DataFrame({'ukrainian_Words': words})

# Save the DataFrame
df.to_excel('test_words.xlsx')

# Apply the transliterate_ukrainian function to each row in the DataFrame
df['transliterated_words'] = df['ukrainian_words'].apply(lambda x: transliterate_ukrainian(x))

# Show the first 20 result
df.head(20)

Unnamed: 0,ukrainian_words,transliterated_words
0,Василь,vasyl
1,Освіта,osvіta
2,Тетяна,tetiana
3,гарний,harnyi
4,ВАСИЛЬ,vasyl
5,Друзі,druzі
6,медицина,medytsyna
7,Іван,іvan
8,Красивий,krasyvyi
9,СОФІЯ,sofіia
