#Import


In [None]:
# install critical dependency to upload a database
# https://github.com/EugenefedorovPro/wiktionary_rus
# !pip install git+https://github.com/EugenefedorovPro/wiktionary_rus.git
from wiktionary_rus.wiktionary import wiki_instances, find_item_from_wiki


In [None]:
from ipapy import UNICODE_TO_IPA
from ipapy import is_valid_ipa
from ipapy.ipachar import IPAConsonant
from ipapy.ipachar import IPAVowel
from ipapy.ipastring import IPAString


In [None]:
from pathlib import Path
import pandas as pd
import re
import numpy as np
import random


In [None]:
import tensorflow as tf

from tensorflow import keras
from tensorflow.keras import layers

from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout, TimeDistributed
from keras.callbacks import ModelCheckpoint
from keras.layers import (
    GRU,
    LSTM,
    Input,
    Dense,
    TimeDistributed,
    Activation,
    RepeatVector,
    Bidirectional,
    Dropout,
    LSTM,
)
from keras.layers import Embedding
from tensorflow.keras.models import load_model


In [None]:
vars(wiki_instances[1004])


#SelectWords


In [None]:
class SelectWords:
    @classmethod
    def set_all_attributes_to_false(cls):
        for item in wiki_instances:
            setattr(item, "status", False)

    @classmethod
    def get_number_of_items_with_true_status(cls):
        number_of_items_with_true_status = len(
            [1 for item in wiki_instances if item.status]
        )
        return number_of_items_with_true_status

    @classmethod
    def get_df_unstressed_words_lengths(cls, status="allwords"):
        if status == "allwords":
            list_lengths_of_words = [len(item.word_lowcase) for item in wiki_instances]
        if status == "truewords":
            list_lengths_of_words = [
                len(item.word_lowcase) for item in wiki_instances if item.status
            ]

        series_lengths_of_words = pd.Series(
            list_lengths_of_words, dtype="object"
        ).value_counts()
        pd_lengths_of_unstressed_words = pd.DataFrame({"freq": series_lengths_of_words})
        pd_lengths_of_unstressed_words = pd_lengths_of_unstressed_words.reset_index()
        pd_lengths_of_unstressed_words = pd_lengths_of_unstressed_words.rename(
            columns={"index": "lengths"}
        )
        pd_lengths_of_unstressed_words = pd_lengths_of_unstressed_words.sort_values(
            by="freq", ascending=True
        )
        pd_lengths_of_unstressed_words["percent"] = list(
            map(
                lambda x: x / len(list_lengths_of_words),
                list(pd_lengths_of_unstressed_words["freq"]),
            )
        )
        return pd_lengths_of_unstressed_words

    @classmethod
    def get_list_of_most_frequent_unstressed_lengths(cls, threshold_persent=0):
        pd_lengths_of_unstressed_words = cls.get_df_unstressed_words_lengths(
            status="allwords"
        )
        pd_selected = pd_lengths_of_unstressed_words[
            pd_lengths_of_unstressed_words["percent"] > threshold_persent
        ]
        list_of_most_frequent_unstressed_lengths = list(pd_selected["lengths"])
        list_of_most_frequent_unstressed_lengths.sort()
        return list_of_most_frequent_unstressed_lengths

    @classmethod
    def select_unstressed_words_for_nn(cls):
        cls.set_all_attributes_to_false()
        list_of_most_frequent_unstressed_lengths = (
            cls.get_list_of_most_frequent_unstressed_lengths()
        )

        pat_only_simple_words = re.compile("[^а-я|А-Я|ё|Ё|-]")
        for item in wiki_instances:
            if (
                item.word_lowcase != None
                and re.search(pat_only_simple_words, item.word_lowcase) == None
                and item.pos in ("noun", "verb", "adj", "name", "adv", "num", "pron")
                and len(item.word_lowcase) in list_of_most_frequent_unstressed_lengths
                and item.accent
                and re.findall("'", item.accent)
            ):

                setattr(item, "status", True)

        return "number_of_items_with_true_status - {}".format(
            cls.get_number_of_items_with_true_status()
        )

    @classmethod
    def get_rand_words_with_true_status(cls, n_of_words_to_show):
        rand_words_with_true_status = [
            item.word_lowcase for item in wiki_instances if item.status
        ]
        rand_numbers = [
            random.randint(0, cls.get_number_of_items_with_true_status())
            for i in range(n_of_words_to_show)
        ]
        rand_words_with_true_status = [
            item.word_lowcase
            for i, item in enumerate(wiki_instances)
            if i in rand_numbers and item.status == True
        ]
        return rand_words_with_true_status


In [None]:
SelectWords.select_unstressed_words_for_nn()


#WordSelectionByLength


In [None]:
class WordsSelectionByLengths:
    # get df with words' lengths
    @classmethod
    def get_df_words_lengths(cls):
        number_of_true_items = len([1 for item in wiki_instances if item.status])
        list_lengths_of_words = [
            len(item.accent) for item in wiki_instances if item.status
        ]
        series_lengths_of_words = pd.Series(list_lengths_of_words).value_counts()
        pd_lengths_of_words = pd.DataFrame({"freq": series_lengths_of_words})
        pd_lengths_of_words = pd_lengths_of_words.reset_index()
        pd_lengths_of_words = pd_lengths_of_words.rename(columns={"index": "lengths"})
        pd_lengths_of_words = pd_lengths_of_words.sort_values(by="freq", ascending=True)
        pd_lengths_of_words["percent"] = list(
            map(lambda x: x / number_of_true_items, list(pd_lengths_of_words["freq"]))
        )
        return pd_lengths_of_words

    # selecting for nn only words with length,
    # share of which is above a defined threshold (0.01) in the total amount of words
    @classmethod
    def get_list_of_most_frequent_lengths(cls, threshold_persent):
        pd_lengths_of_words = cls.get_df_words_lengths()
        pd_selected = pd_lengths_of_words[
            pd_lengths_of_words["percent"] >= threshold_persent
        ]
        list_of_selected_lengths = list(pd_selected["lengths"])
        list_of_selected_lengths.sort()
        return list_of_selected_lengths

    # setting False status to words unsatisfying condition of lengths
    @classmethod
    def unselect_words_with_low_freq_lengths(cls, threshold_persent):
        pd_lengths_of_words = cls.get_df_words_lengths()
        list_of_selected_lengths = cls.get_list_of_most_frequent_lengths(
            threshold_persent
        )
        n_of_changes = len(
            [
                setattr(item, "status", False)
                for item in wiki_instances
                if item.status and len(item.accent) not in list_of_selected_lengths
            ]
        )
        return "number of unselected words is", n_of_changes


In [None]:
WordsSelectionByLengths.get_df_words_lengths()


In [None]:
WordsSelectionByLengths.unselect_words_with_low_freq_lengths(0)


In [None]:
WordsSelectionByLengths.get_df_words_lengths()


#WordsProcessing


In [None]:
class WordsProcessing:
    @classmethod
    def get_unique_chs_from_unstressed_words(cls):
        unique_chs_from_unstressed_words = set()
        for item in wiki_instances:
            if item.status:
                chs = list(item.word_lowcase)
                unique_chs_from_unstressed_words.update(chs)
        unique_chs_from_unstressed_words = list(unique_chs_from_unstressed_words)
        unique_chs_from_unstressed_words.sort()
        return unique_chs_from_unstressed_words

    @classmethod
    def get_character2number_for_unstressed_words(cls):
        unique_chs = cls.get_unique_chs_from_unstressed_words()
        character2number_for_unstressed_words = dict(
            (ch, i) for i, ch in enumerate(unique_chs, start=1)
        )
        return character2number_for_unstressed_words

    @classmethod
    def get_number2character_for_unstressed_words(cls):
        unique_chs = cls.get_unique_chs_from_unstressed_words()
        number2character_for_unstressed_words = dict(
            (i, ch) for i, ch in enumerate(unique_chs, start=1)
        )
        return number2character_for_unstressed_words

    @classmethod
    def get_max_length_of_unstressed_word(cls):
        # +1 is added as a place for accent for correct padding
        max_length_of_unstressed_word = (
            max([len(item.word_lowcase) for item in wiki_instances if item.status]) + 1
        )
        return max_length_of_unstressed_word

    @classmethod
    def all_unstressed_words_to_array(cls):
        # inputs_unstressed = np.zeros(max_length_of_unstressed_word)

        def _word2numbers(_word_lowcase):
            _numbers = []
            for ch in _word_lowcase:
                n = character2number_for_unstressed_words[ch]
                _numbers.append(n)
            _n_of_zeros_to_add = max_length_of_unstressed_word - len(_numbers)
            _numbers.extend([0 for i in range(_n_of_zeros_to_add)])

            return _numbers

        list_of_words_as_numbers = [
            _word2numbers(item.word_lowcase) for item in wiki_instances if item.status
        ]

        all_unstressed_words_as_array = np.vstack(list_of_words_as_numbers)
        return all_unstressed_words_as_array


In [None]:
character2number_for_unstressed_words = (
    WordsProcessing.get_character2number_for_unstressed_words()
)
print(character2number_for_unstressed_words)
number2character_for_unstressed_words = (
    WordsProcessing.get_number2character_for_unstressed_words()
)
print(character2number_for_unstressed_words)
max_length_of_unstressed_word = WordsProcessing.get_max_length_of_unstressed_word()
print(max_length_of_unstressed_word)


In [None]:
all_unstressed_words_as_array = WordsProcessing.all_unstressed_words_to_array()
print(all_unstressed_words_as_array.shape)


#StressProcessing


In [None]:
class StressProcessing:
    @classmethod
    def get_position_of_stress(cls, accent):
        try:
            position_of_stress = accent.index("'")
        except:
            print(accent)

        return position_of_stress

    @classmethod
    def convert_stress2array(cls, accent):
        max_length_of_unstressed_word = cls.get_max_length_of_unstressed_word()
        stress_as_array = np.zeros(max_length_of_unstressed_word)
        position_of_stress = cls.get_position_of_stress(accent)
        stress_as_array = np.insert(stress_as_array, position_of_stress, 1)
        return stress_as_array

    @classmethod
    def all_stresses_to_array(cls):
        def _stress2array(accent):
            stress_as_array = np.zeros(max_length_of_unstressed_word)
            position_of_stress = cls.get_position_of_stress(accent)
            stress_as_array[position_of_stress] = 1
            return stress_as_array

        all_stresses_as_array = np.zeros(max_length_of_unstressed_word)

        _list_of_arrays = []
        for item in wiki_instances:
            if item.status == True:
                # print(item.accent)
                accent = item.accent
                position_of_stress = cls.get_position_of_stress(item.accent)
                stress_as_array = _stress2array(accent)
                _list_of_arrays.append(stress_as_array)

        all_stresses_as_array = np.vstack(_list_of_arrays)

        return all_stresses_as_array


In [None]:
all_stresses_as_array = StressProcessing.all_stresses_to_array()
print(all_stresses_as_array.shape)


#Train_Test_Split


In [None]:
print(all_unstressed_words_as_array.shape)
print(all_stresses_as_array.shape)


In [None]:
def split_data_by_ratio(
    split_ratio, all_unstressed_words_as_array, all_stresses_as_array
):
    np.random.seed(30)
    number_of_items_with_true_status = (
        SelectWords.get_number_of_items_with_true_status()
    )

    mask = np.random.rand(number_of_items_with_true_status) < split_ratio

    inputs = all_unstressed_words_as_array[mask]
    inputs_test = all_unstressed_words_as_array[~mask]

    labels = all_stresses_as_array[mask]
    labels_test = all_stresses_as_array[~mask]

    return inputs, labels, inputs_test, labels_test


inputs, labels, inputs_test, labels_test = split_data_by_ratio(
    0.8, all_unstressed_words_as_array, all_stresses_as_array
)
print("inputs", inputs.shape)
print("labels", labels.shape)
print("inputs_test", inputs_test.shape)
print("labels_test", labels_test.shape)


#Model


In [None]:
print("lables.shape", labels.shape)
print("inputs.shape", inputs.shape)
print("max_length_of_unstressed_word", max_length_of_unstressed_word)
size_of_vocab = len(character2number_for_unstressed_words) + 1
print("size_of_vocab", size_of_vocab)


In [None]:
def desigh_model_stress(inputs, size_of_vocab, max_length_of_unstressed_word):
    inputs_stress_keras = tf.keras.Input(shape=(inputs.shape[1],))

    embd_stress = tf.keras.layers.Embedding(size_of_vocab, 16)
    embd_stress_result = embd_stress(inputs_stress_keras)

    my_gru = tf.keras.layers.Bidirectional(
        tf.keras.layers.GRU(32, dropout=0.3, recurrent_dropout=0.3)
    )
    result_my_gru = my_gru(embd_stress_result)

    dns_stress = tf.keras.layers.Dense(
        max_length_of_unstressed_word, activation="softmax"
    )

    outputs_stress = dns_stress(result_my_gru)

    model_stress = tf.keras.Model(inputs=inputs_stress_keras, outputs=outputs_stress)
    return model_stress


model_stress = desigh_model_stress(inputs, size_of_vocab, max_length_of_unstressed_word)


In [None]:
model_stress.summary()


In [None]:
filepath_stress = Path.cwd() / "checkpoint_put_stress.hdf5"
filepath_stress


In [None]:
checkpoint = tf.keras.callbacks.ModelCheckpoint(
    filepath_stress,
    monitor="categorical_accuracy",
    verbose=1,
    save_best_only=True,
    mode="max",
)


In [None]:
model_stress.compile(
    loss=keras.losses.CategoricalCrossentropy(),
    optimizer=keras.optimizers.Adam(learning_rate=0.001),
    metrics="categorical_accuracy",
)


In [None]:
# model_stress.load_weights(filepath_stress)


In [None]:
# training model

# history_stress = model_stress.fit(inputs, labels,
#                     batch_size=64,
#                     epochs=100,
#                     #validation_split=0.2,
#                     verbose=1,
#                     callbacks=[checkpoint],
#                     )


In [None]:
path_model_stress = Path.cwd(). / "model_put_stress.h5" 
path_model_stress

In [None]:
# save entire model

# model_stress.save(path_model_stress)


#Load Model


In [None]:
# load model
del model_stress
model_stress = load_model(path_model_stress)


#Predict


In [None]:
# categorical_accuracy: 0.7945
model_stress.evaluate(inputs_test, labels_test)


In [None]:
def put_stress(word_lowcase):
    def _word2numbers(word_lowcase):
        _numbers = []
        for ch in word_lowcase:
            n = character2number_for_unstressed_words[ch]
            _numbers.append(n)
        _n_of_zeros_to_add = max_length_of_unstressed_word - len(_numbers)
        _numbers.extend([0 for i in range(_n_of_zeros_to_add)])
        _numbers = np.array(_numbers)
        _numbers = _numbers.reshape((1, max_length_of_unstressed_word))
        return _numbers

    stress_as_array = model_stress.predict(_word2numbers(word_lowcase))

    def numbers2word_stressed(stress_as_array):
        index_of_stress = np.argmax(stress_as_array)
        word_stressed = (
            word_lowcase[:index_of_stress] + "'" + word_lowcase[index_of_stress:]
        )
        return word_stressed

    word_stressed = numbers2word_stressed(stress_as_array)
    return word_stressed


In [None]:
put_stress("лопотушечка")


In [None]:
def put_stress_comparison_dict_predict(word_unstressed):
    word_stressed_predicted = put_stress(word_unstressed)
    try:
        word_stressed_from_dict = Dictionary.get_word_from_Dict(word_unstressed)[
            0
        ].accent
    except:
        word_stressed_from_dict = "no word in dict"

    stress_comparison = "word_stressed_predicted - {},\n \
  word_stressed_from_dict - {},\n".format(
        word_stressed_predicted, word_stressed_from_dict
    )

    return print(stress_comparison)


In [None]:
put_stress_comparison_dict_predict("дума")
