#import

In [44]:
from ipapy import UNICODE_TO_IPA
from ipapy import is_valid_ipa
from ipapy.ipachar import IPAConsonant
from ipapy.ipachar import IPAVowel
from ipapy.ipastring import IPAString

from pathlib import Path
import pandas as pd
import re
import inspect
import numpy as np
import random
import pickle


import tensorflow as tf
from tensorflow.keras.models import load_model
from tensorflow import keras
from tensorflow.keras import layers

from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout, TimeDistributed
from keras.callbacks import ModelCheckpoint
from keras.layers import GRU, LSTM, Input, Dense, TimeDistributed, Activation, RepeatVector, Bidirectional, Dropout, LSTM
from keras.layers import Embedding


In [7]:
from wiktionary_rus.wiktionary import wiki_instances, find_item_from_wiki

In [8]:
vars(wiki_instances[0])

{'word': 'бёрдо',
 'word_lowcase': 'бёрдо',
 'stem': "бё'рдо",
 'pos': 'noun',
 'grammeme': ['accusative', 'nominative', 'singular'],
 'meanings': [['comb, reed (in a loom)']],
 'accent': "бё'рдо",
 'sounds': 'ˈbʲɵrdə',
 'status': True,
 'ipa': bilabial consonant palatalized plosive voiced
 central close-mid rounded vowel
 alveolar consonant trill voiced
 alveolar consonant plosive voiced
 central mid unrounded vowel,
 'npipa': array([ 3., 78., 39.,  6., 71.], dtype=float32),
 'intipa': [3, 78, 39, 6, 71]}

#SelectWords

In [10]:
class SelectWords():
  @classmethod
  def set_all_attributes_to_false(cls):
    for item in wiki_instances:
      setattr(item, "status", False)
  
  @classmethod
  def get_number_of_items_with_true_status(cls):
    number_of_items_with_true_status = len([1 for item in wiki_instances if item.status])
    return number_of_items_with_true_status


  @classmethod  
  def select_stressed_items_for_nn(cls):
    cls.set_all_attributes_to_false()
    pat_only_simple_words =  re.compile("[^а-я|А-Я|ё|Ё|'|-]")
    ipa_characters_to_remove = [UNICODE_TO_IPA[u"ɣ"], 
                                UNICODE_TO_IPA[u"‿"],
                                UNICODE_TO_IPA[u'|'],
                                UNICODE_TO_IPA[u'ǀ'],
                                UNICODE_TO_IPA[u"ɖ͡ʐ"],
                                UNICODE_TO_IPA[u"ɑ"],
                                UNICODE_TO_IPA[u"."],
                                UNICODE_TO_IPA[u"ʑ"],
                                UNICODE_TO_IPA[u"ˌ"],
                                UNICODE_TO_IPA[u"ʔ"],
                                UNICODE_TO_IPA[u"ʕ"],
                                #UNICODE_TO_IPA[u" "],
                                UNICODE_TO_IPA[u"m̥"],
                                UNICODE_TO_IPA[u'̩'],
                                UNICODE_TO_IPA[u"ː"]]
    pos_to_remain = ("noun", "verb", "adj", "name", "adv", 
                    "num", "pron", "intj", "prep", "conj", 
                    "particle", "det", "ambiposition") 
    

    stressed_items_for_nn = len([setattr(item, "status", True) for item in wiki_instances 
                            if item.sounds 
                            and item.accent
                            and re.search(pat_only_simple_words, item.accent) == None
                            and item.pos in pos_to_remain 
                            and item.word not in ["лахмаджун", "МГУ", "тпру"]
                            and set(ipa_characters_to_remove) & set(IPAString(unicode_string = item.sounds)) == set()])


    return stressed_items_for_nn

  @classmethod
  def get_rand_words_with_true_status(cls, n_of_words_to_show =10):
    rand_words_with_true_status = [item.accent for item in wiki_instances if item.status]
    rand_numbers = [random.randint(0, cls.get_number_of_items_with_true_status()) \
                    for i in range(n_of_words_to_show)]
    rand_words_with_true_status = [item.accent for i, item in enumerate(wiki_instances) \
                                  if i in rand_numbers and item.status
                                  ]
    return rand_words_with_true_status


In [11]:
SelectWords.select_stressed_items_for_nn()

396999

In [12]:
SelectWords.get_rand_words_with_true_status(5)

["благода'рный", "дио'дам", "вними'те", "при'быльностям", "хва'литесь"]

#SelectWordByLength

In [13]:
#class embracing functions demanded for unselecting words with lengths, 
#which have low grequency of occurance in Dictionary

class WordsSelectionByLengths():

  #get df with words' lengths
  @classmethod
  def get_df_words_lengths(cls):
    number_of_true_items = SelectWords.get_number_of_items_with_true_status()
    list_lengths_of_words = [len(item.accent) for item in wiki_instances if item.status]
    series_lengths_of_words = pd.Series(list_lengths_of_words).value_counts()
    pd_lengths_of_words = pd.DataFrame({"freq":series_lengths_of_words})
    pd_lengths_of_words = pd_lengths_of_words.reset_index()
    pd_lengths_of_words = pd_lengths_of_words.rename(columns = {"index": "lengths"})
    pd_lengths_of_words = pd_lengths_of_words.sort_values(by = "freq", ascending = True)
    pd_lengths_of_words["percent"] = list(map(lambda x: x/number_of_true_items, 
                                              list(pd_lengths_of_words["freq"])))
    return pd_lengths_of_words

  #selecting for nn only words with length, 
  #share of which is above a defined threshold (0.01) in the total amount of words
  @classmethod
  def get_list_of_most_frequent_lengths(cls, threshold_persent = 0):
    pd_lengths_of_words = cls.get_df_words_lengths()
    pd_selected = pd_lengths_of_words[pd_lengths_of_words["percent"] >= threshold_persent]
    list_of_selected_lengths = list(pd_selected["lengths"])
    list_of_selected_lengths.sort()
    return list_of_selected_lengths

  #setting False status to words unsatisfying condition of lengths
  @classmethod
  def unselect_words_with_low_freq_lengths(cls, threshold_persent = 0):
    pd_lengths_of_words = cls.get_df_words_lengths()
    list_of_selected_lengths = cls.get_list_of_most_frequent_lengths(threshold_persent)
    n_of_changes = len([setattr(item, "status", False) for item in wiki_instances if
                      item.status and
                      len(item.accent) not in list_of_selected_lengths])
    return "number of unselected words is", n_of_changes

In [14]:
WordsSelectionByLengths.unselect_words_with_low_freq_lengths()

('number of unselected words is', 0)

#IpaProcessing

In [15]:
class IpaProcessing:
  @classmethod
  def get_unique_ipa(cls):  
    all_signs = set()
    for item in wiki_instances:
        if item.status == True:
          value = IPAString(unicode_string=item.sounds)          
          for ch in value:
            all_signs.add(ch)
    list_unique_ipas = [ipa_ch for ipa_ch in list(all_signs)]
    list_unique_unicodes = [str(ip) for ip in list_unique_ipas]
    list_unique_unicodes.sort()
    return list_unique_unicodes

  @classmethod
  def get_sign2number(cls):
    sign2number = dict((UNICODE_TO_IPA[l],i) for i, l in enumerate(list_unique_unicodes, start = 1))
    return sign2number
  
  @classmethod
  def get_number2sign(cls):
    number2sign = dict((i,UNICODE_TO_IPA[l]) for i, l in enumerate(list_unique_unicodes, start = 1))
    return number2sign 

  @classmethod
  def get_max_length_of_ipa(cls):
    lenghts = [len(IPAString(unicode_string = item.sounds)) for item in wiki_instances 
               if item.status == True]
    max_length_of_ipa = max(lenghts)
    return max_length_of_ipa

  @classmethod
  #func to convert ipa string to sparse numbers padding
  def ipa_string_to_numbers(cls, ipa_string):
    ipa_as_numbers = []
    trans_as_IPAString = IPAString(unicode_string = ipa_string)

    for ch in trans_as_IPAString:
      n = sign2number[ch]
      ipa_as_numbers.append(n)
        
    current_length = len(ipa_as_numbers)
    ipa_as_numbers.extend(0 for i in range(max_length_of_ipa - current_length))
    return ipa_as_numbers


  #func to convert all ipa to array
  @classmethod
  def all_ipa_to_array(cls):

    #save intermediate results, including save at the end
    checks_and_save_points = [i for i in range(10000,400000,10000)]

    count = 0
    all_ipa_as_array = np.zeros(max_length_of_ipa)
    for i,item in enumerate(wiki_instances):
        
      if item.status == True:
        ipa_as_numbers = cls.ipa_string_to_numbers(item.sounds)
        ipa_as_numbers = np.array(ipa_as_numbers)

        all_ipa_as_array = np.vstack([all_ipa_as_array, ipa_as_numbers])
        count +=1

        if count in checks_and_save_points:
          print("count", count, " ", "i", i)
          
    all_ipa_as_array = all_ipa_as_array[1:]
    return all_ipa_as_array

In [16]:
list_unique_unicodes = IpaProcessing.get_unique_ipa()
print(list_unique_unicodes)

[' ', 'a', 'b', 'bʲ', 'bʲː', 'bː', 'd', 'dʲ', 'dʲː', 'dː', 'd͡z', 'd͡zʲ', 'e', 'f', 'fʲ', 'fʲː', 'i', 'j', 'jː', 'k', 'kʲ', 'kʲː', 'kː', 'lʲ', 'lʲː', 'lˠ', 'lˠː', 'm', 'mʲ', 'mʲː', 'mː', 'n', 'nʲ', 'nʲː', 'nː', 'o', 'p', 'pʲ', 'pʲː', 'pː', 'r', 'rʲ', 'rʲː', 'rː', 's', 'sʲ', 'sʲː', 'sː', 't', 'tʲ', 'tʲː', 'tː', 't͡s', 't͡sʲ', 't͡sː', 't͡ɕ', 't͡ɕː', 'u', 'v', 'vʲ', 'vʲː', 'vː', 'x', 'xʲ', 'z', 'zʲ', 'zʲː', 'zː', 'æ', 'ɐ', 'ɕ', 'ɕː', 'ə', 'ɛ', 'ɡ', 'ɡʲ', 'ɡː', 'ɨ', 'ɪ', 'ɵ', 'ʂ', 'ʂː', 'ʈ͡ʂ', 'ʉ', 'ʊ', 'ʐ', 'ʐː', 'ʑː', 'ˈ']


In [17]:
sign2number = IpaProcessing.get_sign2number()
print(len(sign2number))
print(sign2number)
number2sign = IpaProcessing.get_number2sign() 
print(len(number2sign))
print(number2sign)
max_length_of_ipa = IpaProcessing.get_max_length_of_ipa()
print("max_length_of_ipa", max_length_of_ipa)

89
{suprasegmental word-break: 1, front open unrounded vowel: 2, bilabial consonant plosive voiced: 3, bilabial consonant palatalized plosive voiced: 4, bilabial consonant palatalized plosive voiced: 5, bilabial consonant plosive voiced: 6, alveolar consonant plosive voiced: 7, alveolar consonant palatalized plosive voiced: 8, alveolar consonant palatalized plosive voiced: 9, alveolar consonant plosive voiced: 10, alveolar consonant sibilant-affricate voiced: 11, alveolar consonant palatalized sibilant-affricate voiced: 12, close-mid front unrounded vowel: 13, consonant labio-dental non-sibilant-fricative voiceless: 14, consonant labio-dental non-sibilant-fricative palatalized voiceless: 15, consonant labio-dental non-sibilant-fricative palatalized voiceless: 16, close front unrounded vowel: 17, approximant consonant palatal voiced: 18, approximant consonant palatal voiced: 19, consonant plosive velar voiceless: 20, consonant palatalized plosive velar voiceless: 21, consonant palataliz

In [19]:
ipa_string = find_item_from_wiki("молоко'")[0].sounds
print(ipa_string)
ex_of_ipa_string_as_numbers = IpaProcessing.ipa_string_to_numbers(ipa_string)
print(ex_of_ipa_string_as_numbers)

məɫɐˈko
[28, 73, 26, 70, 89, 20, 36, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [20]:
#time 1 hours 13 min
#all_ipa_as_array = IpaProcessing.all_ipa_to_array()
#print(all_ipa_as_array.shape)
#print(all_ipa_as_array[0])

In [22]:
#transcriptions of words with accents + transcriptions without secondary accents, blank spaces
path_all_ipa_as_array = "all_ipa_as_array_with_accents_short.pickle"

#save
#with open(path_all_ipa_as_array, 'wb') as f:
  #pickle.dump(all_ipa_as_array, f)

#load
with open(path_all_ipa_as_array, "rb") as f:
  all_ipa_as_array = pickle.load(f)

In [23]:
print(all_ipa_as_array.shape)
print(all_ipa_as_array[0])

(396999, 34)
[89.  4. 80. 41.  7. 73.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]


#FreqIpa

In [25]:
#get df: numbers(number of ipa sign) - freq(frequence of sign's occurance) -
#signs(ipa chaacters in unicode) - persents(share of this ipa in total)
#multiplication (the number to multiply class_weights in the model)
def get_freq_of_ipa_signs():
  number_of_true_items = SelectWords.get_number_of_items_with_true_status()
  no_classes = len(sign2number)
  table_of_all_signs_as_numbers = []
  for i,item in enumerate(wiki_instances):
    if item.status and i <= number_of_true_items:
      item_as_string = IPAString(unicode_string = item.sounds)
      for sign in item_as_string:
        number = sign2number[sign]
        table_of_all_signs_as_numbers.append(number)
  table_of_all_signs_as_numbers = pd.Series(table_of_all_signs_as_numbers).value_counts(ascending=False)

  #table_of_all_signs_as_numbers = pd.Series(table_of_all_signs_as_numbers)
  freq_of_ipa_signs = pd.DataFrame(table_of_all_signs_as_numbers).reset_index()
  freq_of_ipa_signs = freq_of_ipa_signs.rename(columns = {"index":"numbers", 0:'freq'})
  freq_of_ipa_signs["numbers"].sort_values(ascending = True)

  #adding ipa signs to df
  signs_to_df = []
  for i in freq_of_ipa_signs["numbers"]:
    signs_to_df.append(str(number2sign[i]))
  freq_of_ipa_signs["signs"] = signs_to_df

  #getting frequency of zeros from array for making class "0" 
  def get_freq_of_zeros():
    freq_of_zeros = sum([np.count_nonzero(a == 0) for a in all_ipa_as_array])
    return freq_of_zeros
  freq_of_zeros = get_freq_of_zeros()

  #adding row with freq of zeros to df
  def add_freq_of_zeros_to_df(freq_of_ipa_signs, freq_of_zeros):
    freq_of_ipa_signs = freq_of_ipa_signs.append({"numbers": 0, 
                                                  "freq": freq_of_zeros, 
                                                  "signs": "zero"}, ignore_index=True)
    return freq_of_ipa_signs
  freq_of_ipa_signs = add_freq_of_zeros_to_df(freq_of_ipa_signs, freq_of_zeros)

  #getting percents
  percents_as_map = list(map(lambda x: round(x/freq_of_ipa_signs["freq"].sum(),3), freq_of_ipa_signs["freq"]))
  freq_of_ipa_signs["percents"] = percents_as_map
  
  #getting multiplication for weights
  mult_as_map = list(map(lambda x: round(number_of_true_items/(len(sign2number)*x), 5), freq_of_ipa_signs["freq"]))
  freq_of_ipa_signs['multiplication'] = mult_as_map
  
  return freq_of_ipa_signs

freq_of_ipa_signs = get_freq_of_ipa_signs()
freq_of_ipa_signs


  freq_of_ipa_signs = freq_of_ipa_signs.append({"numbers": 0,


Unnamed: 0,numbers,freq,signs,percents,multiplication
0,89,369755,ˈ,0.028,0.01206
1,79,331385,ɪ,0.025,0.01346
2,73,293264,ə,0.022,0.01521
3,70,187883,ɐ,0.014,0.02374
4,18,147770,j,0.011,0.03019
...,...,...,...,...,...
85,22,5,kʲː,0.000,892.13258
86,43,5,rʲː,0.000,892.13258
87,5,2,bʲː,0.000,2230.33146
88,44,1,rː,0.000,4460.66292


In [27]:
#rare ipa signs to remove
#ɣ, ‿, d͡z, ʈ͡ʂ, ɖ͡ʐ
#"ɑ", ".", "ʑ"  

#search as ipa
#getting instances of words with a certain ipa character
#ɡ͡b(382), n͡m(387), k͡x(128),  d͡b(96), d͡b(386), #t͡p(385) 

#d͡z(119)+ ('дзюдо', 'd͡zʲzʲʊˈdo', 'noun'), alveolar consonant sibilant-affricate voiced
#ʈ͡ʂ(99)+(('обветшаю', 'ɐbvʲɪt͡ʂˈʂajʊ', 'verb')), consonant retroflex sibilant-affricate voiceless 
#ɖ͡ʐ(100)+(('камбоджа', 'kɐmˈbod͡ʐʐə', 'name')), alveolar consonant sibilant-affricate voiced

#getting instances of words with a certain ipa character
#ipa signs to remove: "ɑ", "."(syllable-break), ʑ(вещдок), ͡ (2046)
#ipa signs to change: ɣ-x(consonant non-sibilant-fricative velar voiced, 54)

def find_word_by_freq_of_ipa_signs_number_of_row(number_of_row):
  sign_to_find = UNICODE_TO_IPA[freq_of_ipa_signs.iloc[number_of_row,2]]
  wb = [(item.accent, item.sounds, item.pos) for item in wiki_instances if item.status 
        and sign_to_find in IPAString(unicode_string = item.sounds)]
  number2sign[freq_of_ipa_signs.iloc[number_of_row,0]]
  return number2sign, wb

number_of_row = 38
find_word_by_freq_of_ipa_signs_number_of_row(number_of_row)[1][0:10]

[("распя'тие", 'rɐˈspʲætʲɪje', 'noun'),
 ("запя'стье", 'zɐˈpʲæsʲtʲje', 'noun'),
 ("юпи'тер", 'jʉˈpʲitʲɪr', 'name'),
 ("лесопи'лка", 'lʲɪsɐˈpʲiɫkə', 'noun'),
 ("лесопи'льня", 'lʲɪsɐˈpʲilʲnʲə', 'noun'),
 ("лесопи'льный", 'lʲɪsɐˈpʲilʲnɨj', 'adj'),
 ("копе'йка", 'kɐˈpʲejkə', 'noun'),
 ("пенька'", 'pʲɪnʲˈka', 'noun'),
 ("пенька'", 'pʲɪnʲˈka', 'noun'),
 ("пя'ть", 'pʲætʲ', 'num')]

#WordsProcessing

In [28]:
class WordsProcessing():
  @classmethod
  def get_unique_chs_from_stressed_words(cls):
    unique_chs_from_stressed_words = set()
    for item in wiki_instances:
      if item.status:
        chs = list(item.accent)
        unique_chs_from_stressed_words.update(chs)
    unique_chs_from_stressed_words = list(unique_chs_from_stressed_words)
    unique_chs_from_stressed_words.sort()
    return unique_chs_from_stressed_words

  @classmethod
  def get_character2number_for_stressed_words(cls):
    unique_chs = cls.get_unique_chs_from_stressed_words() 
    character2number_for_stressed_words = dict((ch,i) for i, ch in enumerate(unique_chs, start = 1))
    return character2number_for_stressed_words
  
  @classmethod
  def get_number2character_for_stressed_words(cls): 
    unique_chs = cls.get_unique_chs_from_stressed_words() 
    number2character_for_stressed_words = dict((i,ch) for i, ch in enumerate(unique_chs, start = 1))
    return number2character_for_stressed_words 

  @classmethod
  def get_max_length_of_stressed_word(cls):
    #+1 is added as a place for accent for correct padding 
    max_length_of_stressed_word = max([len(item.word_lowcase) \
                                        for item in wiki_instances if item.status]) + 1
    #ipa and word arrays should be of the same length
    max_length_of_stressed_word = max(max_length_of_ipa, max_length_of_stressed_word) 
    return max_length_of_stressed_word


  @classmethod
  def word2numbers(cls, accent):
    numbers = []
    for ch in accent:
      n = character2number_for_stressed_words[ch]
      numbers.append(n)
      
    n_of_zeros_to_add = max_length_of_stressed_word - len(numbers) 
    numbers.extend([0 for i in range(n_of_zeros_to_add)])

    return numbers


  @classmethod
  def all_stressed_words_to_array(cls):
    
    list_of_words_as_numbers = [cls.word2numbers(item.accent) \
                                for item in wiki_instances if item.status]
    all_stressed_words_as_array = np.vstack(list_of_words_as_numbers)
    return all_stressed_words_as_array

In [29]:
character2number_for_stressed_words = WordsProcessing.get_character2number_for_stressed_words()
print(character2number_for_stressed_words)
number2character_for_stressed_words = WordsProcessing.get_number2character_for_stressed_words()
print(character2number_for_stressed_words)
max_length_of_stressed_word = WordsProcessing.get_max_length_of_stressed_word()
print(max_length_of_stressed_word)
print(max_length_of_ipa)

{"'": 1, '-': 2, 'а': 3, 'б': 4, 'в': 5, 'г': 6, 'д': 7, 'е': 8, 'ж': 9, 'з': 10, 'и': 11, 'й': 12, 'к': 13, 'л': 14, 'м': 15, 'н': 16, 'о': 17, 'п': 18, 'р': 19, 'с': 20, 'т': 21, 'у': 22, 'ф': 23, 'х': 24, 'ц': 25, 'ч': 26, 'ш': 27, 'щ': 28, 'ъ': 29, 'ы': 30, 'ь': 31, 'э': 32, 'ю': 33, 'я': 34, 'ё': 35}
{"'": 1, '-': 2, 'а': 3, 'б': 4, 'в': 5, 'г': 6, 'д': 7, 'е': 8, 'ж': 9, 'з': 10, 'и': 11, 'й': 12, 'к': 13, 'л': 14, 'м': 15, 'н': 16, 'о': 17, 'п': 18, 'р': 19, 'с': 20, 'т': 21, 'у': 22, 'ф': 23, 'х': 24, 'ц': 25, 'ч': 26, 'ш': 27, 'щ': 28, 'ъ': 29, 'ы': 30, 'ь': 31, 'э': 32, 'ю': 33, 'я': 34, 'ё': 35}
34
34


In [30]:
all_stressed_words_as_array = WordsProcessing.all_stressed_words_to_array()
print(all_stressed_words_as_array.shape)
print(all_stressed_words_as_array[10])

(396999, 34)
[ 4 19  8  1  7  8 16 31  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0]


In [31]:
print("all_stressed_words_as_array", all_stressed_words_as_array.shape)
print("all_ipa_as_array", all_ipa_as_array.shape)

all_stressed_words_as_array (396999, 34)
all_ipa_as_array (396999, 34)


#Train_Test_Split

In [32]:
def split_data_by_ratio(split_ratio, all_stressed_words_as_array, all_ipa_as_array):
  np.random.seed(30)
  number_of_items_with_true_status = SelectWords.get_number_of_items_with_true_status()
  
  mask = np.random.rand(number_of_items_with_true_status) < split_ratio
  
  inputs = all_stressed_words_as_array[mask]
  inputs_test = all_stressed_words_as_array[~mask]
  
  labels = all_ipa_as_array[mask]
  labels_test = all_ipa_as_array[~mask]

  return inputs, labels, inputs_test, labels_test, mask

inputs, labels, inputs_test, labels_test, mask = split_data_by_ratio(0.8, all_stressed_words_as_array, all_ipa_as_array)
print("inputs",inputs.shape)
print("labels",labels.shape)
print("inputs_test",inputs_test.shape)
print("labels_test",labels_test.shape)

inputs (317442, 34)
labels (317442, 34)
inputs_test (79557, 34)
labels_test (79557, 34)


In [124]:
indexes_of_test_items = list(np.where(mask == False)[0])
print(len(indexes_of_test_items))

79557


#Model

In [36]:
print("lables.shape",labels.shape)
print("inputs.shape",inputs.shape)
print("max_length_of_ipa", max_length_of_ipa)
print('max_length_of_word', max_length_of_stressed_word)
size_of_vocab = len(number2character_for_stressed_words)+1 
print("size_of_vocab", size_of_vocab)
no_classes = len(sign2number)+1 
print("no_classes", no_classes)

lables.shape (317442, 34)
inputs.shape (317442, 34)
max_length_of_ipa 34
max_length_of_word 34
size_of_vocab 36
no_classes 90


In [37]:
def make_model_ipa():
  #inputs_keras = keras.Input(shape = (34,1))
  inputs_keras = keras.Input(shape = (inputs.shape[1],))

  embd = tf.keras.layers.Embedding(size_of_vocab, 128)
  x_embd = embd(inputs_keras)

  rn = tf.keras.layers.Bidirectional(keras.layers.GRU(256, 
                                                      dropout=0.5, 
                                                      recurrent_dropout=0.2, 
                                                      return_sequences=True))
  r_0 = rn(x_embd)

  dns_1 = tf.keras.layers.TimeDistributed(keras.layers.Dense(512, activation="relu"))
  x = dns_1(r_0)

  dns_2 = tf.keras.layers.TimeDistributed(keras.layers.Dense(no_classes, activation="softmax"))
  outputs = dns_2(x)
  #print("outputs", outputs.shape)

  model_ipa = keras.Model(inputs = inputs_keras, outputs = outputs)
  return model_ipa

model_ipa = make_model_ipa() 

In [38]:
model_ipa.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 34)]              0         
                                                                 
 embedding (Embedding)       (None, 34, 128)           4608      
                                                                 
 bidirectional (Bidirectiona  (None, 34, 512)          592896    
 l)                                                              
                                                                 
 time_distributed (TimeDistr  (None, 34, 512)          262656    
 ibuted)                                                         
                                                                 
 time_distributed_1 (TimeDis  (None, 34, 90)           46170     
 tributed)                                                       
                                                             

In [None]:
filepath="checkpoint_word_stressed2IPA.hdf5"
#model.load_weights(filepath)

In [40]:
model_ipa.compile(loss = keras.losses.SparseCategoricalCrossentropy(), 
              optimizer = keras.optimizers.Adam(learning_rate=0.001),
              metrics = "accuracy")

In [41]:
#func to create checkpoint
checkpoint = ModelCheckpoint(filepath, 
                             monitor='accuracy', 
                             verbose=1, 
                             save_best_only=True, 
                             mode='max')

In [None]:
#model_ipa.load_weights(filepath)

In [43]:
#accuracy = 0.99975

history = model_ipa.fit(inputs, labels, 
                    batch_size=1024, 
                    epochs=10, 
                    #validation_split=0.2, 
                    verbose=1, 
                    callbacks=[checkpoint],
                    )


Epoch 1/2
Epoch 1: accuracy improved from -inf to 0.86577, saving model to checkpoint_word_stressed2IPA.hdf5
Epoch 2/2
Epoch 2: accuracy improved from 0.86577 to 0.97402, saving model to checkpoint_word_stressed2IPA.hdf5


In [167]:
#save entire model
path_model = Path.cwd().parent / "word2ipa_rus" / "model_word2ipa_rus.h5"
path_model


# model_ipa.save("model_word2ipa_rus.h5")

WindowsPath('c:/Users/eugen/Documents/word2ipa_rus/word2ipa_rus/model_word2ipa_rus.h5')

#load model

In [168]:
# load model
# del model_ipa
model_ipa = load_model(path_model)

#Predict

In [49]:
#accuracy = 0.9996
model_ipa.evaluate(inputs_test, labels_test)



[0.0032051389571279287, 0.9996480345726013]

In [103]:
#func to convert word to IPA using model
def word2IPA(word_stressed):

  def make_array_from_word_for_model(word_stressed):
    word_as_array = WordsProcessing.word2numbers(word_stressed)
    word_as_array = np.array(word_as_array)  
    word_as_array = word_as_array.reshape(1,word_as_array.shape[0])
    predicted_array = model_ipa.predict(word_as_array, verbose=0)
    predicted_array = predicted_array.reshape(labels.shape[1], no_classes) 
    return predicted_array
  predicted_array = make_array_from_word_for_model(word_stressed)

  def turn_predicted_array_to_transcription(predicted_array):
    ipa_predicted = []
    for position in predicted_array:
      number = np.argmax(position)
      if number !=0:
        sign = number2sign[number]
        ipa_predicted.append(str(sign)) 
    ipa_predicted = ''.join(ipa_predicted)
    return ipa_predicted
  ipa_predicted = turn_predicted_array_to_transcription(predicted_array)
  
  return ipa_predicted

In [104]:
word2IPA("челове'к")

't͡ɕɪlˠɐˈvʲek'

In [114]:
def word2IPA_dict_comparison(word_stressed):
  try: 
    ipa_predicted = word2IPA(word_stressed)
  except:
    ipa_predicted = word2IPA(word_stressed.lower())

  try:
    ipa_from_dict = find_item_from_wiki(word_stressed)[0].sounds
  except IndexError:
    ipa_from_dict = "no word in dict"

  if ipa_from_dict == "no word in dict":
    status = "None"
  elif IPAString(unicode_string = ipa_predicted).is_equivalent(
      IPAString(unicode_string = ipa_from_dict)):
    status = "True"
  else:
    status = "False"
  
  # feedback = print("ipa_predicted {} \n ipa_from_dict {} \n status {}".
  #              format(ipa_predicted, ipa_from_dict, status))
  feedback = (ipa_predicted, ipa_from_dict, status)
  return feedback   

In [116]:
word2IPA_dict_comparison("молоко'")

('məlˠɐˈko', 'məɫɐˈko', 'True')

In [134]:
def word_by_word_ipa_comparison(sample = 5, verbose = 1):
  rand_numbers = random.sample(indexes_of_test_items, sample)
  true_status = 0
  n_items = 0

  if verbose == 0:
    for i, item in enumerate(wiki_instances): 
      if item.status and i in rand_numbers:
        status = word2IPA_dict_comparison(item.accent)[2]
        n_items += 1
        if status:
          true_status += 1

  if verbose == 1:
    for i, item in enumerate(wiki_instances): 
      if item.status and i in rand_numbers:
        ipa_predicted, ipa_from_dict, status = word2IPA_dict_comparison(item.accent)
        n_items += 1
        print("ipa_predicted {} \n ipa_from_dict {} \n status {}".
              format(ipa_predicted, ipa_from_dict, status))
        if status == "True":
          true_status += 1

  share_of_true = true_status/n_items
  
  return share_of_true

In [139]:
word_by_word_ipa_comparison(5,1)

ipa_predicted prʲɪəbrɐˈʐɛnʲɪje 
 ipa_from_dict prʲɪəbrɐˈʐɛnʲɪje 
 status True
ipa_predicted ˈvʲertʲɪtʲe 
 ipa_from_dict ˈvʲertʲɪtʲe 
 status True
ipa_predicted sʲɪmʲɪˈstrunək 
 ipa_from_dict sʲɪmʲɪˈstrunək 
 status True
ipa_predicted səsrʲɪdɐˈtot͡ɕɪvəjɪt 
 ipa_from_dict səsrʲɪdɐˈtot͡ɕɪvəjɪt 
 status True
ipa_predicted ɡrɨˈzʲa 
 ipa_from_dict ɡrɨˈzʲa 
 status True


1.0

In [141]:
word_by_word_ipa_comparison(1000,0)

1.0