# Import

In [3]:
import numpy as np
import pandas as pd
import dill
import re
import json
import random
from ipapy import UNICODE_TO_IPA
from ipapy import is_valid_ipa
from ipapy.ipastring import IPAString
from pathlib import Path

In [4]:
#wiktionary rus parsed 
#func to upload original rus wiktionary

def upload_kaikki(path_kaikki):
  wiki_uploaded = []
  with open(path_kaikki) as f:
    for line in f:
      wiki_uploaded.append(json.loads(line))
  return wiki_uploaded

path_kaikki = "kaikki.org-dictionary-Russian_2022_01_01.json"
wiki_uploaded = upload_kaikki(path_kaikki)
print(len(wiki_uploaded))

422821


In [5]:
#check of uploaded kaikki
def get_article_from_wiki(word):
  return [item for item in wiki_uploaded if item['word'] == word]
get_article_from_wiki('окнами')

[{'pos': 'noun',
  'head_templates': [{'name': 'head',
    'args': {'1': 'ru',
     '2': 'noun form',
     'head': 'о́кнами',
     'tr': '',
     'tr2': '',
     'tr3': '',
     'tr4': '',
     'tr5': '',
     'tr6': '',
     'g': 'n-in-p',
     'head2': '',
     'head3': '',
     'head4': '',
     'head5': '',
     'head6': '',
     'g2': '',
     'g3': '',
     'g4': '',
     'g5': '',
     'g6': '',
     'g7': '',
     'g8': '',
     'g9': '',
     'g10': '',
     'g11': '',
     'g12': ''},
    'expansion': 'о́кнами (óknami) n inan pl'},
   {'name': 'ru-noun form',
    'args': {'1': 'о́кнами', '2': 'n-in-p'},
    'expansion': 'о́кнами (óknami) n inan pl'}],
  'forms': [{'form': 'о́кнами',
    'tags': ['canonical', 'inanimate', 'neuter', 'plural']},
   {'form': 'óknami', 'tags': ['romanization']}],
  'word': 'окнами',
  'lang': 'Russian',
  'lang_code': 'ru',
  'sounds': [{'ipa': '[ˈoknəmʲɪ]'}],
  'senses': [{'raw_glosses': ['instrumental plural of окно́ (oknó)'],
    'glosses': ['i

# Dictionary

In [6]:
#defining basic class Dictionary, where all parsed items wil be stored
class Dictionary:
  def __init__(self, word, pos, forms, senses, sounds, status, word_lowcase, stem,
               grammeme, meanings, accent, ipa, npipa, intipa):
    self.word = word
    self.word_lowcase = word_lowcase
    self.stem = stem
    self.pos = pos
    self.grammeme = grammeme
    self.forms = forms
    self.senses = senses
    self.meanings = meanings
    self.accent = accent
    self.sounds = sounds
    self.status = status
    self.ipa = ipa
    self.npipa = npipa
    self.intipa = intipa

In [7]:
# func for populating class Dictionary with words
# creation of wiki_instances - list, containing all instances of a class

def wiki2class(wiki_uploaded):

    wiki_instances = []
    
    for item in wiki_uploaded:

        try:               
            word = item['word']
        except KeyError:
            word = None

        try:
            pos = item['pos']
        except KeyError:
            pos = None

        try:
            forms = item['forms']
        except KeyError:
            forms = None

        try:
            senses = item['senses']
        except KeyError:
            senses = None

        try:
            sounds = item['sounds']
        except KeyError:
            sounds = None

        status = False

        item = Dictionary(word, pos, forms, senses, sounds, status,
                        None, None, None, None, None, None, None, None)
        wiki_instances.append(item)
    
    return wiki_instances

In [8]:
#running func for populate Dictionary class with words. No return
wiki_instances = wiki2class(wiki_uploaded)
print(len(wiki_instances))

422821


In [9]:
# func to find item from wiki_instances
def find_item_from_dict(word_to_find, wiki_instances):
    item = [item for item in wiki_instances if item.word == word_to_find]
    return item

In [10]:
# check of the func
vars(find_item_from_dict("поездом", wiki_instances)[0])

{'word': 'поездом',
 'word_lowcase': None,
 'stem': None,
 'pos': 'noun',
 'grammeme': None,
 'forms': [{'form': 'по́ездом',
   'tags': ['canonical', 'inanimate', 'masculine']},
  {'form': 'pójezdom', 'tags': ['romanization']}],
 'senses': [{'raw_glosses': ['instrumental singular of по́езд (pójezd)'],
   'glosses': ['instrumental singular of по́езд (pójezd)'],
   'tags': ['form-of', 'instrumental', 'singular'],
   'form_of': [{'word': 'по́езд', 'extra': 'pójezd'}],
   'id': 'поездом-ru-noun--HD3.5KZ',
   'categories': []}],
 'meanings': None,
 'accent': None,
 'sounds': [{'ipa': '[ˈpo(j)ɪzdəm]'}],
 'status': False,
 'ipa': None,
 'npipa': None,
 'intipa': None}

# Transformations

## low_case

In [11]:
#inspecting
words_upper_case = [item.word for item in wiki_instances 
                    if re.search("[А-Я]", item.word)]
print(len(words_upper_case))
print(words_upper_case[0:10])

4249
['Израиль', 'Я', 'ГУЛАГ', 'Россия', 'КГБ', 'Смолян', 'По', 'По', 'Юпитер', 'СССР']


In [12]:
#replacing
def word2lowcase():
  for item in wiki_instances:
    setattr(item, "word_lowcase", item.word.lower())
word2lowcase()

In [13]:
#checking again
words_low_case = [item.word_lowcase for item in wiki_instances]
print(len(words_low_case))
print(words_low_case[0:10])

422821
['бёрдо', 'масленица', 'узус', 'баской', 'баять', 'бортник', 'бочаг', 'уст.', 'браный', 'братыня']


In [14]:
find_item_from_dict("ГУЛАГ", wiki_instances)[0].word_lowcase

'гулаг'

## IPA

### cleaning sound atr

In [15]:
#inspecting
words_wih_sounds  = [item.sounds[0] for item in wiki_instances 
                     if item.sounds and len(item.sounds) > 0]
print(len(words_wih_sounds))
print(words_wih_sounds[0:5])

420441
[{'ipa': '[ˈbʲɵrdə]'}, {'ipa': '[ˈmas⁽ʲ⁾lʲɪnʲɪt͡sə]'}, {'ipa': '[ˈuzʊs]'}, {'ipa': '[bɐˈskoj]'}, {'ipa': '[ˈba(j)ɪtʲ]'}]


In [16]:
words_with_only_ipa = [ipa for ipa in words_wih_sounds if len(ipa) != 1]
print(len(words_with_only_ipa))
print(words_with_only_ipa[0:5])

261
[{'ipa': '[ɛm]', 'tags': ['letter']}, {'ipa': '[ɛn]', 'tags': ['letter']}, {'ipa': '[o]', 'tags': ['stressed']}, {'ipa': '[p]', 'tags': ['phoneme']}, {'ipa': '[a]', 'tags': ['stressed']}]


In [17]:
#changing
def sounds2ipa():
  for item in wiki_instances:
    if item.sounds != None and len(item.sounds) > 0:
      setattr(item, "sounds", item.sounds[0])

  for item in wiki_instances:  
    if item.sounds != None and len(item.sounds) != 1:
      setattr(item, "sounds", None)

  for item in wiki_instances:
    if item.sounds != None and "ipa" not in item.sounds.keys():
      setattr(item, "sounds", None)

sounds2ipa()

In [18]:
# checking number of empty sounds attribute
len([item for item in wiki_instances if item.sounds == None])

2643

In [19]:
# final check: sounds_not_satisfying_conditions 
# expected 0
sounds_not_satisfying_conditions  = [item.sounds for item in wiki_instances 
      if item.sounds and len(item.sounds) != 1 and "ipa" not in item.sounds.keys()]
print(len(sounds_not_satisfying_conditions))
print(sounds_not_satisfying_conditions[0:50])

0
[]


### cleaning IPA from non-ipa characters

In [20]:
#getting all signs in ipa
def get_all_unique_signs_in_ipa():
  unique_ipa = set()
  for item in wiki_instances:
    if item.sounds:
      trans = item.sounds["ipa"]
      trans = set(list(trans))
      unique_ipa.update(trans)
  return unique_ipa
all_unique_signs_in_ipa = get_all_unique_signs_in_ipa()
print(len(all_unique_signs_in_ipa))
print(all_unique_signs_in_ipa)

62
{'⁾', '-', 'e', '|', 'ʑ', '̩', ')', 'ʂ', 'ɪ', 'ʕ', 'b', '[', 'ǀ', 'ʲ', 'ˑ', '.', 'i', 'ʉ', 'ɑ', 'k', 'n', 'ɛ', 'a', 'ɨ', '̠', 'ɵ', 'æ', 'v', '(', 't', 'ɫ', 'ɐ', 's', 'x', '̥', '/', 'u', ']', 'ʙ', '⁽', '͡', 'p', '‿', 'ɡ', 'ˈ', 'ʐ', 'ˌ', 'z', 'f', 'd', 'ː', 'r', 'ʔ', 'l', ' ', 'ɣ', 'm', 'ɕ', 'ə', 'ʊ', 'o', 'j'}


In [21]:
#getting all non ipa characters
def get_all_non_ipa_signs(all_unique_signs_in_ipa):
  all_non_ipa_signs = []
  for ch in all_unique_signs_in_ipa:
    if is_valid_ipa(ch) == False:
      all_non_ipa_signs.append(ch)
  return all_non_ipa_signs
all_non_ipa_signs = get_all_non_ipa_signs(all_unique_signs_in_ipa)
all_non_ipa_signs

['⁾', '-', ')', '[', '(', '/', ']', '⁽']

In [22]:
#removing exceptions
all_non_ipa_signs = re.compile("\]|\/|\⁾|\(|\⁽|\[|\)|\-")
def remove_nonipa_from_ipa(
  all_non_ipa_signs):
  for item in wiki_instances: 
    if item.sounds != None:
      #print(item.sounds["ipa"])
      attribute = item.sounds["ipa"]
      attribute = re.sub(all_non_ipa_signs, "", attribute)
      setattr(item, "sounds", attribute)
remove_nonipa_from_ipa(all_non_ipa_signs)

In [23]:
wiki_instances[1003].sounds

'ˌlʲesɐˈspusk'

## Accent

In [24]:
# func to put a stress in the word by parsing its IPA transcription
def put_accents_on_words():
  vs_re = re.compile('а|я|о|е|ё|у|ю|э|ы|и|А|Я|О|Е|Ё|У|Ю|Э|Ы|И')
  vs = 'аяоеёуюэыиАЯОЕЁУЮЭЫИ'

  # getting only vowels
  def get_IPA_vowels():
    vowels_ipa = set()
    for item in wiki_instances:
      if item.sounds:
        set_ipa = set(IPAString(unicode_string=item.sounds, ignore = True).vowels)
        vowels_ipa.update(set_ipa)
    return vowels_ipa
  vowels_ipa = get_IPA_vowels()

  for item in wiki_instances:
    if item.sounds:     

      # func to get an index of stress mark in the IPA sequence of vowels only
      def get_index_of_stress_mark_among_vowels(IPA_transcription):
        word_ipa = IPAString(unicode_string=IPA_transcription, ignore = True)
        pat_ipa = []
        for i, ipa_ch in enumerate(word_ipa):
          if ipa_ch in vowels_ipa or ipa_ch == UNICODE_TO_IPA[u"ˈ"]:
            pat_ipa.append(ipa_ch)
        if pat_ipa == [] or re.search(vs_re, item.word) == None:
          index_stress_among_vowels = []
        else: 
          try: 
            # index_stress_among_vowels = pat_ipa.index(UNICODE_TO_IPA[u"ˈ"]) 
            index_stress_among_vowels = [i for i, ip in enumerate(pat_ipa) 
                                         if ip == UNICODE_TO_IPA[u"ˈ"]]
            if index_stress_among_vowels == []:       
              index_stress_among_vowels = list([0])
          except:
            index_stress_among_vowels = list([0])
        return index_stress_among_vowels

      IPA_transcription = item.sounds
      index_stress_among_vowels = get_index_of_stress_mark_among_vowels(IPA_transcription)
      
      # func to put a stress mark to word
      word = item.word_lowcase

      def put_stress_on_word_by_index(word, index_stress_among_vowels):
        index = 0
        stressed_word = word
        for ii, ind_of_stressed_vowel in enumerate(index_stress_among_vowels):
          ind_of_stressed_vowel = ind_of_stressed_vowel - ii

          n_of_vowel = -1

          for i, ch in enumerate(stressed_word):
            if ch in vs:
              n_of_vowel += 1

              if n_of_vowel == ind_of_stressed_vowel:
                index = i+1
                stressed_word = stressed_word[:index]+ "'" + stressed_word[index:]
        return stressed_word 

      stressed_word = put_stress_on_word_by_index(word, index_stress_among_vowels)
      item.accent = stressed_word
put_accents_on_words()

In [25]:
# get number of empty accent attributes
len([i for i, item in enumerate(wiki_instances) if not item.accent])

2643

## Pos

### get_freq_of_pos

In [26]:
def get_freq_of_pos():
  list_of_pos = set(item.pos for item in wiki_instances)
  freq_of_pos = dict()
  for p in list_of_pos:
    name = "{}".format(p)
    items_pos = [item for item in wiki_instances if item.pos == p]
    length = len(items_pos)
    freq_of_pos[name] = length
  return pd.Series(freq_of_pos).sort_values(ascending = False)
freq_of_pos = get_freq_of_pos()
freq_of_pos

noun              205020
verb              195740
adj                13745
name                3151
adv                 2287
num                  690
pron                 551
intj                 380
phrase               284
suffix               233
prefix               153
prep                 139
proverb              107
conj                  95
combining_form        83
particle              59
character             51
det                   23
prep_phrase           20
interfix               5
punct                  2
symbol                 2
ambiposition           1
dtype: int64

In [27]:
name_of_pos = "ambiposition"
list_of_words_with_pos = [item for item in wiki_instances if item.pos == name_of_pos]
for i,item in enumerate(list_of_words_with_pos):
  if i in range(0,50):
    print(item.word, item.grammeme)

ради None


### pos: adv, intj

In [28]:
#func to populate stem attribute with word_lowcase in adverbs 
def set_stem_for_pos():
  for i, item in enumerate(wiki_instances):
    if item.stem == None and (item.pos == "adv" 
                              or item.pos == "prep" 
                              or item.pos == "conj" 
                              or item.pos == "particle"):
      item.stem = item.word_lowcase
set_stem_for_pos()

In [29]:
# check
find_item_from_dict("пешком", wiki_instances)[0].pos

'adv'

## Grammemes

### grammemes: save, remove

In [30]:
# downloading xlsx file with grammemes
path_stat_gram = "grammemes_statistics.xlsx"
gram_stat_done = pd.read_excel(path_stat_gram)

In [31]:
gram_stat_done

Unnamed: 0,grammeme,count,status
0,iterative,1,
1,regional,1,
2,slur,1,
3,literary,1,
4,offensive,1,
...,...,...,...
77,imperfective,97841,1.0
78,indicative,110234,1.0
79,plural,145627,1.0
80,singular,163451,1.0


In [32]:
grammemes_to_remove = list(gram_stat_done[np.isnan(gram_stat_done["status"])]["grammeme"])
print(len(grammemes_to_remove))
print(grammemes_to_remove)

57
['iterative', 'regional', 'slur', 'literary', 'offensive', 'collective', 'contemporary', 'Ukraine', 'ethnic', 'uncommon', 'only', 'often', 'reflexive', 'sarcastic', 'with-genitive', 'pejorative', 'usually', 'informal', 'relative', 'empty-gloss', 'impersonal', 'possessive', 'adjectival', 'dialectal', 'morpheme', 'partitive', 'rare', 'obsolete', 'indeclinable', 'poetic', 'noun-from-verb', 'historical', 'vulgar', 'nonstandard', 'derogatory', 'uncountable', 'slang', 'proscribed', 'dated', 'demonstrative', 'abstract', 'verb', 'vocative', 'no-comparative', 'augmentative', 'plural-only', 'concrete', 'superlative', 'archaic', 'colloquial', 'comparative', 'endearing', 'diminutive', 'animate', 'inanimate', 'adverbial', 'form-of']


In [33]:
grammemes_to_save = list(gram_stat_done[gram_stat_done["status"] == 1.0]["grammeme"])
print(len(grammemes_to_save))
print(grammemes_to_save)


25
['locative', 'passive', 'neuter', 'nominative', 'masculine', 'feminine', 'active', 'imperative', 'accusative', 'first-person', 'third-person', 'future', 'prepositional', 'dative', 'genitive', 'participle', 'second-person', 'instrumental', 'present', 'past', 'perfective', 'imperfective', 'indicative', 'plural', 'singular']


### item.senses[0]["tags"]

In [34]:
#populating 356323 grammemes with item.senses[0]["tags"]
def set_tag_grammemes():
  for item in wiki_instances:
    if item.grammeme == None and "tags" in item.senses[0].keys() and "form-of" in item.senses[0]["tags"]:
      attribute = item.senses[0]["tags"]
      [attribute.remove(gr) for gr in grammemes_to_remove if gr in attribute] 
      item.grammeme = attribute

set_tag_grammemes()

In [35]:
#checking
n_of_populated_grammemes = [(item.word, item.grammeme)
                            for item in wiki_instances if item.grammeme]
print(len(n_of_populated_grammemes))
print(n_of_populated_grammemes[0:3])

353295
[('как', ['genitive', 'plural']), ('лук', ['feminine', 'genitive', 'plural']), ('царица', ['feminine'])]


In [36]:
# number of empty grammemes
len([i for i, item in enumerate(wiki_instances) if item.grammeme == None])

66498

### item.forms

In [37]:
#func to populate None grammeme attributes with grammemes derived from tags in forms 
def set_form_grammemes():
  for i, item in enumerate(wiki_instances):
    
    if item.forms and item.grammeme == None:
      set_of_grs = set()
      set_of_grs_list = []
      
      for f in item.forms: 
        word_form = re.sub('́', "", f["form"]).lower()
        if word_form == item.word.lower():
          set_of_grs.update(f["tags"])
      for gr in set_of_grs:
        if gr in grammemes_to_save:   
          set_of_grs_list.append(gr)
      
      item.grammeme = set_of_grs_list
    
    if item.grammeme == []:
      item.grammeme = None

set_form_grammemes()

In [38]:
# number of empty grammemes
len([i for i, item in enumerate(wiki_instances) if item.grammeme == None])

24070

### item.senses[0]["glosses"]

In [39]:
# populating 22727 grammemes in item.senses[0]["glosses"]
def set_senses_grammemes():
  grammemes_to_save = list(gram_stat_done[gram_stat_done["status"] == 1.0]["grammeme"])
  for item in wiki_instances:
    if item.grammeme == None:
      if "tags" in item.senses[0].keys() and 'form-of' not in item.senses[0]["tags"] and 'glosses' in item.senses[0] and item.grammeme == None:
        raw_grammeme = item.senses[0]["glosses"]
        clean_gr = [gr for gr in grammemes_to_save if gr in raw_grammeme[0]]
        if clean_gr != []:
          item.grammeme = clean_gr
        else:
          item.grammeme = None
set_senses_grammemes()

In [40]:
# number of empty grammemes
len([i for i, item in enumerate(wiki_instances) if item.grammeme == None])

13777

### exceptions

In [41]:
def get_words_with_no_grammemes(start, stop):
  gr_excepts = [item for item in wiki_instances if item.grammeme == None]
  for i, item in enumerate(gr_excepts):
    if i in range(start,stop):
      print(item.word, item.grammeme)
  return gr_excepts

In [42]:
gr_excepts = get_words_with_no_grammemes(100,105)

будь здоров None
@ None
всё-таки None
всё-таки None
дома None


### sorting

In [43]:
def sorting_grammemes():
  for item in wiki_instances:
    if item.grammeme !=None:
      item.grammeme.sort()
sorting_grammemes()

## Stems

In [44]:
#setting all stems to None
def set_stems_to_none():
  for item in wiki_instances:
    item.stem = None
set_stems_to_none()

### get_populates_stems

In [45]:
#func to check number of populated stems: on -  361861, none - 60960
def get_populated_stems():
  list_of_stems = [(item.word_lowcase, item.stem) for item in wiki_instances if item.stem]
  return list_of_stems
list_of_stems = get_populated_stems()
print(len(list_of_stems))

0


### n of dupes in wiki

In [46]:
#func to get number of dupes in wiki 
def get_number_of_dupe_words_in_Dict():
  dupes = []
  unique = set()
  for item in wiki_instances:
    if item.word not in unique:
      unique.add(item.word)
    else:
      dupes.append(item.word)
  return dupes
# dupes = get_number_of_dupe_words_in_Dict()
# print(len(dupes)) 

### derivatives

In [47]:
find_item_from_dict("кабаном", wiki_instances)[0].senses[0]["glosses"]

['instrumental singular of каба́н (kabán)']

In [48]:
random_item_senses = find_item_from_dict("тусовщица", wiki_instances)[0].senses
random_item_senses

[{'raw_glosses': ["(slang) female equivalent of тусо́вщик (tusóvščik): female who often attends hangouts (тусо́вка (tusóvka)), especially teenagers' hangouts"],
  'tags': ['feminine'],
  'glosses': ["female equivalent of тусо́вщик (tusóvščik): female who often attends hangouts (тусо́вка (tusóvka)), especially teenagers' hangouts"],
  'form_of': [{'word': 'тусо́вщик',
    'extra': "(tusóvščik): female who often attends hangouts (тусо́вка (tusóvka)), especially teenagers' hangouts"}],
  'id': 'тусовщица-ru-noun-Fw2FSwTg',
  'categories': []},
 {'raw_glosses': ['(slang, figuratively) female who is a member of a coterie'],
  'tags': ['figuratively', 'slang'],
  'glosses': ['female who is a member of a coterie'],
  'id': 'тусовщица-ru-noun-tIfaunty',
  'categories': [{'name': 'Russian nouns with accent pattern a',
    'kind': 'other',
    'parents': [],
    'source': 'w+disamb',
    '_dis': '42 58'}]}]

In [49]:
class SetStemForDerivatives:
  @classmethod 
  def set_all_stems_to_none(cls):
    for item in wiki_instances:
      setattr(item, "stem", None)
    return len([i for i, item in enumerate(wiki_instances) if item.stem == None])

  @classmethod
  def extract_stems_from_trash(cls):
    for i, item in enumerate(wiki_instances):
      list_of_senses = item.senses

      for s in list_of_senses:
        
        if ("glosses" in s.keys() \
            and "of" in s["glosses"][0]
            and re.findall("-| ", item.word) == [] #words without whitespaces and -
            and item.pos in ("noun","verb","adj","name","adv","num","pron") #only meaningful words
            ): 
          
          # substituting stress mark
          stem_new = re.sub("́", "'", s["glosses"][0], count = 0) 

          # extracting only russian characters and "-", and white spaces
          stem_new = ''.join(re.findall("[а-я]|[А-Я]|ё|Ё|'|-| ", stem_new))

          # removing all white spaces before and after word
          stem_new = re.sub("^\s*\W|\s*$", '', stem_new)

          # lowcase  
          stem_new = stem_new.lower()

          # putting stress after "ё"
          vs = list('аяоеёуюэыи')         
          if "'" not in stem_new:
            stem_new = re.sub("ё","ё'", stem_new)

          # setting stem attributes
          if stem_new != '':
            item.stem = stem_new

        else:
          item.stem = None
        
      # remove all stems with "-" and white spaces
      def remove_stems_with_blank_dash():
        if item.stem and re.findall("-| ", item.stem):
          item.stem = None
      remove_stems_with_blank_dash()
  

    # func to put stress mark in one syllable word woithout stress mark
    def put_stress_in_one_syllable_word(word):
      vs = tuple('аяоеёуюэыи')

      # func to get a number of vowels in word 
      def get_number_of_vowels_in_word(word):
        def num_vs(vowel):
          return word.count(vowel)
        
        list_of_vs_numbers = map(num_vs, vs)
        number_of_vowels =  sum(list_of_vs_numbers)
        return number_of_vowels
      
      number_of_vowels = get_number_of_vowels_in_word(word)

      if "'" not in word and number_of_vowels == 1:
        
        def func_sub(vowel):
          new_w = re.sub(vowel, vowel+"'", word)
          return new_w

        list_of_new_ws = map(func_sub, list(vs))
        stressed_word = [a for a in list_of_new_ws if "'" in a][0]

      else:
        stressed_word = word

      return stressed_word  

    stressed_word = put_stress_in_one_syllable_word(item.stem)
    item.stem = stressed_word 

    

In [50]:
print(SetStemForDerivatives.set_all_stems_to_none())

422821


In [51]:
SetStemForDerivatives.extract_stems_from_trash()

In [52]:
#number of non populated stems
len([i for i, item in enumerate(wiki_instances) if item.stem == None]) 

163390

In [53]:
def get_stem_with_defined_pat(pat):
  pat = re.compile(pat)
  stem_with_defined_pat = [(item.word, item.pos, item.stem) \
                            for item in wiki_instances 
                            if item.stem 
                            and re.findall(pat, item.stem) != []
                            and len(item.word) == 3
                           ]
  return stem_with_defined_pat
stem_with_defined_pat = get_stem_with_defined_pat("ё")
print(len(stem_with_defined_pat))
stem_with_defined_pat[0:10]

10


[('псе', 'noun', "пё'с"),
 ('БТР', 'noun', "бронетранспортё'р"),
 ('псы', 'noun', "пё'с"),
 ('рёв', 'noun', "рё'ва"),
 ('пса', 'noun', "пё'с"),
 ('псу', 'noun', "пё'с"),
 ('нёб', 'noun', "нё'бо"),
 ('тёщ', 'noun', "тё'ща"),
 ('ежу', 'noun', "ё'ж"),
 ('еже', 'noun', "ё'ж")]

In [54]:
def get_random_stems(number_of_stems_to_return):
  import random 
  rns = [random.randint(0,len(wiki_instances)) for i in range(number_of_stems_to_return)]
  print(len(rns))
  random_stems = [(item.word, item.stem) for i, item in enumerate(wiki_instances) if i in rns]
  return random_stems
get_random_stems(10)

10


[('видеодисплею', None),
 ('разоблачило', "разоблачи'ть"),
 ('выпасают', None),
 ('выпололо', "вы'полоть"),
 ('симптомам', "симпто'м"),
 ('полисом', "по'лис"),
 ('отдуются', None),
 ('отлитый', "отли'ть"),
 ('сглаживая', "сгла'живать"),
 ('ракету-носитель', None)]

### nominative, canonical

In [55]:
#number of non populated grammemes
len([i for i, item in enumerate(wiki_instances) if item.grammeme == None]) 

13777

In [56]:
items_for_canonical = [(item.word, item.stem, item.grammeme)
                       for item in wiki_instances if item.stem == None
                       and item.grammeme 
                       and set(("canonical", "nominative")) & set(item.grammeme)]
print(len(items_for_canonical))
items_for_canonical[0:10]

33708


[('бёрдо', None, ['accusative', 'nominative', 'singular']),
 ('масленица', None, ['feminine', 'nominative', 'singular']),
 ('узус', None, ['accusative', 'masculine', 'nominative', 'singular']),
 ('баской',
  None,
  ['accusative',
   'dative',
   'feminine',
   'genitive',
   'instrumental',
   'masculine',
   'nominative',
   'prepositional']),
 ('бортник', None, ['masculine', 'nominative', 'singular']),
 ('бочаг', None, ['accusative', 'masculine', 'nominative', 'singular']),
 ('браный', None, ['accusative', 'masculine', 'nominative']),
 ('брашно', None, ['accusative', 'neuter', 'nominative', 'singular']),
 ('бредень', None, ['accusative', 'masculine', 'nominative', 'singular']),
 ('бредник', None, ['accusative', 'masculine', 'nominative', 'singular'])]

In [57]:
# func to populate stem attribute with word_lowcase in canonical words and nominative cases
def set_stem_for_canonical():
  for item in wiki_instances:
    if item.stem == None and item.grammeme != None:
      if "canonical" or "nominative" in item.grammeme:
        item.stem = item.accent
set_stem_for_canonical()

In [58]:
#number of non populated stems
len([i for i, item in enumerate(wiki_instances) if item.stem == None]) 

6774

In [59]:
def get_sample_of_unpopulated_stems(sample):
  random_numbers = [random.randint(0, len(wiki_instances)) for i in range(sample)]
  list_of_unpopulated_stems = [(item.word_lowcase, item.stem) 
                               for i,item in enumerate(wiki_instances) 
                               if item.stem == None
                               and i in random_numbers]
  return list_of_unpopulated_stems
list_of_unpopulated_stems = get_sample_of_unpopulated_stems(20)
print(len(list_of_unpopulated_stems))
list_of_unpopulated_stems

0


[]

## Senses

In [60]:
find_item_from_dict("ключом", wiki_instances)[0].senses[0]["glosses"]

['instrumental singular of ключ (ključ)']

In [61]:
find_item_from_dict("ключ", wiki_instances)[0].senses[2]["glosses"][0]

'clue, key'

In [62]:
find_item_from_dict("ключ", wiki_instances)[0].senses[0].keys()

dict_keys(['raw_glosses', 'tags', 'glosses', 'id'])

In [63]:
def set_meanings():
  for i, item in enumerate(wiki_instances):
    list_of_meanings = []
    
    for sns in item.senses:
      if "glosses" in sns.keys(): 
        mng = sns["glosses"]
        list_of_meanings.append(mng)
    
    item.meanings = list_of_meanings
set_meanings()

In [64]:
find_item_from_dict("ключ", wiki_instances)[0].meanings

[['key'],
 ['wrench, spanner, screw wrench'],
 ['clue, key'],
 ['clef, key'],
 ['radical (in Chinese characters)']]

## ipa_shorten_for_rhyme

### get_unique_ipa

In [65]:
def get_unique_ipa():  
  all_signs = set()
  for item in wiki_instances:
    #if item.status == True:
    value = IPAString(unicode_string=item.sounds, ignore = True)          
    for ch in value:
      all_signs.add(ch)
  list_unique_ipas = [ipa_ch for ipa_ch in list(all_signs)]
  list_unique_unicodes = [str(ip) for ip in list_unique_ipas]
  list_unique_unicodes.sort()
  return list_unique_unicodes

list_unique_unicodes = get_unique_ipa()
print(list_unique_unicodes)

[' ', '.', 'a', 'b', 'bʲ', 'bʲː', 'bː', 'd', 'dʲ', 'dʲː', 'dː', 'd͡z', 'd͡zʲ', 'd͡ʑ', 'e', 'f', 'fʲ', 'fʲː', 'i', 'j', 'jː', 'k', 'kʲ', 'kʲː', 'kː', 'lʲ', 'lʲː', 'lˠ', 'lˠː', 'm', 'mʲ', 'mʲː', 'mː', 'm̥', 'n', 'nʲ', 'nʲː', 'nː', 'o', 'p', 'pʲ', 'pʲː', 'pː', 'r', 'rʲ', 'rʲː', 'rː', 's', 'sʲ', 'sʲː', 'sː', 't', 'tʲ', 'tʲː', 'tː', 't͡s', 't͡sʲ', 't͡sː', 't͡ɕ', 't͡ɕː', 'u', 'v', 'vʲ', 'vʲː', 'vː', 'x', 'xʲ', 'z', 'zʲ', 'zʲː', 'zː', '|', 'æ', 'ǀ', 'ɐ', 'ɑ', 'ɕ', 'ɕː', 'ɖ͡ʐ', 'ə', 'ɛ', 'ɡ', 'ɡʲ', 'ɡː', 'ɣ', 'ɨ', 'ɪ', 'ɵ', 'ʂ', 'ʂː', 'ʈ͡ʂ', 'ʉ', 'ʊ', 'ʐ', 'ʐː', 'ʑː', 'ʔ', 'ʕ', 'ʙ', 'ʲ', 'ˈ', 'ˌ', 'ː', 'ˑ', '̠', '̩', '͡', '‿']


In [66]:
def find_sounds_with_pat(pat_find): 
  pat_find = IPAString(unicode_string = pat_find).ipa_chars
  list_matches = []
  for item in wiki_instances:
    where = IPAString(unicode_string = item.sounds).ipa_chars 
    for i,p in enumerate(where):
      try:
        if pat_find[0] == p:
          match = [p]
          for ii in range(len(pat_find)):
            if where[i+ii+1] == pat_find[ii+1]:
              match.append(where[i+ii+1])
              if len(match) == len(pat_find):
                list_matches.append((item.word, item.sounds))
      except IndexError:
        pass
  return list_matches  

pat_find = "ʐalʲ"
list_matches = find_sounds_with_pat(pat_find)
print(len(list_matches))
print(list_matches[0:5])

208
[('жалить', 'ˈʐalʲɪtʲ'), ('ужалить', 'ʊˈʐalʲɪtʲ'), ('жаль', 'ʐalʲ'), ('жаль', 'ʐalʲ'), ('лежали', 'lʲɪˈʐalʲɪ')]


### select items

In [67]:
def set_all_attributes_to_false():
  for item in wiki_instances:
    setattr(item, "status", False)

In [68]:
print(list_unique_unicodes)

[' ', '.', 'a', 'b', 'bʲ', 'bʲː', 'bː', 'd', 'dʲ', 'dʲː', 'dː', 'd͡z', 'd͡zʲ', 'd͡ʑ', 'e', 'f', 'fʲ', 'fʲː', 'i', 'j', 'jː', 'k', 'kʲ', 'kʲː', 'kː', 'lʲ', 'lʲː', 'lˠ', 'lˠː', 'm', 'mʲ', 'mʲː', 'mː', 'm̥', 'n', 'nʲ', 'nʲː', 'nː', 'o', 'p', 'pʲ', 'pʲː', 'pː', 'r', 'rʲ', 'rʲː', 'rː', 's', 'sʲ', 'sʲː', 'sː', 't', 'tʲ', 'tʲː', 'tː', 't͡s', 't͡sʲ', 't͡sː', 't͡ɕ', 't͡ɕː', 'u', 'v', 'vʲ', 'vʲː', 'vː', 'x', 'xʲ', 'z', 'zʲ', 'zʲː', 'zː', '|', 'æ', 'ǀ', 'ɐ', 'ɑ', 'ɕ', 'ɕː', 'ɖ͡ʐ', 'ə', 'ɛ', 'ɡ', 'ɡʲ', 'ɡː', 'ɣ', 'ɨ', 'ɪ', 'ɵ', 'ʂ', 'ʂː', 'ʈ͡ʂ', 'ʉ', 'ʊ', 'ʐ', 'ʐː', 'ʑː', 'ʔ', 'ʕ', 'ʙ', 'ʲ', 'ˈ', 'ˌ', 'ː', 'ˑ', '̠', '̩', '͡', '‿']


In [69]:
def select_words_for_ipa_shortening():
  set_all_attributes_to_false()
  pat_only_simple_words =  re.compile("[^а-я|А-Я|ё|Ё|'|-]")
  ipa_characters_to_remove = [UNICODE_TO_IPA[u"ɣ"], 
                              UNICODE_TO_IPA[u"‿"],
                              UNICODE_TO_IPA[u'|'],
                              UNICODE_TO_IPA[u'ǀ'],
                              UNICODE_TO_IPA[u"ɖ͡ʐ"],
                              UNICODE_TO_IPA[u"ɑ"],
                              UNICODE_TO_IPA[u"."],
                              UNICODE_TO_IPA[u"ʑ"],
                              UNICODE_TO_IPA[u"ˌ"],
                              UNICODE_TO_IPA[u"ʔ"],
                              UNICODE_TO_IPA[u"ʕ"],
                              UNICODE_TO_IPA[u" "],
                              UNICODE_TO_IPA[u"m̥"],
                              UNICODE_TO_IPA[u'̩'],
                              UNICODE_TO_IPA[u"ː"]]
  pos_to_remain = ("noun", "verb", "adj", "name", "adv", 
                   "num", "pron", "intj", "prep", "conj", 
                   "particle", "det", "ambiposition") 

  words_for_ipa_shortening = len([setattr(item, "status", True) for item in wiki_instances 
                          if item.sounds 
                          and item.accent
                          and re.search(pat_only_simple_words, item.accent) == None
                          and item.pos in pos_to_remain 
                          and item.word not in ["лахмаджун", "МГУ", "тпру"]
                          and set(ipa_characters_to_remove) & set(IPAString(unicode_string = item.sounds)) == set()])
              
  return words_for_ipa_shortening
words_for_ipa_shortening = select_words_for_ipa_shortening()
words_for_ipa_shortening


395692

In [70]:
def get_unique_ipa_true():  
  all_signs = set()
  for item in wiki_instances:
    if item.status:
      value = IPAString(unicode_string=item.sounds, ignore = True)          
      for ch in value:
        all_signs.add(ch)
  list_unique_ipas_true = [ipa_ch for ipa_ch in list(all_signs)]
  list_unique_unicodes_true = [str(ip) for ip in list_unique_ipas_true]
  list_unique_unicodes_true.sort()
  return list_unique_unicodes_true

list_unique_unicodes_true = get_unique_ipa_true()
print(list_unique_unicodes_true)

['a', 'b', 'bʲ', 'bʲː', 'bː', 'd', 'dʲ', 'dʲː', 'dː', 'd͡z', 'd͡zʲ', 'e', 'f', 'fʲ', 'fʲː', 'i', 'j', 'jː', 'k', 'kʲ', 'kʲː', 'kː', 'lʲ', 'lʲː', 'lˠ', 'lˠː', 'm', 'mʲ', 'mʲː', 'mː', 'n', 'nʲ', 'nʲː', 'nː', 'o', 'p', 'pʲ', 'pʲː', 'pː', 'r', 'rʲ', 'rʲː', 'rː', 's', 'sʲ', 'sʲː', 'sː', 't', 'tʲ', 'tʲː', 'tː', 't͡s', 't͡sʲ', 't͡sː', 't͡ɕ', 't͡ɕː', 'u', 'v', 'vʲ', 'vʲː', 'vː', 'x', 'xʲ', 'z', 'zʲ', 'zʲː', 'zː', 'æ', 'ɐ', 'ɕ', 'ɕː', 'ə', 'ɛ', 'ɡ', 'ɡʲ', 'ɡː', 'ɨ', 'ɪ', 'ɵ', 'ʂ', 'ʂː', 'ʈ͡ʂ', 'ʉ', 'ʊ', 'ʐ', 'ʐː', 'ʑː', 'ˈ']


In [71]:
def get_number_of_items_with_true_status():
  number_of_items_with_true_status = len([1 for item in wiki_instances if item.status])
  return number_of_items_with_true_status
print(get_number_of_items_with_true_status())

395692


### shortenning

In [72]:
#func to shorten ipa by scheme: one consonant before stressed vowel (if available)
#stressed vowel + the rest of characters to the end of word
#e.g. ipa ʂɨpʲɪˈlʲævʲɪtʲ shorened to lʲævʲɪtʲ
stress_as_uni = 'ˈ'
stress_as_ipa = UNICODE_TO_IPA[stress_as_uni]
palat_uni = 'ʲ'
palat_ipa = UNICODE_TO_IPA[palat_uni]

def get_ipa_shortened(trans_uni):  
  trans_ipa = IPAString(unicode_string = trans_uni)
  def shorten_word_without_stress(trans_ipa):
  #first vowel or first single consonant
    if "vowel" in trans_ipa[0].name or \
    ("consonant" in trans_ipa[0].name and "vowel" in trans_ipa[1].name):
      ipa_short = trans_ipa
    #more than one initial consonants
    else:
      first_vowel = trans_ipa.vowels[0]
      index_of_first_vowel = trans_ipa.index(first_vowel)
      ipa_short = trans_ipa[index_of_first_vowel-1:]
      
      if ipa_short[0] == palat_ipa:
        ipa_short = trans_ipa[index_of_first_vowel-2:]    
    return ipa_short
  
  
  def shorten_all_variants(trans_ipa):
    #no stress
    if stress_as_ipa not in trans_ipa:
      ipa_short = shorten_word_without_stress(trans_ipa)
    #with stress
    else:
      index_of_last_stress = trans_ipa[::-1].index(stress_as_ipa)
      ipa_after_last_stress = trans_ipa[-index_of_last_stress:]
      ipa_after_last_stress = IPAString(ipa_chars = ipa_after_last_stress)
      ipa_short = shorten_word_without_stress(ipa_after_last_stress)
  
    return ipa_short
  
  #exception for single consonant words 
  try:
    ipa_short = shorten_all_variants(trans_ipa)
    
    #convert list type to IPA
    if type(ipa_short) == list:
      ipa_short = IPAString(ipa_chars = ipa_short)  
    
    return ipa_short  
  
  except IndexError:
    ipa_short = None
  
  

trans_uni = find_item_from_dict("клок", wiki_instances)[0].sounds
print(trans_uni)
ipa_short = get_ipa_shortened(trans_uni)
ipa_short
#print(''.join(map(lambda x: str(x), ipa_short)))     

kɫok


alveolar consonant lateral-approximant velarized voiced
back close-mid rounded vowel
consonant plosive velar voiceless

In [73]:
def all_ipa_shortened(): 

  def set_all_ipa_none():
    for item in wiki_instances:
      setattr(item, "ipa", None)  
  set_all_ipa_none()
  
  n_of_ipa_set = 0
  for item in wiki_instances:
    if item.status:
      setattr(item, "ipa", get_ipa_shortened(item.sounds))
      n_of_ipa_set += 1
  return n_of_ipa_set

all_ipa_shortened()

395692

In [74]:
# checking number of empty ipa
len([i for i, item in enumerate(wiki_instances) if item.ipa == None])

27144

# Final check

In [75]:
def get_items_with_full_attributes():
  items_with_full_attributes = [item for item in wiki_instances 
                                if item.accent
                                and item.grammeme
                                and item.meanings
                                and item.pos
                                and item.sounds
                                and item.status
                                and item.stem
                                and item.word
                                and item.word_lowcase
                                and item.ipa
                                ]
  return items_with_full_attributes
items_with_full_attributes = get_items_with_full_attributes()
print(len(items_with_full_attributes))

384515


In [76]:
def get_random_words(sample):
  r_numbers = [random.randint(0, len(wiki_instances)) for i in range(sample)]
  random_words = [vars(item) for i, item in enumerate(wiki_instances) \
                  if i in r_numbers]
  return random_words
random_words =   get_random_words(1)
random_words

[{'word': 'приглашавший',
  'word_lowcase': 'приглашавший',
  'stem': "приглаша'ть",
  'pos': 'verb',
  'grammeme': ['active', 'imperfective', 'participle', 'past'],
  'forms': [{'form': 'приглаша́вший', 'tags': ['canonical']},
   {'form': 'priglašávšij', 'tags': ['romanization']},
   {'form': 'no-short-form', 'source': 'Declension', 'tags': ['table-tags']},
   {'form': 'приглаша́вший',
    'tags': ['masculine', 'nominative'],
    'source': 'Declension',
    'roman': 'priglašávšij'},
   {'form': 'приглаша́вшее',
    'tags': ['neuter', 'nominative'],
    'source': 'Declension',
    'roman': 'priglašávšeje'},
   {'form': 'приглаша́вшая',
    'tags': ['feminine', 'nominative'],
    'source': 'Declension',
    'roman': 'priglašávšaja'},
   {'form': 'приглаша́вшие',
    'tags': ['nominative', 'plural'],
    'source': 'Declension',
    'roman': 'priglašávšije'},
   {'form': 'приглаша́вшего',
    'tags': ['genitive', 'masculine', 'neuter'],
    'source': 'Declension',
    'roman': 'priglašávš

# Ipa2Np

## IpaProcessing

In [77]:
class IpaProcessing:
  @classmethod
  def get_unique_ipa(cls):
        all_signs = set()
        for item in wiki_instances:
            if item.ipa:
                value = item.ipa
                for ch in value:
                    all_signs.add(ch)
        list_unique_ipas = [ipa_ch for ipa_ch in list(all_signs)]
        list_unique_unicodes = [str(ip) for ip in list_unique_ipas]
        list_unique_unicodes.sort()
        return list_unique_unicodes

  @classmethod
  def get_sign2number(cls):
    sign2number = dict((UNICODE_TO_IPA[l],i) for i, l in enumerate(list_unique_unicodes, start = 1))
    return sign2number
  
  @classmethod
  def get_number2sign(cls):
    number2sign = dict((i,UNICODE_TO_IPA[l]) for i, l in enumerate(list_unique_unicodes, start = 1))
    return number2sign 

  @classmethod
  def get_max_length_of_ipa(cls):
    lengths = [len(IPAString(unicode_string = item.sounds)) for item in wiki_instances 
               if item.status]
    max_length_of_ipa = max(lengths)
    return max_length_of_ipa

  @classmethod
  #func to convert ipa string to sparse numbers padding
  def ipa_string_to_numbers(cls, ipa_string):
    ipa_as_numbers = []
    trans_as_IPAString = IPAString(unicode_string = ipa_string)

    for ch in trans_as_IPAString:
      n = sign2number[ch]
      ipa_as_numbers.append(n)
        
    current_length = len(ipa_as_numbers)
    ipa_as_numbers.extend(0 for i in range(max_length_of_ipa - current_length))
    return ipa_as_numbers


  #func to convert all ipa to array
  @classmethod
  def all_ipa_to_array(cls):

    #save intermediate results, including save at the end
    checks_and_save_points = [i for i in range(10000,400000,10000)]

    count = 0
    all_ipa_as_array = np.zeros(max_length_of_ipa)
    for i,item in enumerate(wiki_instances):
        
      if item.status == True:
        ipa_as_numbers = cls.ipa_string_to_numbers(item.sounds)
        ipa_as_numbers = np.array(ipa_as_numbers)

        all_ipa_as_array = np.vstack([all_ipa_as_array, ipa_as_numbers])
        count +=1

        if count in checks_and_save_points:
          print("count", count, " ", "i", i)
          
    all_ipa_as_array = all_ipa_as_array[1:]
    return all_ipa_as_array

## NpIpaProcessing

In [78]:
class NpIpaProcessing(IpaProcessing):
  #func to convert ipa string to sparse numbers padding
  @classmethod
  def ipa_string_to_numbers(cls, ipa_string):
    ipa_as_numbers = []

    for ch in ipa_string:
      n = sign2number[ch]
      ipa_as_numbers.append(n)
    ipa_as_numbers = np.array(ipa_as_numbers, dtype = "float32")
    return ipa_as_numbers
  
  @classmethod
  def all_ipa_to_array(cls):
    for item in wiki_instances:
      if item.ipa:
        ipa_string = item.ipa
        item.npipa = cls.ipa_string_to_numbers(ipa_string)
        
  @classmethod
  def uni_string_to_int(cls, ipa_string):
    ipa_as_int = []

    for ch in ipa_string:
      n = sign2number[ch]
      ipa_as_int.append(n) 
    return ipa_as_int
  
  @classmethod
  def all_ipa_to_int(cls):
    for item in wiki_instances:
      if item.ipa:
        ipa_string = item.ipa
        item.intipa = cls.uni_string_to_int(ipa_string)

In [79]:
list_unique_unicodes = NpIpaProcessing.get_unique_ipa()
print(list_unique_unicodes)
number2sign = IpaProcessing.get_number2sign()
print(len(number2sign))
sign2number = IpaProcessing.get_sign2number()
print(len(sign2number))

['a', 'b', 'bʲ', 'bʲː', 'bː', 'd', 'dʲ', 'dʲː', 'dː', 'd͡z', 'd͡zʲ', 'e', 'f', 'fʲ', 'i', 'j', 'jː', 'k', 'kʲ', 'kʲː', 'kː', 'lʲ', 'lʲː', 'lˠ', 'lˠː', 'm', 'mʲ', 'mʲː', 'mː', 'n', 'nʲ', 'nʲː', 'nː', 'o', 'p', 'pʲ', 'pʲː', 'pː', 'r', 'rʲ', 'rʲː', 'rː', 's', 'sʲ', 'sʲː', 'sː', 't', 'tʲ', 'tʲː', 'tː', 't͡s', 't͡sʲ', 't͡sː', 't͡ɕ', 't͡ɕː', 'u', 'v', 'vʲ', 'vʲː', 'vː', 'x', 'xʲ', 'z', 'zʲ', 'zʲː', 'zː', 'æ', 'ɐ', 'ɕ', 'ɕː', 'ə', 'ɛ', 'ɡ', 'ɡʲ', 'ɡː', 'ɨ', 'ɪ', 'ɵ', 'ʂ', 'ʂː', 'ʈ͡ʂ', 'ʉ', 'ʊ', 'ʐ', 'ʐː']
85
85


## all_ipa_to_array()

In [80]:
NpIpaProcessing.all_ipa_to_array()

In [81]:
ipa_string = find_item_from_dict("облако", wiki_instances)[0].ipa
ipa_string

back close-mid rounded vowel
bilabial consonant plosive voiced
alveolar consonant lateral-approximant velarized voiced
central mid unrounded vowel
consonant plosive velar voiceless
central mid unrounded vowel

In [82]:
NpIpaProcessing.ipa_string_to_numbers(ipa_string)

array([34.,  2., 24., 71., 18., 71.], dtype=float32)

In [83]:
ipa_string = find_item_from_dict("облако", wiki_instances)[0].ipa
ipa_string

back close-mid rounded vowel
bilabial consonant plosive voiced
alveolar consonant lateral-approximant velarized voiced
central mid unrounded vowel
consonant plosive velar voiceless
central mid unrounded vowel

In [84]:
ipa_as_numbers = NpIpaProcessing.ipa_string_to_numbers(ipa_string)
ipa_as_numbers

array([34.,  2., 24., 71., 18., 71.], dtype=float32)

## all_ipa_to_int

In [85]:
NpIpaProcessing.all_ipa_to_int()

In [86]:
find_item_from_dict("корыто", wiki_instances)[0].npipa

array([39., 76., 47., 71.], dtype=float32)

# Delete attributes

In [87]:
# delattr(object, name)
def remove_attributes():
  for item in wiki_instances:
    delattr(item, "forms")
    delattr(item, "senses")
remove_attributes()

In [88]:
vars(find_item_from_dict("дома", wiki_instances)[0])

{'word': 'дома',
 'word_lowcase': 'дома',
 'stem': None,
 'pos': 'adv',
 'grammeme': None,
 'meanings': [['at home']],
 'accent': "до'ма",
 'sounds': 'ˈdomə',
 'status': True,
 'ipa': alveolar consonant plosive voiced
 back close-mid rounded vowel
 bilabial consonant nasal voiced
 central mid unrounded vowel,
 'npipa': array([ 6., 34., 26., 71.], dtype=float32),
 'intipa': [6, 34, 26, 71]}

# Save

In [6]:
Path.cwd().parent

WindowsPath('c:/Users/eugen/Documents/wiktionary_rus')

In [90]:
path_wiki_parsed = Path.cwd().parent / "source/wiki_parsed.pkl"

# save
with open(path_wiki_parsed, 'wb') as f:
  dill.dump(wiki_instances, f)



In [91]:
del(wiki_instances)

NameError: name 'wiki_instances' is not defined

In [93]:
# load
with open(path_wiki_parsed, "rb") as f:
  wiki_instances = dill.load(f)
print(len(wiki_instances))
print(vars(wiki_instances[199]))

422821
{'word': 'свинина', 'word_lowcase': 'свинина', 'stem': "свини'на", 'pos': 'noun', 'grammeme': ['feminine', 'nominative', 'singular'], 'meanings': [['pork']], 'accent': "свини'на", 'sounds': 'svʲɪˈnʲinə', 'status': True, 'ipa': alveolar consonant nasal palatalized voiced
close front unrounded vowel
alveolar consonant nasal voiced
central mid unrounded vowel, 'npipa': array([31., 15., 30., 71.], dtype=float32), 'intipa': [31, 15, 30, 71]}


In [95]:
vars(wiki_instances[450])

{'word': 'понедельник',
 'word_lowcase': 'понедельник',
 'stem': "понеде'льник",
 'pos': 'noun',
 'grammeme': ['accusative', 'masculine', 'nominative', 'singular'],
 'meanings': [['Monday']],
 'accent': "понеде'льник",
 'sounds': 'pənʲɪˈdʲelʲnʲɪk',
 'status': True,
 'ipa': alveolar consonant palatalized plosive voiced
 close-mid front unrounded vowel
 alveolar consonant lateral-approximant palatalized voiced
 alveolar consonant nasal palatalized voiced
 near-close near-front unrounded vowel
 consonant plosive velar voiceless,
 'npipa': array([ 7., 12., 22., 31., 77., 18.], dtype=float32),
 'intipa': [7, 12, 22, 31, 77, 18]}