### Установка библиотек и модулей

In [None]:
import json
from collections import defaultdict

In [None]:
!pip install korean_romanizer
from korean_romanizer.romanizer import Romanizer

!pip install konlpy
from konlpy.tag import Kkma
kkma = Kkma()

In [None]:
from pymystem3 import Mystem
m = Mystem()
!wget http://download.cdn.yandex.net/mystem/mystem-3.0-linux3.1-64bit.tar.gz
!tar -xvf mystem-3.0-linux3.1-64bit.tar.gz
!cp mystem /root/.local/bin/mystem

In [None]:
!pip install nltk==3.6.6
import nltk
nltk.download('averaged_perceptron_tagger')

from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.corpus import wordnet 
nltk_lemmatizer = WordNetLemmatizer()

In [None]:
!pip install eng-to-ipa
import eng_to_ipa as eti

In [None]:
hangul_string = ''
with open('hangul without readings.txt', 'r', encoding = 'utf-8') as han_f:
    han_t = han_f.readlines()
    for ht in han_t:
        hangul_string += ht.strip('\n')

hangul = set(hangul_string) # множество корейских символов

#### Открываем .json файл и считываем текст песни

In [None]:
# %TEXT% - название нужного .json файла
file_name = '%TEXT%.json'
with open(file_name, 'r', encoding='utf-8') as f:
    song_meta_lyrics = json.load(f)

### Токенизация, определение языка, транслитерация, перевод тегов

In [None]:
from eng_to_ipa.rhymes import get_rhymes
# ПЕРЕВОД

light_dict_name = 'kor_rus_dict_light.json'
heavy_dict_name = 'kor_rus_dict_heavy.json'

ld_f = open(light_dict_name, 'r', encoding='utf-8')
ld = json.load(ld_f)

hd_f = open(heavy_dict_name, 'r', encoding='utf-8')
hd = json.load(hd_f)


def definer(word, gr, light_dict, heavy_dict):
    # в словаре для экономии времени будут искаться только
    # корни, а не вообще любые морфемы
    verb_tags = ['V', 'A', 'V, aux', 'A, aux', 'V, cop', 'V, cop, neg', 
                 'V, ger', 'V, partcp', 'A, partcp']
    non_verb_tags = ['S', 'S, aux', 'QUANT', 'NUM', 'ANUM', 'ADV', 
                     'INTJ', 'PRO']
    translation = ''
    if gr in verb_tags:
        light_translation = light_dict.get(word+'다', ['перевод не найден'])
        for tr in light_translation:
            translation += tr +'; '
        translation = translation.strip('; ')
        if translation == 'перевод не найден':
            translation = heavy_dict.get(word+'다', 'перевод не найден')
    elif gr in non_verb_tags:
        light_translation = light_dict.get(word, ['перевод не найден'])
        for tr in light_translation:
            translation += tr +'; '
        translation = translation.strip('; ')
        if translation == 'перевод не найден':
            translation = heavy_dict.get(word, 'перевод не найден')
    
    return translation

In [None]:
# РАЗБОР KKMA / NLTK

def analizer(s):
    token_list = []
    tokens = kkma.pos(s)
    for t in tokens:
        if t[1] == 'OL':
            lang = 1
            transcr = s
            gr = nltk.pos_tag([s])[0][1]
            return [{'lex':s, 'gr':gr, 'lang':1}]
        # а вообще надо сохранять знаки препинания?
        # я оставила на всякий случай, но избавиться
        # от них очень легко
        elif t[1][0] == 'S':
            lang = 'none'
            transcr = t[0]
            gr = 'punct'
        else:
            lang = 0
            gr = t[1]
        
        token = t[0] # тут можно добавить окончания к предикатам!
        token_list.append({'lex':token, 'gr':gr, 'lang':lang})
        
        
    return(token_list)

In [None]:
# ТРАНСЛИТЕРАЦИЯ

def transl2(han):
    try:
        transcr = Romanizer(han)
        transcr = transcr.romanize()
    except Exception:
        transcr = 'transcription_error'
    return transcr

In [None]:
# РАЗДЕЛЕНИЕ СУЩЕСТВИТЕЛЬНЫХ И ЧАСТИЦ

def nj_separator(m_list):
    # NN_ + JK_ + JK = максимально длинный паттерн
    cut = []
    just_particles = ["JK", "JKM", "JX", "JC"]
    for ml in reversed(m_list):
        sq = []
        if ml['gr'] in just_particles:
            sq = [ml['lex'], 'PART', 0, ml['lex']]
        elif ml['gr'] == 'JKC':
            sq = [ml['lex'], 'PART, nom', 0, ml['lex']]
        elif ml['gr'] == 'JKG':
            sq = [ml['lex'], 'PART, gen', 0, ml['lex']]
        elif ml['gr'] == 'JKO':
            sq = [ml['lex'], 'PART, acc', 0, ml['lex']]
        elif ml['gr'] == 'NNP' or ml['gr'] == 'NNG':
            sq = [ml['lex'], 'S', 0, ml['lex']]
        else:
            sq = [ml['lex'], 'PART', 0, ml['lex']]
        
        cut.insert(0, sq)

    return cut

In [None]:
# ПЕРЕВОД ТЕГОВ KKMA В СИСТЕМУ НКРЯ, ТРАНСЛИТЕРАЦИЯ И ПЕРЕВОД
# fw - словоформа от пробела до пробела
# ms - [{'lex':морфема, 'gr':gr, 'lang':0, 
#        'transl':перевод, 'transcr':транслитерация}]

def ncrl(fw, ms):
    final_tag = ''
    lex = fw

    # сначала выпишем все теги, присвоенные kkma
    tags = []
    mors = []

    # оставляем целое слово и его целый разбор
    full_word = fw
    punctuation_tags = ['SF','SE','SS','SP','SO',
                        'SW', 'ON'] # если какие-то не слова всё же попали
    
    for m in ms:
        if m['lang'] == 0:
            tags.append(m['gr'])
            mors.append(m['lex'])
        elif m['lang'] == 1:
            return en_ncrl(ms, full_word)
        else:
            final_tag = (m['gr'])
            return [[full_word, final_tag, 3, full_word]]
    
    if len(mors)>0:
        m_lex = mors[0]
    else:
        m_lex = fw

    # проверяем наличие морфем,
    # приписывающих часть речи

    # деепричастия
    if 'EFI' in tags or 'ECE' in tags or 'ECD' in tags and ms[-1]['lex']!='게':
        if 'VA' in tags or 'VXA' in tags:
            final_tag = 'A, ger'
        elif 'VV' in tags or 'VXV' in tags or 'VCP' in tags or 'VCN' in tags:
            final_tag = 'V, ger'
        return [[full_word, final_tag, 0, m_lex]]
    
    # причастия
    if 'ETD' in tags:
        if 'VA' in tags:
            final_tag = 'A, partcp'
        else:
            final_tag = 'V, partcp'
        return [[full_word, final_tag, 0, m_lex]]

    # номинализация, вербализация и аджективизация
    if 'ETN' in tags:
        final_tag = 'S'
        return [[full_word, final_tag, 0, m_lex]]
    
    if 'XSN' in tags:
        final_tag = 'S'
        return [[full_word, final_tag, 0, m_lex]]

    if 'XSV' in tags:
        final_tag = 'V'
        return [[full_word, final_tag, 0, m_lex]]

    if 'XSA' in tags:
        final_tag = 'A'
        return [[full_word, final_tag, 0, m_lex]]

    # отглагольные деепричастия
    if 'ECD' in tags and ms[-1]['lex']=='게':
        final_tag = 'ADV'
        return [[full_word, final_tag, 0, lex]]
    
    # отделение существительных от частиц и падежей
    particles = ['JX', 'JKS', 'JKC', 'JKG', 'JKO', 'JKM', 'JC']
    for pt in particles:
        if pt in tags and 'NNG' in tags or pt in tags and 'NNP' in tags:
            return nj_separator(ms)

    # приписывание всех остальных тегов
    if 'VV' in tags:
        final_tag = 'V'
        return [[full_word, final_tag, 0, m_lex]]
    
    if 'VA' in tags or 'MDT' in tags:
        final_tag = 'A'
        return [[full_word, final_tag, 0, m_lex]]

    if 'NNP' in tags or 'NNG' in tags:
        final_tag = 'S'
        return [[full_word, final_tag, 0, lex]]
    
    if 'NNB' in tags or 'NNM' in tags:
        final_tag = 'S, aux'
        return [[full_word, final_tag, 0, lex]]
    
    if 'NR' in tags:
        final_tag = 'NUM'
        return [[full_word, final_tag, 0, lex]]
    
    if 'NP' in tags:
        final_tag = 'PRO'
        return [[full_word, final_tag, 0, lex]]

    if 'VXV' in tags:
        final_tag = 'V, aux'
        return [[full_word, final_tag, 0, m_lex]]

    if 'VXA' in tags:
        final_tag = 'A, aux'
        return [[full_word, final_tag, 0, lex]]
    
    if 'VCP' in tags:
        final_tag = 'V, cop'
        return [[full_word, final_tag, 0, lex]]
    
    if 'VCN' in tags:
        final_tag = 'V, cop, neg'
        return [[full_word, final_tag, 0, lex]]
    
    if 'MDN' in tags:
        final_tag = 'ANUM'
        return [[full_word, final_tag, 0, lex]]
    
    if 'MAG' in tags or 'MAC' in tags:
        final_tag = 'ADV'
        return [[full_word, final_tag, 0, lex]]

    if 'IC' in tags:
        final_tag = 'INTJ'
        return [[full_word, final_tag, 0, lex]]

    return [[full_word, final_tag, 3, lex]]

In [None]:
# эта функция разлепляет contracted forms
def uncontract(contracted):
    forms = []
    annots = {'t':['not','ADV',1], 've':['have','V',1], 'll':['will', 'MD',1],
              'd':['would', 'MD',1], 're':['are', 'V',1], 'm':['am', 'V',1]}
    pros = ['he', 'she', 'it', 'there', 'here', 'where', 'who', 'that']

    parts = contracted.split('\'')

    if contracted == 'let\'s':
        return [['let','V', 1],['us', 'PRO', 1]]

    # mustn't've, couldn't've ect
    if len(parts) == 3 and parts[-1] == 've':
        forms = uncontract(parts[:-1])
        forms.append(annots['ve'])
        return forms
        
    # n't
    elif parts[-1] == 't':
        true_first = parts[0][:-1]
        forms.append([true_first, nltk.pos_tag([true_first])[0], 1])
        forms.append(annots['t'])
        return forms

    # 's - генитив или is (has)
    elif parts[-1] == 's':
        if parts[0] in pros:
            forms.append([parts[0], nltk.pos_tag([parts[0]])[0], 1])
            forms.append(['is', 'V', 1])
            return  forms
        else:
            return [[contracted, 'S, sg, gen', 1]]

    # s' - генитив  множественного 
    elif parts[-1] == '' and parts[-2][-1] == 's':
        return [[contracted, 'S, pl, gen', 1]]

    else:
        try: 
            forms.append([parts[0], nltk.pos_tag([parts[0]])[0], 1])
            forms.append(annots[parts[-1]])
            return forms
        except:
            return [[contracted, nltk.pos_tag(contracted)[0], 1]]

In [None]:
# ПЕРЕВОД ТЕГОВ nltk В СИСТЕМУ НКРЯ <-
# ems - {'lex':token, 'gr':gr, 'transcr':transcr, 'lang':1}

def en_ncrl(ems, full_word):
    etags = []

    for em in ems:
        etags.append(em['gr'])
    
    # боремся с проблемами с contructed forms, которые возникают из-за
    # отсутствия контекста: nltk начинает их парсить как NN все подряд
    if '\'' in em['lex']:
        return(uncontract(full_word))

    final_tag = '' # на случай, если что-то пошло не так

    # переводим в НКРЯ, если это не была contructed form
    if 'FW' in etags:
        final_tag = 'NONLEX'
    if 'UH' in etags:
        final_tag = 'INTJ'
    if 'IN' in etags:
        final_tag = 'PR'
    if 'CC' in etags:
        final_tag = 'CONJ'
    if 'PDT' in etags:
        final_tag = 'ANUM'
    if 'RP' in etags:
        final_tag = 'PART'
    if 'DT' in etags:
        final_tag = 'ART'
    if 'LS' in etags:
        final_tag = 'LS'
    if 'TO' in etags:
        final_tag = 'PART'
    if 'WDT' in etags:
        final_tag = 'APRO'
    if 'WP' in etags or 'PRP' in etags:
        final_tag = 'PRO'
    if 'WRB' in etags:
        final_tag = 'ADVPRO'
    if 'PRPS' in etags:
        final_tag = 'APRO'
    if 'CD' in etags:
        final_tag = 'NUM'
    if 'EX' in etags or 'RB' in etags:
        final_tag = 'ADV'
    if 'RBR' in etags:
        final_tag = 'ADV, comp'
    if 'RBS' in etags:
        final_tag = 'ADV, supr'
    if 'JJ' in etags:
        final_tag = 'ADJ'
    if 'JJR' in etags:
        final_tag = 'ADJ, comp'
    if 'JJS' in etags:
        final_tag = 'ADJ, supr'
    if 'NN' in etags or 'NNP' in etags:
        final_tag = 'S, sg'
    if 'NNS' in etags or 'NNPS' in etags:
        final_tag = 'S, pl'
    if 'VB' in etags:
        final_tag = 'V'
    if 'VBG' in etags:
        final_tag = 'V, ger'
    if 'VBD' in etags or 'VBP' in etags:
        final_tag = 'V, praet'
    if 'VBN' in etags:
        final_tag = 'V, partcp'
    if 'VBZ' in etags:
        final_tag = 'V, praet, 3p, sg'
    if 'MD' in etags:
        final_tag = 'V'
    if 'POS' in etags:
        final_tag = 'S, gen' # но по идее эти формы отсекаются раньше

    return [[full_word, final_tag, 1]]

In [None]:
# ТОКЕНИЗАЦИЯ

def tag_ana(org, ld, hd):
    anas = []
    ncrl_token_list = []
    toks = org.split(' ')
    i_char = 0 # счётчик символов
    i_wn = 0 # счётчик слов

    for tok in toks:
        end_of_the_token = i_char + len(tok) - 1
        tok = tok.strip('\.\,\:\'\"\-\+\=\$\%\#\@\/\\\[\]\{\}\(\)\*\&\^\`\~\n')
        done_tok = analizer(tok) # просто обработка kkma или nltk
        words = ncrl(tok, done_tok) # объединение морфем + теги нкря
        # in words each w = [full_form, ncr_tag, lang, (lex - только для кор)]
        
        for w in words:
            small_ana = {'pos':w[1], 'lang':w[2]}
            if small_ana['lang'] == 0:
                small_ana['lex'] = w[3]
                small_ana['transcr'] = transl2(w[0])
                translation = definer(w[3], w[1], ld, hd)
                if translation != '':
                      small_ana['trans_ru'] = translation
            elif small_ana['lang'] == 1:
                small_ana['lex'] = nltk_lemmatizer.lemmatize(w[0])

            big_ana = {'wf':w[0].lower(), 'wtype':'word', 'ana':small_ana, 
                       'sentence_index':i_wn, 'off_start':i_char,
                       'off_end':end_of_the_token}
            if i_wn != len(toks) - 1:
                big_ana['next_word'] = i_wn + 1

            anas.append(big_ana)
            i_wn += 1

        i_char = end_of_the_token + 2 # пробел

    for a in anas:
        a['sentence_index_neg'] = len(anas) - a['sentence_index']
        
    return anas

### Обработка русского текста

In [None]:
def rus_tagger(phrase, para_alignment):
  # заполняем meta пустыми значениями, т.к. для русского не актуально
    para_alignment['off_start'] = 0
    para_alignment['off_end'] = len(phrase) - 1
    ready = {'text':phrase, 'lang':2, 'para_alignment':para_alignment,
             'meta':{'possibly_rhymed_with':'', 'rhymed_with':'',
                     'last_word':'', 'ending':'', 'last_vowel':'',
                     'length_in_syllables':0}}

    words = []
    char_i = 0
    word_i = 0
    
    
    punct = '\.\,\:\'\"\-\+\=\$\%\#\@\/\\\[\]\{\}\(\)\*\&\^\`\~\n '
    phrase = phrase.strip(punct)
    mb_ana = m.analyze(phrase)
    m_ana = []
    # удалим все пробелы из анализа
    for mb in mb_ana:
        if mb['text'] != ' ':
            m_ana.append(mb)

    for m_word in m_ana[:-1]:
        if 'analysis' in m_word and len(m_word['analysis'])>0:
            gr = m_word['analysis'][0]['gr']
            pos = gr.split('=')[0].split(',')[0]
            lex = m_word['analysis'][0]['lex']
            ana = [{'pos':pos, 'lex':lex}]
            word = {'wf':m_word['text'], 'sentence_index_neg':len(m_ana)-word_i,
                     'sentence_index':word_i, 'ana':ana, 'wtype':'word',}
                
        elif 'analysis' in m_word and len(m_word['analysis']) == 0:
            word = {'wf':m_word['text'], 'wtype':'word', 
                    'sentence_index':word_i,
                    'sentence_index_neg':len(m_ana)-word_i}
           
        elif 'analysis' not in m_word:
            word = {'wf':m_word['text'], 'wtype':'punct',
                    'sentence_index':word_i,
                    'sentence_index_neg':len(m_ana)-word_i}
            
        word_i += 1
        if word_i < len(m_ana):
                word['next_word'] = word_i
        
        word['off_start'] = char_i
        word['off_end'] = char_i + len(m_word['text']) - 1
        char_i += + len(m_word['text']) + 1 # помним про пробелы
        
        words.append(word)

    ready['words'] = words
    return ready

### Стиховедческая разметка

#### Рифма

In [None]:
# ТРАНСЛИТЕРАЦИЯ КОРЕЙСКОГО -> МФА
def rom_to_ipa(rom):
    
    # переводим диграфы согласных в МФА
    rom = rom.replace('ng', 'ŋ')
    rom = rom.replace('ch', 'ʧʰ')
    rom = rom.replace('j', 'ʤ')

    vowels = 'euioa'
    voiced = 'mnlŋ'
    unvoiced = 'ktpʧ'
    voiced_paired = 'gdbʤ'
    palatalized = 'gkdtbplmrnh'
    pairs = {'g':'k', 'd':'t', 'b':'p', 'ʤ':'ʧ'}
    
    rom = ' ' + rom + ' '
    new_rom = ''

    for i in range(1,len(rom)-1):
        # придыхательные согласные
        if rom[i] in unvoiced and rom[i-1] != rom[i] and rom[i+1] != rom[i]:
            new_rom += rom[i] + 'ʰ'

        # палатализация
        elif rom[i] == 'y':
            if rom[i-1] == ' ' or rom[i-1] in vowels:
                new_rom += 'y'
            elif rom[i-1] in palatalized:
                new_rom += 'ʲ' 

        # обычные согласные: 
        # интервокальная позиция и после звонких -> звонкий
        # начало слова, после глухих, конец слова -> глухой
        elif rom[i] in voiced_paired:
            if rom[i-1] in vowels and rom[i+1] in vowels:
                new_rom += rom[i]
            elif rom[i-1] in voiced and rom[i+1] in vowels:
                new_rom += rom[i]
            else:
                new_rom += pairs[rom[i]]

        # остальное просто перезаписываем
        else:
            new_rom += rom[i]

    # в современном корейском 애 и 에 совпали в ɛ
    new_rom = new_rom.replace('ae', 'ɛ')
    new_rom = new_rom.replace('oe', 'wɛ')
    new_rom = new_rom.replace('eu', 'ɯ')
    new_rom = new_rom.replace(' ui', ' ɰi')
    new_rom = new_rom.replace('ui', 'i')
    new_rom = new_rom.replace('eo', 'ʌ')
    new_rom = new_rom.replace('a', 'ɑ')
    new_rom = new_rom.replace('e', 'ɛ')

    # ui
    
    return new_rom

In [None]:
# ВЫДЕЛЕНИЕ ПОСЛЕДНЕГО СЛОГА И ПОСЛЕДНЕГО ГЛАСНОГО

def ender(transcr):
    vowels = 'ɛieuoaɐɤʌɯɔæɶɑɒəʊɜɪ:'
    ending = ''
    last_vowel = ''
    written = 0
    for i in range(len(transcr)-1, -1, -1):
        if transcr[i] in vowels and written == 0:
            last_vowel = transcr[i] + last_vowel
            ending = transcr[i] + ending
            written = 1
        elif transcr[i] in vowels and written == 1:
            last_vowel = transcr[i] + last_vowel
            ending = transcr[i] + ending
        elif transcr[i] in vowels and written == 2:
            written = 3
        elif transcr[i] not in vowels and written == 0:
            ending = transcr[i] + ending
        elif transcr[i] not in vowels and transcr[i] != 'ʰ' and transcr[i] != 'ʲ' and written == 1:
            ending = transcr[i] + ending
            written = 2
        elif transcr[i] == 'ʰ' and written < 2:
            ending = 'ʰ' + ending
        elif transcr[i] == 'ʲ' and written < 2:
            ending = 'ʲ' + ending
    return last_vowel, ending

In [None]:
# ДЛЯ POSSIBLY RHYMED

rhymed_vowels = {'i':'ɯɪi', 'ɯ':'ɯɪi', 'ɪ':'ɪɯi', 
                 'u':'uʊ', 'ʊ':'uʊ', 'ə':'ɐ',
                 'e':'eæɛ', 'æ':'eæɛ', 'ɛ':'eæɛ',
                 'a':'aɐɑ', 'ɐ':'aɐɑə', 'ɑ':'aɐɑ',
                 'o':'oɤɔʌ', 'ɤ':'ʌoɤɔ', 'ɔ':'ʌoɤɔ', 'ʌ':'ʌoɔɤ'}

In [None]:
# НАХОЖДЕНИЕ ЗАРИФМОВАННЫХ СТРОК

strings = defaultdict(list) 
# strings = {параграф:[номер_стоки, последний_гласный, 
#                      последний_слог, последнее слово]}


line_keys = song_meta_lyrics[1].keys()

for lk in line_keys:
    punct = ' ,.\"\'[]()!?-%:;'
    last_word = song_meta_lyrics[1][lk][1].split(' ')[-1].strip(punct).lower()
    lat_letters = 'qwertyuioplkjhgfdsazxcvbnm'
    if len(last_word) != 0:
        if last_word[0].lower() in lat_letters:
            last_syllable, last_vowel = ender(eti.convert(last_word).strip('*'))
        elif last_word[0] in hangul:
            last_syllable, last_vowel = ender(
                rom_to_ipa(Romanizer(last_word).romanize()))
        else:
            last_syllable = ''
            last_vowel = ''
      
    par = lk.split('-')[0]
    l = lk.split('-')[1]

    strings[par].append([l, last_vowel, last_syllable, last_word])


rhymes = defaultdict(list) # РИФМЫ
possible_rhymes = defaultdict(list) # ВОЗМОЖНЫЕ РИФМЫ

for par_i in strings.keys():
    paragraph = strings[par_i]
    for i in range(0, len(paragraph)):
        for j in range(0, len(paragraph)):
            if j != i and paragraph[i][1] != '' and paragraph[j][1] != '':
                # точные рифмы
                if paragraph[j][1] == paragraph[i][1]:
                    rhymes[par_i+'-'+str(paragraph[i][0])].append(paragraph[j][3])
                # возможные рифмы
                if len(paragraph[i][2]) == 1: 
                    if paragraph[j][2] in rhymed_vowels[paragraph[i][2]]:
                        possible_rhymes[par_i+'-'+str(paragraph[i][0])].append(paragraph[j][3])
        

#### Количество слогов и определение языка

In [None]:
def syll_lang(line, hangul):
    ko = 0
    en = 0

    syllable_length = 0
    words = line.split(' ')
    
    en_letters = 'qwertyuioplkjhgfdsazxcvbnm'

    for word in words:
        word = word.strip(' ,.\"\'[]()!?-%:;')
        if len(word) > 0:
            if word[0].lower() in en_letters:
                syllable_length += eti.syllable_count(word)
                en = 1
            elif word[0] in hangul:
                # в корейском слоговое письмо
                syllable_length += len(word)
                ko = 1

    if ko == 0 and en == 0:
        lang = 3
    if ko == 1 and en == 0:
        lang = 0
    if ko == 0 and en == 1:
        lang = 1
    if ko == 1 and en == 1:
        lang = 4

    return syllable_length, lang

### Запись в JSON

In [None]:
meta = song_meta_lyrics[0]
lyrics = song_meta_lyrics[1]

In [None]:
# КООРДИНИРОВАНИЕ ВСЕХ ФУНКЦИЙ
processed_korean = []
processed_russian = []

for lk in lyrics.keys():
    para = lk.split('-')[0]
    string = lk.split('-')[1]
    para_id = int(para)*1000 + int(string)

    # корейский
    kor_words = tag_ana(lyrics[lk][1], ld, hd)
    syll_length, lang = syll_lang(lyrics[lk][1], hangul)
    
    rhymed = ''
    for rh in rhymes[lk]:
        rhymed += rh + ', '

    pos_rhymed = ''
    for prhm in possible_rhymes[lk]:
        if prhm not in rhymes[lk]:
            pos_rhymed += prhm + ', '
    
    for line in strings[para]:
        if line[0] == string:
            last_word = line[3]
            last_vowel = line[2]

    kor_sent_meta = {'possibly_rhymed_with':pos_rhymed.strip(', '),
                     'rhymed_with':rhymed.strip(', '),
                     'last_word':last_word,
                     'last_vowel':last_vowel,
                     'length_in_syllables':syll_length}
    # выравнивание
    kor_alignment = {'off_start':0,
                     'off_end':len(lyrics[lk][1])-1,
                     'para_id':para_id}
    
    kor_sent = {'text':lyrics[lk][1],
                'words':kor_words,
                'lang':lang,
                'meta':kor_sent_meta,
                'para_alignment':kor_alignment}
    
    processed_korean.append(kor_sent)

    # русский
    rus_sent = rus_tagger(lyrics[lk][0], {'para_id':para_id})
    processed_russian.append(rus_sent)


processed_korean.extend(processed_russian)
final = {'meta':meta, 'sentences':processed_korean}

In [None]:
# ЗАПИСЬ В КОНЕЧНЫЙ ФАЙЛ
with open('Processed ' + file_name, 'w', encoding='utf-8') as out_file:
    json.dump(final, out_file, ensure_ascii=False, indent=3)