In [1]:
import os

import torch
import torch.nn as nn

import numpy as np
import pandas as pd

In [2]:
T = 50
C = 20
N = 1
S = 30

S_min = 10
input = torch.randn(T, N, C).log_softmax(2).detach().requires_grad_()
print("Inputs:", input[:2])

target = torch.randint(low=1, high=C, size=(N, S), dtype=torch.long)
print("\n\nTargets:", target[:2])

input_lengths = torch.full(size=(N,), fill_value=T, dtype=torch.long)
print("Input lengths:\n", input_lengths)

target_lengths = torch.randint(low=S_min, high=S, size=(N,), dtype=torch.long)
print("Target lengths:\n", target_lengths)

Inputs: tensor([[[-4.3921, -2.7748, -2.6272, -3.1844, -5.8391, -2.9118, -1.5081,
          -2.0204, -2.4960, -4.6648, -3.1822, -5.2760, -4.1566, -5.2143,
          -3.3396, -3.1048, -4.1634, -2.6075, -3.3762, -3.2947]],

        [[-3.9547, -3.0358, -3.1015, -4.6223, -4.6874, -1.6695, -3.2782,
          -3.1245, -2.7917, -3.6702, -6.2349, -3.0321, -2.4299, -2.5736,
          -5.0681, -2.5046, -2.2475, -2.7110, -3.3705, -5.8073]]],
       grad_fn=<SliceBackward>)


Targets: tensor([[ 1,  8, 16,  9,  7, 12,  1, 13, 10, 19,  6, 19, 17, 18, 18, 16,  1,  6,
          2,  9,  5,  7,  8,  7, 11,  2, 14, 13,  7,  8]])
Input lengths:
 tensor([50])
Target lengths:
 tensor([18])


In [3]:
ctc_loss = nn.CTCLoss()
loss = ctc_loss(input, target, input_lengths, target_lengths)
loss.backward()
input.grad[:2]

tensor([[[-0.0059, -0.0455,  0.0040,  0.0023,  0.0002,  0.0030,  0.0123,
           0.0074,  0.0046,  0.0005,  0.0023,  0.0003,  0.0009,  0.0003,
           0.0020,  0.0025,  0.0009,  0.0041,  0.0019,  0.0021]],

        [[-0.0091, -0.0255,  0.0025,  0.0005,  0.0005,  0.0105,  0.0021,
           0.0024, -0.0138,  0.0014,  0.0001,  0.0027,  0.0049,  0.0042,
           0.0003,  0.0045,  0.0059,  0.0037,  0.0019,  0.0002]]])

In [4]:
extra_tokens = ["<blank>", "<sos>", "<eos>", "<unk>", " "]
tokens = extra_tokens + ['а', 'б', 'в', 'г', 'д', 
        'е', 'ж', 'з', 'и', 'й', 'к', 'л', 'м', 
        'н', 'о', 'п', 'р', 'с', 'т', 'у', 'ф', 
        'х', 'ц', 'ч', 'ш', 'щ', 'ь', 'ю', 'я',
        'є', 'і', 'ї', 'ґ']
char_to_index = {c:i for i, c in enumerate(tokens)}
index_to_char = {i:c for i, c in enumerate(tokens)}
print(char_to_index)
print(index_to_char)

{'<blank>': 0, '<sos>': 1, '<eos>': 2, '<unk>': 3, ' ': 4, 'а': 5, 'б': 6, 'в': 7, 'г': 8, 'д': 9, 'е': 10, 'ж': 11, 'з': 12, 'и': 13, 'й': 14, 'к': 15, 'л': 16, 'м': 17, 'н': 18, 'о': 19, 'п': 20, 'р': 21, 'с': 22, 'т': 23, 'у': 24, 'ф': 25, 'х': 26, 'ц': 27, 'ч': 28, 'ш': 29, 'щ': 30, 'ь': 31, 'ю': 32, 'я': 33, 'є': 34, 'і': 35, 'ї': 36, 'ґ': 37}
{0: '<blank>', 1: '<sos>', 2: '<eos>', 3: '<unk>', 4: ' ', 5: 'а', 6: 'б', 7: 'в', 8: 'г', 9: 'д', 10: 'е', 11: 'ж', 12: 'з', 13: 'и', 14: 'й', 15: 'к', 16: 'л', 17: 'м', 18: 'н', 19: 'о', 20: 'п', 21: 'р', 22: 'с', 23: 'т', 24: 'у', 25: 'ф', 26: 'х', 27: 'ц', 28: 'ч', 29: 'ш', 30: 'щ', 31: 'ь', 32: 'ю', 33: 'я', 34: 'є', 35: 'і', 36: 'ї', 37: 'ґ'}


In [5]:
import string

def remove_stop_signs(sentence):
    stop_signs = string.punctuation + "–—»«…“”’"
    for sign in stop_signs:
        sentence = sentence.replace(sign, "")
    return sentence

def sentence_to_indeces(sentence, cti : dict):
    """
    args:
        cti - char to index dictionary
    """
    sent = remove_stop_signs(sentence)
    sent = sent.lower()
    sent = sent.split()
    result = []
    for word in sent:
        for c in word:
            result.append(cti.get(c, cti["<unk>"]))
        result.append(cti[" "])
    result = result[:-1]
    result = [cti["<sos>"]] + result + [cti["<eos>"]]
    return result


sent = "Московитам дозволено створити свою державу а татарам чеченцям – ні Але це – расизм"
sent_to_idxs = sentence_to_indeces(sent, char_to_index)
print(f"Sentence \n'{sent}'\nto indeces:\n")
for i in sent_to_idxs:
    print(i, end=" ")

print(f"\n\nSentence \n'{sent}'\nfrom indeces to chars:\n")
for i in sent_to_idxs:
    print(index_to_char[i], end=" ")

Sentence 
'Московитам дозволено створити свою державу а татарам чеченцям – ні Але це – расизм'
to indeces:

1 17 19 22 15 19 7 13 23 5 17 4 9 19 12 7 19 16 10 18 19 4 22 23 7 19 21 13 23 13 4 22 7 19 32 4 9 10 21 11 5 7 24 4 5 4 23 5 23 5 21 5 17 4 28 10 28 10 18 27 33 17 4 18 35 4 5 16 10 4 27 10 4 21 5 22 13 12 17 2 

Sentence 
'Московитам дозволено створити свою державу а татарам чеченцям – ні Але це – расизм'
from indeces to chars:

<sos> м о с к о в и т а м   д о з в о л е н о   с т в о р и т и   с в о ю   д е р ж а в у   а   т а т а р а м   ч е ч е н ц я м   н і   а л е   ц е   р а с и з м <eos> 

In [6]:
DATA_DIR = "D:\\ML\\Speech recognition\\NLP_diploma\\uk"
train_df = pd.read_csv(os.path.join(DATA_DIR, "train.tsv"), sep="\t")

In [7]:
max = 0
is_show_prints = False
for sent in train_df["sentence"]:
    sent_to_idxs = sentence_to_indeces(sent, char_to_index)
    if len(sent_to_idxs) > max:
        max = len(sent_to_idxs)    
    
    if is_show_prints:
        print(f"Sentence \n'{sent}'\nto indeces:")
        for i in sent_to_idxs:
            print(i, end=" ")
        print(f"\n\nfrom indeces to chars:")
        for i in sent_to_idxs:
            print(index_to_char[i], end=" ")
        print("\n\n")
        
        
print(f"Max symbols {max}")

Max symbols 137


In [8]:
len(tokens), 152

(38, 152)

In [9]:
# ENCODER PARAMS
# Encoder inputs
enc_n_feats = 256   # spectrogram height
enc_d_model = 1024  # spectrogram max width

# DECODER PARAMS
# Decoder inputs
dec_n_feats = 38    # len(tokens)
dec_d_model = 152   # maximum symbols in sentence

In [10]:
def idxs_to_onehot(sent_idxs, length:int):
    result = []
    for number in sent_idxs:
        onehot = np.zeros((length,))
        onehot[number] = 1
        result.append(onehot)
    return np.array(result)


def onehot_matrix_to_idxs(onehots):
    result = []
    for i in range(onehots.shape[0]):
        onehot = onehots[i, :]
        number = np.argmax(onehot)
        result.append(number)
    return np.array(result)


sent = "Московитам дозволено створити свою державу а татарам чеченцям – ні Але це – расизм"
print(f"Basic: '{sent}'")

sent_to_idxs = sentence_to_indeces(sent, char_to_index)
a = idxs_to_onehot(sent_to_idxs, dec_n_feats)
print("idxs_to_onehot result:")
print(a)
print(a.shape)

print("\nonehot_matrix_to_idxs result:")
r = onehot_matrix_to_idxs(a)
for i in r:
    print(index_to_char[i], end=" ")

Basic: 'Московитам дозволено створити свою державу а татарам чеченцям – ні Але це – расизм'
idxs_to_onehot result:
[[0. 1. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 1. ... 0. 0. 0.]]
(80, 38)

onehot_matrix_to_idxs result:
<sos> м о с к о в и т а м   д о з в о л е н о   с т в о р и т и   с в о ю   д е р ж а в у   а   т а т а р а м   ч е ч е н ц я м   н і   а л е   ц е   р а с и з м <eos> 