In [1]:
import os

import torch
import torch.nn as nn

import numpy as np
import pandas as pd

In [2]:
T = 50
C = 20
N = 1
S = 30

S_min = 10
input = torch.randn(T, N, C).log_softmax(2).detach().requires_grad_()
print("Inputs:", input[:2])

target = torch.randint(low=1, high=C, size=(N, S), dtype=torch.long)
print("\n\nTargets:", target[:2])

input_lengths = torch.full(size=(N,), fill_value=T, dtype=torch.long)
print("Input lengths:\n", input_lengths)

target_lengths = torch.randint(low=S_min, high=S, size=(N,), dtype=torch.long)
print("Target lengths:\n", target_lengths)

Inputs: tensor([[[-3.0902, -3.7390, -3.4956, -5.1916, -1.6517, -2.0307, -3.0985,
          -3.1335, -2.5695, -4.4338, -2.9636, -2.3014, -4.7290, -2.4146,
          -2.8782, -4.7997, -3.2969, -4.4470, -4.6960, -3.7967]],

        [[-3.5079, -5.3225, -4.1056, -4.3521, -3.6322, -3.6178, -1.7566,
          -3.5656, -2.7448, -3.7311, -3.3192, -4.1413, -3.8689, -2.2083,
          -1.5062, -3.9838, -4.9298, -2.0877, -3.4894, -4.7769]]],
       grad_fn=<SliceBackward0>)


Targets: tensor([[19, 10, 10, 17, 17,  9, 13, 13,  9, 12, 17, 10, 10,  6, 11,  6,  3,  8,
         18, 15, 19,  5,  5, 14,  6,  8,  5, 16,  5, 18]])
Input lengths:
 tensor([50])
Target lengths:
 tensor([25])


In [3]:
ctc_loss = nn.CTCLoss()
loss = ctc_loss(input, target, input_lengths, target_lengths)
loss.backward()
input.grad[:2]

tensor([[[-0.0159,  0.0010,  0.0012,  0.0002,  0.0077,  0.0052,  0.0018,
           0.0017,  0.0031,  0.0005,  0.0021,  0.0040,  0.0004,  0.0036,
           0.0022,  0.0003,  0.0015,  0.0005,  0.0004, -0.0214]],

        [[-0.0157,  0.0002,  0.0007,  0.0005,  0.0011,  0.0011,  0.0069,
           0.0011,  0.0026,  0.0010, -0.0140,  0.0006,  0.0008,  0.0044,
           0.0089,  0.0007,  0.0003,  0.0050,  0.0012, -0.0072]]])

In [4]:
extra_tokens = ["<blank>", "<sos>", "<eos>", "<unk>", " "]
tokens = extra_tokens + ['а', 'б', 'в', 'г', 'д', 
        'е', 'ж', 'з', 'и', 'й', 'к', 'л', 'м', 
        'н', 'о', 'п', 'р', 'с', 'т', 'у', 'ф', 
        'х', 'ц', 'ч', 'ш', 'щ', 'ь', 'ю', 'я',
        'є', 'і', 'ї', 'ґ']
char_to_index = {c:i for i, c in enumerate(tokens)}
index_to_char = {i:c for i, c in enumerate(tokens)}
print(char_to_index)
print(index_to_char)

{'<blank>': 0, '<sos>': 1, '<eos>': 2, '<unk>': 3, ' ': 4, 'а': 5, 'б': 6, 'в': 7, 'г': 8, 'д': 9, 'е': 10, 'ж': 11, 'з': 12, 'и': 13, 'й': 14, 'к': 15, 'л': 16, 'м': 17, 'н': 18, 'о': 19, 'п': 20, 'р': 21, 'с': 22, 'т': 23, 'у': 24, 'ф': 25, 'х': 26, 'ц': 27, 'ч': 28, 'ш': 29, 'щ': 30, 'ь': 31, 'ю': 32, 'я': 33, 'є': 34, 'і': 35, 'ї': 36, 'ґ': 37}
{0: '<blank>', 1: '<sos>', 2: '<eos>', 3: '<unk>', 4: ' ', 5: 'а', 6: 'б', 7: 'в', 8: 'г', 9: 'д', 10: 'е', 11: 'ж', 12: 'з', 13: 'и', 14: 'й', 15: 'к', 16: 'л', 17: 'м', 18: 'н', 19: 'о', 20: 'п', 21: 'р', 22: 'с', 23: 'т', 24: 'у', 25: 'ф', 26: 'х', 27: 'ц', 28: 'ч', 29: 'ш', 30: 'щ', 31: 'ь', 32: 'ю', 33: 'я', 34: 'є', 35: 'і', 36: 'ї', 37: 'ґ'}


In [5]:
import string

def remove_stop_signs(sentence):
    stop_signs = string.punctuation + "–—»«…“”’"
    for sign in stop_signs:
        sentence = sentence.replace(sign, "")
    return sentence

def sentence_to_indeces(sentence, cti : dict):
    """
    args:
        cti - char to index dictionary
    """
    sent = remove_stop_signs(sentence)
    sent = sent.lower()
    sent = sent.split()
    result = []
    for word in sent:
        for c in word:
            result.append(cti.get(c, cti["<unk>"]))
        result.append(cti[" "])
    result = result[:-1]
    result = [cti["<sos>"]] + result + [cti["<eos>"]]
    return result


sent = "Московитам дозволено створити свою державу а татарам чеченцям – ні Але це – расизм"
sent_to_idxs = sentence_to_indeces(sent, char_to_index)
print(f"Sentence \n'{sent}'\nto indeces:\n")
for i in sent_to_idxs:
    print(i, end=" ")

print(f"\n\nSentence \n'{sent}'\nfrom indeces to chars:\n")
for i in sent_to_idxs:
    print(index_to_char[i], end=" ")

Sentence 
'Московитам дозволено створити свою державу а татарам чеченцям – ні Але це – расизм'
to indeces:

1 17 19 22 15 19 7 13 23 5 17 4 9 19 12 7 19 16 10 18 19 4 22 23 7 19 21 13 23 13 4 22 7 19 32 4 9 10 21 11 5 7 24 4 5 4 23 5 23 5 21 5 17 4 28 10 28 10 18 27 33 17 4 18 35 4 5 16 10 4 27 10 4 21 5 22 13 12 17 2 

Sentence 
'Московитам дозволено створити свою державу а татарам чеченцям – ні Але це – расизм'
from indeces to chars:

<sos> м о с к о в и т а м   д о з в о л е н о   с т в о р и т и   с в о ю   д е р ж а в у   а   т а т а р а м   ч е ч е н ц я м   н і   а л е   ц е   р а с и з м <eos> 

In [6]:
DATA_DIR = "D:\\ML\\Speech recognition\\NLP_diploma\\uk"
train_df = pd.read_csv(os.path.join(DATA_DIR, "train.tsv"), sep="\t")

In [7]:
max = 0
is_show_prints = False
for sent in train_df["sentence"]:
    sent_to_idxs = sentence_to_indeces(sent, char_to_index)
    if len(sent_to_idxs) > max:
        max = len(sent_to_idxs)    
    
    if is_show_prints:
        print(f"Sentence \n'{sent}'\nto indeces:")
        for i in sent_to_idxs:
            print(i, end=" ")
        print(f"\n\nfrom indeces to chars:")
        for i in sent_to_idxs:
            print(index_to_char[i], end=" ")
        print("\n\n")
        
        
print(f"Max symbols {max}")

Max symbols 137


In [8]:
len(tokens), 152

(38, 152)

In [9]:
# ENCODER PARAMS
# Encoder inputs
enc_n_feats = 256   # spectrogram height
enc_d_model = 1024  # spectrogram max width

# DECODER PARAMS
# Decoder inputs
dec_n_feats = 38    # len(tokens)
dec_d_model = 152   # maximum symbols in sentence

In [46]:
def idxs_to_onehot(sent_idxs, length:int):
    result = []
    for number in sent_idxs:
        onehot = np.zeros((length,))
        onehot[number] = 1
        result.append(onehot)
    return np.array(result)


def onehot_matrix_to_idxs(onehots):
    result = []
    for i in range(onehots.shape[0]):
        onehot = onehots[i, :]
        number = np.argmax(onehot)
        result.append(number)
    return np.array(result)


sent = "Московитам дозволено створити свою державу а татарам чеченцям – ні Але це – расизм"
print(f"Basic: '{sent}'")

sent_to_idxs = sentence_to_indeces(sent, char_to_index)
a = idxs_to_onehot(sent_to_idxs, dec_n_feats)
print("idxs_to_onehot result:")
print(a)
print(a.shape)

print("\nonehot_matrix_to_idxs result:")
r = onehot_matrix_to_idxs(a)
print(r)
for i in r:
    print(index_to_char[i], end=" ")

Basic: 'Московитам дозволено створити свою державу а татарам чеченцям – ні Але це – расизм'
idxs_to_onehot result:
[[0. 1. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 1. ... 0. 0. 0.]]
(80, 38)

onehot_matrix_to_idxs result:
[ 1 17 19 22 15 19  7 13 23  5 17  4  9 19 12  7 19 16 10 18 19  4 22 23
  7 19 21 13 23 13  4 22  7 19 32  4  9 10 21 11  5  7 24  4  5  4 23  5
 23  5 21  5 17  4 28 10 28 10 18 27 33 17  4 18 35  4  5 16 10  4 27 10
  4 21  5 22 13 12 17  2]
<sos> м о с к о в и т а м   д о з в о л е н о   с т в о р и т и   с в о ю   д е р ж а в у   а   т а т а р а м   ч е ч е н ц я м   н і   а л е   ц е   р а с и з м <eos> 

In [17]:
import torch.nn.functional as F

A = torch.Tensor([1, 2, 3]).long()
print(A)

F.one_hot(A, num_classes=38)

tensor([1, 2, 3])


tensor([[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [33]:
sent_num = sentence_to_indeces(sent, char_to_index) # list of ints
A = torch.Tensor(sent_num).long()
a1 = F.one_hot(A, num_classes=dec_n_feats)

torch.Size([80, 38]) torch.Size([80, 38])


3040

In [58]:
class LangHandling():
    def __init__(self, tokens):
        self.token_to_index = {c:i for i, c in enumerate(tokens)}
        self.index_to_token = {i:c for i, c in enumerate(tokens)}
        self.num_classes = len(tokens)


class LangCharHandling(LangHandling):
    def __init__(self, tokens):
        super().__init__(tokens)
        self.stop_signs = string.punctuation + "–—»«…“”’"
        
    def remove_stop_signs(self, sentence):
        for sign in self.stop_signs:
            sentence = sentence.replace(sign, "")
        return sentence

    def sentence_to_indeces(self, sentence):
        sent = self.remove_stop_signs(sentence)
        sent = sent.lower()
        sent = sent.split()
        result = []
        for word in sent:
            for c in word:
                char = self.token_to_index.get(c, self.token_to_index["<unk>"])
                result.append(char)
            result.append(self.token_to_index[" "])
        result = result[:-1]
        result = [self.token_to_index["<sos>"]] + result + [self.token_to_index["<eos>"]]
        return result

    def sentence_to_one_hots(self, sent):
        sent_to_idxs = self.sentence_to_indeces(sent)
        sent_to_idxs = torch.Tensor(sent_to_idxs).long()
        one_hots = F.one_hot(sent_to_idxs, num_classes=self.num_classes)
        return one_hots
    
    def one_hots_to_sentence(self, one_hots):
        result = ""
        idxs = self.onehot_matrix_to_idxs(one_hots)
        print(idxs)
        for index in idxs:
            result += self.index_to_token[int(index)]
        return result
        
    def onehot_matrix_to_idxs(self, one_hots):
        result = []
        for i in range(one_hots.shape[0]):
            one_hot = one_hots[i, :]
            number = np.argmax(one_hot)
            result.append(number)
        return result

    
sent = "Московитам дозволено створити свою державу а татарам чеченцям – ні Але це – расизм"
lang_handle = LangCharHandling(tokens)
one_hots = lang_handle.sentence_to_one_hots(sent)
print(one_hots.shape, one_hots)
sent_result = lang_handle.one_hots_to_sentence(one_hots)
print(sent_result)

torch.Size([80, 38]) tensor([[0, 1, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 1,  ..., 0, 0, 0]])
[tensor(1), tensor(17), tensor(19), tensor(22), tensor(15), tensor(19), tensor(7), tensor(13), tensor(23), tensor(5), tensor(17), tensor(4), tensor(9), tensor(19), tensor(12), tensor(7), tensor(19), tensor(16), tensor(10), tensor(18), tensor(19), tensor(4), tensor(22), tensor(23), tensor(7), tensor(19), tensor(21), tensor(13), tensor(23), tensor(13), tensor(4), tensor(22), tensor(7), tensor(19), tensor(32), tensor(4), tensor(9), tensor(10), tensor(21), tensor(11), tensor(5), tensor(7), tensor(24), tensor(4), tensor(5), tensor(4), tensor(23), tensor(5), tensor(23), tensor(5), tensor(21), tensor(5), tensor(17), tensor(4), tensor(28), tensor(10), tensor(28), tensor(10), tensor(18), tensor(27), tensor(33), tensor(17), tensor(4), tensor(18), tensor(35), te