In [3]:
import numpy as np
import os
import torch
import torch.nn.functional as F

from data import SpeechDataset, SpeechDataLoader, featurelen, cer, wer
from uyghur import uyghur_latin
from tqdm import tqdm
from UModel import UModel

from torch.optim.lr_scheduler import CosineAnnealingLR

class CustOpt:
    def __init__(self, params, datalen, lr, min_lr = None):
        if min_lr is None:
            min_lr = lr

        self.optimizer = torch.optim.Adam(params, lr=lr, weight_decay=0.000001)  #, weight_decay=0.000001
        self._step = 0
        self.scheduler = CosineAnnealingLR(self.optimizer,T_max=datalen, eta_min = min_lr)

    def step(self):
        self.optimizer.step()
        self.scheduler.step()
        rate = self.scheduler.get_last_lr()[0]
        return rate

    def zero_grad(self):
        self.optimizer.zero_grad()

#outputs format = B x F x T
def calctc_loss(outputs, targets, output_lengths, target_lengths):
    loss = F.ctc_loss(outputs.permute(2,0,1).contiguous(), targets, output_lengths, target_lengths, blank = uyghur_latin.pad_idx, reduction='mean',zero_infinity=True)
    return loss

def validate(model, valid_loader):
    chars = 0
    words = 0
    e_chars = 0
    e_words = 0
    avg_loss = 0
    iter_cnt = 0
    msg = ""
    
    cer_val = 0.0

    model.eval()
    with torch.no_grad():
        tlen = len(valid_loader)
        vbar = tqdm(iter(valid_loader), leave=True, total=tlen)
        for inputs, targets, input_lengths, target_lengths, _ in vbar:

            inputs  = inputs.to(device)
            targets = targets.to(device)
            outputs, output_lengths = model(inputs, input_lengths)
            loss = calctc_loss(outputs, targets, output_lengths, target_lengths)
            preds   = model.greedydecode(outputs, output_lengths)
            targets = [uyghur_latin.decode(target) for target in targets]
            
            for pred, src in zip(preds, targets):
                e_char_cnt, char_cnt = cer(pred,src)
                e_word_cnt, word_cnt = wer(pred, src)
                e_chars += e_char_cnt
                e_words += e_word_cnt

                chars += char_cnt
                words += word_cnt

            iter_cnt += 1
            avg_loss +=loss.item()

            msg = f"  VALIDATION: [CER:{e_chars/chars:.2%} ({e_chars}/{chars} letters) WER:{e_words/words:.2%} ({e_words}/{words} words), Avg loss:{avg_loss/iter_cnt:4f}]"
            vbar.set_description(msg)

        vbar.close()

        cer_val = e_chars/chars

        with open(log_name,'a', encoding='utf-8') as fp:
            fp.write(msg+"\n")

        #Print Last 3 validation results
        result =""
        result_cnt = 0
        for pred, src in zip(preds, targets):
            e_char_cnt, char_cnt = cer(pred,src)
            e_word_cnt, word_cnt = wer(pred, src)
            result += f"   O:{src}\n"
            result += f"   P:{pred}\n"
            result += f"     CER: {e_char_cnt/char_cnt:.2%} ({e_char_cnt}/{char_cnt} letters), WER: {e_word_cnt/word_cnt:.2%} ({e_word_cnt}/{word_cnt} words)\n"
            result_cnt += 1
            if result_cnt >= 3:
                break
        
        print(result)
        return cer_val


def train(model, train_loader):
    total_loss = 0
    iter_cnt = 0
    msg =''
    model.train()
    pbar = tqdm(iter(train_loader), leave=True, total=mini_epoch_length)
    for data in pbar:
        optimizer.zero_grad()
        inputs, targets, input_lengths, target_lengths, _ = data
        inputs  = inputs.to(device)
        targets = targets.to(device)

        outputs, output_lengths = model(inputs, input_lengths)
        loss = calctc_loss(outputs, targets, output_lengths, target_lengths)
        loss.backward()

        lr = optimizer.step()
        total_loss += loss.item()
        iter_cnt += 1

        msg = f'[LR: {lr: .7f} Loss: {loss.item(): .5f}, Avg loss: {(total_loss/iter_cnt): .5f}]'
        pbar.set_description(msg)
        if iter_cnt > mini_epoch_length:
            break
        
    pbar.close()
    with open(log_name,'a', encoding='utf-8') as fp:
        msg = f'Epoch[{(epoch+1):d}]:\t{msg}\n'
        fp.write(msg)


In [55]:
prefix_path = "../datasets/cv-corpus-17.0-delta-2024-03-15/fa/"
prefix_path2 = "clips_wav/"
train_file = 'other.tsv'

print(train_file)

with open(prefix_path + train_file,encoding='utf_8_sig') as f:
    lines = f.readlines()                               # len(lines): 9923, list
                                                        # line[0]: ".wav \t script"

idxs  = []
idxs2 = []
for x in lines:
    _, path, _, sentence = x.strip().split("\t")[0:4]
    path = path.strip().split(".")[0]
    path = prefix_path + prefix_path2 + path + ".wav"
    # print(path)
    # print(sentence)
    
    if os.path.exists(path):
        line = []
        line.append(path)
        idxs2.append(sentence)
        char_indx = uyghur_latin.encode(sentence)
        line.append(char_indx)
        idxs.append(line)                           # len(idx): 9923, list
        


other.tsv


In [56]:
print(len(idxs))
print(len(lines))

6208
6209


In [57]:
print(uyghur_latin.vocab_size)
print(uyghur_latin.vocab_list())

38
['<pad>', '<sos>', '<eos>', 'آ', 'ئ', 'ا', 'ب', 'پ', 'ت', 'ث', 'ج', 'چ', 'ح', 'خ', 'د', 'ذ', 'ر', 'ژ', 'س', 'ش', 'ص', 'ض', 'ط', 'ظ', 'ع', 'غ', 'ف', 'ق', 'ک', 'گ', 'ل', 'م', 'ن', 'و', 'ه', 'ی', 'ز', ' ']


In [58]:
print(idxs[0:3])

[['../datasets/cv-corpus-17.0-delta-2024-03-15/fa/clips_wav/common_voice_fa_39591341.wav', [5, 26, 18, 16, 37, 18, 16, 16, 19, 8, 34, 37, 14, 5, 16, 35]], ['../datasets/cv-corpus-17.0-delta-2024-03-15/fa/clips_wav/common_voice_fa_39591342.wav', [13, 33, 5, 6, 19, 37, 32, 31, 35, 6, 16, 14, 37, 33, 37, 5, 26, 28, 5, 16, 37, 19, 33, 16, 35, 14, 34, 35, 37, 5, 33, 37, 31, 8, 33, 10, 34, 37, 31, 18, 5, 6, 27, 34, 35, 37, 26, 16, 14, 5, 37, 6, 33, 14]], ['../datasets/cv-corpus-17.0-delta-2024-03-15/fa/clips_wav/common_voice_fa_39591343.wav', [5, 33, 37, 16, 5, 37, 14, 16, 37, 28, 30, 35, 18, 5, 35, 37, 20, 33, 31, 24, 34, 37, 14, 26, 32, 37, 28, 16, 14, 32, 14]]]


In [88]:
num = 63
test_case = idxs[num][1]
test_case_raw = idxs2[num]
print(test_case)
print(uyghur_latin.decode(test_case))
print(test_case_raw)

[7, 33, 30, 37, 13, 33, 14, 37, 16, 5, 37, 14, 16, 37, 18, 34, 5, 31, 37, 18, 16, 31, 5, 35, 34, 37, 29, 15, 5, 16, 35, 37, 28, 16, 14, 32]
پول خود را در سهام سرمایه گذاری کردن
پول خود را در سهام سرمایه گذاری کردن
