In [1]:
COLAB_GPU = False

In [2]:
from pathlib import Path
from src.config import get_config

PATH = Path.home() / "Documents/common-voice-fa/fa"
CONFIG = Path(".").absolute() / '..' / 'config.yaml'

In [3]:
import warnings

warnings.filterwarnings("ignore", category=UserWarning) 

In [4]:
import pandas as pd

dataset = pd.read_csv(PATH / "train.tsv", delimiter="\t")

In [5]:
from src.utils.text import TextUtility

utils = TextUtility(config=get_config(CONFIG))

In [6]:
dataset.head()

Unnamed: 0,client_id,path,sentence,up_votes,down_votes,age,gender,accent,locale,segment
0,23c0ee3fe3ab2f504f1bf0e27e4d10b8a23ae5fe777847...,common_voice_fa_19258088.mp3,دعویهای قضایی بیفایده و طولانی، کمپینهای عمومی...,2,0,twenties,female,,fa,
1,23c0ee3fe3ab2f504f1bf0e27e4d10b8a23ae5fe777847...,common_voice_fa_19258097.mp3,جاش ، اینا دوستام هستن,2,0,twenties,female,,fa,
2,23c0ee3fe3ab2f504f1bf0e27e4d10b8a23ae5fe777847...,common_voice_fa_19258102.mp3,بسه دیگه دیگه حرف نزن,2,0,twenties,female,,fa,
3,2543e37ab831f1c42bcdc2accd4beda86dea3931e95bbc...,common_voice_fa_20871570.mp3,بعد از ناهار آزاد خواهم بود.,2,0,twenties,male,,fa,
4,2543e37ab831f1c42bcdc2accd4beda86dea3931e95bbc...,common_voice_fa_20871593.mp3,من رئیس دپارتمان هستم.,2,0,twenties,male,,fa,


In [7]:
from src.utils.data.speech import CommonVoice, ctc_collate_function
from torch.utils.data import DataLoader

In [8]:
commonvoice_dataset = CommonVoice(
    path=PATH,
    df=dataset,
    config=get_config(CONFIG)
)

dataloader = DataLoader(commonvoice_dataset, batch_size=10, shuffle=True, collate_fn=ctc_collate_function)

In [9]:
import torch
from torch.nn import Linear, Sequential
from src.nn import (
    EncDecCTCModel, 
    DepthwiseSeperableConv1D as Conv
)

from src.nn.quartz import (
    QuartzNet,
    BlockConfig,
    PreConfig,
    PostConfig
)

  rank_zero_deprecation(


In [10]:
encoder = QuartzNet(
    pre_config=PreConfig(
        input_channels=40,
        kernel_size=12,
        filter_size=40,
        dropout=0.6
    ),
    block_config=BlockConfig(
        input_channels=[40, 64, 128, 256],
        filters=[64, 128, 256, 256],
        kernels=[3, 3, 3, 3],
        drop_rates=[0.6, 0.6, 0.6, 0.6],
        repeat=[5, 5, 5, 5]
    ),
    post_config=PostConfig(
        input_channels=[256, 256, 256],
        filters=[256, 256, 256],
        kernels=[11, 11, 11],
        drop_rates=[0.6, 0.6, 0.6, 0.6]
    )
)

In [11]:
model = EncDecCTCModel(
    encoder = encoder,
    decoder=Linear(256, utils.blank_id+1),
    blank_id=utils.blank_id,
    optimizer=torch.optim.AdamW
)

In [12]:
model.summarize()


  | Name    | Type      | Params
--------------------------------------
0 | encoder | QuartzNet | 6.7 M 
1 | decoder | Linear    | 11.8 K
2 | loss    | CTCLoss   | 0     
--------------------------------------
6.7 M     Trainable params
0         Non-trainable params
6.7 M     Total params
26.825    Total estimated model params size (MB)


  | Name    | Type      | Params
--------------------------------------
0 | encoder | QuartzNet | 6.7 M 
1 | decoder | Linear    | 11.8 K
2 | loss    | CTCLoss   | 0     
--------------------------------------
6.7 M     Trainable params
0         Non-trainable params
6.7 M     Total params
26.825    Total estimated model params size (MB)

In [13]:
from pytorch_lightning import Trainer

trainer = Trainer(gpus=1 if COLAB_GPU else 0)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores


In [28]:
trainer.fit(model, dataloader)


  | Name    | Type      | Params
--------------------------------------
0 | encoder | QuartzNet | 2.6 M 
1 | decoder | Linear    | 11.8 K
2 | loss    | CTCLoss   | 0     
--------------------------------------
2.6 M     Trainable params
0         Non-trainable params
2.6 M     Total params
10.590    Total estimated model params size (MB)
Epoch 0:   1%|          | 6/760 [01:22<2:52:45, 13.75s/it, loss=8.22, v_num=1]