In [1]:
%load_ext autoreload
%autoreload 2
CMUdict_ARPAbet = {
    "" : " ",
    "[SIL]": "-", "NG": "G", "F" : "f", "M" : "m", "AE": "@",
    "R"    : "r", "UW": "u", "N" : "n", "IY": "i", "AW": "W",
    "V"    : "v", "UH": "U", "OW": "o", "AA": "a", "ER": "R",
    "HH"   : "h", "Z" : "z", "K" : "k", "CH": "C", "W" : "w",
    "EY"   : "e", "ZH": "Z", "T" : "t", "EH": "E", "Y" : "y",
    "AH"   : "A", "B" : "b", "P" : "p", "TH": "T", "DH": "D",
    "AO"   : "c", "G" : "g", "L" : "l", "JH": "j", "OY": "O",
    "SH"   : "S", "D" : "d", "AY": "Y", "S" : "s", "IH": "I",
    "[SOS]": "[SOS]", "[EOS]": "[EOS]"
}

CMUdict = list(CMUdict_ARPAbet.keys())
ARPAbet = list(CMUdict_ARPAbet.values())

PHONEMES = CMUdict[:-2]
LABELS = ARPAbet[:-2]

## The Data

In [36]:
import dataset
from dataset import AudioDataset

# probably want to collect some RAM before we go.
import gc
gc.collect()

train_data = AudioDataset('train-clean-100', data_dir="../../data/ARPAbet_kaggle")
val_data = AudioDataset('dev-clean', data_dir="../../data/ARPAbet_kaggle")


create dataset from data  ../../data/ARPAbet_kaggle/train-clean-100
	total mfcc cnt:  28539
	total transcript cnt:  28539
create dataset from data  ../../data/ARPAbet_kaggle/dev-clean
	total mfcc cnt:  2703
	total transcript cnt:  2703


one observation to make is that each trancript
starts with [SIL] and ends with [SIL]. There are moments of silence at the beginning and end of each recording, which should be mapped to this token.

The \<sos\> and \<eos\> tokens have been removed in dataset.

In [33]:
mfcc,  transcript = train_data[0]
print(mfcc.shape, mfcc.dtype)
print(transcript)
string = [train_data.idx_to_str(x) for x in transcript]
print(" ".join(string))

mfcc,  transcript = train_data[1]
print(mfcc.shape, mfcc.dtype)
print(transcript)
string = [train_data.idx_to_str(x) for x in transcript]
print(" ".join(string))


(1174, 28) float32
[ 1 26  8 37 15 20 40 19 16  5 28  9 15 39 15 18 26  4 39 23  5  8 39 26
 17 25  7  4 38 23  1 40  3  1 26 11  5  8 26 27 17 15 11 40  2 23 15  8
 26 11  4 38  8 37 26  8 37  8 14 23  7  4 26 19 26 11 26 16 10 39 20 38
  3 23  7 27  9 40  8 16 25  7  4 26  8  1 16  5 11 32 40 11 26  8 30 26
 39 24  8 23  6 26 33  3 40 32 25 15 26 11 30 40 39 23 31  6  9  1 33 24
 39 18 15 39 15  9 26 23 24  8 36 26  8  1]
[SIL] AH N D ER W IH CH HH AE P IY ER S ER K AH M S T AE N S AH Z Y UW M AY T [SIL] IH F [SIL] AH V AE N AH B Z ER V IH NG T ER N AH V M AY N D AH N D N AA T UW M AH CH AH V AH HH AW S W AY F T UW B IY IH N HH Y UW M AH N [SIL] HH AE V G IH V AH N DH AH S EH N T R AH L F IH G Y ER AH V DH IH S T AO R IY [SIL] L EH S K ER S ER IY AH T EH N SH AH N [SIL]
(1463, 28) float32
[ 1 26  8 37 26 33 38 23  8 40  2 33 38 18  4  7 11  4 26  8 23  1 39 26
  4 20 12 37 18 31 33 40 23 33 26 18  1 16  9  4  5  8 26 34 37 23  7 39
 21 11 16 40  4 39 24 33  3  1 40  8 30 40 39 20 21 

In [37]:
from torch.utils.data import DataLoader

batch_size = 64
train_loader =  DataLoader(
            train_data,
            batch_size=batch_size,
            drop_last=True,
            shuffle=True,
            collate_fn=train_data.collate_fn
)


print("Train dataset samples = {}, batches = {}".format(train_data.__len__(), len(train_loader)))

# sanity check
for data in train_loader:
    x, y, lx, ly = data
    print(x.shape, y.shape, lx.shape, ly.shape)
    break


Train dataset samples = 28539, batches = 445
torch.Size([64, 1611, 28]) torch.Size([64, 226]) torch.Size([64]) torch.Size([64])


## The Model

In [8]:
from model import ASRModel
from defines import PHONEMES

model = ASRModel(
    input_size = 28, 
    embed_size= 64,
    output_size = len(PHONEMES)
)
for data in train_loader:
    x, y, lx, ly = data
    decoder_out, encoder_lens = model.forward(x, lx)
    break

torch.Size([64, 1671, 28]) torch.Size([64, 281]) torch.Size([64]) torch.Size([64])


In [20]:
print(len(LABELS))
print(LABELS)

for i in range(10):
    mfcc,  transcript = val_data[i]
    sent = [val_data.int_to_str(s) for s in transcript]
    print(''.join(sent))

print(sorted(LABELS))

41
[' ', '-', 'G', 'f', 'm', '@', 'r', 'u', 'n', 'i', 'W', 'v', 'U', 'o', 'a', 'R', 'h', 'z', 'k', 'C', 'w', 'e', 'Z', 't', 'E', 'y', 'A', 'b', 'p', 'T', 'D', 'c', 'g', 'l', 'j', 'O', 'S', 'd', 'Y', 's', 'I']
[SOS]AND THE POOR SILLY THINGS RUFFLED UP THEIR FEATHERS AND LOOKED MISERABLE AS ONLY A LITTLE BIRD CAN LOOK WHEN IT IS UNHAPPY[EOS]
[SOS]HE THOUGHT IT WAS A LAST BURST OF ENERGY HE KNEW HOW CLOSE THEY BOTH WERE TO EXHAUSTION[EOS]
[SOS]WE WERE INURED TO PRIVATIONS AND HARDSHIPS HAD BEEN UPON EVERY MARCH IN EVERY BATTLE IN EVERY SKIRMISH IN EVERY ADVANCE IN EVERY RETREAT IN EVERY VICTORY IN EVERY DEFEAT[EOS]
[SOS]THIS WAS THE OLD ESTABLISHMENT OF URSUS ITS PROPORTIONS AUGMENTED BY SUCCESS AND IMPROVED FROM A WRETCHED BOOTH INTO A THEATRE[EOS]
[SOS]LETTY FINDING HERSELF NOT QUITE EQUAL TO THE EMERGENCY CAME IN HER TURN TO CALL MARY SHE WENT AS QUIETLY AS IF SHE WERE LEAVING A TIRESOME VISITOR[EOS]
[SOS]BUT EARNEST AS THE FATHER WAS IN WATCHING THE YET LIVING HE HAD EYES AND EARS FOR