In [10]:
import numpy as np
from lab3_tools import *
from lab3_proto import *
from lab2_tools import *
from lab2_proto import *

from sklearn.preprocessing import StandardScaler

In [11]:
phoneHMMs = np.load('lab2_models_all.npz', allow_pickle=True)['phoneHMMs'].item()
phones = sorted(phoneHMMs.keys())
nstates = {phone: phoneHMMs[phone]['means'].shape[0] for phone in phones}
stateList = [ph + '_' + str(id) for ph in phones for id in range(nstates[ph])]
stateList

FileNotFoundError: [Errno 2] No such file or directory: 'lab2_models_all.npz'

In [None]:
np.savez("./statelist.npz",stateList)

In [None]:
test_example, _= loadAudio('tidigits/disc_4.1.1/tidigits/train/man/ae/z9z6531a.wav')
path2info('tidigits/disc_4.1.1/tidigits/train/man/ae/z9z6531a.wav')

In [11]:
filename = 'tidigits/disc_4.1.1/tidigits/train/man/nw/z43a.wav'
samples, samplingrate = loadAudio(filename)
lmfcc = mfcc(samples)
wordTrans = list(path2info(filename)[2])
wordTrans

LibsndfileError: ignored

In [None]:
from prondict import prondict
phoneTrans = words2phones(wordTrans, prondict)
phoneTrans

In [None]:
utteranceHMM = concatHMMs(phoneHMMs, phoneTrans)

In [None]:
stateTrans = [phone + '_' + str(stateid) for phone in phoneTrans
                  for stateid in range(nstates[phone])]
stateTrans[10]

In [None]:
viterbiStateTrans = forcedAlignment(lmfcc, phoneHMMs, phoneTrans)

In [None]:
frames2trans(viterbiStateTrans, outfilename='z43a.lab')

In [None]:
import os
traindata = []
for root, dirs, files in os.walk('tidigits/disc_4.1.1/tidigits/train'):
    for file in files:
      if file.endswith('.wav'):
          filename = os.path.join(root, file)
          samples, samplingrate = loadAudio(filename)
          lmfcc = mfcc(samples)
          spec = mspec(samples)
          wordTrans = list(path2info(filename)[2])
          phoneTrans = words2phones(wordTrans, prondict)
          targets = forcedAlignment(lmfcc, phoneHMMs, phoneTrans)
          traindata.append({'filename': filename, 'lmfcc': lmfcc, 'mspec': 'mspec', 'targets': targets})

In [12]:
def loaddata():
    traindata = []
    for root, dirs, files in os.walk():
        for file in files:
          if file.endswith('.wav'):
              filename = os.path.join(root, file)
              samples, samplingrate = loadAudio(filename)
              lmfcc = mfcc(samples)
              spec = mspec(samples)
              wordTrans = list(path2info(filename)[2])
              phoneTrans = words2phones(wordTrans, prondict)
              targets = forcedAlignment(lmfcc, phoneHMMs, phoneTrans)
              traindata.append({'filename': filename, 'lmfcc': lmfcc, 'mspec': 'mspec', 'targets': targets})
    return traindata

In [None]:
traindata = loaddata('tidigits/disc_4.1.1/tidigits/train')
np.savez('traindata.npz', traindata=traindata)

In [None]:
testdata = loaddata('tidigits/disc_4.1.1/tidigits/test')
np.savez('testdata.npz', testdata=testdata)

In [None]:
train_data = np.load('traindata.npz', allow_pickle=True)['traindata']
n_val = round(len(train_data)//10)

In [None]:
indexes = np.random.permutation(N)
train = np.take(train_data,indexes)
val = train[:n_val]
train = train[n_val:]
np.savez('train.npz', train=train)
np.savez('val.npz', val=val)

## Acoustic Context (Dynamic Features)

In [None]:
def get_features(data, dynamic=True):
    lmfcc_dim = data[0]['lmfcc'].shape[1]
    mspec_dim = data[0]['mspec'].shape[1]
    total_frames = sum([len(x['targets']) for x in data])
    pad_size = 3
    
    if dynamic:
        num_features = 7
    else:
        num_features = 1
    
    mfcc_features = np.zeros((total_frames, lmfcc_dim * num_features))
    mspec_features = np.zeros((total_frames, mspec_dim * num_features))
    targets = []
    current_frame_idx = 0
    
    for utterance in data:
        lmfcc_padded = np.pad(utterance['lmfcc'], pad_width=((pad_size, pad_size), (0, 0)), mode='reflect')
        mspec_padded = np.pad(utterance['mspec'], pad_width=((pad_size, pad_size), (0, 0)), mode='reflect')
        num_frames = lmfcc_padded.shape[0]

        for frame_idx in range(num_frames - 6 * (1 - dynamic)):
            if dynamic:
                start = frame_idx
                end = frame_idx + num_features
                if start < pad_size:
                    start = pad_size
                    end = pad_size + num_features
                elif end > num_frames - pad_size:
                    start = num_frames - pad_size - num_features
                    end = num_frames - pad_size
                mfcc_features[current_frame_idx] = np.hstack(lmfcc_padded[start:end])
                mspec_features[current_frame_idx] = np.hstack(mspec_padded[start:end])
            else:
                mfcc_features[current_frame_idx] = utterance['lmfcc'][frame_idx]
                mspec_features[current_frame_idx] = utterance['mspec'][frame_idx]
            
            current_frame_idx += 1

        targets += utterance['targets']

    return mfcc_features, mspec_features, targets

In [None]:
d_lmfcc_train, d_mspec_train, train_y = get_features(train,dynamic=True)
d_lmfcc_val, d_mspec_val, val_y = get_features(val,dynamic=True)
d_lmfcc_test, d_mspec_test, test_y = get_features(testdata,dynamic=True)

In [None]:
lmfcc_train_x, mspec_train_x, _ = get_features(train,dynamic=False)
lmfcc_val_x, mspec_val_x, _ = get_features(val,dynamic=False)
lmfcc_test_x, mspec_test_x, _ = get_features(testdata,dynamic=False)


In [None]:
np.savez('d_lmfcc_train.npz', d_lmfcc_train=d_lmfcc_train)
np.savez('d_lmfcc_val.npz', d_lmfcc_val=d_lmfcc_val)
np.savez('d_lmfcc_test.npz', d_lmfcc_test=d_lmfcc_test)

np.savez('d_mspec_train.npz', d_mspec_train=d_mspec_train)
np.savez('d_mspec_val.npz', d_mspec_val=d_mspec_val)
np.savez('d_mspec_test.npz', d_mspec_test=d_mspec_test)

np.savez('train_y',train_y=train_y)
np.savez('val_y',val_y=val_y)
np.savez('test_y',test_y=test_y)

In [None]:
np.savez('lmfcc_train_x.npz', lmfcc_train_x=lmfcc_train_x)
np.savez('lmfcc_val_x.npz', lmfcc_val_x=lmfcc_val_x)
np.savez('lmfcc_test_x.npz', lmfcc_test_x=lmfcc_test_x)
np.savez('mspec_train_x.npz', mspec_train_x=mspec_train_x)
np.savez('mspec_val_x.npz', mspec_val_x=mspec_val_x)
np.savez('mspec_test_x.npz', mspec_test_x=mspec_test_x)

## Feature Standardisation

In [None]:
stateList = np.load('statelist.npz',allow_pickle=True)['arr_0']
output_dim = len(stateList)

In [None]:
scaler = StandardScaler(copy=False)
scaler.fit_transform(d_lmfcc_train)
scaler.fit_transform(d_mspec_train)
scaler.fit_transform(mspec_train_x)
scaler.fit_transform(lmfcc_train_x)

scaler.transform(d_lmfcc_val)
scaler.transform(d_lmfcc_test)
scaler.transform(d_mspec_val)
scaler.transform(d_mspec_test)
scaler.transform(mspec_train_x)
scaler.transform(mspec_train_x)
scaler.transform(lmfcc_val_x)
scaler.transform(lmfcc_test_x)

targets_train = np_utils.to_categorical(train_y, output_dim)
targets_val = np_utils.to_categorical(val_y, output_dim)
targets_test = np_utils.to_categorical(test_y, output_dim)
targets_train = np_utils.to_categorical(train_y, output_dim)
targets_val = np_utils.to_categorical(val_y, output_dim)
targets_test = np_utils.to_categorical(test_y, output_dim)