# Feature Extraction for Phoneme Recognition on TIMIT

## Goals

- Loading and testing the datasets exported by the previous notebook.
- Feature design for the LSTM-250 network.
- Feature extraction from the TIMIT dataset.
- Exporting the features as standalone dataset for training the network.
    - Feature dataset record: (phoneme, feature-sequence)

# Environment Setup

In [None]:
import torch

import os
import librosa
import numpy as np
import matplotlib.pyplot as plt
import IPython.display as ipd
from tqdm.auto import tqdm


if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

print('Using PyTorch version:', torch.__version__, ' Device:', device)

# Loading and Testing the Datasets

In [None]:
ds_path = './session/curated-dataset.pt'
ds_dict = torch.load(ds_path)
print(ds_dict.keys())
print(ds_dict['note'])

Train_ds = ds_dict['train']
Test_ds  = ds_dict['test']

print('Train_ds:', len(Train_ds))
ipd.display(Train_ds[:3])
print('Test_ds:', len(Test_ds))
ipd.display(Test_ds[:3])

In [None]:
rec = Train_ds[1]
phone, audio_path, start, end = rec
wave, rate = librosa.load(audio_path, sr=None)
phone_wave = wave[start:end]
print(phone)
ipd.display(ipd.Audio(phone_wave, rate=rate))

In [None]:
# delete redundant variables to avoid confusion
del ds_path, ds_dict
del rec, phone, audio_path, start, end, wave, rate, phone_wave
print(dir())

## Convert to Audio dataset

Convert records from (phoneme, path-to-audio, start-index, end-index) to (phoneme, wave, rate).

In [None]:
# Given a file path, returns the audio waveform and the sampling rate.
# Mainly used for caching already loaded files.
audio_cache = {}    # Caching loaded audio files for faster processing
def getAudio(audio_path):
    if audio_path not in audio_cache:
        wave, rate = librosa.load(audio_path, sr=None)    
        audio_cache[audio_path] = (wave, rate)
    return audio_cache[audio_path]


# Given file path, start, and end indices, returns the audio slice and the sampling rate
def getAudioSlice(audio_path, start, end):
    wave, rate = getAudio(audio_path)
    return wave[start:end], rate


# Without caching: 14 seconds
# With caching: < 1sec
# Test above function
for rec in tqdm(Train_ds):    
    _, audio_path, *_ = rec
    getAudio(audio_path)
    
    
# delete redundant variables to avoid confusion
del rec, audio_path

In [None]:
# Given a phoneme record list, returns an audio record list: (phoneme, wave, rate)
def makeAudioDS(list_phone_rec):
    audio_ds = []
    for phone_rec in list_phone_rec:
        phone, audio_path, start, end = phone_rec
        wave, rate = getAudioSlice(audio_path, start, end)
        audio_rec = [phone, wave, rate]
        audio_ds.append(audio_rec)
    return audio_ds


# Convert to audio datasets
Train_audio_ds = makeAudioDS(Train_ds)
Test_audio_ds  = makeAudioDS(Test_ds)
        
print('Train_audio_ds:', len(Train_audio_ds))
print('Test_audio_ds :', len(Test_audio_ds))

# Feature Design

**NOTE:**
- Feature vector is designed following [[Speech-Recog paper]](https://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=6638947)
- Mel-spectrogram is used as the base feature, 40 of Mel bands are generated.
- The energy term is computed using the mel-spectrogram.
- First and second order derivatives of those terms are used.
- Total length of the feature vector: 41 x 3 = 123.

In [None]:
# Given an audio waveform, get the MFCC coefficients
# Params,
#   n_mfcc    : no. of MFCC coefficient to return
#   n_mels    : number of Mel bands to generate
#   fft_window: length of the FFT window
#   hop_len   : number of audio samples between adjacent STFT columns.
# Note: Change fft_window and hop_len to play with the sequence lengths.
#def getMFCC(wave, sample_rate, n_mfcc, fft_window, hop_length, n_mels):
#    #mfcc = librosa.feature.mfcc(y=wave, sr=sample_rate, n_mfcc=n_mfcc, n_fft=fft_window, hop_length=hop_length, n_mels=n_mels)
#    mel_spec = librosa.feature.melspectrogram(y=wave, sr=sample_rate, n_fft=fft_window, hop_length=hop_length, n_mels=n_mels)
#    log_mel_spec = librosa.power_to_db(mel_spec)  # Convert to log-scale
#    return log_mel_spec
    #return mfcc


# Given a sequence of mfcc coefficients, returns a sequence of corresponding energy terms
#def getEnergy(mfcc_seq):
#    energy_seq = np.sum(mfcc_seq**2, axis=0)
#    return energy_seq


# Given an audio waveform, get the mel-spectrogram
# Params,
#   n_mels     : number of Mel bands to generate
#   fft_window : length of the FFT window
#   hop_length : number of audio samples between adjacent STFT columns.
# Note: Change fft_window and hop_len to play with the sequence lengths.
def getMelSpec(wave, sample_rate, n_mels, fft_window, hop_length):
    mel_spec = librosa.feature.melspectrogram(y=wave, sr=sample_rate, n_fft=fft_window, hop_length=hop_length, n_mels=n_mels)
    log_mel_spec = librosa.power_to_db(mel_spec)  # Convert to log-scale
    return log_mel_spec


# Given a waveform, compute energy (in decibel) in each frame
def getEnergy(wave, frame_length, hop_length):
    energy = librosa.feature.rms(y=wave, frame_length=frame_length, hop_length=hop_length)
    log_energy = librosa.power_to_db(energy)
    return log_energy


# Given a sequence, computes the derivative of order=order.
def getDelta(seq, order):
    delta = librosa.feature.delta(seq, order=order)
    return delta


# Plotting utilities ---
def plotAudio(wave, sample_rate, axis):
    duration = len(wave) / sample_rate
    time = np.linspace(0, duration, len(wave))
    axis.plot(time, wave)
    

# Use librosa.display.specshow to display 2D features
def plotSpecshow(data, fig, axis):
    img = librosa.display.specshow(data, ax=axis)
    fig.colorbar(img, ax=axis)
    return img
    

In [None]:
# Test above functions ---
# Get an audio
rec = Test_audio_ds[1200]
phone, wave, rate = rec
print('phone:', phone, '  wave:', len(wave), '  rate:', rate)

#mfcc = getMFCC(wave, rate, n_mfcc=40, fft_window=64, hop_length=16, n_mels=40)
#print('mfcc:', mfcc.shape)

mel_spec = getMelSpec(wave, rate, n_mels=40, fft_window=64, hop_length=16)
print('mel_spec:', mel_spec.shape)

#energy = getEnergy(mfcc)
energy = getEnergy(wave, frame_length=64, hop_length=16)
print('energy:', energy.shape)

#mfcc_eng = np.vstack([mfcc, energy])
#print('mfcc_eng:', mfcc_eng.shape)

mel_eng = np.vstack([mel_spec, energy])

delta1 = getDelta(mel_eng, order=1)
delta2 = getDelta(mel_eng, order=2)
print('delta1:', delta1.shape)
print('delta2:', delta2.shape)

In [None]:
# Plot the features
print('phoneme:', phone)
ipd.display(ipd.Audio(wave, rate=rate))

fig = plt.figure(figsize=(3*4, 7))
fig.subplots_adjust(hspace=0.5)
fig.tight_layout()

ax_wave = fig.add_subplot(3, 1, 1)
plotAudio(wave, rate, ax_wave)
ax_wave.set_title('Audio')

ax_mel = fig.add_subplot(3, 2, 3)
ax_mel.set_title('Mel Spectrogram')
plotSpecshow(mel_spec, fig, ax_mel)

ax_energy = fig.add_subplot(3, 2, 4)
ax_energy.set_title('Energy')
ax_energy.plot(energy[0], color='r')

ax_delta1 = fig.add_subplot(3, 2, 5)
ax_delta1.set_title('Delta-1')
plotSpecshow(delta1, fig, ax_delta1)

ax_delta2 = fig.add_subplot(3, 2, 6)
ax_delta2.set_title('Delta-2')
plotSpecshow(delta2, fig, ax_delta2)

In [None]:
# delete redundant variables to avoid confusion
del rec, phone, wave, rate
del mel_spec, energy, mel_eng, delta1, delta2
del ax_wave, ax_mel, ax_energy, ax_delta1, ax_delta2, fig

## Check Audio Dataset Distribution

Check the audio dataset to determine the parameters for the features.  

In [None]:
from math import inf as INF


# Given a audio dataset, returns the record with min and max audio lengths
def getMinMaxRec(ds_list):
    min_len = INF
    max_len = -INF
    min_rec = None
    max_rec = None
    all_len = []
    for audio_rec in ds_list:    # rec: (phoneme, wave, rate)
        phone_len = len(audio_rec[1])
        if phone_len < min_len:
            min_len = phone_len
            min_rec = audio_rec
        if phone_len > max_len:
            max_len = phone_len
            max_rec = audio_rec
        all_len.append(phone_len)
    return min_len, max_len, min_rec, max_rec, all_len


# Given an audio dataset, shows the audio length disribution
def showAudioLenDisrib(audio_ds):
    # Print min/max info
    min_len, max_len, min_rec, max_rec, all_len = getMinMaxRec(audio_ds)
    print('min_rec:', min_len, min_rec[0])
    print('max_rec:', max_len, max_rec[0])
    median = np.median(all_len)
    print('median:', median)
    
    # show min-rec waveform
    wave, rate = min_rec[1:]
    print('min_rec waveform:')
    plt.plot(wave)
    plt.show()
    
    # Play the audio clips
    ipd.display(ipd.Audio(wave, rate=rate))
    wave, rate = max_rec[1:]
    ipd.display(ipd.Audio(wave, rate=rate))
    
    # Show the histogram
    plt.hist(all_len, bins='auto', rwidth=0.8)
    plt.xlabel('Value')
    plt.ylabel('Frequency')
    plt.title('Distribution of audio lengths')

In [None]:
print('Train_audio_ds ---')
showAudioLenDisrib(Train_audio_ds)

In [None]:
print('Test_audio_ds ---')
showAudioLenDisrib(Test_audio_ds)

## Zero Padding of Short Audio

**NOTE:**
Based on the above observation
- Because median is around 1024, pad zeros to make all audio length >= 1024.

In [None]:
# Given an audio dataset, pad zeros to make all audio clips lengths >= min_len
# audio_ds[i]: (phoneme, wave, rate)
def padZeroAudio(audio_ds, min_len):
    for audio_rec in audio_ds:
        current_length = len(audio_rec[1])
        if current_length < min_len:
            num_zeros = min_len - current_length
            padding = np.zeros(num_zeros)
            padded_audio = np.concatenate((audio_rec[1], padding))
            audio_rec[1] = padded_audio

            
# Pad zeros to audio
F_min_audio_len = 1024
padZeroAudio(Train_audio_ds, F_min_audio_len)
padZeroAudio(Test_audio_ds, F_min_audio_len)    

In [None]:
print('Train_audio_ds ---')
showAudioLenDisrib(Train_audio_ds)

In [None]:
print('Test_audio_ds ---')
showAudioLenDisrib(Test_audio_ds)

# Data Augmentation

Data augmentation is performed to avoid over-fitting on the training dataset. This can potentially improve accuracy over the unknown data points.

## Design Augmentations

In [None]:
# Adds noise to the audio
def add_noise(waveform, noise_factor):
    noise = np.random.rand(len(waveform))
    noisy_audio = waveform + noise_factor * noise
    return noisy_audio


# Changes the audio volume
def change_volume(waveform, volume_factor):  
    scaled_audio = waveform * volume_factor
    return scaled_audio


# Shifts audio pitch
def pitch_shift(waveform, sample_rate, n_steps):
    shifted_audio = librosa.effects.pitch_shift(waveform, sr=sample_rate, n_steps=n_steps, n_fft=256)
    return shifted_audio


# Increases the audio speed
def speed_change(waveform, speed):
    speedup_audio = librosa.effects.time_stretch(waveform, rate=speed, n_fft=256)
    return speedup_audio
    

In [None]:
audio_keys = list(audio_cache.keys())
wave, rate = audio_cache[audio_keys[100]]
print('original audio')
ipd.Audio(wave, rate=rate)

In [None]:
# Applies the augmentation and shows the audio widget with the new waveform
def testAugmentation(augmenter, waveform, rate, **options):
    new_wave = augmenter(waveform, **options)
    ipd.display(ipd.Audio(new_wave, rate=rate))
    ax_org = plt.subplot(1,2,1)
    ax_new = plt.subplot(1,2,2)
    ax_org.plot(waveform)
    ax_new.plot(new_wave)
    plt.show()
    
    

# Test the augmentations
print('add_noise')
testAugmentation(add_noise, wave, rate, noise_factor=0.002)  # factor range: 0.002 - 0.005

In [None]:
print('change_volume')
testAugmentation(change_volume, wave, rate, volume_factor=1.5)  # factor range: 0.2 - 1.5

In [None]:
print('pitch_shift')
testAugmentation(pitch_shift, wave, rate, sample_rate=rate, n_steps=2)    # n_steps: -2 to 2

In [None]:
print('speed_change')
testAugmentation(speed_change, wave, rate, speed=0.85)   # speed range: 0.85 - 1.2

In [None]:
del audio_keys, wave, rate

## Build Augmented Audio Dataset

In [None]:
# Define augmentation functions with randomized parameters
import random


def augment_noise(waveform, rate):
    factor_min, factor_max = 0.002, 0.005
    factor = random.uniform(factor_min, factor_max)
    return add_noise(waveform, noise_factor=factor)


def augment_volume(waveform, rate):
    factor_min, factor_max = 0.2, 1.5
    factor = random.uniform(factor_min, factor_max)
    return change_volume(waveform, volume_factor=factor)


def augment_pitch(waveform, rate):
    step_min, step_max = 0.2, 1.5
    step = random.uniform(-2, 2)
    return pitch_shift(waveform, sample_rate=rate, n_steps=step)


def augment_speed(waveform, rate):
    speed_min, speed_max = 0.85, 1.2
    speed = random.uniform(speed_min, speed_max)
    return speed_change(waveform, speed=speed)
    

    
# Given an audio dataset and a set of augmentations functions
# returns a new dataset with random augmentations applied.
# prototype of all_augments[i]: (waveform, rate)
def makeAugmentedDS(audio_ds, all_augments):
    rmin, rmax = 0, len(all_augments) - 1     # index range for selection
    new_audio_ds = []
    for item in tqdm(audio_ds):
        augment_select = random.randint(rmin, rmax)
        phone, wave, rate = item
        new_wave = all_augments[augment_select](wave, rate)
        new_audio_ds.append([phone, new_wave, rate])
    return new_audio_ds
    
    
    
#Train_audio_ds = makeAudioDS(Train_ds)
Train_aug_audio_ds = makeAugmentedDS(Train_audio_ds, [augment_noise, augment_volume, augment_pitch, augment_speed])
print('Train_aug_audio_ds:', len(Train_aug_audio_ds))

In [None]:
select_index = 2
item = Train_audio_ds[select_index]
ipd.display(ipd.Audio(item[1], rate=item[2]))

item = Train_aug_audio_ds[select_index]
ipd.display(ipd.Audio(item[1], rate=item[2]))

del select_index, item

# Feature Extraction, Normalization, and Export

## Build Feature Dataset

In [None]:
F_fft_window = 512
F_hop_length = 64
F_n_mels = 40

F_note = f'''
F_n_mels    : {F_n_mels}
F_fft_window: {F_fft_window}
F_hop_length: {F_hop_length}
'''

# Given an audio record, returns a feature record: (phoneme, feature-sequence)
def getFeatureRecord(audio_record):
    # Feature extraction parameters
    # Extract features from the audio-record
    phone, wave, rate = audio_record
    # compute mel-spectrogram
    mel_spec = getMelSpec(wave, rate, n_mels=F_n_mels, fft_window=F_fft_window, hop_length=F_hop_length)
    # Compute energy from mfcc then stack on top of mfcc for deta calculation
    energy = getEnergy(wave, frame_length=F_fft_window, hop_length=F_hop_length)
    mel_eng = np.vstack([mel_spec, energy])
    # compute deltas
    delta1 = getDelta(mel_eng, order=1)
    delta2 = getDelta(mel_eng, order=2)
    # stack all to make feature vector
    feat_vec = np.vstack([mel_eng, delta1, delta2])
    return [phone, feat_vec]    # make each record a list, not a tuple for easier modification later

    
# Test above function
audio_rec = Train_audio_ds[4]
phone, feat_vec = getFeatureRecord(audio_rec)
print('phone:', phone)
print('audio_rec[1]:', len(audio_rec[1]))
print('feat_vec:', feat_vec.shape)
F_feat_len = len(feat_vec)
ipd.display(ipd.Audio(audio_rec[1], rate=audio_rec[2]))


# delete redundant variables to avoid confusion
del audio_rec, phone, feat_vec

In [None]:
# Given a feature-sequence, breaks down different parts then plots it
def showFeatures(feature_sequence):    # feature_sequence: (sequence-point, feature-vector)    
    feat_seq = feature_sequence
    fig = plt.figure(figsize=(12, 5))
    fig.subplots_adjust(hspace=0.5)
    fig.tight_layout()

    mel_coeff = feat_seq[0:40, :]     # extract the mel coefficients only
    ax_mel = fig.add_subplot(2, 2, 1)
    ax_mel.set_title('Mel Coefficients')
    plotSpecshow(mel_coeff, fig, ax_mel)

    energy = feat_seq[40, :]     # extract energy only
    ax_energy = fig.add_subplot(2, 2, 2)
    ax_energy.set_title('Energy')
    ax_energy.plot(energy)
    
    delta1 = feat_seq[41:82, :]  # extract delta1 only
    ax_delta1 = fig.add_subplot(2, 2, 3)
    ax_delta1.set_title('Delta-1')
    plotSpecshow(delta1, fig, ax_delta1)

    delta2 = feat_seq[82:, :]    # extract delta2 only
    ax_delta2 = fig.add_subplot(2, 2, 4)
    ax_delta2.set_title('Delta-2')
    plotSpecshow(delta2, fig, ax_delta2)
    


In [None]:
# Given an audio dataset, returns a list of feature dataset
# audio_ds[i]: (phone, wave, rate)
# return[i]: (phone, feature-sequence)
def makeFeatureDS(audio_ds):
    feat_ds = []
    for audio_rec in tqdm(audio_ds):
        feat_rec = getFeatureRecord(audio_rec)
        feat_ds.append(feat_rec)
    return feat_ds


# Build feature datasets
Train_feat_ds     = makeFeatureDS(Train_audio_ds)
Train_aug_feat_ds = makeFeatureDS(Train_aug_audio_ds)
Test_feat_ds      = makeFeatureDS(Test_audio_ds)

In [None]:
item = Train_feat_ds[0]
showFeatures(item[1])
del item

## Normalize

In [None]:
# Given a list of sequences, flattens it into a 1D array
def makeFlattened(sequence_list):
    # merge and flatten
    out_list = []
    for index, seq in enumerate(sequence_list):
        out_list.append(seq.flatten())        # make flattened copies of the sequences
    out_list = np.concatenate(out_list)
    out_list = out_list.flatten()
    return out_list
    

# Given a list of features, returns their mean and standard deviation
def getMeanStd(all_features, flattened=False):
    # merge and flatten if necessary
    if not flattened: all_features = makeFlattened(all_features)
    # Get mean and standard deviation
    mean = np.mean(all_features)
    std  = np.std(all_features)
    return mean, std


# Show the histogram
def plotHistogram(array):
    plt.hist(array, bins='auto', rwidth=0.8)
    plt.xlabel('Value')
    plt.ylabel('Frequency')
    
        
# Given a feature-dataset, returns different parts of the feature vector as lists
def separateFeatures(feat_ds):
    all_mel = [item[1][0:40, :] for item in feat_ds]
    all_energy = [item[1][40, :] for item in feat_ds]
    all_delta1 = [item[1][41:82, :] for item in feat_ds]
    all_delta2 = [item[1][82:123, :] for item in feat_ds]
    return all_mel, all_energy, all_delta1, all_delta2




# Test above function
all_mel, all_energy, all_delta1, all_delta2 = separateFeatures(Test_feat_ds)
array = makeFlattened(all_energy)
mean, std = getMeanStd(array)
plotHistogram(array)
print('mean:', mean, "   std:", std)
del all_mel, all_energy, all_delta1, all_delta2
del array, mean, std

In [None]:
# Given a list of features, returns dynarmic range-related parameters
def getDynRange(all_features):
    # Merge and Flatten
    all_features = makeFlattened(all_features)
    all_feat_0 = all_features[all_features!=0]     # remove all zeros
    # Compute dynamic range-related information
    min_all = np.min(all_features)
    max_all = np.max(all_features)
    min_abs = np.min(np.abs(all_features))
    max_abs = np.max(np.abs(all_features))
    min_abs_0 = np.min(np.abs(all_feat_0))
    mean, std = getMeanStd(all_features, flattened=True)
    return min_all, max_all, min_abs, max_abs, min_abs_0, mean, std


# Printing utility for dynamic range parameters
def printDynRangeParams(min_all, max_all, min_abs, max_abs, min_abs_0, mean, std):
    print('min_all  :', min_all)
    print('max_all  :', max_all)
    print('min_abs  :', min_abs)
    print('max_abs  :', max_abs)
    print('min_abs_0:', min_abs_0)
    print('mean     :', mean)
    print('std      :', std)

    
# Given a feature-dataset, prints the dynamic ranges of different sections of the feature-vector
def printAllRange(feat_ds):
    all_feat = [item[1] for item in feat_ds]
    all_mel, all_energy, all_delta1, all_delta2 = separateFeatures(feat_ds)
     
    # Dynamic range of entire feature vector
    range_params = getDynRange(all_feat)
    print('[all_feat]')
    printDynRangeParams(*range_params)

    # Dynamic range of mel-coefficients
    range_params = getDynRange(all_mel)
    print('\n[all_mel]')
    printDynRangeParams(*range_params)

    # Dynamic range of energy
    range_params = getDynRange(all_energy)
    print('\n[all_energy]')
    printDynRangeParams(*range_params)

    # Dynamic range of delta-1
    range_params = getDynRange(all_delta1)
    print('\n[all_delta1]')
    printDynRangeParams(*range_params)

    # Dynamic range of delta-2
    range_params = getDynRange(all_delta2)
    print('\n[all_delta2]')
    printDynRangeParams(*range_params)
    

    
    
# Print the dynamic ranges before normalization
print('Dynamic Ranges before normalization')
print('---- Test_feat_ds ----')
printAllRange(Test_feat_ds)

In [None]:
print('---- Train_feat_ds ----')
printAllRange(Train_feat_ds)

In [None]:
print('---- Train_aug_feat_ds ----')
printAllRange(Train_aug_feat_ds)

In [None]:
from copy import deepcopy


# Given a feature dataset, returns the mean and std of each feature subset
def getNormalizationParams(feature_ds):
    all_mel, all_energy, all_delta1, all_delta2 = separateFeatures(feature_ds)
    mel_mean, mel_std = getMeanStd(all_mel)
    eng_mean, eng_std = getMeanStd(all_energy)
    d1_mean, d1_std = getMeanStd(all_delta1)
    d2_mean, d2_std = getMeanStd(all_delta2)        
    return (mel_mean, mel_std), (eng_mean, eng_std), (d1_mean, d1_std), (d2_mean, d2_std)


# Given a feature dataset and normalization parameters, normalizes the feature subsets
def normalizeFeatures(feature_ds, normal_params):
    # Unpack normalization parameters
    (mel_mean, mel_std), \
    (eng_mean, eng_std), \
    (d1_mean, d1_std), \
    (d2_mean, d2_std)  = normal_params
    eps = 1e-4            # replacement for zero
    feat_ds_copy = deepcopy(feature_ds)    # don't modify original dataset
    
    # Normalize mel-coefficients
    mean, std = mel_mean, mel_std
    print(f'INFO: Normalizing mel-coefficients, mean: {mean}   std: {std}')
    if std == 0: std = eps           # avoid division by zero
    for item in feat_ds_copy:
        mels = item[1][0:40, :]
        item[1][0:40, :] = (mels - mean) / std
    
    # normalize energy
    mean, std = eng_mean, eng_std
    print(f'INFO: Normalizing energy, mean: {mean}   std: {std}')
    if std == 0: std = eps           # avoid division by zero
    for item in feat_ds_copy:
        energy = item[1][40, :]
        item[1][40, :] = (energy - mean) / std
    
    # normalize delta-1
    mean, std = d1_mean, d1_std
    print(f'INFO: Normalizing delta-1, mean: {mean}   std: {std}')
    if std == 0: std = eps           # avoid division by zero
    for item in feat_ds_copy:
        delta1 = item[1][41:82, :]
        item[1][41:82, :] = (delta1 - mean) / std
    
    # normalize delta-2
    mean, std = d2_mean, d2_std
    print(f'INFO: Normalizing delta-2, mean: {mean}   std: {std}')
    if std == 0: std = eps           # avoid division by zero
    for item in feat_ds_copy:
        delta2 = item[1][82:123, :]
        item[1][82:123, :] = (delta2 - mean) / std
        
    return feat_ds_copy

In [None]:
# Normalize Train and Test features using Train normalization parameters
train_normal_params = getNormalizationParams(Train_feat_ds)
Test_norm_feat_ds = normalizeFeatures(Test_feat_ds, normal_params=train_normal_params)
Train_norm_feat_ds = normalizeFeatures(Train_feat_ds, normal_params=train_normal_params)
Train_norm_aug_feat_ds = normalizeFeatures(Train_aug_feat_ds, normal_params=train_normal_params)

In [None]:
print('\nDynamic Ranges after normalization')
print('---- Test_norm_feat_ds ----')
printAllRange(Test_norm_feat_ds)

In [None]:
print('---- Train_norm_feat_ds ----')
printAllRange(Train_norm_feat_ds)

In [None]:
print('---- Train_norm_aug_feat_ds ----')
printAllRange(Train_norm_aug_feat_ds)

In [None]:
Normal_params = train_normal_params
item = Test_norm_feat_ds[0]
showFeatures(item[1])
del item, train_normal_params

## Export Normalized Feature Datasets

In [None]:
# Delete extra datasets to save memory
del Train_ds, Train_audio_ds, Train_feat_ds
del Test_ds, Test_audio_ds, Test_feat_ds
del Train_aug_audio_ds, Train_aug_feat_ds

In [None]:
# Export the dataset with necessary information for the next notebook
normal_param_scheme = '(mel_mean, mel_std), (eng_mean, eng_std), (d1_mean, d1_std), (d2_mean, d2_std)'
note = f'''
Notes:
- Feature record: (phone, feature_sequence)
- Feature_seqence: list(feature_vector)
- Normalization params: {normal_param_scheme}
- len(feature_vector): {F_feat_len}

Normalization Parameters:
{Normal_params}

Features are extracted using following parameters''' + F_note

print(note)

In [None]:
# Given a feature-dataset, saves in a file
def saveFeatureDS(feat_ds, save_path, note):
    export = {
        'note' : note,
        'data-schema' : '(phoneme, feature-sequence)',
        'normal_params' : Normal_params,
        'data' : feat_ds
    }
    torch.save(export, save_path)
    print(f'INFO: Saved {save_path}')

In [None]:
# Make test feature dataset then save
saveFeatureDS(Test_norm_feat_ds, './session/test-norm-features.pt', note)

In [None]:
saveFeatureDS(Train_norm_feat_ds, './session/train-norm-features.pt', note)

In [None]:
saveFeatureDS(Train_norm_aug_feat_ds, './session/train-norm-aug-features.pt', note)

# Label-to-Index Mapping and Export

In [None]:
# Export Label-to-index map
export_labels = False


# Given an audio dataset, returns the set of all labels
def getAllLabels(audio_ds):
    labels = set()
    for item in audio_ds:     # item: (label, feature-sequence)
        labels.add(item[0])
    return labels


# Build the labels dictionary
all_train_labels = getAllLabels(Train_audio_ds)
all_test_labels  = getAllLabels(Test_audio_ds)
print('all_train_labels:', len(all_train_labels), '\n', all_train_labels)
print('all_test_labels :', len(all_test_labels) , '\n', all_test_labels)

print('')
Label_to_index = {label:index for index, label in enumerate(all_train_labels)}
Index_to_label = {index:label for label, index in Label_to_index.items()}
print('Label_to_index:\n', Label_to_index)
print('Index_to_label:\n', Index_to_label)

In [None]:
if export_labels:
    save_path = './session/label-to-index.pt'
    torch.save(Label_to_index, save_path)
    !ls -ltrh ./session