To run this notebook, please just go from the top cell all the way to the bottom cell sequentially.
All model structures are detailed below, with hyperparameters being tuned and tested with wandb record on.

# Installs

In [1]:
%pip install torch==1.13.1+cu117 torchvision==0.14.1+cu117 torchtext==0.14.1 torchaudio==0.13.1 torchdata==0.5.1 --extra-index-url https://download.pytorch.org/whl/cu117 -q
!pip install wandb --quiet
!pip install python-Levenshtein -q
!git clone --recursive https://github.com/parlance/ctcdecode.git
!pip install wget -q
%cd ctcdecode
!pip install . -q
%cd ..
!pip install torchsummaryX -q
!pip install torchsummaryx==1.1.0

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 GB[0m [31m909.7 kB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.3/24.3 MB[0m [31m45.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.2/4.2 MB[0m [31m56.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.6/4.6 MB[0m [31m26.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m14.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m195.4/195.4 kB[0m [31m18.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m264.6/264.6 kB[0m [31m18.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━

# Imports

In [2]:
import torch
import random
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
from torchsummaryX import summary
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
import torchaudio.transforms as tat
import torchaudio
from sklearn.metrics import accuracy_score
import gc
import zipfile
import pandas as pd
from tqdm import tqdm
import os
import wandb
import datetime
import ctcdecode # imports for decoding and distance calculation
import Levenshtein
from ctcdecode import CTCBeamDecoder
import warnings
warnings.filterwarnings('ignore')
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print("Device: ", device)

Device:  cuda


# Kaggle Setup

In [3]:
!pip install --upgrade --force-reinstall --no-deps kaggle==1.5.8 -q
!mkdir /root/.kaggle
with open("/root/.kaggle/kaggle.json", "w+") as f:
    f.write('{"username": "dunhan777", "key": "8fd87309c15da3fc8173ced30a2dfa76"}') # TODO: Put your kaggle username & key here
!chmod 600 /root/.kaggle/kaggle.json
!kaggle competitions download -c hw3p2asr-s24
'''
This will take a couple minutes, but you should see at least the following:
11-785-s24-hw3p2  ctcdecode  hw3p2asr-s24.zip  sample_data
'''
!unzip -q hw3p2asr-s24.zip
!ls

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/59.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.2/59.2 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for kaggle (setup.py) ... [?25l[?25hdone
Downloading hw3p2asr-s24.zip to /content
 99% 3.72G/3.74G [00:30<00:00, 221MB/s]
100% 3.74G/3.74G [00:30<00:00, 130MB/s]
11-785-s24-hw3p2  best_early_submission.pth  ctcdecode	hw3p2asr-s24.zip  sample_data


# Dataset and Dataloader

In [4]:
CMUdict_ARPAbet = {
    "" : " ",
    "[SIL]": "-", "NG": "G", "F" : "f", "M" : "m", "AE": "@",
    "R"    : "r", "UW": "u", "N" : "n", "IY": "i", "AW": "W",
    "V"    : "v", "UH": "U", "OW": "o", "AA": "a", "ER": "R",
    "HH"   : "h", "Z" : "z", "K" : "k", "CH": "C", "W" : "w",
    "EY"   : "e", "ZH": "Z", "T" : "t", "EH": "E", "Y" : "y",
    "AH"   : "A", "B" : "b", "P" : "p", "TH": "T", "DH": "D",
    "AO"   : "c", "G" : "g", "L" : "l", "JH": "j", "OY": "O",
    "SH"   : "S", "D" : "d", "AY": "Y", "S" : "s", "IH": "I",
    "[SOS]": "[SOS]", "[EOS]": "[EOS]"
} # ARPABET PHONEME MAPPING DO NOT CHANGE
CMUdict = list(CMUdict_ARPAbet.keys())
ARPAbet = list(CMUdict_ARPAbet.values())
PHONEMES = CMUdict[:-2]
LABELS = ARPAbet[:-2]

### Train Data

In [5]:
class AudioDataset(torch.utils.data.Dataset):
    # For this homework, we give you full flexibility to design your data set class. Hint: The data from HW1 is very similar to this HW
    def __init__(self, directory, PHONEMES):
        # Initializes the dataset. INPUTS: What inputs do you need here? Load the directory and all files in them.
        self.mfcc_dir = directory + '/mfcc/' #TODO
        self.transcript_dir = directory + '/transcript/' #TODO
        self.mfcc_files = sorted(os.listdir(self.mfcc_dir)) #TODO
        self.transcript_files = sorted(os.listdir(self.transcript_dir)) #TODO
        self.PHONEMES = PHONEMES
        self.length = len(self.mfcc_files) # WHAT SHOULD THE LENGTH OF THE DATASET BE?
        # HOW CAN WE REPRESENT PHONEMES? CAN WE CREATE A MAPPING FOR THEM? HINT: TENSORS CANNOT STORE NON-NUMERICAL VALUES OR STRINGS
        # CREATE AN ARRAY OF ALL FEATUERS AND LABELS WHAT NORMALIZATION TECHNIQUE DID YOU USE IN HW1? CAN WE USE IT HERE?
        # You may decide to do this in __getitem__ if you wish. However, doing this here will make
        # the __init__ function take the load of loading the data, and shift it away from training.
        self.mfccs = []
        self.transcripts = []
        for i in range(len(self.mfcc_files)):
            self.mfccs.append(np.load(self.mfcc_dir + self.mfcc_files[i]))
            transcript  = np.load(self.transcript_dir + self.transcript_files[i])[1:-1] # remove sos & eos
            self.transcripts.append([self.PHONEMES.index(j) for j in transcript])
    def __len__(self):
        return self.length
    def __getitem__(self, ind):
        # RETURN THE MFCC COEFFICIENTS AND ITS CORRESPONDING LABELS
        # If you didn't do the loading and processing of the data in __init__, do that here. Once done, return a tuple of features and labels.
        return torch.FloatTensor(self.mfccs[ind]), torch.tensor(self.transcripts[ind])
    def collate_fn(self, batch):
        '''
        1.  Extract the features and labels from 'batch'
        2.  We will additionally need to pad both features and labels, look at pytorch's docs for pad_sequence
        3.  This is a good place to perform transforms, if you so wish. Performing them on batches will speed the process up a bit.
        4.  Return batch of features, labels, lenghts of features, and lengths of labels.
        '''
        batch_mfcc = [mfcc for mfcc, _ in batch] # TODO batch of input mfcc coefficients
        batch_transcript = [ts for _, ts in batch] # TODO batch of output phonemes
        # HINT: CHECK OUT -> pad_sequence (imported above), also be sure to check the input format (batch_first)
        mfcc_all = [len(mfcc) for mfcc, _ in batch]
        ts_all = [len(ts) for _, ts in batch]
        audio_transforms = nn.Sequential(
            PermuteBlock(),
            torchaudio.transforms.FrequencyMasking(freq_mask_param=5),
            torchaudio.transforms.TimeMasking(time_mask_param=100),
            PermuteBlock()
        )
        # You may apply some transformation, Time and Frequency masking, here in the collate function;
        # Food for thought -> Why are we applying the transformation here and not in the __getitem__?
        #                  -> Would we apply transformation on the validation set as well?
        #                  -> Is the order of axes / dimensions as expected for the transform functions?
        # Return the following values: padded features, padded labels, actual length of features, actual length of the labels
        padded_features = audio_transforms(pad_sequence(batch_mfcc,batch_first=True))
        padded_labels = pad_sequence(batch_transcript,batch_first=True)
        return padded_features, padded_labels, torch.tensor(mfcc_all), torch.tensor(ts_all)
class PermuteBlock(torch.nn.Module):
    def forward(self, x):
        return x.transpose(1, 2)

In [6]:
class AudioDatasetValid(torch.utils.data.Dataset):
    # For this homework, we give you full flexibility to design your data set class. Hint: The data from HW1 is very similar to this HW
    def __init__(self, directory, PHONEMES):
        # Initializes the dataset. INPUTS: What inputs do you need here? Load the directory and all files in them.
        self.mfcc_dir = directory + '/mfcc/' #TODO
        self.transcript_dir = directory + '/transcript/' #TODO
        self.mfcc_files = sorted(os.listdir(self.mfcc_dir)) #TODO
        self.transcript_files = sorted(os.listdir(self.transcript_dir)) #TODO
        self.PHONEMES = PHONEMES
        self.length = len(self.mfcc_files) # WHAT SHOULD THE LENGTH OF THE DATASET BE?
        # HOW CAN WE REPRESENT PHONEMES? CAN WE CREATE A MAPPING FOR THEM? HINT: TENSORS CANNOT STORE NON-NUMERICAL VALUES OR STRINGS
        # CREATE AN ARRAY OF ALL FEATUERS AND LABELS WHAT NORMALIZATION TECHNIQUE DID YOU USE IN HW1? CAN WE USE IT HERE?
        # You may decide to do this in __getitem__ if you wish. However, doing this here will make
        # the __init__ function take the load of loading the data, and shift it away from training.
        self.mfccs = []
        self.transcripts = []
        for i in range(len(self.mfcc_files)):
            self.mfccs.append(np.load(self.mfcc_dir + self.mfcc_files[i]))
            transcript  = np.load(self.transcript_dir + self.transcript_files[i])[1:-1] # remove sos & eos
            self.transcripts.append([self.PHONEMES.index(j) for j in transcript])
    def __len__(self):
        return self.length
    def __getitem__(self, ind):
        # RETURN THE MFCC COEFFICIENTS AND ITS CORRESPONDING LABELS
        # If you didn't do the loading and processing of the data in __init__, do that here. Once done, return a tuple of features and labels.
        return torch.FloatTensor(self.mfccs[ind]), torch.tensor(self.transcripts[ind])
    def collate_fn(self, batch):
        '''
        1.  Extract the features and labels from 'batch'
        2.  We will additionally need to pad both features and labels, look at pytorch's docs for pad_sequence
        3.  This is a good place to perform transforms, if you so wish. Performing them on batches will speed the process up a bit.
        4.  Return batch of features, labels, lenghts of features, and lengths of labels.
        '''
        batch_mfcc = [mfcc for mfcc, _ in batch] # TODO batch of input mfcc coefficients
        batch_transcript = [ts for _, ts in batch] # TODO batch of output phonemes
        # HINT: CHECK OUT -> pad_sequence (imported above), also be sure to check the input format (batch_first)
        mfcc_all = [len(mfcc) for mfcc, _ in batch]
        ts_all = [len(ts) for _, ts in batch]
        # You may apply some transformation, Time and Frequency masking, here in the collate function;
        # Food for thought -> Why are we applying the transformation here and not in the __getitem__?
        #                  -> Would we apply transformation on the validation set as well?
        #                  -> Is the order of axes / dimensions as expected for the transform functions?
        # Return the following values: padded features, padded labels, actual length of features, actual length of the labels
        padded_features = pad_sequence(batch_mfcc,batch_first=True)
        padded_labels = pad_sequence(batch_transcript,batch_first=True)
        return padded_features, padded_labels, torch.tensor(mfcc_all), torch.tensor(ts_all)

### Test Data

In [7]:
class AudioDatasetTest(torch.utils.data.Dataset):
  def __init__(self, directory):
      self.mfcc_dir = directory + '/mfcc/'
      self.mfcc_files = sorted(os.listdir(self.mfcc_dir))
      self.length = len(self.mfcc_files)
      self.mfccs = []
      for i in range(len(self.mfcc_files)):
        self.mfccs.append(np.load(self.mfcc_dir + self.mfcc_files[i]))
  def __len__(self):
      return self.length
  def __getitem__(self, ind):
      return torch.FloatTensor(self.mfccs[ind])
  def collate_fn(self, batch):
      batch_mfcc = [mfcc for mfcc in batch]
      mfcc_all = list(map(len, batch_mfcc))
      padded_features = pad_sequence(batch_mfcc,batch_first=True)
      return padded_features, torch.tensor(mfcc_all)

### Data loaders

In [8]:
config = {
    "beam_width" : 5,
    "lr"         : 0.002,
    "batch_size" : 64,
    "epochs"     : 50,
    "watch": True
}
gc.collect() # get me RAMMM!!!!
train_data = AudioDataset('/content/11-785-s24-hw3p2/train-clean-100', PHONEMES) #TODO Create objects for the dataset class
val_data =  AudioDatasetValid('/content/11-785-s24-hw3p2/dev-clean', PHONEMES) # TODO
test_data = AudioDatasetTest('/content/11-785-s24-hw3p2/test-clean') #TODO
train_loader = torch.utils.data.DataLoader(
    dataset     = train_data,
    shuffle     = True,
    num_workers = 8,
    batch_size  = config["batch_size"],
    pin_memory  = True,
    collate_fn = train_data.collate_fn)
val_loader = torch.utils.data.DataLoader(
    dataset     = val_data,
    shuffle     = False,
    num_workers = 4,
    batch_size  = config["batch_size"],
    pin_memory  = True,
    collate_fn = val_data.collate_fn)
test_loader = torch.utils.data.DataLoader(
    dataset     = test_data,
    shuffle     = False,
    num_workers = 2,
    batch_size  = 1,
    pin_memory  = True,
    collate_fn = test_data.collate_fn)
print("Train dataset samples = {}, batches = {}".format(train_data.__len__(), len(train_loader)))
print("Val dataset samples = {}, batches = {}".format(val_data.__len__(), len(val_loader)))
print("Test dataset samples = {}, batches = {}".format(test_data.__len__(), len(test_loader)))
# sanity check
for data in train_loader:
    x, y, lx, ly = data
    print("sanity check:", x.shape, y.shape, lx.shape, ly.shape)
    break

Train dataset samples = 28539, batches = 446
Val dataset samples = 2703, batches = 43
Test dataset samples = 2620, batches = 2620
sanity check: torch.Size([64, 1688, 27]) torch.Size([64, 206]) torch.Size([64]) torch.Size([64])


### Pyramid Bi-LSTM (pBLSTM)

In [9]:
class pBLSTM(torch.nn.Module):
    '''
    Pyramidal BiLSTM: Read the write up/paper and understand the concepts and then write your implementation here. At each step:
    1. Pad your input if it is packed (Unpack it)
    2. Reduce the input length dimension by concatenating feature dimension (Tip: Write down the shapes and understand)
        (i) How should  you deal with odd/even length input? (ii) How should you deal with input length array (x_lens) after truncating the input?
    3. Pack your input
    4. Pass it into LSTM layer
    To make our implementation modular, we pass 1 layer at a time.
    '''
    def __init__(self, input_size, hidden_size):
        super(pBLSTM, self).__init__()
        # TODO: Initialize a single layer bidirectional LSTM with the given input_size and hidden_size
        self.blstm = nn.LSTM(input_size=2*input_size,hidden_size=hidden_size,num_layers=1,bidirectional=True,dropout=0.25,batch_first=True)
    def forward(self, x_packed): # x_packed is a PackedSequence
        x, unpacked = pad_packed_sequence(x_packed, batch_first=True) # TODO: Pad Packed Sequence
        # Call self.trunc_reshape() which downsamples the time steps of x and increases the feature dimensions as mentioned above
        x, x_lens = self.trunc_reshape(x, unpacked) # self.trunc_reshape will return 2 outputs.
        x = pack_padded_sequence(x, x_lens, batch_first=True, enforce_sorted=False) # TODO: Pack Padded Sequence. What output(s) would you get?
        x, _ = self.blstm(x) # TODO: Pass the sequence through bLSTM
        return x
    def trunc_reshape(self, x, x_lens):
        # TODO: If you have odd number of timesteps, how can you handle it? (Hint: You can exclude them)
        x = x[:, :-1, :] if x.shape[1] % 2 != 0 else x
        # TODO: Reshape x, while reducing number of timesteps by a downsampling factor while increasing number of features by the same factor
        # TODO: Reduce lengths by the same downsampling factor
        return x.reshape(x.shape[0], x.shape[1]//2, x.shape[2]*2), x_lens//2

### Encoder

In [10]:
class lDropout(nn.Module):
    def __init__(self, p):
        super(lDropout, self).__init__()
        self.p = p
    def forward(self, x):
        if not self.training or not self.p: return x # turn it off during inference
        else:
          x, x_lens = pad_packed_sequence(x, batch_first=True)
          m = (x.new_empty(x.size(0), 1, x.size(2), requires_grad=False).bernoulli_(1 - self.p)) / (1 - self.p)
          return pack_padded_sequence(x*(m.expand_as(x)), x_lens, batch_first=True, enforce_sorted=False)
class Encoder(torch.nn.Module):
    # The Encoder takes utterances as inputs and returns latent feature representations
    def __init__(self, input_size, encoder_hidden_size):
        super(Encoder, self).__init__()
        self.dim = [128, 256]
        #TODO: You can use CNNs as Embedding layer to extract features. Keep in mind the Input dimensions and expected dimension of Pytorch CNN.
        self.embedding = torch.nn.Sequential(
            PermuteBlock(),
            nn.Conv1d(in_channels=input_size, out_channels=self.dim[0], kernel_size=3, stride=1, padding=1),
            nn.BatchNorm1d(num_features=self.dim[0]),
            nn.GELU(),
            nn.Conv1d(in_channels=self.dim[0], out_channels=self.dim[1], kernel_size=3, stride=1, padding=1),
            nn.BatchNorm1d(num_features=self.dim[1]),
            PermuteBlock())
        self.pBLSTMs = torch.nn.Sequential( # How many pBLSTMs are required?
            # TODO: Fill this up with pBLSTMs - What should the input_size be?
            # Hint: You are downsampling timesteps by a factor of 2, upsampling features by a factor of 2 and the LSTM is bidirectional)
            # Optional: Dropout/Locked Dropout after each pBLSTM (Not needed for early submission)
            # https://github.com/salesforce/awd-lstm-lm/blob/dfd3cb0235d2caf2847a4d53e1cbd495b781b5d2/locked_dropout.py#L5
            pBLSTM(input_size=self.dim[1], hidden_size=encoder_hidden_size),
            lDropout(0.4),
            pBLSTM(input_size=2*encoder_hidden_size, hidden_size=encoder_hidden_size),
            lDropout(0.25)
        )
    def forward(self, x, x_lens): # Where are x and x_lens coming from? The dataloader
        #TODO: Call the embedding layer and the Pack Padded Sequence
        x = self.embedding(x)
        x = pack_padded_sequence(x, x_lens.clamp(max=x.shape[1]), batch_first=True, enforce_sorted=False)
        x = self.pBLSTMs(x) #TODO: Pass Sequence through the pyramidal Bi-LSTM layer
        encoder_outputs, encoder_lens = pad_packed_sequence(x, batch_first=True) #TODO: Pad Packed Sequence
        return encoder_outputs, encoder_lens

### Decoder

In [11]:
class Decoder(torch.nn.Module):
    def __init__(self, embed_size, output_size=41):
        super().__init__()
        self.mlp = torch.nn.Sequential(
            PermuteBlock(), 
            torch.nn.BatchNorm1d(embed_size), 
            PermuteBlock(),
            #TODO define your MLP arch. Refer HW1P2. Use Permute Block before and after BatchNorm1d() to match the size
            nn.Linear(embed_size, 2048),
            nn.GELU(),
            PermuteBlock(), 
            torch.nn.BatchNorm1d(2048), 
            PermuteBlock(),
            nn.Dropout(0.25),
            nn.Linear(2048, 1024),
            nn.GELU(),
            PermuteBlock(), 
            torch.nn.BatchNorm1d(1024), 
            PermuteBlock(),
            nn.Dropout(0.25),
            nn.Linear(1024, output_size)
        )
        self.softmax = torch.nn.LogSoftmax(dim=2)
    def forward(self, encoder_out):
        #TODO call your MLP, think what should be the final output of the decoder for the classification
        return self.softmax(self.mlp(encoder_out))
class ASRModel(torch.nn.Module):
    def __init__(self, input, embed=192, output=len(PHONEMES)):
        super().__init__()
        self.encoder = Encoder(input, embed) # TODO: Initialize Encoder
        self.decoder = Decoder(embed*2, output) # TODO: Initialize Decoder
    def forward(self, x, lengths_x):
        encoder_out, encoder_lens = self.encoder(x, lengths_x)
        decoder_out = self.decoder(encoder_out)
        return decoder_out, encoder_lens

# Initialize model

In [12]:
model = ASRModel(input=27, embed=512, output=len(PHONEMES)).to(device)
criterion = torch.nn.CTCLoss(blank=0, reduction='mean', zero_infinity=False) # Define CTC loss as the criterion
# CTC Loss: https://pytorch.org/docs/stable/generated/torch.nn.CTCLoss.html
optimizer = torch.optim.AdamW(model.parameters(), lr=config['lr']) # What goes in here?
# Declare the decoder. Use the CTC Beam Decoder to decode phonemes
decoder = CTCBeamDecoder(LABELS, beam_width=config["beam_width"], log_probs_input=True) # TODO Doc: https://github.com/parlance/ctcdecode
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', patience=4, threshold=1e-2) #TODO
scaler = torch.cuda.amp.GradScaler() # Mixed Precision, if you need it

# Decode Prediction

In [13]:
def decode_prediction(output, output_lens, decoder, PHONEME_MAP=LABELS):
    # TODO: look at docs for CTC.decoder and find out what is returned here. Check the shape of output and expected shape in decode.
    beam, _, _, seq = decoder.decode(output, seq_lens=output_lens) #lengths - list of lengths
    pred_strings = []
    for i in range(output_lens.shape[0]):
        #TODO: Create the prediction from the output of decoder.decode. Don't forget to map it using PHONEMES_MAP.
        pred_strings.append(''.join([PHONEME_MAP[n] for n in beam[i][0][:seq[i][0]]]))
    return pred_strings
def calculate_levenshtein(output, label, output_lens, label_lens, decoder, PHONEME_MAP= LABELS): # y - sequence of integers
    dist            = 0
    batch_size      = label.shape[0]
    pred_strings    = decode_prediction(output, output_lens, decoder, PHONEME_MAP)
    for i in range(batch_size): # TODO: Get predicted string and label string for each element in the batch
        pred_string = pred_strings[i] #TODO
        label_string = ''.join([PHONEME_MAP[n] for n in label[i][:label_lens[i]]]) #TODO
        dist += Levenshtein.distance(pred_string, label_string)
    dist /= batch_size # TODO: Uncomment this, but think about why we are doing this
    return dist

# Test Implementation

In [14]:
model.eval()
for i, data in enumerate(val_loader, 0):
    x, y, lx, ly = data
    x, y = x.to(device), y.to(device)
    h, lh = model(x, lx)
    print(h.shape)
    print(calculate_levenshtein(h, y, lx, ly, decoder, LABELS))
    h = torch.permute(h, (1, 0, 2))
    print(h.shape, y.shape)
    loss = criterion(h, y, lh, ly)
    print(loss)
    break

torch.Size([64, 734, 41])
201.703125
torch.Size([734, 64, 41]) torch.Size([64, 265])
tensor(7.6629, device='cuda:0', grad_fn=<MeanBackward0>)


# WandB

In [None]:
if config['watch']:
  print("initializing wandb watch for current experiment")
  wandb.login(key="e82cd60c71ce53e010026113443de725b0d4fb58")
  run = wandb.init(
    name = "early-submission", ## Wandb creates random run names if you skip this field
    reinit = True, ### Allows reinitalizing runs when you re-run this cell
    # run_id = "4219x9e1", # Insert specific run id here if you want to resume a previous run
    # resume = "must", ### You need this to resume previous runs, but comment out reinit = True when using this
    project = "hw3p2-ablations", ### Project should be created in your wandb account
    config = config ### Wandb Config for your run
  )

initializing wandb watch for current experiment


[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mdunhanj[0m ([33mdunhan[0m). Use [1m`wandb login --relogin`[0m to force relogin


# Train Functions

In [15]:
def train_model(model, train_loader, criterion, optimizer):
    model.train()
    batch_bar = tqdm(total=len(train_loader), dynamic_ncols=True, leave=False, position=0, desc='Train')
    total_loss = 0
    for i, data in enumerate(train_loader):
        optimizer.zero_grad()
        x, y, lx, ly = data
        x, y = x.to(device), y.to(device)
        with torch.cuda.amp.autocast():
            h, lh = model(x, lx)
            h = torch.permute(h, (1, 0, 2))
            loss = criterion(h, y, lh, ly)
        total_loss += loss.item()
        batch_bar.set_postfix(
            loss="{:.04f}".format(float(total_loss / (i + 1))),
            lr="{:.06f}".format(float(optimizer.param_groups[0]['lr'])))
        batch_bar.update() # Update tqdm bar
        # Another couple things you need for FP16.
        scaler.scale(loss).backward() # This is a replacement for loss.backward()
        scaler.step(optimizer) # This is a replacement for optimizer.step()
        scaler.update() # This is something added just for FP16
        del x, y, lx, ly, h, lh, loss
        torch.cuda.empty_cache()
    batch_bar.close() # You need this to close the tqdm bar
    return total_loss / len(train_loader)
def validate_model(model, val_loader, decoder, phoneme_map= LABELS):
    model.eval()
    batch_bar = tqdm(total=len(val_loader), dynamic_ncols=True, position=0, leave=False, desc='Val')
    total_loss = 0
    vdist = 0
    for i, data in enumerate(val_loader):
        x, y, lx, ly = data
        x, y = x.to(device), y.to(device)
        with torch.inference_mode():
            h, lh = model(x, lx)
            h = torch.permute(h, (1, 0, 2))
            loss = criterion(h, y, lh, ly)
        total_loss += float(loss)
        vdist += calculate_levenshtein(torch.permute(h, (1, 0, 2)), y, lh, ly, decoder, phoneme_map)
        batch_bar.set_postfix(loss="{:.04f}".format(float(total_loss / (i + 1))), dist="{:.04f}".format(float(vdist / (i + 1))))
        batch_bar.update()
        del x, y, lx, ly, h, lh, loss
        torch.cuda.empty_cache()
    batch_bar.close()
    total_loss = total_loss/len(val_loader)
    val_dist = vdist/len(val_loader)
    return total_loss, val_dist

## Training Setup

In [16]:
def save_model(model, optimizer, scheduler, metric, epoch, path):
    torch.save(
        {'model_state_dict'         : model.state_dict(),
         'optimizer_state_dict'     : optimizer.state_dict(),
         'scheduler_state_dict'     : scheduler.state_dict(),
         metric[0]                  : metric[1],
         'epoch'                    : epoch},
         path
    )
def load_model(path, model, metric='valid_acc', optimizer=None, scheduler=None):
    checkpoint = torch.load(path)
    model.load_state_dict(checkpoint['model_state_dict'])
    if optimizer != None:
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    if scheduler != None:
        scheduler.load_state_dict(checkpoint['scheduler_state_dict'])
    return model

In [17]:
# This is for checkpointing, if you're doing it over multiple sessions
last_epoch_completed = 0
start = last_epoch_completed
end = config["epochs"]
best_lev_dist = float("inf") # if you're restarting from some checkpoint, use what you saw there.
best_model_path = '/content/best_early_submission.pth' #TODO set best model path

In [None]:
# model = load_model(best_model_path, model)
torch.cuda.empty_cache()
gc.collect()
#TODO: Please complete the training loop
for epoch in range(0, config['epochs']):
    print("Epoch: {}/{}".format(epoch+1, config['epochs']))
    curr_lr = float(optimizer.param_groups[0]['lr']) #TODO
    train_loss = train_model(model, train_loader, criterion, optimizer) #TODO
    valid_loss, valid_dist = validate_model(model, val_loader, decoder, phoneme_map=LABELS) #TODO
    scheduler.step(valid_dist)
    print("\ttrain_loss {:.04f}\t lr {:.05f}\t val_loss {:.04f}%\t val_dist {:.04f}".format(train_loss, curr_lr, valid_loss, valid_dist))
    if config['watch']:
      wandb.log({
          'train_loss': train_loss,
          'valid_dist': valid_dist,
          'valid_loss': valid_loss,
          'lr'        : curr_lr
      })
    if valid_dist <= best_lev_dist:
        best_lev_dist = valid_dist
        save_model(model, optimizer, scheduler, ['valid_dist', valid_dist], epoch, best_model_path)
        if config['watch']:
          wandb.save(best_model_path)
        print("the best model saved")
run.finish()

Epoch: 1/5




	train_loss 0.2194	 lr 0.00020	 val_loss 0.2581%	 val_dist 4.9390




the best model saved
Epoch: 2/5




	train_loss 0.2205	 lr 0.00020	 val_loss 0.2597%	 val_dist 4.8538
the best model saved
Epoch: 3/5




	train_loss 0.2192	 lr 0.00020	 val_loss 0.2611%	 val_dist 4.9212
Epoch: 4/5




	train_loss 0.2148	 lr 0.00020	 val_loss 0.2616%	 val_dist 4.9159
Epoch: 5/5




	train_loss 0.2135	 lr 0.00020	 val_loss 0.2603%	 val_dist 4.8491
the best model saved


VBox(children=(Label(value='218.135 MB of 218.135 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
lr,▁▁▁▁▁
train_loss,▇█▇▂▁
valid_dist,█▁▇▆▁
valid_loss,▁▄▇█▅

0,1
lr,0.0002
train_loss,0.21345
valid_dist,4.84913
valid_loss,0.26025


# Generate Predictions and Submit to Kaggle

In [19]:
best_model = load_model(best_model_path, model)
#TODO: Make predictions. Follow the steps below:
# 1. Create a new object for CTCBeamDecoder with larger (why?) number of beams
# 2. Get prediction string by decoding the results of the beam decoder
TEST_BEAM_WIDTH = 40
test_decoder =  CTCBeamDecoder(LABELS, beam_width = TEST_BEAM_WIDTH, log_probs_input = True)
results = []
best_model.eval()
for data in tqdm(test_loader):
    x, lx   = data
    x       = x.to(device)
    with torch.no_grad():
        h, lh = best_model(x, lx)
    prediction_string= decode_prediction(h, lh, test_decoder) # TODO call decode_prediction
    #TODO save the output in results array.
    results.extend(prediction_string)
    del x, lx, h, lh
    torch.cuda.empty_cache()

100%|██████████| 2620/2620 [03:24<00:00, 12.84it/s]


In [20]:
data_dir = "/content/11-785-s24-hw3p2/test-clean/random_submission.csv"
df = pd.read_csv(data_dir)
df.label = results
df.to_csv('submission.csv', index=False)

In [21]:
!kaggle competitions submit -c hw3p2asr-s24 -f submission.csv -m "I made it!"

100% 209k/209k [00:01<00:00, 114kB/s]
Successfully submitted to HW3P2_ASR-S24