# LSTM-250 Numpy Model Extraction

## Goals

- Loads Dataset and Model from training notebook and validates.
- Translates the model into Numpy operations and validates.
- Dataset updated with the accuracy from the Numpy model.
- Model and dataset exported as sqlite3 database for implementation in C.

**NOTE:** The dataset exported by the training notebook may have incorrect predicted index due to several iterations of model training and not updating the dataset. We'll re-run the predictions here and update the predicted index in the dataset.

# Environment Setup

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

import os
import numpy as np
import IPython.display as ipd
from tqdm.auto import tqdm


#if torch.cuda.is_available():
#    device = torch.device('cuda')
#else:
#    device = torch.device('cpu')

#print('Using PyTorch version:', torch.__version__, ' Device:', device)
print('Using PyTorch version:', torch.__version__)
device = 'cpu'

Using PyTorch version: 2.0.1


# Load and Validate torch.nn.Module Implementation

## Define Model

**NOTE:** Always copy the following cell from the training notebook.

In [2]:
import torch.nn.utils.rnn as rnn_utils


# LSTM model definition
class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(LSTM, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.num_classes = num_classes
        self.debug = False    # Set it to true to print debug info
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, dropout=0.2)
        #self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)
    
    # Expects a padded_sequence of batched input and the lengths of the sequences
    def forward(self, pad_seq, lengths):        
        if self.debug: print('DEBUG START: LSTM model ---')

        # Extract batch size for initialization of hidden state
        batch_size = len(pad_seq)
            
        # Set initial hidden and cell states
        h0 = torch.zeros(self.num_layers, batch_size, self.hidden_size).to(device) 
        c0 = torch.zeros(self.num_layers, batch_size, self.hidden_size).to(device)
        
        # Convert padded sequence to variable length packed sequence for LSTM
        packed_seq = rnn_utils.pack_padded_sequence(pad_seq, lengths, enforce_sorted=False, batch_first=True)
        
        # Forward propagate LSTM, returns a packed sequence
        out_packed, _ = self.lstm(packed_seq, (h0, c0))
        
        # Extract final hidden states of each sequence for the output layer
        out_pad, out_lens = rnn_utils.pad_packed_sequence(out_packed, batch_first=True)
        out_indx = out_lens - 1   # indices of the last valid hidden state in the padded sequence
        last_hidden = out_pad[range(batch_size), out_indx].contiguous()  # select the last valid state in each sequence, and make them contiguous for efficiency
                
        if self.debug:
            print('last_hidden size:', last_hidden.size())
            print('last_hidden:\n', last_hidden)
        
        # Decode the hidden state of the last time step only (for whole batch)
        out = self.fc(last_hidden)
        if self.debug: print('DEBUG END: LSTM model ---')
        return out

## Load Saved Model

In [3]:
!ls -ltrh ./session/
print('')

# Load saved model dictionary
model_path = './session/trained-lstm250.pt'
model_dict = torch.load(model_path)
print(model_dict.keys())


# Parse the values for easier use
Accuracy = model_dict['accuracy']
Correct_count = model_dict['correct_count']
Index_to_label = model_dict['index_to_label']
Label_to_index = {label:index for index, label in Index_to_label.items()}
Hparam = model_dict['Hparam']
Model_state_dict = model_dict['state_dict']
Model_perf = f'Model Performance:   accuracy: {Accuracy:.2f}%   correct_count: {Correct_count}'  # to be used later
print('Hparam:', Hparam)
print('Model_perf:', Model_perf)


# move all weights to cpu
#for key in Model_state_dict: 
#    Model_state_dict[key] = Model_state_dict[key].to('cpu')
    
    
# Instantiate the model
model_pt = LSTM(Hparam['input_size'], Hparam['hidden_size'], Hparam['num_layers'], Hparam['num_classes'])
model_pt.load_state_dict(Model_state_dict)
model_pt.to('cpu')
model_pt.eval()     # we are always evaluating here
print(model_pt)

total 1.7G
-rw-rw-r-- 1 makabir makabir  310 Jun 24 16:57 README.md
-rw-rw-r-- 1 makabir makabir 1.3M Jun 26 12:52 curated-dataset.pt
-rw-rw-r-- 1 makabir makabir 5.4M Jun 27 18:24 trained-lstm250-83.09p.pt
-rw-rw-r-- 1 makabir makabir  829 Jun 27 18:30 label-to-index.pt
-rw-rw-r-- 1 makabir makabir 167M Jun 28 23:12 test-features.pt
-rw-rw-r-- 1 makabir makabir 1.2G Jun 28 23:15 train-features.pt
-rw-rw-r-- 1 makabir makabir 125M Jun 28 23:48 Test_feat.pt
-rw-rw-r-- 1 makabir makabir 5.4M Jun 28 23:48 trained-lstm250-82.26p.pt
-rw-rw-r-- 1 makabir makabir 125M Jun 28 23:49 Test_feat-82.26p.pt
drwxrwxr-x 2 makabir makabir 4.0K Jun 29 13:23 backup
-rw-rw-r-- 1 makabir makabir 5.4M Jun 29 15:14 trained-lstm250.pt
-rw-rw-r-- 1 makabir makabir 125M Jun 29 15:28 test-export-ds.pt

dict_keys(['accuracy', 'correct_count', 'index_to_label', 'Hparam', 'state_dict'])
Hparam: {'input_size': 123, 'hidden_size': 250, 'num_layers': 3, 'num_classes': 39}
Model_perf: Model Performance:   accuracy: 73.

In [4]:
# Delete names to avoid confusions later
del model_path, model_dict

## Load Saved Dataset

In [5]:
# Prints information about dataset item
def print_dataItem(item):
    mstr = f"label: {item[0]}, label_index: {item[1]}, predicted_index: {item[2]}, sequence_length: {item[3]},"
    mstr2 = f"\nfeature_sequence shape: {item[4].shape}, feature_seq type: {type(item[4])}"
    print(mstr, mstr2)

    
# Load the test dataset
ds_path = "./session/test-export-ds.pt"
DS_loaded = torch.load(ds_path)
print(DS_loaded.keys())
print('DS_loaded len:', len(DS_loaded['dataset']))
print('schema:', DS_loaded['dataset_schema'])


# Make sure the label-to-index dictionary matches the one in the model
for key in DS_loaded['label_dict']:
    assert DS_loaded['label_dict'][key] == Label_to_index[key], 'Dataset and Model Label-to-index are different'
print('INFO: Dataset and Model label-to-index matched')


# show an item information
item = DS_loaded['dataset'][0]
print_dataItem(item)

dict_keys(['label_dict', 'dataset_schema', 'dataset'])
DS_loaded len: 7333
schema: (label, label_index, predicted_index, sequence_length, feature_sequence)
INFO: Dataset and Model label-to-index matched
label: h#, label_index: 6, predicted_index: 6, sequence_length: 38, 
feature_sequence shape: (38, 123), feature_seq type: <class 'numpy.ndarray'>


In [6]:
# Delete names to avoid confusions later
del ds_path, key, item

## Validate Loaded Model

In [7]:
# Padding is needed to make the batch <tensor> from <list> of variable length sequences
# The padding values are not passed to the LSTM during trainig/testing
def pad_sequence_lstm(batch):
    # Make all tensor in a batch the same length by padding with zeros
    batch = rnn_utils.pad_sequence(batch, batch_first=True, padding_value=0.)
    return batch


# Gets the list of audio and labels as batch then
# converts them into sequence of features for the model.
# Adds padding to build the batch tensor
def collate_fn_lstm(batch):
    tensors, targets, lengths = [], [], []   # lengths is needed for pack_padded_sequence  in LSTM.forward()

    # Gather in lists, and encode labels as indices
    for item in batch:
        label, feat_seq = item
        feat_seq_tensor = torch.from_numpy(feat_seq)
        tensors += [feat_seq_tensor]
        targets += [Label_to_index[label]]
        lengths.append(feat_seq_tensor.size()[0])

    # Group the list of tensors into a batched tensor
    tensors = pad_sequence_lstm(tensors)
    targets = torch.tensor(targets)
    lengths = torch.tensor(lengths)
    return tensors, targets, lengths

In [8]:
def get_likely_index(tensor):
    # find most likely label index for each element in the batch
    return tensor.argmax(dim=-1)


# Given an item form the test_dataset, returns an example for predict() function
# numpytype: set it to True to return numpy nd-array
def make_example(data_item):
    label = item[0]
    feat_seq = data_item[4]
    return label, feat_seq


# Return the prediction using nn.Module instance
def predictNN(example, model=None):    # feat_seq: np.ndarray
    model.eval()
    batch = [example]   # make a batch with single example
    tensor, target, lengths = collate_fn_lstm(batch)
    # Use the model to predict the label index
    output = model(tensor, lengths)
    pred = get_likely_index(output)[0]   # indexing to get the prediction from batch    
    return pred.item()


# Run a prediction
select_index = 1006
item = DS_loaded['dataset'][select_index]
example = make_example(item)
pred_index = predictNN(example, model=model_pt)
pred_label  = Index_to_label[pred_index]
phone, *_, feat_seq = item
print(f"Expected: {phone}. Predicted: {pred_label}.")

Expected: iy. Predicted: iy.


In [9]:
# Validate the Given model on the whole dataset
# ptmodel: set it to True for the PyTorch model
def validateModel(model, predict_fn):
    dataset = DS_loaded['dataset']
    expect_miss = 0      # keeps track of no. of mismatche between prediction in dataset vs model prediction
    total_count = 0
    correct_count = 0
    for item in tqdm(dataset):
        lbl, lbl_index, pred_index, seq_len, feat_seq = item
        example = make_example(item)
        pred = predict_fn(example, model=model)
        if pred != pred_index: expect_miss += 1    # prediction does not match prediction in dataset
        if pred == lbl_index: correct_count += 1   # prediction matched the actual label-index
        total_count += 1
    # Compute and print statistics
    accuracy = (100.0 * correct_count) / total_count
    print(f'Validation accuracy: {accuracy:.2f}%   correct_count: {correct_count}   expected-miss: {expect_miss}   total_count: {total_count}')
    return accuracy, correct_count, expect_miss, total_count

            
# Validate the loaded model
validateModel(model_pt, predictNN)
print('Expected', Model_perf)

  0%|          | 0/7333 [00:00<?, ?it/s]

Validation accuracy: 73.03%   correct_count: 5355   expected-miss: 0   total_count: 7333
Expected Model Performance:   accuracy: 73.03%   correct_count: 5355


# Implementation Using torch.tensor Operations

# Implement Using Numpy Matrix Operations

# Update the dataset with the Numpy Model Predicted index

---

# Export Numpy Model as sqlite3 DB

# Export the Dataset as sqlite3 DB