In [1]:
import os
import sys

import numpy as np
import torch

import matplotlib.pyplot as plt
import seaborn as sns

import scipy.io.wavfile as wav

import importlib
import librosa
import soundfile as sf

import torchnet.meter as tnt
from collections import OrderedDict

from IPython.display import clear_output

sys.path.append(os.path.join(os.getcwd(), '..'))
sys.path.append(os.path.join(os.getcwd(), '../models/'))
sys.path.append(os.path.join(os.getcwd(), '../datasets/'))

import ai8x

from types import SimpleNamespace

In [2]:
trained_checkpoint_path = os.path.join("/home/merveeyuboglu/Github/ai8x-training-merve/ai8x-training/logs/train_kws35/2024.05.23-103316/qat_best.pth.tar")
mod_qat = importlib.import_module("ai85net-kws20-nas")
dataset = importlib.import_module("kws20")

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Working with device:", device)

ai8x.set_device(device=85, simulate=False, round_avg=False)
# qat_policy = {'start_epoch': 10, 'weight_bits': 8, 'bias_bits': 8}

Working with device: cuda
Configuring device: MAX78000, simulate=False.


In [4]:
model = mod_qat.AI85KWS20NetNAS(num_classes=35, num_channels=128, dimensions=(128, 1), bias=True, 
                           quantize_activation=False)


checkpoint = torch.load(trained_checkpoint_path)

state_dict = checkpoint['state_dict']
new_state_dict = OrderedDict()
for k, v in state_dict.items():
    if k.startswith('module.'):
        k = k[7:]
    new_state_dict[k] = v
checkpoint['state_dict'] = new_state_dict

ai8x.fuse_bn_layers(model)

model.load_state_dict(checkpoint['state_dict'], strict=False)

ai8x.update_model(model)

In [5]:
sn = SimpleNamespace()
sn.truncate_testset = False
sn.act_mode_8bit = False

_, test = dataset.KWS_35_get_datasets( ('/data_ssd', sn), load_train=False, load_test=True)


test_loader = torch.utils.data.DataLoader(test, batch_size=256)


Processing test...
test set: 11413 elements
Filtering silence elements...
test set: 11005 elements
Class backward (# 0): 165 elements
Class bed (# 1): 207 elements
Class bird (# 2): 185 elements
Class cat (# 3): 194 elements
Class dog (# 4): 220 elements
Class down (# 5): 406 elements
Class eight (# 6): 408 elements
Class five (# 7): 445 elements
Class follow (# 8): 172 elements
Class forward (# 9): 155 elements
Class four (# 10): 400 elements
Class go (# 11): 402 elements
Class happy (# 12): 203 elements
Class house (# 13): 191 elements
Class learn (# 14): 161 elements
Class left (# 15): 412 elements
Class marvin (# 17): 195 elements
Class nine (# 18): 408 elements
Class no (# 19): 405 elements
Class off (# 20): 402 elements
Class on (# 21): 396 elements
Class one (# 22): 399 elements
Class right (# 23): 396 elements
Class seven (# 24): 406 elements
Class sheila (# 25): 212 elements
Class six (# 27): 394 elements
Class stop (# 28): 411 elements
Class three (# 29): 405 elements
Class 

In [6]:
test.new_class_dict

{'backward': 0,
 'bed': 1,
 'bird': 2,
 'cat': 3,
 'dog': 4,
 'down': 5,
 'eight': 6,
 'five': 7,
 'follow': 8,
 'forward': 9,
 'four': 10,
 'go': 11,
 'happy': 12,
 'house': 13,
 'learn': 14,
 'left': 15,
 'marvin': 16,
 'nine': 17,
 'no': 18,
 'off': 19,
 'on': 20,
 'one': 21,
 'right': 22,
 'seven': 23,
 'sheila': 24,
 'six': 25,
 'stop': 26,
 'three': 27,
 'tree': 28,
 'two': 29,
 'up': 30,
 'visual': 31,
 'wow': 32,
 'yes': 33,
 'zero': 34,
 'UNKNOWN': 35}

In [14]:
classerr_test = tnt.ClassErrorMeter(accuracy=True, topk=(1, min(35, 5)))
test_confusion = tnt.ConfusionMeter(35)
outputs_all = np.zeros((len(test), 35))
targets_all = np.zeros((len(test)))
with torch.no_grad():
    for batch_idx, (inputs, targets) in enumerate(test_loader):
        outputs = model(inputs)
        classerr_test.add(outputs, targets)
        test_confusion.add(outputs, targets)
        #print("Batch: [",batch_idx*256 ,"/", len(val_dataset),"]")
        test_acc = classerr_test.value()[0]
        #print("Accuracy: ", acc)
        outputs_all[batch_idx*256:batch_idx*256+len(inputs)] = outputs.cpu().numpy()
        targets_all[batch_idx*256:batch_idx*256+len(inputs)] = targets.cpu().numpy()
    print("Total Accuracy: ", test_acc)

Total Accuracy:  92.90653580191835


In [7]:
classes_35 = [5, 11, 15, 18, 19, 20, 22, 26, 30, 33]

In [8]:
classerr_test = tnt.ClassErrorMeter(accuracy=True, topk=(1, min(11, 5)))
test_confusion = tnt.ConfusionMeter(11)
outputs_all = np.zeros((len(test), 11))
targets_all = np.zeros((len(test)))
with torch.no_grad():
    for batch_idx, (inputs, targets) in enumerate(test_loader):
        outputs = model(inputs)
        targets_12_idx = [idx for idx, i in enumerate(targets) if i not in classes_35]
        
        for i in targets_12_idx:
            targets[i] = 10

        outputs_12_idx = [idx for idx, i in enumerate(np.argmax(outputs, 1)) if i not in classes_35]
        outputs_12 = torch.empty((len(inputs), 11))
        for idx in range(len(inputs)):
        
            if targets[idx] == 5:
                targets[idx] = 0
            elif targets[idx] == 11:
                targets[idx] = 1
            elif targets[idx] == 15:
                targets[idx] = 2
            elif targets[idx] == 18:
                targets[idx] = 3
            elif targets[idx] == 19:
                targets[idx] = 4
            elif targets[idx] == 20:
                targets[idx] = 5
            elif targets[idx] == 22:
                targets[idx] = 6
            elif targets[idx] == 26:
                targets[idx] = 7
            elif targets[idx] == 30:
                targets[idx] = 8
            elif targets[idx] == 33:
                targets[idx] = 9
            else:
                targets[idx] = 10

            if idx in outputs_12_idx:
                outputs_12[idx][10] = 100
                for i in [0,1,2,3,4,5,6,7,8,9]:
                    outputs_12[idx][i] = 0
            else:
                outputs_12[idx][0] = outputs[idx][5]
                outputs_12[idx][1] = outputs[idx][11]
                outputs_12[idx][2] = outputs[idx][15]
                outputs_12[idx][3] = outputs[idx][18]
                outputs_12[idx][4] = outputs[idx][19]
                outputs_12[idx][5] = outputs[idx][20]
                outputs_12[idx][6] = outputs[idx][22]
                outputs_12[idx][7] = outputs[idx][26]
                outputs_12[idx][8] = outputs[idx][30]
                outputs_12[idx][9] = outputs[idx][33]
                outputs_12[idx][10] = -100

        outputs_12 = outputs_12.to(device)
        targets = targets.to(device)

        classerr_test.add(outputs_12, targets)
        test_confusion.add(outputs_12, targets)
        #print("Batch: [",batch_idx*256 ,"/", len(val_dataset),"]")
        test_acc = classerr_test.value()[0]
        #print("Accuracy: ", acc)
        outputs_all[batch_idx*256:batch_idx*256+len(inputs)] = outputs_12.cpu().numpy()
        targets_all[batch_idx*256:batch_idx*256+len(inputs)] = targets.cpu().numpy()
    print("Total Accuracy: ", test_acc)

Total Accuracy:  95.93820990458882


In [9]:
test_confusion.value()

array([[  9,   0,   0, ...,   0,   0,   0],
       [  0,  11,   0, ...,   0,   0,   0],
       [  0,   0,  10, ...,   0,   0,   0],
       ...,
       [  0,   0,   0, ...,  12,   0,   0],
       [  0,   0,   0, ...,   0, 407,   1],
       [  0,   0,   0, ...,   0,   0,  22]], dtype=int32)

In [15]:
np.disp(test_confusion.value())

[[  9   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [  0  11   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0  10   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0  10   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   7   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [  0   2   0   4   6 360   0   1   1   0   0  12   0   1   2   0   0   4
    8   0   0   0   0   0   1   0   3   0   0   0   1   0   0   0   0]
 [  0   0   0   0   0   0  34   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   

In [13]:
np.set_printoptions(threshold=sys.maxsize)

In [13]:
sum = 0
for i in range(35):
    sum += test_confusion.value()[i][i]
print(sum)
print(np.sum(test_confusion.value()))

IndexError: index 11 is out of bounds for axis 0 with size 11

In [7]:
classes = ['down', 'go', 'left', 'no', 'off', 'on', 'right', 'stop', 'up', 'yes']

In [6]:
classes_35 = [5, 11, 15, 18, 19, 20, 22, 26, 30, 33]

In [9]:
classerr_test = tnt.ClassErrorMeter(accuracy=True, topk=(1, min(11, 5)))
test_confusion = tnt.ConfusionMeter(11)
outputs_all = np.zeros((len(test), 11))
targets_all = np.zeros((len(test)))
with torch.no_grad():
    for batch_idx, (inputs, targets) in enumerate(test_loader):
        outputs = model(inputs)
        targets_12_idx = [idx for idx, i in enumerate(targets) if i not in classes_35]
        
        for i in targets_12_idx:
            targets[i] = 10

        outputs_12_idx = [idx for idx, i in enumerate(np.argmax(outputs, 1)) if i not in classes_35]
        outputs_12 = torch.empty((len(inputs), 11))
        for idx in range(len(inputs)):
        
            if targets[idx] == 5:
                targets[idx] = 0
            elif targets[idx] == 11:
                targets[idx] = 1
            elif targets[idx] == 15:
                targets[idx] = 2
            elif targets[idx] == 18:
                targets[idx] = 3
            elif targets[idx] == 19:
                targets[idx] = 4
            elif targets[idx] == 20:
                targets[idx] = 5
            elif targets[idx] == 22:
                targets[idx] = 6
            elif targets[idx] == 26:
                targets[idx] = 7
            elif targets[idx] == 30:
                targets[idx] = 8
            elif targets[idx] == 33:
                targets[idx] = 9
            else:
                targets[idx] = 10

            if idx in outputs_12_idx:
                outputs_12[idx][10] = 100
                for i in [0,1,2,3,4,5,6,7,8,9]:
                    outputs_12[idx][i] = 0
            else:
                outputs_12[idx][0] = outputs[idx][5]
                outputs_12[idx][1] = outputs[idx][11]
                outputs_12[idx][2] = outputs[idx][15]
                outputs_12[idx][3] = outputs[idx][18]
                outputs_12[idx][4] = outputs[idx][19]
                outputs_12[idx][5] = outputs[idx][20]
                outputs_12[idx][6] = outputs[idx][22]
                outputs_12[idx][7] = outputs[idx][26]
                outputs_12[idx][8] = outputs[idx][30]
                outputs_12[idx][9] = outputs[idx][33]
                outputs_12[idx][10] = -100

        outputs_12 = outputs_12.to(device)
        targets = targets.to(device)

        classerr_test.add(outputs_12, targets)
        test_confusion.add(outputs_12, targets)
        #print("Batch: [",batch_idx*256 ,"/", len(val_dataset),"]")
        test_acc = classerr_test.value()[0]
        #print("Accuracy: ", acc)
        outputs_all[batch_idx*256:batch_idx*256+len(inputs)] = outputs_12.cpu().numpy()
        targets_all[batch_idx*256:batch_idx*256+len(inputs)] = targets.cpu().numpy()
    print("Total Accuracy: ", test_acc)

Total Accuracy:  95.93820990458882


In [10]:
np.disp(test_confusion.value())

[[ 360   12    0    8    0    0    0    3    1    0   22]
 [   2  375    1    8    0    0    1    0    1    1   13]
 [   0    1  399    1    0    0    0    0    1    2    8]
 [   1    1    6  383    0    0    0    0    0    1   13]
 [   0    1    0    1  352    4    0    0   34    0   10]
 [   0    0    0    0   11  372    1    0    1    0   11]
 [   1    0    3    0    2    0  365    0    1    0   24]
 [   0    1    0    2    0    0    0  391    6    0   11]
 [   0    1    1    0   24    0    0    4  384    0   11]
 [   0    2    2    0    0    0    3    0    0  407    5]
 [  17   20   19   26   15   23   12   14   13    2 6770]]


## tests on benchmark dataset without silence class

In [11]:
sn = SimpleNamespace()
sn.truncate_testset = False
sn.act_mode_8bit = False

_, test_benchmark = dataset.KWS_12_benchmark_get_datasets( ('/data_ssd', sn), load_train=False, load_test=True)
benchmark_loader = torch.utils.data.DataLoader(test_benchmark, batch_size=256)


Processing test...
test set: 4891 elements
Class down (# 5): 406 elements
Class go (# 11): 402 elements
Class left (# 15): 412 elements
Class no (# 19): 405 elements
Class off (# 20): 402 elements
Class on (# 21): 396 elements
Class right (# 23): 396 elements
Class stop (# 28): 411 elements
Class up (# 32): 425 elements
Class yes (# 35): 419 elements
Class silence (# 26): 408 elements
Class UNKNOWN: 409 elements


In [12]:
test_benchmark.new_class_dict

{'down': 0,
 'go': 1,
 'left': 2,
 'no': 3,
 'off': 4,
 'on': 5,
 'right': 6,
 'stop': 7,
 'up': 8,
 'yes': 9,
 'silence': 10,
 'UNKNOWN': 11}

In [13]:
classerr_test = tnt.ClassErrorMeter(accuracy=True, topk=(1, min(11, 5)))
test_confusion = tnt.ConfusionMeter(11)
outputs_all = np.zeros((len(test), 11))
targets_all = np.zeros((len(test)))

with torch.no_grad():
    for batch_idx, (inputs, targets) in enumerate(benchmark_loader):
        outputs = model(inputs)

        outputs_12_idx = [idx for idx, i in enumerate(np.argmax(outputs, 1)) if i not in classes_35]
        outputs_12 = torch.empty((len(inputs), 11))
        
        for idx in range(len(inputs)):

            if targets[idx] == 11:
                targets[idx] = 10

            if idx in outputs_12_idx:
                outputs_12[idx][10] = 100
                for i in [0,1,2,3,4,5,6,7,8,9]:
                    outputs_12[idx][i] = 0
            else:
                outputs_12[idx][0] = outputs[idx][5]
                outputs_12[idx][1] = outputs[idx][11]
                outputs_12[idx][2] = outputs[idx][15]
                outputs_12[idx][3] = outputs[idx][18]
                outputs_12[idx][4] = outputs[idx][19]
                outputs_12[idx][5] = outputs[idx][20]
                outputs_12[idx][6] = outputs[idx][22]
                outputs_12[idx][7] = outputs[idx][26]
                outputs_12[idx][8] = outputs[idx][30]
                outputs_12[idx][9] = outputs[idx][33]
                outputs_12[idx][10] = -100

        outputs_12 = outputs_12.to(device)
        targets = targets.to(device)

        classerr_test.add(outputs_12, targets)
        test_confusion.add(outputs_12, targets)
        test_acc = classerr_test.value()[0]
        outputs_all[batch_idx*256:batch_idx*256+len(inputs)] = outputs_12.cpu().numpy()
        targets_all[batch_idx*256:batch_idx*256+len(inputs)] = targets.cpu().numpy()
    print("Total Accuracy: ", test_acc)

Total Accuracy:  91.65814761807401


In [14]:
np.disp(test_confusion.value())

[[360  12   0   8   0   0   0   3   1   0  22]
 [  2 375   1   8   0   0   1   0   1   1  13]
 [  0   1 399   1   0   0   0   0   1   2   8]
 [  1   1   6 383   0   0   0   0   0   1  13]
 [  0   1   0   1 352   4   0   0  34   0  10]
 [  0   0   0   0  11 372   1   0   1   0  11]
 [  1   0   3   0   2   0 365   0   1   0  24]
 [  0   1   0   2   0   0   0 391   6   0  11]
 [  0   1   1   0  24   0   0   4 384   0  11]
 [  0   2   2   0   0   0   3   0   0 407   5]
 [  6  36   7   6   5   6   0  27  10  19 695]]


In [5]:
sn = SimpleNamespace()
sn.truncate_testset = False
sn.act_mode_8bit = False

_, test = dataset.KWS_35_get_datasets( ('/data_ssd', sn), load_train=False, load_test=True)


test_loader = torch.utils.data.DataLoader(test, batch_size=256)


Processing test...
test set: 4891 elements
Filtering silence elements...
test set: 4483 elements
Class backward (# 0): 9 elements
Class bed (# 1): 11 elements
Class bird (# 2): 10 elements
Class cat (# 3): 10 elements
Class dog (# 4): 7 elements
Class down (# 5): 406 elements
Class eight (# 6): 34 elements
Class five (# 7): 23 elements
Class follow (# 8): 9 elements
Class forward (# 9): 13 elements
Class four (# 10): 18 elements
Class go (# 11): 402 elements
Class happy (# 12): 21 elements
Class house (# 13): 10 elements
Class learn (# 14): 10 elements
Class left (# 15): 412 elements
Class marvin (# 17): 11 elements
Class nine (# 18): 24 elements
Class no (# 19): 405 elements
Class off (# 20): 402 elements
Class on (# 21): 396 elements
Class one (# 22): 25 elements
Class right (# 23): 396 elements
Class seven (# 24): 18 elements
Class sheila (# 25): 17 elements
Class six (# 27): 28 elements
Class stop (# 28): 411 elements
Class three (# 29): 26 elements
Class tree (# 30): 10 elements


In [7]:
classes_35 = [5, 11, 15, 18, 19, 20, 22, 26, 30, 33]

In [11]:
classerr_test = tnt.ClassErrorMeter(accuracy=True, topk=(1, min(11, 5)))
test_confusion = tnt.ConfusionMeter(11)
outputs_all = np.zeros((len(test), 11))
targets_all = np.zeros((len(test)))
with torch.no_grad():
    for batch_idx, (inputs, targets) in enumerate(test_loader):
        outputs = model(inputs)
        targets_12_idx = [idx for idx, i in enumerate(targets) if i not in classes_35]
        
        for i in targets_12_idx:
            targets[i] = 10

        outputs_12_idx = [idx for idx, i in enumerate(np.argmax(outputs, 1)) if i not in classes_35]
        outputs_12 = torch.empty((len(inputs), 11))
        for idx in range(len(inputs)):
        
            if targets[idx] == 5:
                targets[idx] = 0
            elif targets[idx] == 11:
                targets[idx] = 1
            elif targets[idx] == 15:
                targets[idx] = 2
            elif targets[idx] == 18:
                targets[idx] = 3
            elif targets[idx] == 19:
                targets[idx] = 4
            elif targets[idx] == 20:
                targets[idx] = 5
            elif targets[idx] == 22:
                targets[idx] = 6
            elif targets[idx] == 26:
                targets[idx] = 7
            elif targets[idx] == 30:
                targets[idx] = 8
            elif targets[idx] == 33:
                targets[idx] = 9
            else:
                targets[idx] = 10

            if idx in outputs_12_idx:
                outputs_12[idx][10] = 100
                for i in [0,1,2,3,4,5,6,7,8,9]:
                    outputs_12[idx][i] = 0
            else:
                outputs_12[idx][0] = outputs[idx][5]
                outputs_12[idx][1] = outputs[idx][11]
                outputs_12[idx][2] = outputs[idx][15]
                outputs_12[idx][3] = outputs[idx][18]
                outputs_12[idx][4] = outputs[idx][19]
                outputs_12[idx][5] = outputs[idx][20]
                outputs_12[idx][6] = outputs[idx][22]
                outputs_12[idx][7] = outputs[idx][26]
                outputs_12[idx][8] = outputs[idx][30]
                outputs_12[idx][9] = outputs[idx][33]
                outputs_12[idx][10] = -100

        outputs_12 = outputs_12.to(device)
        targets = targets.to(device)

        classerr_test.add(outputs_12, targets)
        test_confusion.add(outputs_12, targets)
        #print("Batch: [",batch_idx*256 ,"/", len(val_dataset),"]")
        test_acc = classerr_test.value()[0]
        #print("Accuracy: ", acc)
        outputs_all[batch_idx*256:batch_idx*256+len(inputs)] = outputs_12.cpu().numpy()
        targets_all[batch_idx*256:batch_idx*256+len(inputs)] = targets.cpu().numpy()
    print("Total Accuracy: ", test_acc)

Total Accuracy:  93.44189159045283


In [10]:
max(targets)

tensor(15, device='cuda:0')