## 1. Packages

In [1]:
import os
import torch.nn.functional as F
import torchaudio
from torch.utils.data import Dataset
import torch
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm
import random 
import IPython.display as ipd
import math
from pydub import AudioSegment
import numpy as np
from sklearn.metrics import confusion_matrix



## 2. Creating the voice files

The two block below split the .wav files to smaller .wav files one second each

In [2]:
class SplitWavfile():
    def __init__(self, folder, filename):
        self.folder = folder
        self.filename = filename
        self.filepath = './EE413_dataset' + folder + '/' + filename
        self.audio = AudioSegment.from_wav(self.filepath)
    
    def get_duration(self):
        return self.audio.duration_seconds
    
    def single_split(self, s_start, s_end, split_filename):
        t1 = s_start  * 1000
        t2 = s_end * 1000
        split_audio = self.audio[t1:t2]
        if len(split_audio) > 800 :
            split_audio.export('./EE413_dataset/' +self.folder + '/' + split_filename, format="wav")
        return len(split_audio)
        
    def multiple_split(self, seconds_per_split):
        number_of_splits = math.ceil(self.get_duration() )
        for i in range(0, number_of_splits, seconds_per_split):
            name_f, _ = os.path.splitext(self.filename)
            split_fn = name_f + '-' + str(i)+'.wav'
            length_ms = self.single_split(i, i+seconds_per_split, split_fn)
            if length_ms > 800 :
                self.write_file_name(split_fn)
        print("file splited to: " + str(i)+"  files")    
        print('All splited successfully')
    
    def write_file_name(self,name):
        line = os.path.basename(self.folder)+ '/' + name
        if random.uniform(0, 1) > 0.15: # use 15% of file for testing and rest for training
            file = 'training_list.txt'
        else:
            file = 'testing_list.txt'
        with open(file,'a') as f:
            f.write(line)
            f.write('\n')

In [3]:
def no_num(string):
    return not any([c.isdigit() for c in string])

if os.path.exists('training_list.txt') == True:
    os.remove('training_list.txt')
if os.path.exists('testing_list.txt') == True:
    os.remove('testing_list.txt')
# loop through all the files to split it if the .wav file has number in the name it will not be splited
# to avoid splitting a file twice
for _, dirs, _ in os.walk('./EE413_dataset'):
    for dire in dirs:
        print(dire)
        for _, _, files in os.walk('./EE413_dataset/'+ dire):
            for file in files:
                _, ext = os.path.splitext(file)
                if ext == '.wav' and no_num(file):
                    folder = './' + dire
                    print("folder:",dire,"  file : ",file)
                    split_wav = SplitWavfile(folder, file)
                    split_wav.multiple_split(1)

abdullh
folder: abdullh   file :  abdullh.wav
file splited to: 101  files
All splited successfully
folder: abdullh   file :  abdullh_second.wav
file splited to: 31  files
All splited successfully
book
folder: book   file :  book_first.wav
file splited to: 55  files
All splited successfully
folder: book   file :  book_second.wav
file splited to: 56  files
All splited successfully
folder: book   file :  book_third.wav
file splited to: 33  files
All splited successfully
dahab
folder: dahab   file :  dahab.wav
file splited to: 121  files
All splited successfully
folder: dahab   file :  dahab_second.wav
file splited to: 37  files
All splited successfully
Eftar
folder: Eftar   file :  Eftar.wav
file splited to: 178  files
All splited successfully
Electricity
folder: Electricity   file :  Electricity.wav
file splited to: 175  files
All splited successfully
Fuel
folder: Fuel   file :  Fuel.wav
file splited to: 202  files
All splited successfully
gzr
folder: gzr   file :  gzr.wav
file splited t

## 3. Dataset and Dataloader
Here we define the data set and the data loader we load the data and take the fourier transform of it\
\
Note: the data will be normlized in the dataloader

In [4]:
# define the data set 
def load_item(filepath):
    normp = os.path.normpath(filepath)
    label, _ = os.path.split(normp)
    pa = './EE413_dataset'
    waveform, sample_rate = torchaudio.load(pa + filepath[1:])
    return waveform, sample_rate, label

class EE413_Data(Dataset):
    def __init__(self, subset):
        self.filename1 = subset
        if subset == "training":
            self._walker = self._load_list('training_list.txt')
        elif subset == "testing":
            self._walker = self._load_list("testing_list.txt")
    
    def __len__(self):
        return len(self._walker)
    
    def __getitem__(self,index):
        fileid = self._walker[index]
        waveform, sample_rate, label = load_item(fileid)
        fourier_transform = torch.fft.fft(waveform, n=8000).abs() # calculate 8000 point fft
        return waveform, sample_rate, label, fourier_transform
    
    def _load_list(self, filename):
        output = []
        pa = "./"
        filepath = os.path.join(pa, filename)
        with open(filepath) as fileobj:
            output += [os.path.join(pa, line.strip()) for line in fileobj]
        return output

train_set = EE413_Data("training")
test_set = EE413_Data("testing")  

In [5]:
# list of the word the model will be traind for
labels = sorted(list(set(datapoint[2] for datapoint in train_set)))
print(labels)
print("the number of the words is ",len(labels)) # we need to convert the label to number for training 
#the simplest choice is to use the index 

def label_to_index(word):
    return torch.tensor(labels.index(word))

arabic_word = {'Eftar':'إفطار' , 'Quran':"قرآن", 'Ramadan':"رمضان", 'abdullh':"عبدالله", 'book':"كتاب", 'dahab':"ذهب",'gzr':"جزر",
               'lbn':"لبن", 'mal':"مال", 'myah':"مياه", 'qyam':"قيام", 'sadaga':"صدقة", 'saif':"سيف",'saleh':"صالح", 'sky':"سماء",
               'slam':"سلام", 'syah':"صياح", 'thoub':"ثوب", 'zamen':"زمن",'Electricity':"كهرباء", 'Fuel':"وقود", 'Night':"ليل",
               'Prayer':"صلاة", 'Zakat':"زكاة"}

['Eftar', 'Electricity', 'Fuel', 'Night', 'Prayer', 'Quran', 'Ramadan', 'Zakat', 'abdullh', 'book', 'dahab', 'gzr', 'lbn', 'mal', 'myah', 'qyam', 'sadaga', 'saif', 'saleh', 'sky', 'slam', 'syah', 'thoub', 'zamen']
the number of the words is  24


In [26]:
# Using GPU or CPU 
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

if device == "cuda":
    pin_memory = True
else:
    pin_memory = False
    
print(device)

cuda


In [27]:
def collate_fn(batch):
    tensors, fft_tensor, targets = [], [], []
    # Gather in lists, and encode labels as indices
    for waveform,_ , label, fft_data in batch:
        
        targets += [label_to_index(label)]         # gather the labels 
        fft_tensor += [fft_data] # gather the fft of the data 
        
        # gather the waveform 
        waveform = waveform - waveform.mean()
        waveform = waveform/waveform.abs().max()
        if waveform.shape[1] >= 8000:
            tensors += [torch.reshape(waveform[0,0:8000], (1,-1))] 
        else:
            waveform = torch.reshape(waveform, (-1,))
            z = torch.zeros(8000 - waveform.shape[0])
            tensors += [torch.reshape(torch.cat((waveform,z)) ,(1,-1))]
                 
    targets = torch.stack(targets)
    tensors = torch.stack(tensors)
    fft_tensor = torch.stack(fft_tensor)

    return tensors, targets, fft_tensor

batch_size = 4
train_loader = torch.utils.data.DataLoader(
    train_set,
    batch_size=batch_size,
    shuffle=True,
    collate_fn=collate_fn,
    pin_memory=pin_memory,
    drop_last=True,
)
test_loader = torch.utils.data.DataLoader(
    test_set,
    batch_size=batch_size,
    shuffle=True,
    collate_fn=collate_fn,
    pin_memory=pin_memory,
)

# 4. Training

## Define the model

The model used here is called M5 it is a model developed in (https://doi.org/10.48550/arXiv.1610.00087 )\
because our dataset is small we use the M5 model pretraind with 35 English word from the speechcommand dataset and use the transfer learning the pretrained model is in pytorch website 
https://pytorch.org/tutorials/intermediate/speech_command_classification_with_torchaudio_tutorial.html


In [28]:
# the model 
class M5(nn.Module):
    def __init__(self, n_input=1, n_output=35, stride=16, n_channel=32):
        super().__init__()
        self.conv1 = nn.Conv1d(n_input, n_channel, kernel_size=80, stride=stride)
        self.bn1 = nn.BatchNorm1d(n_channel)
        self.pool1 = nn.MaxPool1d(4)
        self.conv2 = nn.Conv1d(n_channel, n_channel, kernel_size=3)
        self.bn2 = nn.BatchNorm1d(n_channel)
        self.pool2 = nn.MaxPool1d(4)
        self.conv3 = nn.Conv1d(n_channel, 2 * n_channel, kernel_size=3)
        self.bn3 = nn.BatchNorm1d(2 * n_channel)
        self.pool3 = nn.MaxPool1d(4)
        self.conv4 = nn.Conv1d(2 * n_channel, 2 * n_channel, kernel_size=3)
        self.bn4 = nn.BatchNorm1d(2 * n_channel)
        self.pool4 = nn.MaxPool1d(4)
        self.fc1 = nn.Linear(2 * n_channel, n_output)

    def forward(self, x):
        x = self.conv1(x)
        x = F.relu(self.bn1(x))
        x = self.pool1(x)
        x = self.conv2(x)
        x = F.relu(self.bn2(x))
        x = self.pool2(x)
        x = self.conv3(x)
        x = F.relu(self.bn3(x))
        x = self.pool3(x)
        x = self.conv4(x)
        x = F.relu(self.bn4(x))
        x = self.pool4(x)
        x = F.avg_pool1d(x, x.shape[-1])
        x = x.permute(0, 2, 1)
        x = self.fc1(x)
        return F.log_softmax(x, dim=2)


#### we use transfer learning in the project to improve the performance

In [29]:
#load the pretrained model
PATH = './transfer_learining_model/eng_command.pth'
model = M5()
model.load_state_dict(torch.load(PATH))
# change the fully connected layer
num_ftrs = model.fc1.in_features
model.fc1 = nn.Linear(num_ftrs, len(labels))

model.to(device)
print(model)

M5(
  (conv1): Conv1d(1, 32, kernel_size=(80,), stride=(16,))
  (bn1): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (pool1): MaxPool1d(kernel_size=4, stride=4, padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv1d(32, 32, kernel_size=(3,), stride=(1,))
  (bn2): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (pool2): MaxPool1d(kernel_size=4, stride=4, padding=0, dilation=1, ceil_mode=False)
  (conv3): Conv1d(32, 64, kernel_size=(3,), stride=(1,))
  (bn3): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (pool3): MaxPool1d(kernel_size=4, stride=4, padding=0, dilation=1, ceil_mode=False)
  (conv4): Conv1d(64, 64, kernel_size=(3,), stride=(1,))
  (bn4): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (pool4): MaxPool1d(kernel_size=4, stride=4, padding=0, dilation=1, ceil_mode=False)
  (fc1): Linear(in_features=64, out_features=24, bias=True)
)


### Train the model with the waveform

In [30]:
# the train function
def train(model1, epoch,input_type): 
    
        
    model1.train()
    for batch_idx, (wave, target,fft_d) in enumerate(train_loader):
        
        if input_type == "waveform":
            data = wave
        elif input_type =="fft":
            data = fft_d
            
        data = data.to(device)
        target = target.to(device)
        output = model1(data)
        loss = F.nll_loss(output.squeeze(), target)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if batch_idx % 200 == 0: # print training stats
            print(f"Train Epoch: {epoch} [{batch_idx * len(data)}/{len(train_loader.dataset)} ({100. * batch_idx / len(train_loader):.0f}%)]\tLoss: {loss.item():.6f}")

        pbar.update(pbar_update)         # update progress bar

        losses.append(loss.item())         # record loss

In [31]:
def number_of_correct(pred, target):
    return pred.squeeze().eq(target).sum().item()


def get_likely_index(tensor):
    return tensor.argmax(dim=-1)

# test the model
def test(model1, epoch,input_type):
    
    model1.eval()
    correct = 0
    for wave, target, fft_d in test_loader:
        
        if input_type == "waveform":
            data = wave
        elif input_type =="fft":
            data = fft_d
            
        data = data.to(device)
        target = target.to(device)
        output = model1(data)
        pred = get_likely_index(output)
        correct += number_of_correct(pred, target)

        pbar.update(pbar_update) # update progress bar

    print(f"\nTest Epoch: {epoch}\tAccuracy: {correct}/{len(test_loader.dataset)} ({100. * correct / len(test_loader.dataset):.0f}%)\n")

In [32]:
n_epoch = 4

pbar_update = 1 / (len(train_loader) + len(test_loader))
losses = []
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=0.0001)
with tqdm(total=n_epoch) as pbar:
    for epoch in range(1, n_epoch + 1):
        train(model, epoch, "waveform")
        test(model, epoch, "waveform")


  1%|          | 0.022058823529411763/4 [00:00<01:14, 18.85s/it]  



  6%|▌         | 0.22163865546218442/4 [00:02<00:33,  8.87s/it] 



 11%|█         | 0.4327731092436963/4 [00:04<00:30,  8.63s/it] 



 16%|█▌        | 0.648109243697477/4 [00:06<00:29,  8.86s/it]  



 21%|██▏       | 0.8518907563025183/4 [00:08<00:44, 14.04s/it]



 25%|██▌       | 1.0084033613445347/4 [00:09<00:19,  6.36s/it]


Test Epoch: 1	Accuracy: 546/563 (97%)



 30%|███       | 1.2100840336134415/4 [00:11<00:33, 11.98s/it]



 36%|███▌      | 1.4348739495798273/4 [00:14<00:23,  9.12s/it]



 41%|████      | 1.6460084033613391/4 [00:16<00:25, 10.94s/it]



 46%|████▌     | 1.8466386554621788/4 [00:19<01:05, 30.28s/it]



 50%|█████     | 2.008403361344531/4 [00:21<00:15,  7.60s/it] 


Test Epoch: 2	Accuracy: 556/563 (99%)



 56%|█████▌    | 2.2279411764705808/4 [00:23<00:17,  9.80s/it]



 61%|██████    | 2.430672268907555/4 [00:26<00:19, 12.29s/it] 



 66%|██████▌   | 2.6418067226890667/4 [00:29<00:19, 14.00s/it]



 72%|███████▏  | 2.8623949579831836/4 [00:32<00:10,  9.47s/it]



 75%|███████▌  | 3.0189075630252/4 [00:33<00:06,  7.00s/it]   


Test Epoch: 3	Accuracy: 551/563 (98%)



 81%|████████  | 3.2205882352941066/4 [00:35<00:08, 11.36s/it]



 86%|████████▌ | 3.4422268907562907/4 [00:38<00:06, 12.12s/it]



 91%|█████████ | 3.6439075630251976/4 [00:40<00:04, 11.55s/it]



 96%|█████████▌| 3.8466386554621717/4 [00:42<00:02, 14.86s/it]



100%|█████████▉| 3.9999999999999862/4 [00:44<00:00, 11.00s/it]


Test Epoch: 4	Accuracy: 560/563 (99%)






### View some of the prediction 

In [44]:
def predict(tensor):
    model.eval()
    tensor = tensor.to(device)
    output = model(tensor.unsqueeze(0))
    index = get_likely_index(output) 
    return index, labels[index]

batch = iter(test_loader)
wave, label, _ = batch.next()
sample_rate = 8000;
for i in range(4):
    waveform = wave[i]
    _, predection = predict(waveform)
    print(f"The word: {arabic_word[labels[label[i].item()]]}. Predicted: {arabic_word[predection]}.")
    ipd.display(ipd.Audio(waveform.numpy(), rate=sample_rate))

The word: ثوب. Predicted: ثوب.


The word: ثوب. Predicted: ثوب.


The word: زمن. Predicted: زمن.


The word: ليل. Predicted: ليل.


## 5. Print each word accurecy and the wrong prediction

In [35]:
correct_pred = {word: 0 for word in labels}
total_number = {word: 0 for word in labels}

for wave, label,_ in test_loader:
    for i in range(list(label.shape)[0]):
        waveform = wave[i]
        pred_index, prediction = predict(waveform)
        l = label[i].item()
        # collect the correct predictions for each class
        if l == pred_index:
            correct_pred[labels[l]] += 1
        else: # play the sound for the wrong prediction
            print(f"wrong prediction - the word: {arabic_word[labels[l]]}. Predicted: {arabic_word[prediction]}.")
            ipd.display(ipd.Audio(waveform.numpy(), rate=sample_rate))
        total_number[labels[l]] += 1

# print accuracy for each class
for key in total_number:
    accuracy = 100 * float(correct_pred[key]) / total_number[key]
    print(f'Accuracy for class: {arabic_word[key]:5s} is {accuracy:.1f} %')

wrong prediction - the word: سماء. Predicted: وقود.


wrong prediction - the word: سلام. Predicted: سماء.


wrong prediction - the word: صدقة. Predicted: رمضان.


Accuracy for class: إفطار is 100.0 %
Accuracy for class: كهرباء is 100.0 %
Accuracy for class: وقود  is 100.0 %
Accuracy for class: ليل   is 100.0 %
Accuracy for class: صلاة  is 100.0 %
Accuracy for class: قرآن  is 100.0 %
Accuracy for class: رمضان is 100.0 %
Accuracy for class: زكاة  is 100.0 %
Accuracy for class: عبدالله is 100.0 %
Accuracy for class: كتاب  is 100.0 %
Accuracy for class: ذهب   is 100.0 %
Accuracy for class: جزر   is 100.0 %
Accuracy for class: لبن   is 100.0 %
Accuracy for class: مال   is 100.0 %
Accuracy for class: مياه  is 100.0 %
Accuracy for class: قيام  is 100.0 %
Accuracy for class: صدقة  is 97.0 %
Accuracy for class: سيف   is 100.0 %
Accuracy for class: صالح  is 100.0 %
Accuracy for class: سماء  is 94.1 %
Accuracy for class: سلام  is 95.7 %
Accuracy for class: صياح  is 100.0 %
Accuracy for class: ثوب   is 100.0 %
Accuracy for class: زمن   is 100.0 %


## 6. Confusion Matrix

In [36]:
all_preds = torch.tensor([])
true_val  = torch.tensor([])
for wave, label,_ in test_loader:
    wave = wave.to(device)
    output = model(wave)
    index = get_likely_index(output) 
    all_preds = torch.cat((all_preds, index.to('cpu')),dim=0)
    true_val  = torch.cat((true_val , label.to('cpu')),dim=0)
    

matrix = confusion_matrix(true_val.numpy(),all_preds.numpy(),list(range(24)))
print(labels)
print(matrix)

['Eftar', 'Electricity', 'Fuel', 'Night', 'Prayer', 'Quran', 'Ramadan', 'Zakat', 'abdullh', 'book', 'dahab', 'gzr', 'lbn', 'mal', 'myah', 'qyam', 'sadaga', 'saif', 'saleh', 'sky', 'slam', 'syah', 'thoub', 'zamen']
[[29  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0 25  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0 30  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0 37  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0 26  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0 35  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0 29  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0 23  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0 21  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0 14  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0 28  



### Save the model

In [37]:
PATH = './EE413_project_waveform.pth'
torch.save(model.state_dict(), PATH)

# 7. Train the model with the DFT

First we load the model to use the transfer learning


In [38]:
PATH = './transfer_learining_model/eng_command_fft.pth'
ft_model = M5()
ft_model.load_state_dict(torch.load(PATH))
# change the fully connected layer
num_ftrs = ft_model.fc1.in_features
ft_model.fc1 = nn.Linear(num_ftrs, len(labels))

ft_model.to(device)
print(ft_model)

M5(
  (conv1): Conv1d(1, 32, kernel_size=(80,), stride=(16,))
  (bn1): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (pool1): MaxPool1d(kernel_size=4, stride=4, padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv1d(32, 32, kernel_size=(3,), stride=(1,))
  (bn2): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (pool2): MaxPool1d(kernel_size=4, stride=4, padding=0, dilation=1, ceil_mode=False)
  (conv3): Conv1d(32, 64, kernel_size=(3,), stride=(1,))
  (bn3): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (pool3): MaxPool1d(kernel_size=4, stride=4, padding=0, dilation=1, ceil_mode=False)
  (conv4): Conv1d(64, 64, kernel_size=(3,), stride=(1,))
  (bn4): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (pool4): MaxPool1d(kernel_size=4, stride=4, padding=0, dilation=1, ceil_mode=False)
  (fc1): Linear(in_features=64, out_features=24, bias=True)
)


In [39]:
n_epoch = 4

pbar_update = 1 / (len(train_loader) + len(test_loader))
losses = []
optimizer = optim.Adam(ft_model.parameters(), lr=0.001, weight_decay=0.0001)
with tqdm(total=n_epoch) as pbar:
    for epoch in range(1, n_epoch + 1):
        train(ft_model, epoch, "fft")
        test(ft_model, epoch, 'fft')


  0%|          | 0.013655462184873948/4 [00:00<00:58, 14.78s/it]



  6%|▌         | 0.22373949579831887/4 [00:02<00:33,  8.92s/it] 



 11%|█         | 0.43592436974789794/4 [00:04<00:30,  8.44s/it]



 16%|█▋        | 0.6544117647058804/4 [00:06<00:27,  8.27s/it] 



 22%|██▏       | 0.8613445378151233/4 [00:08<00:23,  7.51s/it]



 25%|██▍       | 0.9999999999999968/4 [00:09<00:27,  9.27s/it]


Test Epoch: 1	Accuracy: 509/563 (90%)



 31%|███       | 1.236344537815122/4 [00:12<00:23,  8.62s/it] 



 36%|███▌      | 1.438025210084029/4 [00:16<00:23,  9.22s/it] 



 41%|████      | 1.6491596638655408/4 [00:18<00:23,  9.95s/it]



 47%|████▋     | 1.8644957983193216/4 [00:21<00:18,  8.67s/it]



 50%|█████     | 2.0105042016806656/4 [00:22<00:12,  6.43s/it]


Test Epoch: 2	Accuracy: 530/563 (94%)



 55%|█████▌    | 2.2121848739495724/4 [00:24<00:42, 24.00s/it]



 61%|██████    | 2.435924369747891/4 [00:28<00:19, 12.30s/it] 



 66%|██████▌   | 2.6418067226890667/4 [00:31<00:16, 12.30s/it]



 71%|███████   | 2.845588235294108/4 [00:35<00:16, 14.37s/it] 



 75%|███████▌  | 3.010504201680662/4 [00:36<00:09, 10.02s/it] 


Test Epoch: 3	Accuracy: 547/563 (97%)



 80%|████████  | 3.212184873949569/4 [00:39<00:08, 10.18s/it] 



 86%|████████▌ | 3.428571428571417/4 [00:42<00:05, 10.13s/it] 



 91%|█████████ | 3.6428571428571304/4 [00:46<00:03, 10.45s/it]



 96%|█████████▋| 3.858193277310911/4 [00:48<00:01,  9.57s/it] 



100%|█████████▉| 3.9999999999999862/4 [00:49<00:00, 12.34s/it]


Test Epoch: 4	Accuracy: 553/563 (98%)






## View some of the predictions of the model

In [40]:
def predict_ft(tensor):
    ft_model.eval()
    tensor = tensor.to(device)
    output = ft_model(tensor.unsqueeze(0))
    index = get_likely_index(output) 
    return index, labels[index]

wave, label, DFT = batch.next()
sample_rate = 8000;
for i in range(4):
    waveform = wave[i]
    DFT_t = DFT[i]
    _, predection = predict_ft(DFT_t)
    print(f"The word: {arabic_word[labels[label[i].item()]]}. Predicted: {arabic_word[predection]}.")
    ipd.display(ipd.Audio(waveform.numpy(), rate=sample_rate))

the word: كتاب. Predicted: كتاب.


the word: ذهب. Predicted: ذهب.


the word: كتاب. Predicted: كتاب.


the word: سماء. Predicted: سماء.


## 8. Print each word accurecy and the wrong prediction

In [41]:
correct_pred = {word: 0 for word in labels}
total_number = {word: 0 for word in labels}

for wave, label,DFT in test_loader:
    for i in range(list(label.shape)[0]):
        waveform = wave[i]
        DFT_t = DFT[i]
        pred_index, prediction = predict_ft(DFT_t)
        l = label[i].item()
        # collect the correct predictions for each class
        if l == pred_index:
            correct_pred[labels[l]] += 1
        else: # play the sound for the wrong prediction
            print(f"Wrong prediction - The word: {arabic_word[labels[l]]}. Predicted: {arabic_word[prediction]}.")
            ipd.display(ipd.Audio(waveform.numpy(), rate=sample_rate))
        total_number[labels[l]] += 1

# print accuracy for each class
for key in total_number:
    accuracy = 100 * float(correct_pred[key]) / total_number[key]
    print(f'Accuracy for class: {arabic_word[key]:5s} is {accuracy:.1f} %')

wrong prediction - the word: صالح. Predicted: ثوب.


wrong prediction - the word: سلام. Predicted: صلاة.


wrong prediction - the word: سماء. Predicted: وقود.


wrong prediction - the word: كتاب. Predicted: سلام.


wrong prediction - the word: كتاب. Predicted: ثوب.


wrong prediction - the word: سماء. Predicted: صلاة.


wrong prediction - the word: إفطار. Predicted: صدقة.


wrong prediction - the word: سماء. Predicted: صدقة.


wrong prediction - the word: زمن. Predicted: صلاة.


wrong prediction - the word: كتاب. Predicted: عبدالله.


Accuracy for class: إفطار is 96.6 %
Accuracy for class: كهرباء is 100.0 %
Accuracy for class: وقود  is 100.0 %
Accuracy for class: ليل   is 100.0 %
Accuracy for class: صلاة  is 100.0 %
Accuracy for class: قرآن  is 100.0 %
Accuracy for class: رمضان is 100.0 %
Accuracy for class: زكاة  is 100.0 %
Accuracy for class: عبدالله is 100.0 %
Accuracy for class: كتاب  is 78.6 %
Accuracy for class: ذهب   is 100.0 %
Accuracy for class: جزر   is 100.0 %
Accuracy for class: لبن   is 100.0 %
Accuracy for class: مال   is 100.0 %
Accuracy for class: مياه  is 100.0 %
Accuracy for class: قيام  is 100.0 %
Accuracy for class: صدقة  is 100.0 %
Accuracy for class: سيف   is 100.0 %
Accuracy for class: صالح  is 96.6 %
Accuracy for class: سماء  is 82.4 %
Accuracy for class: سلام  is 95.7 %
Accuracy for class: صياح  is 100.0 %
Accuracy for class: ثوب   is 100.0 %
Accuracy for class: زمن   is 94.7 %


## 9. Confusion Matrix for fourier transform

In [42]:
all_preds = torch.tensor([])
true_val  = torch.tensor([])
for _, label,dft in test_loader:
    dft = dft.to(device)
    output = ft_model(dft)
    index = get_likely_index(output) 
    all_preds = torch.cat((all_preds, index.to('cpu')),dim=0)
    true_val  = torch.cat((true_val , label.to('cpu')),dim=0)
    
matrix = confusion_matrix(true_val.numpy(),all_preds.numpy(),list(range(24)))
print(labels)
print(matrix)

['Eftar', 'Electricity', 'Fuel', 'Night', 'Prayer', 'Quran', 'Ramadan', 'Zakat', 'abdullh', 'book', 'dahab', 'gzr', 'lbn', 'mal', 'myah', 'qyam', 'sadaga', 'saif', 'saleh', 'sky', 'slam', 'syah', 'thoub', 'zamen']
[[28  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  1  0  0  0  0  0  0  0]
 [ 0 25  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0 30  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0 37  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0 26  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0 35  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0 29  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0 23  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0 21  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  1 11  0  0  0  0  0  0  0  0  0  0  1  0  1  0]
 [ 0  0  0  0  0  0  0  0  0  0 28  



## Save the model

In [43]:
PATH = './EE413_project_DFT.pth'
torch.save(ft_model.state_dict(), PATH)