## this notebook is to prepare audio speech from the dataset "IEMOCAP".


<p> first, we will prepare folder tree, we implement this by copying necessary audio files (sentences) from the dataset into a specified directory (TARGET_PATH). folder tree is as follows:
    
    root:
    ---- /IEMOCAP
    ---- ---- /Session1
    ---- ---- ---- /EmoEval
    ---- ---- ---- /Sentences
    ---- ---- /Session2
    ---- ---- ---- /EmoEval
    ---- ---- ---- /Sentences
    ---- ---- /Session3
    ---- ---- ---- /EmoEval
    ---- ---- ---- /Sentences
    ---- ---- /Session4
    ---- ---- ---- /EmoEval
    ---- ---- ---- /Sentences
    ---- ---- /Session5
    ---- ---- ---- /EmoEval
    ---- ---- ---- /Sentences
    </p>
    

In [2]:
#first we need to import all necessary packages

import json
import os
import re
import tqdm
import shutil

#constant files are global UPPERCASE
TARGET_PATH = "../Datasets/IEMOCAP"
SRC_PATH = "../Datasets/IEMOCAP_full_release"

def prepare_directory(src_dataset, trg_dataset):
    """
    prepare dataset_folder as folllow:
    trg_dataset contains 5 sessions, each session folder containes 2 subfolders : EmoEval, Sentences
    - EmoEval has the values of (filename, emotion label)
    - Sentences containes the .wav files 
    """
    os.makedirs(trg_dataset, exist_ok=True)
    for dir_path, dir_names, filenames in os.walk(src_dataset):
        folder_names = dir_path.split(sep="/")
        
        if folder_names[-1] == "EmoEvaluation":
            for file in filenames:
                target_path = os.path.join(trg_dataset,folder_names[-3],"EmoEval")
                os.makedirs(target_path, exist_ok=True)
                current_path = os.path.join(dir_path, file)
                shutil.copy(current_path, target_path)
                
                
        if folder_names[-2] == "sentences" and folder_names[-1] == 'wav':
            target_path = os.path.join(trg_dataset,folder_names[-3],"Sentences")
            for d in dir_names:
                current_path = os.path.join(dir_path, d)
                shutil.copytree(current_path, target_path, symlinks=False, ignore=None, ignore_dangling_symlinks=False, dirs_exist_ok=True)

    
prepare_directory(SRC_PATH, TARGET_PATH)

 9 target classification were recorded in the dataset, first we will enumerate them to get a dictionary {emotion : label }

In [4]:
y = "angry, happy, sad, neutral, frustrated, excited, fearful, disgusted, other".split(",")
y = sorted(y)
y = [i.strip() for i in y]
x = [f[0:3] for f in y]
x = dict.fromkeys(x)
for i, j in enumerate(x.keys()):
    x[j] = i
    
label_index = x


In [5]:
#prepare json file that stores filenames, genders, and labels from all sessions of the dataset


ROOT_PATH = "../Datasets/IEMOCAP"
JSON_PATH = "../Datasets/IEMOCAP/json/data.json"


def prepare_json(src_path, trg_path):
        
    
    global filenames_labels
    filenames_labels = {
        "filenames" : [],
        "genders" : [],
        "labels" : []
    }
    
    
    for dir_path, dir_names, filenames in os.walk(src_path):
        # if current dir is Session_i      
        folder_names = dir_path.split(sep="/")
        if folder_names[-1] == "EmoEval":
            for f in tqdm.tqdm(filenames, desc="Reading Folder: {}".format(dir_path), ncols=140):
                if f.split(".")[-1] == "txt":
                    current_path = os.path.join(dir_path, f)
                    with open(current_path, "r") as fr:
                        for l in fr:
                            reg = re.findall("\[\d+.\d+ \- \d+.\d+\]", l)
                            
                            if reg is not None:
                                splits = l.split()
                                if len(splits) < 5:
                                    continue
                                sentence_path = splits[3]
                                emo = splits[4]
                                
                                if emo in label_index:
                                    track_path = os.path.join(dir_path, '../Sentences', sentence_path + ".wav")
                                    try:
                                        librosa.load(track_path)
                                        filenames_labels["filenames"].append(track_path)
                                        filenames_labels["genders"].append(sentence_path[-4])
                                        filenames_labels["labels"].append(label_index[emo])
                                    except:
                                        pass
                                
        if dir_path.split(sep="/")[-1][:-1] == "Session" :
            session_number = dir_path.split(sep="/")[-1][-1]
            current_path = os.path.join(dir_path, 'dialog/EmoEvaluation/Categorical')
    
    os.makedirs(os.path.dirname(track_path), exist_ok=True)
    with open(trg_path, 'w') as fp:
        json.dump(filenames_labels, fp, indent=4)
    
    
    print("output location: {}".format(trg_path))
    return filenames_labels["labels"]
        
        
files = prepare_json(ROOT_PATH, JSON_PATH)


Reading Folder: ../Datasets/IEMOCAP/Session3/EmoEval: 100%|███████████████████████████████████████████████| 32/32 [00:00<00:00, 2147.48it/s]
Reading Folder: ../Datasets/IEMOCAP/Session5/EmoEval: 100%|███████████████████████████████████████████████| 31/31 [00:00<00:00, 2685.77it/s]
Reading Folder: ../Datasets/IEMOCAP/Session4/EmoEval: 100%|███████████████████████████████████████████████| 30/30 [00:00<00:00, 3025.90it/s]
Reading Folder: ../Datasets/IEMOCAP/Session1/EmoEval: 100%|███████████████████████████████████████████████| 28/28 [00:00<00:00, 2398.75it/s]
Reading Folder: ../Datasets/IEMOCAP/Session2/EmoEval: 100%|███████████████████████████████████████████████| 30/30 [00:00<00:00, 3139.61it/s]


FileNotFoundError: [Errno 2] No such file or directory: '../Datasets/IEMOCAP/json/data.json'

In [44]:
len(files)

7135

# feature extraction:

for each speech utternace U = {u_1, u_2 ... u_N} : u_i is a vector of 26 features (13 MFCC + 13 Delta coefficients). 
N is the total number of frames for each utterance input (fixed, default is 50, each frame contains 100ms of audio ==> each sample input represents 5 seconds of speech).

input : Raw audio file (utterance) with shape (1, T) where T is the length in seconds.
output : set of shape (N, d_input)  N is Sequence Length, d_input is feature vector 




In [5]:
import matplotlib.pyplot as plt
import numpy as np
import librosa
import os

SAMPLE_RATE=22050


def extract_features(waveform, sr, win_size=0.025, stride=0.01, n_mfcc=13):
    """
    initially we will use 13 MFCC + 13 Delta as low level descriptors 
    :param track_path: path of the audio track
    :param win_size: length of the hamming window (in seconds)
    :param stride: stride length (in seconds)
    :param n_mfcc: number of desired mel frequency cepstral coefficients
    :returns : spectogram of mfcc+elta features
    """
    
    #specifty parameters of mfcc
    n_fft = int(win_size*sr)
    hop_length = int(stride*sr)
    
    # get mfcc, deltas 
    MFCC = librosa.feature.mfcc(y=waveform,
                                n_mfcc=n_mfcc,
                                n_fft=n_fft,
                                hop_length=hop_length)
    # get the
    MFCC_delta = librosa.feature.delta(MFCC)
    #librosa.display.specshow(MFCC, x_axis="time")
    #plt.colorbar()
    #plt.xlabel('time')
    #plt.ylabel('MFCC')
    
    MFCC = MFCC
    MFCC_delta = MFCC_delta
    
    features = np.vstack((MFCC, MFCC_delta))
    return features
    

audio data preprocessing consists of : voice activity detection to remove silences from beginning and end of each audio utterance. then data-segmenting to obtain fixed-length audio utterances as inputs. Each utterance is windowed into many segments 

we will adapt similar approach to : 
@ARTICLE{gong_psla, 
    author={Gong, Yuan and Chung, Yu-An and Glass, James},  
    journal={IEEE/ACM Transactions on Audio, Speech, and Language Processing},   
    title={PSLA: Improving Audio Tagging with Pretraining, Sampling, Labeling, and Aggregation},   
    year={2021}, 
    doi={10.1109/TASLP.2021.3120633}
}

In [8]:
#extract features into json file 
import tqdm
import json

def feature_to_json(filenames: dict, trg_path_json, n_mfcc=13 ):
    """
    accepts dictonary filenames_labels and extract features of all including tracks to a json outputfile"""
    
    output = {
        "dataset" : "IEMOCAP",
        "n_mfcc": int(n_mfcc),
        "features" : ["mfcc", "mfcc-delta"],     
        "MFCCs" : [],
        "label" : [],
        "n_features":0,
        "n_classes" : 0,
    }
    
    # read filenames
    failed_names = []
    successful_iterations = 0
    total = len(filenames["labels"])
    
    for it, track in tqdm.tqdm(enumerate(filenames["filenames"]), desc="extracting features from audio ",ncols=140, total=total):
        try:
    
            features = extract_features(track, n_mfcc=n_mfcc)
            waveform = features[1]
            features = features[0]
            output["MFCCs"].append(features.tolist())
            output["label"].append(filenames["labels"][it])
            output["n_features"] = int(features.shape[0])
            successful_iterations += 1
        except:
            failed_names.append(track)
    
    output["n_classes"] = int(np.max(output["label"], 0) + 1)
    print("{}/{} audio tracks have been processed successfully".format(successful_iterations, total))
        
    # extract features into output dir
    print("dumping to json file...")
    with open(trg_path_json, 'w') as fp:
        json.dump(output, fp, indent=4)
    
    print("Finished jsonifying, output directory is {}".format(trg_path_json))
    return output
    
    
dataset = feature_to_json(filenames=filenames_labels, trg_path_json="./results.json")

    
    

extracting features from audio : 0it [00:00, ?it/s]


ValueError: zero-size array to reduction operation maximum which has no identity

In [7]:
len(dataset["waveforms"])



NameError: name 'dataset' is not defined

# SER architecture

In [45]:
from torch.utils.data import Dataset, DataLoader
import json
import numpy as np

class SER_Dataset(Dataset):
    """this class loads waveform and labels from a json file, and process it to extract necessary features"""
    def __init__(self, json_config_file):
        """
        :param json_config_file is of shape {waveforms: [], labels: []}"""
        
        # load json data to local
        print("loading data from json file...")
        with open(json_config_file, 'r') as fp:
            data = json.load(fp)
        print("data loaded successfully")
        
        self.dataset = data["dataset"]
        self.n_features = data["n_features"]
        self.n_samples = len(data["label"])
        self.features_extracted = data["features"]
        self.n_classes = data["n_classes"]
        self.data = data["MFCCs"]
        self.targets = data["label"]
        
        # print information about the data : number of samples, features, number of classes
        print("dataset derived from : {}".format(self.dataset))
        print("total number of samples in set: {}".format(self.n_samples))
        print("features used: {}".format(self.features_extracted))
        print("number of target classes: {}".format(self.n_classes))
        
        
    def __getitem__(self, index):
        return np.array(self.data[index]), self.targets[index]
        
    def __len__(self):
        return self.n_samples
    

my_set = SER_Dataset("./results.json")

loading data from json file...
data loaded successfully
dataset derived from : IEMOCAP
total number of samples in set: 7135
features used: ['mfcc', 'mfcc-delta']
number of target classes: 9



## create dataset with fixed_length signals for easier processing

In [46]:
def extract_MFCC_delta(waveform, sr, win_size=0.025, stride=0.01, n_mfcc=13):
    """
    initially we will use 13 MFCC + 13 Delta as low level descriptors 
    :param track_path: path of the audio track
    :param win_size: length of the hamming window (in seconds)
    :param stride: stride length (in seconds)
    :param n_mfcc: number of desired mel frequency cepstral coefficients
    :returns : spectogram of mfcc+elta features
    """
    
    #specifty parameters of mfcc
    n_fft = int(win_size*sr)
    hop_length = int(stride*sr)
    
    # get mfcc, deltas 
    MFCC = librosa.feature.mfcc(y=waveform,
                                n_mfcc=n_mfcc,
                                n_fft=n_fft,
                                hop_length=hop_length)
    
    MFCC_delta = librosa.feature.delta(MFCC)
    #librosa.display.specshow(MFCC, x_axis="time")
    #plt.colorbar()
    #plt.xlabel('time')
    #plt.ylabel('MFCC')
    
    MFCC = MFCC
    MFCC_delta = MFCC_delta
    
    features = np.vstack((MFCC, MFCC_delta))
    print("features shape is : {}".format(features.shape))
    return features

In [11]:
import torchaudio
import numpy as np
import librosa

def extract_mel_bins(waveform, sr, win_size=0.025, stride=0.01, n_mels= 64):
    mel_spectorgram = torchaudio.transforms.MelSpectrogram(sample_rate=sr,
                                                    n_fft=int(win_size*sr),
                                                   hop_length=int(stride*sr),
                                                   n_mels= n_mels)
    features = mel_spectorgram(waveform)
    features = np.array(features)
    features = np.squeeze(features, axis=0)
    return features

waveform, sr = torchaudio.load("/home/bashar/Study/Research_SER/Notebook/Ses03F_impro01_F004.wav")
extract_mel_bins(waveform, sr)

array([[8.52478552e-04, 1.48494318e-02, 1.58146140e-04, ...,
        2.39356887e-03, 9.67159495e-03, 3.36409220e-03],
       [6.00285223e-03, 1.34558845e-02, 1.49821187e-03, ...,
        1.80263328e-03, 8.19785800e-03, 3.04659340e-03],
       [1.99372172e-02, 8.61566141e-03, 5.12828398e-03, ...,
        2.70832079e-05, 3.50711984e-03, 1.94517721e-03],
       ...,
       [1.88185368e-07, 3.91738382e-07, 3.24324077e-07, ...,
        1.64962717e-06, 2.87581133e-06, 3.83244605e-06],
       [1.24415592e-07, 6.52505108e-08, 7.93541659e-08, ...,
        2.99635474e-07, 1.58410913e-07, 4.11869223e-06],
       [2.33628612e-07, 9.12033826e-08, 6.63071020e-08, ...,
        1.33230188e-07, 5.96501977e-08, 3.51766357e-06]], dtype=float32)

In [12]:

import torchaudio
import torch
signal, sr = torchaudio.load("/home/bashar/Study/Research_SER/Notebook/Ses03F_impro01_F004.wav")

padded = torch.nn.functional.pad(signal, (0, 20))


extract_mel_bins(padded, sr).shape

(64, 364)

In [13]:
from torch.utils.data import Dataset, DataLoader
import json
import numpy as np
import librosa
import torchaudio
import torch

SAMPLE_RATE=22050
NUM_SAMPLES = 3 * SAMPLE_RATE
JSON_PATH = "../Datasets/IEMOCAP/json/data.json"

class SER_dataset(Dataset):
    def pad_cut_if_necessary(self, waveform):
        # check length of wave form
        wav_length = waveform.shape[1]
        
        # if longer, cut the waveform
        if wav_length > self.num_samples:
            waveform = waveform[:, :self.num_samples]
            
        # if shorter, pad the waveform with zeros
        if wav_length < self.num_samples:
            num_missing_samples = self.num_samples - wav_length
            waveform = torch.nn.functional.pad(waveform, (0, num_missing_samples))
   
        return waveform
  
    
    def __init__(self, json_config_file, transformation,  num_samples=NUM_SAMPLES):
        """
        :param json_config_file : .json file which includes tracks paths and their labels
        :param transformation : a function pointer that will be used to extract features from input audio signals
        :num_samples : total number of samples allowed for the input"""
        with open(json_config_file, 'r') as pf:
            data = json.load(pf)
            
        self.filenames = data["filenames"]
        self.labels = data["labels"]
        self.num_samples = num_samples
        self.transformation = transformation
        
    def __getitem__(self, index):
        
        # get waveform from current index
        waveform, sr = torchaudio.load(self.filenames[index])
        resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq= SAMPLE_RATE)
        waveform = resampler(waveform)
        waveform = (waveform)
        waveform = self.pad_cut_if_necessary(waveform)
        features = self.transformation(waveform, sr)
        
        # get label of the current index
        label = self.labels[index]
        
        return features, label
        
        
    def __len__(self):
        return len(self.filenames)
            



In [47]:
my_set = SER_dataset(JSON_PATH, transformation=extract_mel_bins)
my_set[0]

(array([[7.28028186e-04, 7.92935584e-03, 2.41690315e-03, ...,
         2.82597292e-04, 2.40223925e-03, 1.59090552e-02],
        [5.83066139e-04, 6.81230566e-03, 2.07271893e-03, ...,
         2.29460318e-04, 2.28847400e-03, 1.22437123e-02],
        [1.37512412e-04, 3.21439281e-03, 9.65998799e-04, ...,
         6.50228467e-05, 1.80890015e-03, 1.15526537e-03],
        ...,
        [9.72811591e-08, 7.84727128e-09, 1.08825287e-08, ...,
         7.89695953e-09, 1.38550735e-08, 2.69502402e-06],
        [5.97432859e-08, 2.10878670e-09, 1.47968982e-09, ...,
         1.16064758e-09, 1.62070479e-09, 2.61602918e-06],
        [7.60158017e-08, 4.96486807e-09, 4.27872893e-09, ...,
         8.42900472e-10, 1.47550061e-09, 2.64305868e-06]], dtype=float32),
 5)

# Building the model 

this layer applies 1d convolution on the spectogram, to get the embeddings of each frame

In [23]:
import torch
import torch.nn as nn
from torchsummary import summary

class input_embedding(nn.Module):
    def __init__(self, d_input, d_model, stride=10, kernel_size=16):
        """
        :param d_input: feature-dimension of input spectogram
        :param d_model: output dimension of the embeddings
        """
        super().__init__()
        self.proj = nn.Conv2d(1, d_model, kernel_size=(d_input, kernel_size), stride=stride)
    def forward(self, x):
        """
        :param x: input of shape [B, d_input, t_length]
        return : output of shape [B, max_length, d_model]"""
        # we need to add new axis for number of channels
        x = torch.unsqueeze(x, dim=1)
        # we apply the projection layer
        x = self.proj(x)
        # we remove the height dimension, because it is always equal to 1 
        x = torch.squeeze(x, dim=2)
        #x = torch.einsum('bij->bji', x)
        x = x.transpose(1,2)
        return x

    
inp = torch.randn([5, 64, 414])
embed = input_embedding(64, 128)
summary(embed, (64, 414))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1           [-1, 128, 1, 40]         131,200
Total params: 131,200
Trainable params: 131,200
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.10
Forward/backward pass size (MB): 0.04
Params size (MB): 0.50
Estimated Total Size (MB): 0.64
----------------------------------------------------------------


now we will work on positional encoding PE, which is added to the output of the projection layer (AKA embeddings) to capture the positional information of the input audio.
the theory behind this positional encoding techniques is derived from the paper "Attention is all you need"

PE(pos,2i) = sin(pos/(1000^(2i/d_model))
PE(pos,2i+1) = cos(pos/(1000^(2i/d_model))

In [24]:
# positional encoding :
from torchsummary import summary

class PositionalEncoding( nn.Module ):
    """
    a nn.Module wrapper to extract Position embeddings with a specific dimnesionality
    """
    def __init__(self,d_model, max_length):
        """:param max_length: number of columns of input embedding [not used for now]
        :param d_model: number of rows to represent input features of the transformer"""
        super().__init__()
        self.max_length = max_length
        self.d_model = d_model
        
    def forward(self, x):
        # get the length of sequency from input
        seq_len = x.shape[1]
        
        even_i = torch.arange(0, self.d_model, 2).float()
        even_den = pow(10000, even_i/self.d_model)
        pos = torch.arange(seq_len).float().unsqueeze(1)
        
        even_pe = torch.sin(pos/even_den)
        odd_pe = torch.cos(pos/even_den)
        stacked = torch.stack([even_pe, odd_pe], dim=2)
        stacked = torch.flatten(stacked, start_dim=1, end_dim=2)
        return stacked
 
pe = PositionalEncoding(128,40)


Next, we will build the Transformer encoder blocks, which consists of multi-headed attention layer, followed by a feed-forward layer (as in the paper "Attention is all you need")


In [25]:
import torch
import torch.nn as nn


class MultiHeadSplit(nn.Module):
    """split the input data into multiple heads using linear layers"""
    def __init__(self, d_model:int, d_k:int, heads:int= 8, bias=False):
        """
        :param d_model: input feature dimension
        :param d_k: dimension of each head 
        :param heads: number of heads to split each input sample into
        :param bias: whether to apply bias term into linear layer"""
        super().__init__()
        # copy params into local attributes of the class
        self.d_model = d_model
        self.d_k = d_k
        self.heads = heads

        # assert that numbers check
        assert(self.d_k * self.heads == self.d_model), "heads number isn't compatible with dimesions!"
        
        
        # create a linear layer to create the different heads
        self.linear = nn.Linear(d_model, heads * d_k, bias=bias)
        
    def forward(self, x):
        """x is of shape [-1, seq_length, d_model]"""
        # change the view of the input so that trainable features (d_model) is the last
        shape = x.shape[:-1]
        # split x into multiple heads
        x = self.linear(x)
        
        
        # split last dimension into two 
        x = x.view(*shape, self.heads, self.d_k)
        x = torch.transpose(x, -2, -3)
        

        # return output
        return x
    

In [26]:
mh = MultiHeadSplit(128, 16)
summary(mh, (40, 128))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Linear-1              [-1, 40, 128]          16,384
Total params: 16,384
Trainable params: 16,384
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.02
Forward/backward pass size (MB): 0.04
Params size (MB): 0.06
Estimated Total Size (MB): 0.12
----------------------------------------------------------------


next, we will create the scaled dot-product attention layer, which will be used to extract meaningful relations between different parts of the audio track

In [27]:
class SelfAttention(nn.Module):
    def __init__(self, d_model, heads, d_k, bias=False, dropout=0.1): # dropout is set randomly 
        super().__init__()
        
        self.d_model = d_model
        self.heads = heads
        self.d_k = d_k
        
        # split input into three matrecies : q: query, k: key, v: value using linear layers
        self.query = MultiHeadSplit(d_model, d_k, heads, bias)
        self.key = MultiHeadSplit(d_model, d_k, heads, bias)
        self.value = MultiHeadSplit(d_model, d_k, heads, bias)
        
        # define necessery activations
        self.softmax = nn.Softmax(dim= -1) # the input to the softmax has shape (t_dim, t_dim)
        
        self.dropout = nn.Dropout(dropout)
        
        self.scale = 1 / (self.d_k)**(0.5)
        
        output = nn
        
    def get_att(self, q, k):
        """
        q shape (B, h, q_len, d)
        k shape (B, h, k_len, d)
        
        :return : output of shape (B, h, q_len, k_len)
            """
        out = torch.einsum('bhqd,bhkd->bhqk', q, k)
        return out
    
    
    def forward(self, x, mask=None):
        """
        :param x : input of shape (B, seq_length, d_model) 
        :param mask : mask of shape (B, seq_length)
        
        :return output of shape (B, seq_length, d_v)
        """
                
        q = self.query(x)
        k = self.key(x)
        v = self.value(x)
        
        # apply matmul
        att = self.get_att(q, k)
        
        # apply scale
        att = att * self.scale
        
      
        
        # apply mask
        if mask is not None:
            att = att.masked_fill(mask==0, -1 * torch.inf)
            
        # apply softmax
        att = self.softmax(att)
        
        # apply matmul
        att = torch.einsum('bhqk,bhkd->bqhd', att, v)
        
        # concatenate all heads in one (bhqd -> bq(d*h)
        shape = att.shape[:-2]

        out = att.reshape(*shape, att.shape[-1] * att.shape[-2])

        
        return out
    
  
        
from torchsummary import summary 


        

In [29]:
# next we will implement the feed forward layer 

class FeedForward(nn.Module):
    def __init__(self, d_model, n_hidden, dropout=0.1):
        """
        :param n_hidden number of feed forward features in the hidden layer"""
        super().__init__()
        self.d_model = d_model
        self.n_hidden = n_hidden
        
        # First linear layer
        self.linear1 = nn.Linear(d_model, n_hidden)
        
        #Second Linear layer
        self.linear2 = nn.Linear(n_hidden, d_model)
        
        #dropout layer
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        # apply first linear
        x = self.linear1(x)
        x = nn.functional.relu(x)
        x = self.dropout(x)
        x = self.linear2(x)
        return x
    

ff = FeedForward(128, 1024)
summary(ff, (40, 128))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Linear-1             [-1, 40, 1024]         132,096
           Dropout-2             [-1, 40, 1024]               0
            Linear-3              [-1, 40, 128]         131,200
Total params: 263,296
Trainable params: 263,296
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.02
Forward/backward pass size (MB): 0.66
Params size (MB): 1.00
Estimated Total Size (MB): 1.69
----------------------------------------------------------------


In [31]:
class LayerNorm(nn.Module):
    def __init__(self, input_shape):
        super().__init__()
        self.norm = nn.LayerNorm(normalized_shape=input_shape)
    
    def forward(self, x):
        return self.norm(x)
    
n = LayerNorm((128))
summary(n, (40,128))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
         LayerNorm-1              [-1, 40, 128]             256
Total params: 256
Trainable params: 256
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.02
Forward/backward pass size (MB): 0.04
Params size (MB): 0.00
Estimated Total Size (MB): 0.06
----------------------------------------------------------------


In [32]:
# now we implement the encoder block 

class EncoderBlock(nn.Module):
    """
    This layer consists of self attention layer, followed by add and norm layer, then a feed forward layer, followed by add and norm layer """
    def __init__(self, d_model, d_k, heads, n_hidden=1024, dropout=0.1, bias=False):
        super().__init__()
        self.d_model = d_model
        self.heads = heads
        self.d_k = d_k
        self.bias = bias
        
        # define a self attention layer
        self.self_attention = SelfAttention(d_model, heads, d_k, bias, dropout)
        
        # norm layer
        self.norm1 = LayerNorm(d_model)
        self.norm2 = LayerNorm(d_model)
        
        # feed forward layer
        self.ff = FeedForward(d_model, n_hidden=n_hidden,dropout=dropout)
        self.dropout = nn.Dropout(dropout)
        
    
    def forward(self, x):
        """
        x has a shape of (B, seq_length, d_model)"""
        
        att = self.self_attention(x)
        
        x = self.norm1(x + att)
        
        x = self.dropout(x)
        ff = self.ff(x)
        
        x = self.norm2(x + ff)
        x = self.dropout(x)
        
        return x
        
        
        
enc = EncoderBlock(128, 16, 8)

summary(enc, (40, 128))
        

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Linear-1              [-1, 40, 128]          16,384
    MultiHeadSplit-2            [-1, 8, 40, 16]               0
            Linear-3              [-1, 40, 128]          16,384
    MultiHeadSplit-4            [-1, 8, 40, 16]               0
            Linear-5              [-1, 40, 128]          16,384
    MultiHeadSplit-6            [-1, 8, 40, 16]               0
           Softmax-7            [-1, 8, 40, 40]               0
     SelfAttention-8              [-1, 40, 128]               0
         LayerNorm-9              [-1, 40, 128]             256
        LayerNorm-10              [-1, 40, 128]               0
          Dropout-11              [-1, 40, 128]               0
           Linear-12             [-1, 40, 1024]         132,096
          Dropout-13             [-1, 40, 1024]               0
           Linear-14              [-1, 

In [34]:
# the encoder sonsists of a positional embedding block + sequence of n encoder blocks 

import torch
import torch.nn as nn

class Encoder(nn.Module):
    def __init__(self, max_length, d_model, d_k, heads,n_encoders=1, n_hidden=1024, dropout=0.1, bias=False ):
        super().__init__()
        
        # self.pe = PositionalEncoding(d_model, max_length)
      
        self.enc_blocks = nn.ModuleList([
            EncoderBlock(d_model, d_k, heads, n_hidden, dropout, bias) for i in range(n_encoders)
        ])
        self.dropout = nn.Dropout(dropout)
        
        
    def forward(self, x): 
        """x is of shape (B, seq_length, d_model)
        :return output of shape (B, seq_length, d_model)"""

        seq_length = x[1]
        #pe = self.pe()[:seq_length,:]
        # add positional information 
        x = x #+ pe
        
        # inject new input to encoder blocks 
        
        for layer in self.enc_blocks:
            x = layer(x)        
        return x
        
enc = Encoder(40, 128, 16,8)
summary(enc, (40,128))
        

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Linear-1              [-1, 40, 128]          16,384
    MultiHeadSplit-2            [-1, 8, 40, 16]               0
            Linear-3              [-1, 40, 128]          16,384
    MultiHeadSplit-4            [-1, 8, 40, 16]               0
            Linear-5              [-1, 40, 128]          16,384
    MultiHeadSplit-6            [-1, 8, 40, 16]               0
           Softmax-7            [-1, 8, 40, 40]               0
     SelfAttention-8              [-1, 40, 128]               0
         LayerNorm-9              [-1, 40, 128]             256
        LayerNorm-10              [-1, 40, 128]               0
          Dropout-11              [-1, 40, 128]               0
           Linear-12             [-1, 40, 1024]         132,096
          Dropout-13             [-1, 40, 1024]               0
           Linear-14              [-1, 

In [35]:
# the full architechture consists of : input embedding + encoder + global average 
# we will experiment with global averaging and cls token [like BERT]
import torch 
import torch.nn as nn

class Classifier(nn.Module):
    def __init__(self, n_classes, d_model, seq_length):
        super().__init__()
        self.average_pooling = nn.AvgPool1d(kernel_size=seq_length, stride=seq_length)
        self.flat = nn.Flatten(start_dim = 1, end_dim=2)
        self.linear = nn.Linear(d_model, n_classes)
        self.softmax = nn.Softmax(dim=-1)
        
    def forward(self, x):
       
        # add channel dimension 
        x = x.unsqueeze(dim=1)
      
        # transpose input 
        x = x.transpose(-1,-2)
        
        # flatten input 
        x = self.flat(x)
    
        # apply avgPool
        x = self.average_pooling(x)
        
        # remove added dimension
        x = x.squeeze(dim=-1)
        
        # apply linear layer 
        x = self.linear(x)
        
        # apply softmax 
        x = self.softmax(x)
        
        return x
    
cls = Classifier(5, 128, 40)
summary(cls, (40, 128))
        

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
           Flatten-1              [-1, 128, 40]               0
         AvgPool1d-2               [-1, 128, 1]               0
            Linear-3                    [-1, 5]             645
           Softmax-4                    [-1, 5]               0
Total params: 645
Trainable params: 645
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.02
Forward/backward pass size (MB): 0.04
Params size (MB): 0.00
Estimated Total Size (MB): 0.06
----------------------------------------------------------------


In [146]:
# the full arch consists of : input embedding + encoder + global average 
# we will experiment with global averaging and cls token [like BERT]
import torch 
import torch.nn as nn

class Classifier2(nn.Module):
    def __init__(self, n_classes, d_model, seq_length):
        super().__init__()
        self.linear = nn.Linear(d_model, n_classes)
        self.softmax = nn.Softmax(dim=-1)
        
    def forward(self, x):
        # apply linear layer 
        x = self.linear(x)
        
        # apply softmax 
        x = self.softmax(x)
        
        return x
    
cls2 = Classifier2(10, 128, 40)
my_set[0][0].shape



(64, 414)

In [52]:
# we will encapsulate the whole work under one class 

class SERT(nn.Module):
    def __init__(self,d_input,
                 max_length,
                 d_model,
                 d_k,
                 heads,
                 n_classes,
                 n_encoders=1,
                 n_hidden=1024,
                 dropout=0.1,
                 bias=False,
                 stride=10,
                 kernel_size=16 ):
        super().__init__()
        self.input = input_embedding(d_input, d_model, stride, kernel_size)
        self.enc_layers = nn.ModuleList([
            Encoder(max_length,
                    d_model,
                    d_k,
                    heads,
                    n_encoders=1,
                    n_hidden=1024,
                    dropout=0.1,
                    bias=False) for _ in range(n_encoders)
        ])
        self.output= Classifier(n_classes, d_model, max_length)
    
    def forward(self, x):
        x = self.input(x)
        for layer in self.enc_layers:
            x = layer(x)
            
        x = self.output(x)
        return x
        
sert_model = SERT(d_input=64, max_length=35, d_model=128,d_k=16,heads=8,n_classes=9)

        

# setting up the dataloader, the optimizer, and the training loop



In [48]:
# first we will split the dataset into train_test_validation sets 
from torch.utils.data import DataLoader, Dataset
import torch.utils.data as torchdata 


def train_test_val_split(dataset, train_test,train_val):
    """splits dataset into 3 groups:
    1- train dataset (size = train_test*train_val)
    2 - test dataset (size = (1-train_test))
    3- validation dataset (size = (train_test * (1-train_val))
    """

    train_ds, test_ds = torchdata.random_split(dataset, [int(train_test * len(dataset)), len(dataset) - int(train_test * len(dataset))])
    train_ds, val_ds = torchdata.random_split(train_ds, [int(train_val * len(train_ds)), len(train_ds) - int(train_val * len(train_ds))])
    train_size = train_test * train_val
    test_size = 1 - train_size
    val_size = train_test * (1 - train_val)
    print("dataset is split as follows:\n\ntrain : {}({})\n\ntest : {}({})\n\nvalidation : {}({})".format(len(train_ds), round(train_size,2), len(test_ds), round(test_size, 2), len(val_ds), round(val_size, 2)))
    return train_ds, test_ds, val_ds


train,test,val = train_test_val_split(my_set, 0.8, 0.8)

# after splitting, we will define the dataloader
train_dataloader = torchdata.DataLoader(dataset=train, batch_size=64, shuffle=True)
validation_dataloader = torchdata.DataLoader(dataset=val, batch_size=64, shuffle=True)
test_dataloader = torchdata.DataLoader(dataset=test, batch_size=64, shuffle=True)


dataset is split as follows:

train : 4566(0.64)

test : 1427(0.36)

validation : 1142(0.16)


In [53]:
# first we will split the dataset into train_test_validation sets 
from torch.utils.data import DataLoader, Dataset
import torch.utils.data as torchdata 
import torch.optim
import torch.nn as nn
import torch.nn.functional as F


def train_test_val_split(dataset, train_test,train_val):
    """splits dataset into 3 groups:
    1- train dataset (size = train_test*train_val)
    2 - test dataset (size = (1-train_test))
    3- validation dataset (size = (train_test * (1-train_val))
    """

    train_ds, test_ds = torchdata.random_split(dataset, [int(train_test * len(dataset)), len(dataset) - int(train_test * len(dataset))])
    train_ds, val_ds = torchdata.random_split(train_ds, [int(train_val * len(train_ds)), len(train_ds) - int(train_val * len(train_ds))])
    train_size = train_test * train_val
    test_size = 1 - train_size
    val_size = train_test * (1 - train_val)
    print("dataset is split as follows:\n\ntrain : {}({})\n\ntest : {}({})\n\nvalidation : {}({})".format(len(train_ds), round(train_size,2), len(test_ds), round(test_size, 2), len(val_ds), round(val_size, 2)))
    return train_ds, test_ds, val_ds


train,test,val = train_test_val_split(my_set, 0.8, 0.8)

# after splitting, we will define the dataloader
train_dataloader = torchdata.DataLoader(dataset=train, batch_size=32, shuffle=True)
validation_dataloader = torchdata.DataLoader(dataset=val, batch_size=32, shuffle=True)
test_dataloader = torchdata.DataLoader(dataset=test, batch_size=32, shuffle=True)


# specify the optimizer (adam) and the loss function
optimizer = torch.optim.Adam(sert_model.parameters(), lr=0.0001)


loss_fn = nn.CrossEntropyLoss()
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

sert_model = sert_model.to(device)

batch = next(iter(train_dataloader))
y = batch[1].to(device)
batch = batch[0]
batch.shape
y_pred = sert_model(batch.to(device))
y_pred
predicted = torch.argmax(y_pred, dim=1)
print(predicted)
(y == predicted).sum().item()

dataset is split as follows:

train : 4566(0.64)

test : 1427(0.36)

validation : 1142(0.16)
cpu
tensor([3, 3, 3, 0, 3, 3, 5, 3, 0, 0, 3, 3, 3, 5, 3, 3, 3, 3, 3, 3, 3, 0, 0, 5,
        3, 3, 3, 8, 5, 3, 0, 3])


5

## building training loop:

In [55]:
from tqdm.auto import  tqdm
import torch.nn.functional as F
from torchmetrics.classification import MulticlassAccuracy
from torch.utils.tensorboard import SummaryWriter

writer = SummaryWriter("runs/log_sert_model")
# set number of epochs (keep it small for initial investigation)
num_epochs = 10

torch.manual_seed(24)

# add a timer 

for epoch in tqdm(range(num_epochs)):
    print(f"epoch : {epoch}\n")
    
    acc_loss = 0.0
    num_correct = 0
    # loop through batches
    for i, (X,y) in enumerate(train_dataloader):
        
        # move input to device
        X = X.to(device);
        y = y.to(device);
        # set the model to train mode
        sert_model.train();
        
        # get the predictions (forward pass)
        y_pred = sert_model(X);
        
        # apply loss
        current_loss = F.cross_entropy(y_pred, y);
        
        
        
        predicted = torch.argmax(y_pred, dim=1)
        
        # accumulate loss
        acc_loss += current_loss;
        
        # get number of correct predictions of each batch
        num_correct += (y == predicted).sum().item()
        
        # zero_grad the optimizer 
        optimizer.zero_grad();
        
        # step the loss
        current_loss.backward();
        
        # step the optimizer
        optimizer.step();
        
        if (i+1) % 15 == 0:
            print(f"Epoch[{epoch + 1}/{num_epochs}] batch[{i}/{len(train_dataloader)}] Loss[{acc_loss/15:.4f}]")
            writer.add_scalar('training loss', acc_loss / 15, epoch * len(train_dataloader) + i)
            writer.add_scalar('training accuracy', num_correct / 15, epoch * len(train_dataloader) + i)
            acc_loss = 0
            num_correct = 0
            
            
            # average loss per epoch
acc_loss /= len(train_dataloader)

### testing 
test_loss = 0
test_acc = 0
sert_model.eval()
with torch.inference_mode():
    for i, (X,y) in enumerate(validation_dataloader):
        X = X.to(device)
        y = y.to(device)
        # get the prediction 
        y_pred = sert_model(X)

        # calculate the loss
        test_loss += loss(y_pred, y)

    # show resutls 
    test_loss /= len(validation_dataloader)
print(f" train_losss : {acc_loss:4f} | test_loss: {test_loss:.4f}")



  0%|                                                    | 0/10 [00:00<?, ?it/s]

epoch : 0

Epoch[1/10] batch[14/143] Loss[1.9360]
Epoch[1/10] batch[29/143] Loss[1.9490]
Epoch[1/10] batch[44/143] Loss[1.9633]
Epoch[1/10] batch[59/143] Loss[1.9694]
Epoch[1/10] batch[74/143] Loss[1.9395]
Epoch[1/10] batch[89/143] Loss[1.9461]
Epoch[1/10] batch[104/143] Loss[1.9295]
Epoch[1/10] batch[119/143] Loss[1.9907]
Epoch[1/10] batch[134/143] Loss[1.9882]


 10%|████▍                                       | 1/10 [00:23<03:28, 23.14s/it]

epoch : 1

Epoch[2/10] batch[14/143] Loss[1.9543]
Epoch[2/10] batch[29/143] Loss[1.9593]
Epoch[2/10] batch[44/143] Loss[1.9410]
Epoch[2/10] batch[59/143] Loss[1.9616]
Epoch[2/10] batch[74/143] Loss[1.9576]
Epoch[2/10] batch[89/143] Loss[1.9305]
Epoch[2/10] batch[104/143] Loss[1.9487]
Epoch[2/10] batch[119/143] Loss[1.9288]
Epoch[2/10] batch[134/143] Loss[1.9336]


 20%|████████▊                                   | 2/10 [00:46<03:07, 23.42s/it]

epoch : 2

Epoch[3/10] batch[14/143] Loss[1.9261]
Epoch[3/10] batch[29/143] Loss[1.9276]
Epoch[3/10] batch[44/143] Loss[1.9266]
Epoch[3/10] batch[59/143] Loss[1.9258]
Epoch[3/10] batch[74/143] Loss[1.9466]
Epoch[3/10] batch[89/143] Loss[1.9322]
Epoch[3/10] batch[104/143] Loss[1.9698]
Epoch[3/10] batch[119/143] Loss[1.9721]
Epoch[3/10] batch[134/143] Loss[1.9556]


 30%|█████████████▏                              | 3/10 [01:09<02:42, 23.24s/it]

epoch : 3

Epoch[4/10] batch[14/143] Loss[1.8929]
Epoch[4/10] batch[29/143] Loss[1.9578]
Epoch[4/10] batch[44/143] Loss[1.9222]
Epoch[4/10] batch[59/143] Loss[1.9343]
Epoch[4/10] batch[74/143] Loss[1.9127]
Epoch[4/10] batch[89/143] Loss[1.9573]
Epoch[4/10] batch[104/143] Loss[1.9330]
Epoch[4/10] batch[119/143] Loss[1.9451]
Epoch[4/10] batch[134/143] Loss[1.9461]


 40%|█████████████████▌                          | 4/10 [01:32<02:18, 23.09s/it]

epoch : 4

Epoch[5/10] batch[14/143] Loss[1.9153]
Epoch[5/10] batch[29/143] Loss[1.9250]
Epoch[5/10] batch[44/143] Loss[1.9040]
Epoch[5/10] batch[59/143] Loss[1.9096]
Epoch[5/10] batch[74/143] Loss[1.9059]
Epoch[5/10] batch[89/143] Loss[1.9399]
Epoch[5/10] batch[104/143] Loss[1.9567]
Epoch[5/10] batch[119/143] Loss[1.9518]
Epoch[5/10] batch[134/143] Loss[1.9607]


 50%|██████████████████████                      | 5/10 [01:55<01:55, 23.05s/it]

epoch : 5



 50%|██████████████████████                      | 5/10 [01:56<01:56, 23.28s/it]


KeyboardInterrupt: 

In [None]:
# experiment with tensorboard
from torch.utils.tensorboard import SummaryWriter
import sys

writer = SummaryWriter("runs/SER")
train_it = iter(train_dataloader)

batch,_ = next(train_it)
writer.add_graph(sert_model, batch.to(device))
writer.close()
torch.cuda.is_available()


1. get model hyperparameters into a dict
2. train the model, storing all loss values and accuracies along the way in a list
3. save the best performing model in an external dir.

In [57]:
import time 
from tqdm.auto import tqdm
from torch.utils.tensorboard import SummaryWriter

def model_train(model,train_dataset, val_dataset, num_epochs, device, loss_fn, metric, optimizer, ext_dir):
    """returns information about the model, the results of training
    :param ext_dir: directory to save the state of the best performing model according to the metric function
    """
    
    summary = {
        'model name': model.__class__.__name__,
        'device': device,
        'loss_fn': loss_fn,
        'metric': metric, 
        'optimizer':optimizer,
        'num_epochs' : num_epochs,
    }
    
    batch_size = train_dataset.batch_size
    
    print(f"start training with :\n total epochs: {num_epochs}\n batch size: {train_dataset.batch_size}\n total batches: {len(train_dataset)}")
    
    #set a timer to measure the training time 
    start_timer = time.time()
    
    
    model = model.to(device)
    
    # log stride = 10% of the number of batches to loop through, this is used to print results while training 
    log_stride = int(0.1 * len(train_dataset))
    
    # add model graph to tensorboard
    batch, _ = next(iter(train_dataset))
    writer = SummaryWriter(f"runs/log_sert_model")
    writer.add_graph(model, batch.to(device))
    
    
    # save the best performing model
    max_acc = 0.0
    
    
    for epoch in tqdm(range(num_epochs)):
        # we will calculate average loss per epoch, average accuracy per epoch
        loss_epoch = 0.0
        accuracy_epoch = 0.0
        
        # average metrics per log_stride
        loss_stride = 0.0
        correct_stride = 0
        
        # save metrics for logging
        train_loss =[]
        train_acc = []
        eval_loss = []
        eval_acc = []
            
        model.train()
        for it, (X, y) in enumerate(train_dataset):
            X = X.to(device)
            y = y.to(device)
            
            y_pred = model(X)
            
            # calculate the loss
            loss =loss_fn(y_pred, y)
            loss_stride += loss
            train_loss.append((loss, epoch, it))
            
            
            # get correct predictions
            predictions = torch.argmax(y_pred, dim=1)
           
            
            correct_stride += (y == predictions).sum().item()
            train_acc.append(((correct_stride)/batch_size, epoch, it))
            
            # zero grad the optim.
            optimizer.zero_grad()
            
            loss.backward()
            
            optimizer.step()
            
            if (it + 1) % log_stride == 0:
                # print results
                print(f"epoch[{epoch + 1}/{num_epochs}] | batch[{it + 1}/{len(train_dataset)}] train_loss: {loss_stride/ (log_stride):.4f} | accuracy: {correct_stride/(log_stride * batch_size)}")
                
                # send scalars to tensorboard
                writer.add_scalar('train_loss',loss_stride/ (log_stride), epoch * len(train_dataset) + it)
                writer.add_scalar('train accuracy', correct_stride/(log_stride * batch_size), epoch * len(train_dataset) + it)
                
                # reset parameters
                loss_epoch += loss_stride
                accuracy_epoch += correct_stride
                loss_stride = 0.0
                correct_stride = 0
        
        print("validation:")
        model.eval()
        loss_val = 0.0
        acc_val = 0.0
        
        
        with torch.inference_mode() :
            correct = 0
            for (X_test, y_test) in val_dataset:
                X_test = X_test.to(device)
                y_test = y_test.to(device)
                
                y_pred = model(X_test)
                loss_val += loss_fn(y_pred, y_test)
                predictions = torch.argmax(y_pred, dim=1)
                
                correct += (predictions == y_test).sum().item()
                
        eval_loss.append((loss_val / len(val_dataset), epoch))
        eval_acc.append(((correct/(len(val_dataset) * batch_size)), epoch))
            
        train_accuracy = accuracy_epoch/(batch_size * len(train_dataset))
        print(f"epoch: {epoch} summary:\n train_loss: {loss_epoch / len(train_dataset)}, validation loss: {loss_val / len(val_dataset)}, train accuracy: {accuracy_epoch/(len(train_dataset) * batch_size)}")
        
        summary['train_loss'] = loss_epoch / len(train_dataset)
        summary['validation_loss'] = loss_val / len(val_dataset)
        summary['validation_accuracy'] = correct/(len(val_dataset) * batch_size)
        writer.add_scalar('validation_loss',eval_loss[-1][0],  epoch)
        writer.add_scalar('validation_accuracy',eval_acc[-1][0],  epoch)
        
        if train_accuracy > max_acc:
            max_acc = train_accuracy
            torch.save(model.state_dict(), ext_dir)
    
    total_time = (time.time() - start_timer)
    summary['total_time'] = total_time
    writer.close()
    return summary, train_accuracy, train_loss, eval_acc, eval_loss

In [58]:
model = SERT(64,40, d_model=128,d_k= 16, heads=8,n_classes=9,n_encoders=1,dropout=0, n_hidden=768)
train,test,val = train_test_val_split(my_set, 0.8, 0.8)

# after splitting, we will define the dataloader
train_dataloader = torchdata.DataLoader(dataset=train, batch_size=64)
validation_dataloader = torchdata.DataLoader(dataset=val, batch_size=64, shuffle=True)
test_dataloader = torchdata.DataLoader(dataset=test, batch_size=32, shuffle=True)

optimizer = torch.optim.Adam(model.parameters(), lr = 0.00001)
summary = model_train(model, train_dataloader, validation_dataloader, num_epochs=10, device=device, loss_fn=loss_fn, metric=acc_loss, optimizer=optimizer, ext_dir='./model')


dataset is split as follows:

train : 4566(0.64)

test : 1427(0.36)

validation : 1142(0.16)
start training with :
 total epochs: 10
 batch size: 64
 total batches: 72


  0%|                                                    | 0/10 [00:00<?, ?it/s]

epoch[1/10] | batch[7/72] train_loss: 2.1969 | accuracy: 0.11160714285714286
epoch[1/10] | batch[14/72] train_loss: 2.1926 | accuracy: 0.15625
epoch[1/10] | batch[21/72] train_loss: 2.1837 | accuracy: 0.20758928571428573
epoch[1/10] | batch[28/72] train_loss: 2.1761 | accuracy: 0.19419642857142858
epoch[1/10] | batch[35/72] train_loss: 2.1679 | accuracy: 0.22321428571428573
epoch[1/10] | batch[42/72] train_loss: 2.1595 | accuracy: 0.21205357142857142
epoch[1/10] | batch[49/72] train_loss: 2.1546 | accuracy: 0.203125
epoch[1/10] | batch[56/72] train_loss: 2.1500 | accuracy: 0.21875
epoch[1/10] | batch[63/72] train_loss: 2.1428 | accuracy: 0.25669642857142855
epoch[1/10] | batch[70/72] train_loss: 2.1335 | accuracy: 0.25
validation:


 10%|████▍                                       | 1/10 [00:21<03:15, 21.73s/it]

epoch: 0 summary:
 train_loss: 2.105602741241455, validation loss: 2.13472056388855, train accuracy: 0.1976996527777778
epoch[2/10] | batch[7/72] train_loss: 2.1178 | accuracy: 0.26339285714285715
epoch[2/10] | batch[14/72] train_loss: 2.1168 | accuracy: 0.27232142857142855
epoch[2/10] | batch[21/72] train_loss: 2.1202 | accuracy: 0.23660714285714285
epoch[2/10] | batch[28/72] train_loss: 2.1119 | accuracy: 0.24776785714285715
epoch[2/10] | batch[35/72] train_loss: 2.1109 | accuracy: 0.2544642857142857
epoch[2/10] | batch[42/72] train_loss: 2.1106 | accuracy: 0.25
epoch[2/10] | batch[49/72] train_loss: 2.1127 | accuracy: 0.22098214285714285
epoch[2/10] | batch[56/72] train_loss: 2.1150 | accuracy: 0.22991071428571427
epoch[2/10] | batch[63/72] train_loss: 2.1045 | accuracy: 0.2611607142857143
epoch[2/10] | batch[70/72] train_loss: 2.0972 | accuracy: 0.25223214285714285
validation:


 20%|████████▊                                   | 2/10 [00:47<03:13, 24.17s/it]

epoch: 1 summary:
 train_loss: 2.053112030029297, validation loss: 2.103925943374634, train accuracy: 0.2419704861111111
epoch[3/10] | batch[7/72] train_loss: 2.0841 | accuracy: 0.28348214285714285
epoch[3/10] | batch[14/72] train_loss: 2.0839 | accuracy: 0.2767857142857143
epoch[3/10] | batch[21/72] train_loss: 2.0915 | accuracy: 0.25
epoch[3/10] | batch[28/72] train_loss: 2.0826 | accuracy: 0.29017857142857145
epoch[3/10] | batch[35/72] train_loss: 2.0871 | accuracy: 0.296875
epoch[3/10] | batch[42/72] train_loss: 2.0888 | accuracy: 0.265625
epoch[3/10] | batch[49/72] train_loss: 2.0934 | accuracy: 0.25223214285714285
epoch[3/10] | batch[56/72] train_loss: 2.0993 | accuracy: 0.25
epoch[3/10] | batch[63/72] train_loss: 2.0865 | accuracy: 0.3013392857142857
epoch[3/10] | batch[70/72] train_loss: 2.0783 | accuracy: 0.29910714285714285
validation:


 30%|█████████████▏                              | 3/10 [01:16<03:03, 26.23s/it]

epoch: 2 summary:
 train_loss: 2.0295567512512207, validation loss: 2.0850465297698975, train accuracy: 0.2688802083333333
epoch[4/10] | batch[7/72] train_loss: 2.0670 | accuracy: 0.3125
epoch[4/10] | batch[14/72] train_loss: 2.0639 | accuracy: 0.3325892857142857
epoch[4/10] | batch[21/72] train_loss: 2.0715 | accuracy: 0.31026785714285715
epoch[4/10] | batch[28/72] train_loss: 2.0635 | accuracy: 0.32589285714285715
epoch[4/10] | batch[35/72] train_loss: 2.0710 | accuracy: 0.3325892857142857
epoch[4/10] | batch[42/72] train_loss: 2.0711 | accuracy: 0.31473214285714285
epoch[4/10] | batch[49/72] train_loss: 2.0761 | accuracy: 0.29017857142857145
epoch[4/10] | batch[56/72] train_loss: 2.0848 | accuracy: 0.25892857142857145
epoch[4/10] | batch[63/72] train_loss: 2.0668 | accuracy: 0.3325892857142857
epoch[4/10] | batch[70/72] train_loss: 2.0639 | accuracy: 0.33482142857142855
validation:


 40%|█████████████████▌                          | 4/10 [01:43<02:40, 26.76s/it]

epoch: 3 summary:
 train_loss: 2.0124616622924805, validation loss: 2.068875312805176, train accuracy: 0.3057725694444444
epoch[5/10] | batch[7/72] train_loss: 2.0519 | accuracy: 0.34375
epoch[5/10] | batch[14/72] train_loss: 2.0476 | accuracy: 0.34598214285714285
epoch[5/10] | batch[21/72] train_loss: 2.0536 | accuracy: 0.36160714285714285
epoch[5/10] | batch[28/72] train_loss: 2.0455 | accuracy: 0.36160714285714285
epoch[5/10] | batch[35/72] train_loss: 2.0561 | accuracy: 0.34151785714285715
epoch[5/10] | batch[42/72] train_loss: 2.0549 | accuracy: 0.328125
epoch[5/10] | batch[49/72] train_loss: 2.0603 | accuracy: 0.31026785714285715
epoch[5/10] | batch[56/72] train_loss: 2.0704 | accuracy: 0.27901785714285715
epoch[5/10] | batch[63/72] train_loss: 2.0448 | accuracy: 0.3638392857142857
epoch[5/10] | batch[70/72] train_loss: 2.0523 | accuracy: 0.32142857142857145
validation:


 50%|██████████████████████                      | 5/10 [02:12<02:17, 27.57s/it]

epoch: 4 summary:
 train_loss: 1.9966875314712524, validation loss: 2.0543887615203857, train accuracy: 0.3263888888888889
epoch[6/10] | batch[7/72] train_loss: 2.0350 | accuracy: 0.3705357142857143
epoch[6/10] | batch[14/72] train_loss: 2.0333 | accuracy: 0.36830357142857145
epoch[6/10] | batch[21/72] train_loss: 2.0397 | accuracy: 0.38392857142857145
epoch[6/10] | batch[28/72] train_loss: 2.0299 | accuracy: 0.3794642857142857
epoch[6/10] | batch[35/72] train_loss: 2.0446 | accuracy: 0.3482142857142857
epoch[6/10] | batch[42/72] train_loss: 2.0413 | accuracy: 0.35267857142857145
epoch[6/10] | batch[49/72] train_loss: 2.0497 | accuracy: 0.31026785714285715
epoch[6/10] | batch[56/72] train_loss: 2.0570 | accuracy: 0.29910714285714285
epoch[6/10] | batch[63/72] train_loss: 2.0284 | accuracy: 0.3861607142857143
epoch[6/10] | batch[70/72] train_loss: 2.0440 | accuracy: 0.33705357142857145
validation:


 60%|██████████████████████████▍                 | 6/10 [02:42<01:52, 28.12s/it]

epoch: 5 summary:
 train_loss: 1.9836055040359497, validation loss: 2.0445990562438965, train accuracy: 0.34375
epoch[7/10] | batch[7/72] train_loss: 2.0244 | accuracy: 0.38392857142857145
epoch[7/10] | batch[14/72] train_loss: 2.0229 | accuracy: 0.37723214285714285
epoch[7/10] | batch[21/72] train_loss: 2.0310 | accuracy: 0.38392857142857145
epoch[7/10] | batch[28/72] train_loss: 2.0198 | accuracy: 0.3950892857142857
epoch[7/10] | batch[35/72] train_loss: 2.0374 | accuracy: 0.3549107142857143
epoch[7/10] | batch[42/72] train_loss: 2.0328 | accuracy: 0.36160714285714285
epoch[7/10] | batch[49/72] train_loss: 2.0423 | accuracy: 0.31473214285714285
epoch[7/10] | batch[56/72] train_loss: 2.0481 | accuracy: 0.31026785714285715
epoch[7/10] | batch[63/72] train_loss: 2.0197 | accuracy: 0.3861607142857143
epoch[7/10] | batch[70/72] train_loss: 2.0355 | accuracy: 0.3392857142857143
validation:


 70%|██████████████████████████████▊             | 7/10 [03:11<01:25, 28.53s/it]

epoch: 6 summary:
 train_loss: 1.9749565124511719, validation loss: 2.0365703105926514, train accuracy: 0.3506944444444444
epoch[8/10] | batch[7/72] train_loss: 2.0161 | accuracy: 0.3794642857142857
epoch[8/10] | batch[14/72] train_loss: 2.0141 | accuracy: 0.38392857142857145
epoch[8/10] | batch[21/72] train_loss: 2.0227 | accuracy: 0.3794642857142857
epoch[8/10] | batch[28/72] train_loss: 2.0109 | accuracy: 0.40625
epoch[8/10] | batch[35/72] train_loss: 2.0327 | accuracy: 0.359375
epoch[8/10] | batch[42/72] train_loss: 2.0259 | accuracy: 0.3705357142857143
epoch[8/10] | batch[49/72] train_loss: 2.0363 | accuracy: 0.3236607142857143
epoch[8/10] | batch[56/72] train_loss: 2.0421 | accuracy: 0.33482142857142855
epoch[8/10] | batch[63/72] train_loss: 2.0128 | accuracy: 0.39285714285714285
epoch[8/10] | batch[70/72] train_loss: 2.0291 | accuracy: 0.35044642857142855
validation:


 80%|███████████████████████████████████▏        | 8/10 [03:40<00:57, 28.57s/it]

epoch: 7 summary:
 train_loss: 1.9680355787277222, validation loss: 2.03070330619812, train accuracy: 0.3578559027777778
epoch[9/10] | batch[7/72] train_loss: 2.0098 | accuracy: 0.3861607142857143
epoch[9/10] | batch[14/72] train_loss: 2.0084 | accuracy: 0.38839285714285715
epoch[9/10] | batch[21/72] train_loss: 2.0163 | accuracy: 0.37276785714285715
epoch[9/10] | batch[28/72] train_loss: 2.0035 | accuracy: 0.4107142857142857
epoch[9/10] | batch[35/72] train_loss: 2.0298 | accuracy: 0.359375
epoch[9/10] | batch[42/72] train_loss: 2.0208 | accuracy: 0.36607142857142855
epoch[9/10] | batch[49/72] train_loss: 2.0318 | accuracy: 0.33035714285714285
epoch[9/10] | batch[56/72] train_loss: 2.0369 | accuracy: 0.33482142857142855
epoch[9/10] | batch[63/72] train_loss: 2.0077 | accuracy: 0.38839285714285715
epoch[9/10] | batch[70/72] train_loss: 2.0233 | accuracy: 0.3549107142857143
validation:


 90%|███████████████████████████████████████▌    | 9/10 [04:06<00:28, 28.01s/it]

epoch: 8 summary:
 train_loss: 1.9627538919448853, validation loss: 2.026277780532837, train accuracy: 0.3589409722222222
epoch[10/10] | batch[7/72] train_loss: 2.0052 | accuracy: 0.3950892857142857
epoch[10/10] | batch[14/72] train_loss: 2.0033 | accuracy: 0.39732142857142855
epoch[10/10] | batch[21/72] train_loss: 2.0103 | accuracy: 0.375
epoch[10/10] | batch[28/72] train_loss: 1.9992 | accuracy: 0.40625
epoch[10/10] | batch[35/72] train_loss: 2.0266 | accuracy: 0.359375
epoch[10/10] | batch[42/72] train_loss: 2.0171 | accuracy: 0.37723214285714285
epoch[10/10] | batch[49/72] train_loss: 2.0268 | accuracy: 0.33035714285714285
epoch[10/10] | batch[56/72] train_loss: 2.0325 | accuracy: 0.328125
epoch[10/10] | batch[63/72] train_loss: 2.0037 | accuracy: 0.39285714285714285
epoch[10/10] | batch[70/72] train_loss: 2.0188 | accuracy: 0.35714285714285715
validation:


100%|███████████████████████████████████████████| 10/10 [04:33<00:00, 27.33s/it]

epoch: 9 summary:
 train_loss: 1.9583990573883057, validation loss: 2.0220940113067627, train accuracy: 0.3615451388888889





In [178]:
summary1 = model_train(model, train_dataloader, validation_dataloader, num_epochs=10, device=device, loss_fn=loss_fn, metric=acc_loss, optimizer=optimizer, ext_dir='./model')


start training with :
 total epochs: 10
 batch size: 32
 total batches: 143


  0%|                                                    | 0/10 [00:00<?, ?it/s]

epoch[1/10] | batch[14/143] train_loss: 1.9815 | accuracy: 0.390625
epoch[1/10] | batch[28/143] train_loss: 1.9821 | accuracy: 0.38839285714285715
epoch[1/10] | batch[42/143] train_loss: 1.9700 | accuracy: 0.41294642857142855
epoch[1/10] | batch[56/143] train_loss: 1.9368 | accuracy: 0.4486607142857143
epoch[1/10] | batch[70/143] train_loss: 1.9655 | accuracy: 0.4107142857142857
epoch[1/10] | batch[84/143] train_loss: 1.9938 | accuracy: 0.37276785714285715
epoch[1/10] | batch[98/143] train_loss: 1.9332 | accuracy: 0.4575892857142857
epoch[1/10] | batch[112/143] train_loss: 1.9581 | accuracy: 0.421875
epoch[1/10] | batch[126/143] train_loss: 1.9503 | accuracy: 0.4419642857142857
epoch[1/10] | batch[140/143] train_loss: 1.9496 | accuracy: 0.41964285714285715
validation:


 10%|████▍                                       | 1/10 [00:23<03:27, 23.06s/it]

epoch: 0 summary:
 train_loss: 1.92090904712677, validation loss: 1.992280125617981, train accuracy: 0.40777972027972026
epoch[2/10] | batch[14/143] train_loss: 1.9378 | accuracy: 0.4419642857142857
epoch[2/10] | batch[28/143] train_loss: 1.9476 | accuracy: 0.43080357142857145
epoch[2/10] | batch[42/143] train_loss: 1.9632 | accuracy: 0.4174107142857143
epoch[2/10] | batch[56/143] train_loss: 1.9671 | accuracy: 0.40848214285714285
epoch[2/10] | batch[70/143] train_loss: 1.9364 | accuracy: 0.45982142857142855
epoch[2/10] | batch[84/143] train_loss: 1.9705 | accuracy: 0.4174107142857143
epoch[2/10] | batch[98/143] train_loss: 1.9910 | accuracy: 0.375
epoch[2/10] | batch[112/143] train_loss: 1.9540 | accuracy: 0.4107142857142857
epoch[2/10] | batch[126/143] train_loss: 1.9753 | accuracy: 0.40401785714285715
epoch[2/10] | batch[140/143] train_loss: 1.9698 | accuracy: 0.4174107142857143
validation:


 20%|████████▊                                   | 2/10 [00:56<03:52, 29.07s/it]

epoch: 1 summary:
 train_loss: 1.9201096296310425, validation loss: 1.990226149559021, train accuracy: 0.40952797202797203
epoch[3/10] | batch[14/143] train_loss: 1.9954 | accuracy: 0.3705357142857143
epoch[3/10] | batch[28/143] train_loss: 1.9799 | accuracy: 0.4017857142857143
epoch[3/10] | batch[42/143] train_loss: 1.9594 | accuracy: 0.4375
epoch[3/10] | batch[56/143] train_loss: 1.9687 | accuracy: 0.41294642857142855
epoch[3/10] | batch[70/143] train_loss: 1.9409 | accuracy: 0.42857142857142855
epoch[3/10] | batch[84/143] train_loss: 1.9395 | accuracy: 0.4375
epoch[3/10] | batch[98/143] train_loss: 1.9481 | accuracy: 0.44642857142857145
epoch[3/10] | batch[112/143] train_loss: 1.9766 | accuracy: 0.38839285714285715
epoch[3/10] | batch[126/143] train_loss: 1.9349 | accuracy: 0.4486607142857143
epoch[3/10] | batch[140/143] train_loss: 1.9738 | accuracy: 0.3861607142857143
validation:


 30%|█████████████▏                              | 3/10 [01:28<03:34, 30.67s/it]

epoch: 2 summary:
 train_loss: 1.920564889907837, validation loss: 1.9880449771881104, train accuracy: 0.4071241258741259
epoch[4/10] | batch[14/143] train_loss: 1.9496 | accuracy: 0.43080357142857145
epoch[4/10] | batch[28/143] train_loss: 1.9688 | accuracy: 0.39955357142857145
epoch[4/10] | batch[42/143] train_loss: 1.9853 | accuracy: 0.390625
epoch[4/10] | batch[56/143] train_loss: 1.9833 | accuracy: 0.3861607142857143
epoch[4/10] | batch[70/143] train_loss: 1.9178 | accuracy: 0.46875
epoch[4/10] | batch[84/143] train_loss: 1.9402 | accuracy: 0.45535714285714285
epoch[4/10] | batch[98/143] train_loss: 1.9556 | accuracy: 0.41294642857142855
epoch[4/10] | batch[112/143] train_loss: 1.9610 | accuracy: 0.41517857142857145
epoch[4/10] | batch[126/143] train_loss: 1.9610 | accuracy: 0.41294642857142855
epoch[4/10] | batch[140/143] train_loss: 1.9553 | accuracy: 0.43080357142857145
validation:


 30%|█████████████▏                              | 3/10 [01:57<04:34, 39.22s/it]


KeyboardInterrupt: 

# 1. problem: fluctuating loss

## possible causes:
- dropout rate ( easy )
- small dataset ( hard )
- unnormalized data ( easy )
- small batch size ( easy )



# 2. problem: small accuracy:
## possible causes:

- needs fine-tuning hyperparameters ( takes time )
- features are not suitable ( takes experiments )





In [171]:

model2 = SERT(d_input= 64, max_length= 40, d_model= 128,d_k=16, heads=8, n_classes=9,dropout=0 )