In [1]:
import IPython.display as ipd
import glob
from scipy.io import wavfile
import numpy as np
import pandas as pd
import librosa
import librosa.display
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import matplotlib as plt
import matplotlib.pyplot as plt
import struct
from scipy.io import wavfile as wav
import os
from datetime import datetime 
from sklearn import metrics 
from sklearn.preprocessing import LabelEncoder
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import Convolution2D, Conv2D, MaxPooling2D, GlobalAveragePooling2D
from keras.optimizers import Adam
from keras.utils import np_utils, to_categorical
from keras.callbacks import ModelCheckpoint 

In [12]:
file_name_data='mfcc_label.xlsx'
dffeatures=pd.read_excel(file_name_data)

In [13]:
dffeatures

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,31,32,33,34,35,36,37,38,39,40
0,0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0
1,0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0
2,0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0
3,0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0
4,0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71911,-80,-0.035597,-80.0,-8.098465,-8.980048,-15.847272,-10.322305,-80.0,-11.243167,-36.083233,...,-15.291732,-12.942391,-36.452793,-17.051119,-60.872829,-80.000000,-58.338337,-41.416588,-59.643997,0
71912,-80,-0.094599,-80.0,-7.325050,-11.943776,-17.820580,-8.446717,-80.0,-11.624997,-12.545907,...,-19.534180,-16.903601,-36.371616,-16.394320,-42.433369,-59.528172,-61.072720,-60.711777,-40.487423,0
71913,-80,-0.152973,-80.0,-6.007671,-13.862395,-17.086370,-8.125309,-80.0,-11.100876,-13.036957,...,-24.564272,-39.630966,-80.000000,-38.364044,-43.945827,-17.266998,-39.512138,-80.000000,-39.440868,0
71914,-80,-0.184401,-80.0,-6.244070,-38.443878,-58.667614,-9.242589,-80.0,-10.375988,-13.404880,...,-16.364969,-35.572617,-36.778931,-16.553949,-64.693108,-59.632282,-43.261234,-80.000000,-80.000000,0


In [14]:
class MultiHeadSelfAttention(layers.Layer):
    def __init__(self, embed_dim, num_heads=6):
        super(MultiHeadSelfAttention, self).__init__()
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        if embed_dim % num_heads != 0:
            raise ValueError(
                f"embedding dimension = {embed_dim} should be divisible by number of heads = {num_heads}"
            )
        self.projection_dim = embed_dim // num_heads
        self.query_dense = layers.Dense(embed_dim)
        self.key_dense = layers.Dense(embed_dim)
        self.value_dense = layers.Dense(embed_dim)
        self.combine_heads = layers.Dense(embed_dim)

    def attention(self, query, key, value):
        score = tf.matmul(query, key, transpose_b=True)
        dim_key = tf.cast(tf.shape(key)[-1], tf.float32)
        scaled_score = score / tf.math.sqrt(dim_key)
        weights = tf.nn.softmax(scaled_score, axis=-1)
        output = tf.matmul(weights, value)
        return output, weights

    def separate_heads(self, x, batch_size):
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.projection_dim))
        return tf.transpose(x, perm=[0, 2, 1, 3])

    def call(self, inputs):
        ## x.shape = [batch_size, seq_len, embedding_dim]
        batch_size = tf.shape(inputs)[0]
        query = self.query_dense(inputs)
        
        ## (batch_size, seq_len, embed_dim)
        key = self.key_dense(inputs) 
        
        ## (batch_size, seq_len, embed_dim)
        value = self.value_dense(inputs) 
        
        ## (batch_size, seq_len, embed_dim)
        query = self.separate_heads(
            query, batch_size
        )  
        ## (batch_size, num_heads, seq_len, projection_dim)
        key = self.separate_heads(
            key, batch_size
        )  
        ## (batch_size, num_heads, seq_len, projection_dim)
        value = self.separate_heads(
            value, batch_size
        )  
        ## (batch_size, num_heads, seq_len, projection_dim)
        attention, weights = self.attention(query, key, value)
        attention = tf.transpose(
            attention, perm=[0, 2, 1, 3]
        )  
        ## (batch_size, seq_len, num_heads, projection_dim)
        concat_attention = tf.reshape(
            attention, (batch_size, -1, self.embed_dim)
        )  
        ## (batch_size, seq_len, embed_dim)
        output = self.combine_heads(
            concat_attention
        )  
        ## (batch_size, seq_len, embed_dim)
        return output

In [15]:
class TransformerBlock(layers.Layer):
    ## For calling multihead attention on embedded data and arranging it sequentially and adding other layers.
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.01):
        super(TransformerBlock, self).__init__()
        self.att = MultiHeadSelfAttention(embed_dim, num_heads)
        self.ffn = keras.Sequential(
            [layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)
        
    def call(self, inputs, training):
        attn_output = self.att(inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

In [16]:
class TokenAndPositionEmbedding(layers.Layer):
    ## For preliminary token generation and embedding
    def __init__(self, maxlen, vocab_size, embed_dim):
        super(TokenAndPositionEmbedding, self).__init__()
        self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions

In [19]:
dffeatures=dffeatures.dropna(axis=0)
X = dffeatures.to_numpy()
[m,n]=X.shape
features=[]
for i in range(0,m):
    features.append([X[i,0:n-2],X[i,n-1]])

In [20]:
featuresdf = pd.DataFrame(features, columns=['feature','class_label'])
featuresdf

Unnamed: 0,feature,class_label
0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0
1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0
2,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0
3,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0
4,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0
...,...,...
71911,"[-80.0, -0.03559684753417969, -80.0, -8.098464...",0.0
71912,"[-80.0, -0.0945994034409523, -80.0, -7.3250498...",0.0
71913,"[-80.0, -0.1529731750488281, -80.0, -6.0076708...",0.0
71914,"[-80.0, -0.1844005584716797, -80.0, -6.2440700...",0.0


In [21]:
featuresdf=featuresdf.dropna(axis=0)
print(featuresdf)
X = np.array(featuresdf.feature.tolist())
X=X*100000
min_X=-min([min(element) for element in X])
x=X+min_X
x=x.astype(int)
max_len=max([max(element) for element in x])
print(max_len)
# Getting label size
y = np.array(featuresdf.class_label.tolist())
print(x,x.shape)
print(y,y.shape)

                                                 feature  class_label
0      [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...          0.0
1      [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...          0.0
2      [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...          0.0
3      [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...          0.0
4      [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...          0.0
...                                                  ...          ...
71911  [-80.0, -0.03559684753417969, -80.0, -8.098464...          0.0
71912  [-80.0, -0.0945994034409523, -80.0, -7.3250498...          0.0
71913  [-80.0, -0.1529731750488281, -80.0, -6.0076708...          0.0
71914  [-80.0, -0.1844005584716797, -80.0, -6.2440700...          0.0
71915  [-80.0, -0.1889057159423828, -80.0, -6.2168769...          0.0

[71916 rows x 2 columns]
8000000
[[8000000 8000000 8000000 ... 8000000 8000000 8000000]
 [8000000 8000000 8000000 ... 8000000 8000000 8000000]
 [8000000 800000

In [22]:
m,n=x.shape
print(m,n)
print(y.shape)
x_train=x[0:int(m*9/10),:]
x_test=x[int(m*9/10):m,:]
y_train=y[0:int(m*9/10)]
y_test=y[int(m*9/10):m]

71916 39
(71916,)


In [23]:
m,n=x_test.shape
print(m,n)

## Converting form and reshaping
TestX=x_test
TestY=y_test
testy=np.reshape(TestY,(m,))

## Changing datatypes
testx=np.empty((m,),object)
for i in range (0,m):
    testx[i]=list(int(v) for v in TestX[i])
    testy[i]=testy[i].astype(int)

## Printing data-types - relevant to transformer input
print(type(testx))
print(type(testx[m-1][n-1]))
print(type(testx[m-1]))
print(type(testy[m-1]))
print(type(testy))

## Converting Train Data and Getting size of data
m,n=x_train.shape
print(m,n)

## Converting form and reshaping
TrainX=x_train
TrainY=y_train
trainy=np.reshape(TrainY,(m,))

## Changing datatypes
trainx=np.empty((m,),object)
for i in range (0,m):
    trainx[i]=list(int(v) for v in TrainX[i])
    trainy[i]=TrainY[i].astype(int)
    
## Printing data-types - relevant to transformer input
print(type(trainx))
print(type(trainx[m-1][n-1]))
print(type(trainx[m-1]))
print(type(trainy[m-1]))
print(type(trainy))
print(trainx.shape,trainy.shape,testx.shape,testy.shape)

7192 39
<class 'numpy.ndarray'>
<class 'int'>
<class 'list'>
<class 'numpy.float64'>
<class 'numpy.ndarray'>
64724 39
<class 'numpy.ndarray'>
<class 'int'>
<class 'list'>
<class 'numpy.float64'>
<class 'numpy.ndarray'>
(64724,) (64724,) (7192,) (7192,)


In [24]:
vocab_size = max_len+1
maxlen = 40
print(len(trainx), "Training sequences")
print(len(testx), "Validation sequences")

## Converting to padded tensor sequence
trainx = keras.preprocessing.sequence.pad_sequences(trainx,maxlen=maxlen)
testx = keras.preprocessing.sequence.pad_sequences(testx,maxlen=maxlen)
print(trainx.shape,trainy.shape,testx.shape,testy.shape)
print(trainx,trainy,testx,testy)

64724 Training sequences
7192 Validation sequences
(64724, 40) (64724,) (7192, 40) (7192,)
[[      0 8000000 8000000 ... 8000000 8000000 8000000]
 [      0 8000000 8000000 ... 8000000 8000000 8000000]
 [      0 8000000 8000000 ... 8000000 8000000 8000000]
 ...
 [      0       0 7941571 ... 2195945 6350569 6301741]
 [      0       0 7996376 ... 2171862 6221349 5976445]
 [      0       0 7981751 ... 6309285 3797697 3917446]] [0. 0. 0. ... 0. 0. 0.] [[      0       0 7952578 ... 6435506 4035311 6186128]
 [      0       0 7981283 ... 6499866 1902946 4320971]
 [      0       0 7969698 ... 4392839 2075936 6544528]
 ...
 [      0       0 7984702 ... 6273300 4048786       0]
 [      0       0 7981559 ... 2036771 3673876       0]
 [      0       0 7981109 ...       0 5156304 2078697]] [0. 0. 0. ... 0. 0. 0.]


In [25]:
vocab_size=8000001
maxlen=40
embed_dim = 30  ## Embedding size for each token
num_heads = 6  ## Number of attention heads
ff_dim = 30  ## Hidden layer size in feed forward network inside transformer

## Tokenizing input data with max dimension and embedding it
inputs = layers.Input(shape=(maxlen,))
#x = keras.Sequential()
embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)
x = embedding_layer(inputs)

## Adding Sequential layer to the embedded data and attention layers too.
transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
x = transformer_block(x)

## Add other layers
x = layers.Conv1D(6,3,padding="same")(x)
x = layers.Dense(30, activation="relu")(x)
x = layers.Dropout(0.5)(x)
x = layers.MaxPool1D(pool_size=2, strides=2)(x)
x = layers.Dense(30, activation="relu")(x)
x = layers.Dropout(0.5)(x)
x = layers.GlobalAveragePooling1D()(x)
x = layers.Dense(30, activation="relu")(x)
x = layers.Dropout(0.5)(x)
x = layers.Flatten()(x)
x = layers.Dense(30, activation="relu")(x)
x = layers.Dropout(0.5)(x)
x = layers.Dense(30, activation="relu")(x)
x = layers.Dropout(0.5)(x)

## Producing general softmax layer for classification
outputs = layers.Dense(8, activation="softmax")(x)

## Generating model
model = keras.Model(inputs=inputs, outputs=outputs)
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 40)]              0         
_________________________________________________________________
token_and_position_embedding (None, 40, 30)            240001230 
_________________________________________________________________
transformer_block (Transform (None, 40, 30)            5700      
_________________________________________________________________
conv1d (Conv1D)              (None, 40, 6)             546       
_________________________________________________________________
dense_6 (Dense)              (None, 40, 30)            210       
_________________________________________________________________
dropout_2 (Dropout)          (None, 40, 30)            0         
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 20, 30)            0     

In [26]:
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
history = model.fit(trainx, trainy, batch_size=100, epochs=3,validation_data=(testx, testy))

Epoch 1/3
  4/648 [..............................] - ETA: 27:22 - loss: 2.2548 - accuracy: 0.0477

KeyboardInterrupt: 

In [None]:
score = model.evaluate(trainx, trainy, verbose=0)
print("Training Performance",score)
score = model.evaluate(testx, testy, verbose=0)
print("Testing Performanr",score)

In [None]:
print(history.history.keys())
# summarize history for accuracy
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()
# summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

In [None]:
del model
keras.backend.clear_session()