## Importing Libraries

In [1]:
import numpy as np
import pandas as pd
import librosa
import librosa.display

## Importing Model & Encoder

In [2]:
import tensorflow as tf
import joblib

In [3]:
model = tf.keras.models.load_model('models/latest_model.h5')

In [4]:
encoder = joblib.load('models/one_hot_encoder.pkl')

In [5]:
scaler = joblib.load('models/scaler.pkl')

## Importing Dataset

In [6]:
dataset = pd.read_csv('data_path.csv')

X = dataset.values

In [7]:
print(X)

[['neutral'
  'dataset/revdess/audio_speech_actors_01-24/Actor_01/03-01-01-01-01-01-01.wav']
 ['neutral'
  'dataset/revdess/audio_speech_actors_01-24/Actor_01/03-01-01-01-01-02-01.wav']
 ['neutral'
  'dataset/revdess/audio_speech_actors_01-24/Actor_01/03-01-01-01-02-01-01.wav']
 ...
 ['surprise' 'dataset/savee/ALL/KL_su13.wav']
 ['surprise' 'dataset/savee/ALL/KL_su14.wav']
 ['surprise' 'dataset/savee/ALL/KL_su15.wav']]


In [8]:
X_test = X[87]

X_test

array(['sad',
       'dataset/revdess/audio_speech_actors_01-24/Actor_02/03-01-04-02-02-02-02.wav'],
      dtype=object)

In [9]:
# path = X_test[1]
path = 'test/anger.mp3'


## Feature Extraction

In [10]:
def noise(data):
    noise_amp = 0.035*np.random.uniform()*np.amax(data)
    data = data + noise_amp*np.random.normal(size=data.shape[0])
    return data

def stretch(data,rate = 0.8):
    return librosa.effects.time_stretch(data,rate= rate)

def shift(data):
    shift_range = int(np.random.uniform(low=-5, high = 5)*1000)
    return np.roll(data, shift_range)

def pitch(data, sampling_rate, pitch_factor=0.7):
    return librosa.effects.pitch_shift(data,sr =  sampling_rate,n_steps= pitch_factor)

# taking any example and checking for techniques.
data, sample_rate = librosa.load(path)

In [11]:
def extract_features(data):
    # ZCR
    result = np.array([])
    zcr = np.mean(librosa.feature.zero_crossing_rate(y=data).T, axis=0)
    result=np.hstack((result, zcr)) # stacking horizontally

    # Chroma_stft
    stft = np.abs(librosa.stft(data))
    chroma_stft = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T, axis=0)
    result = np.hstack((result, chroma_stft)) # stacking horizontally

    # MFCC
    mfcc = np.mean(librosa.feature.mfcc(y=data, sr=sample_rate).T, axis=0)
    result = np.hstack((result, mfcc)) # stacking horizontally

    # Root Mean Square Value
    rms = np.mean(librosa.feature.rms(y=data).T, axis=0)
    result = np.hstack((result, rms)) # stacking horizontally

    # MelSpectogram
    mel = np.mean(librosa.feature.melspectrogram(y=data, sr=sample_rate).T, axis=0)
    result = np.hstack((result, mel)) # stacking horizontally
    
    return result

def get_features(path):
    # duration and offset are used to take care of the no audio in start and the ending of each audio files as seen above.
    data, sample_rate = librosa.load(path, duration=2.5, offset=0.6)
    
    # without augmentation
    res1 = extract_features(data)
    result = np.array(res1)
    
    # data with noise
    noise_data = noise(data)
    res2 = extract_features(noise_data)
    result = np.vstack((result, res2)) # stacking vertically
    
    # data with stretching and pitching
    new_data = stretch(data)
    data_stretch_pitch = pitch(new_data, sample_rate)
    res3 = extract_features(data_stretch_pitch)
    result = np.vstack((result, res3)) # stacking vertically
    
    return result

In [12]:
feature = get_features(path = path)

In [13]:
X = []

for ele in feature: 
    X.append(ele)

In [14]:
Features = pd.DataFrame(X)


In [15]:
X = scaler.transform(Features)

## Predicting Result

In [16]:
pred = model.predict(X)

In [17]:
pred_value =encoder.inverse_transform(pred)

print(pred_value)

print(pred_value[0][0])

[['angry']
 ['angry']
 ['angry']]
angry
