In [None]:
import librosa
import numpy as np
import pickle
import pandas as pd

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
def extract_features(data):
    # ZCR
    result = np.array([])
    zcr = np.mean(librosa.feature.zero_crossing_rate(y=data).T, axis=0)
    result=np.hstack((result, zcr)) # stacking horizontally

    # Chroma_stft
    stft = np.abs(librosa.stft(data))
    chroma_stft = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T, axis=0)
    result = np.hstack((result, chroma_stft)) # stacking horizontally

    # MFCC
    mfcc = np.mean(librosa.feature.mfcc(y=data, sr=sample_rate).T, axis=0)
    result = np.hstack((result, mfcc)) # stacking horizontally

    # Root Mean Square Value
    rms = np.mean(librosa.feature.rms(y=data).T, axis=0)
    result = np.hstack((result, rms)) # stacking horizontally

    # MelSpectogram
    mel = np.mean(librosa.feature.melspectrogram(y=data, sr=sample_rate).T, axis=0)
    result = np.hstack((result, mel)) # stacking horizontally
    
    return result

def get_features(path):
    # duration and offset are used to take care of the no audio in start and the ending of each audio files as seen above.
    data, sample_rate = librosa.load(path, duration=2.5, offset=0.6)
    
    # without augmentation
    res1 = extract_features(data)
    result = np.array(res1)
    
    # data with noise
    noise_data = noise(data)
    res2 = extract_features(noise_data)
    result = np.vstack((result, res2)) # stacking vertically
    
    # data with stretching and pitching
    new_data = stretch(data)
    data_stretch_pitch = pitch(new_data, sample_rate)
    res3 = extract_features(data_stretch_pitch)
    result = np.vstack((result, res3)) # stacking vertically
    
    return result

In [None]:
def noise(data):
  noise_amp = 0.035*np.random.uniform()*np.amax(data)
  data = data + noise_amp*np.random.normal(size=data.shape[0])
  return data

def stretch(data, rate = 0.8):
  return librosa.effects.time_stretch(data, rate)

def shift(data):
  shift_range = int(np.random.uniform(low=-5, high = 5)*1000)
  return np.roll(data, shift_range)

def pitch(data, sampling_rate, pitch_factor=0.7):
    return librosa.effects.pitch_shift(data, sampling_rate, pitch_factor)

In [None]:
from keras.models import model_from_json

json_file = open('/content/drive/MyDrive/AudioEmotionDetection/model.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)
# load weights into new model
loaded_model.load_weights("/content/drive/MyDrive/AudioEmotionDetection/model.h5")
print("Loaded model from disk")

Loaded model from disk


In [None]:
### Script for using model for prediction
predict_path = '/content/drive/MyDrive/AudioEmotionDetection/data/audio_speech_actors_01-24/Actor_01/03-01-08-01-01-01-01.wav'

data, sample_rate = librosa.load(predict_path)
X = []
feature = get_features(predict_path)
for ele in feature:
    X.append(ele)

X = pd.DataFrame(X)

scaler = pickle.load(open('/content/drive/MyDrive/AudioEmotionDetection/scaler.pkl','rb'))
X = scaler.transform(X)
X = np.expand_dims(X, axis=2)

pred_test = loaded_model.predict(X)
encoder = pickle.load(open('/content/drive/MyDrive/AudioEmotionDetection/encoder.pkl','rb'))
y_pred = encoder.inverse_transform(pred_test)
print(y_pred)

[['surprise']
 ['surprise']
 ['surprise']]
