In [14]:
import pickle
import tensorflow as tf
from tensorflow.keras import layers as L
from keras.layers import Input, Flatten, Dropout, Activation
from keras.layers import Conv1D, MaxPooling1D, AveragePooling1D
import librosa
import numpy as np

class Voice_confidence:
    def __init__(self) -> None:
        self.model_weights = 'CNN_model_weights.weights.h5'
        self.model_encoder = 'encoder2.pickle'
        self.model_scaler = 'scaler2.pickle'
        self.expected_length  = 4
        self.emotions1 = {1: 'Neutral', 2: 'Calm', 3: 'Happy', 4: 'Sad', 5: 'Angry', 6: 'Fear', 7: 'Disgust', 8: 'Surprise'}
        
        # Define the model architecture
        self.model = tf.keras.Sequential([
            L.Conv1D(512, kernel_size=5, strides=1, padding='same', activation='relu', input_shape=(1620, 1)),
            L.BatchNormalization(),
            L.MaxPool1D(pool_size=5, strides=2, padding='same'),
            L.Conv1D(512, kernel_size=5, strides=1, padding='same', activation='relu'),
            L.BatchNormalization(),
            L.MaxPool1D(pool_size=5, strides=2, padding='same'),
            Dropout(0.2),
            L.Conv1D(256, kernel_size=5, strides=1, padding='same', activation='relu'),
            L.BatchNormalization(),
            L.MaxPool1D(pool_size=5, strides=2, padding='same'),
            L.Conv1D(256, kernel_size=3, strides=1, padding='same', activation='relu'),
            L.BatchNormalization(),
            L.MaxPool1D(pool_size=5, strides=2, padding='same'),
            Dropout(0.2),
            L.Conv1D(128, kernel_size=3, strides=1, padding='same', activation='relu'),
            L.BatchNormalization(),
            L.MaxPool1D(pool_size=3, strides=2, padding='same'),
            Dropout(0.2),
            L.Flatten(),
            L.Dense(512, activation='relu'),
            L.BatchNormalization(),
            L.Dense(8, activation='softmax')
        ])
        
        # Load the model weights
        self.model.load_weights(self.model_weights)

    def load_pickle_file(self, pickle_path):
        with open(pickle_path, 'rb') as f:
            return pickle.load(f)

    def zcr(self, data, frame_length, hop_length):
        zcr = librosa.feature.zero_crossing_rate(y=data, frame_length=frame_length, hop_length=hop_length)
        return np.squeeze(zcr)

    def rmse(self, data, frame_length=2048, hop_length=512):
        rmse = librosa.feature.rms(y=data, frame_length=frame_length, hop_length=hop_length)
        return np.squeeze(rmse)

    def mfcc(self, data, sr, frame_length=2048, hop_length=512, flatten: bool = True):
        mfcc = librosa.feature.mfcc(y=data, sr=sr, n_mfcc=13, hop_length=hop_length)
        return np.squeeze(mfcc.T) if not flatten else np.ravel(mfcc.T)
    
    def preprocess_audio(self, audio_data,sample_rate):
        # Ensure audio is exactly `expected_length` seconds
        target_length = int(sample_rate * self.expected_length)
        if len(audio_data) < target_length:
            # Pad with zeros to reach `expected_length` seconds
            audio_data = np.pad(audio_data, (0, target_length - len(audio_data)), mode='constant')
        elif len(audio_data) > target_length:
            # Trim to `expected_length` seconds
            audio_data = audio_data[:target_length]
        return audio_data

    def extract_features(self, data, sr=22050, frame_length=2048, hop_length=512):
        # Extract individual features
        zcr_result = self.zcr(data, frame_length, hop_length)
        rmse_result = self.rmse(data, frame_length, hop_length)
        mfcc_result = self.mfcc(data, sr, frame_length, hop_length, flatten=True)

        # Concatenate features
        combined_features = np.hstack((zcr_result, rmse_result, mfcc_result))

        # Check total size
        total_features = len(combined_features)

        # Pad or truncate to ensure exactly 1620 features
        if total_features < 1620:
            combined_features = np.pad(combined_features, (0, 1620 - total_features), mode='constant')
        elif total_features > 1620:
            combined_features = combined_features[:1620]

        # Reshape to (1, 1620)
        combined_features = np.reshape(combined_features, newshape=(1, 1620))

        return combined_features

    def get_predict_feat(self, file_path):
        data, sr = librosa.load(file_path, sr=None)
        #Calculate duration of loaded audio in seconds
        duration = len(data) / sr
        # Preprocess audio to `expected_length`
        data = self.preprocess_audio(data,sr)
        features = self.extract_features(data, sr)
        scaler = self.load_pickle_file(self.model_scaler)
        scaled_features = scaler.transform(features)
        return scaled_features

    def prediction(self, path1):
        res = self.get_predict_feat(path1)
        # Reshape the features for the Conv1D model
        res = res.reshape(res.shape[0], res.shape[1], 1)
        # Make predictions
        predictions = self.model.predict(res)
        # Load the encoder to decode the prediction
        encoder2 = self.load_pickle_file(self.model_encoder)
        y_pred = encoder2.inverse_transform(predictions)
        return y_pred[0]

if __name__ == '__main__':
    voice = Voice_confidence()
    audio = '03-01-07-01-01-01-01.wav'
    print(len(audio))
    print(voice.prediction('03-01-07-01-01-01-01.wav'))


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


24
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 147ms/step
['fear']


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [2]:
import pyaudio

ModuleNotFoundError: No module named 'pyaudio'

In [4]:
audio = '03-01-07-01-01-01-01.wav'
data,sr = librosa.load(audio)

In [5]:
len(data)

85345

In [6]:
int(22050 * 2)

44100