In [2]:
#pip install librosa

Collecting librosa
  Downloading librosa-0.10.1-py3-none-any.whl.metadata (8.3 kB)
Collecting audioread>=2.1.9 (from librosa)
  Downloading audioread-3.0.1-py3-none-any.whl.metadata (8.4 kB)
Collecting numba>=0.51.0 (from librosa)
  Downloading numba-0.58.1-cp311-cp311-win_amd64.whl.metadata (2.8 kB)
Collecting soundfile>=0.12.1 (from librosa)
  Downloading soundfile-0.12.1-py2.py3-none-win_amd64.whl (1.0 MB)
     ---------------------------------------- 0.0/1.0 MB ? eta -:--:--
     - -------------------------------------- 0.0/1.0 MB 435.7 kB/s eta 0:00:03
     --- ------------------------------------ 0.1/1.0 MB 1.1 MB/s eta 0:00:01
     --------- ------------------------------ 0.2/1.0 MB 1.8 MB/s eta 0:00:01
     ------------------- -------------------- 0.5/1.0 MB 2.6 MB/s eta 0:00:01
     ---------------------------- ----------- 0.7/1.0 MB 3.1 MB/s eta 0:00:01
     -------------------------------------- - 1.0/1.0 MB 3.5 MB/s eta 0:00:01
     ---------------------------------------- 

In [10]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout, BatchNormalization
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import librosa
import os

In [2]:
dataset_folder_path = 'AudioWAV'

# Function to extract MFCC features from audio files
def extract_features(file_path, mfcc=True, chroma=True, mel=True):
    with tf.device('/device:GPU:0'):  # Use GPU if available
        audio, sample_rate = librosa.load(file_path, res_type='kaiser_fast')
        result = np.array([])
        if mfcc:
            mfccs = np.mean(librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40).T, axis=0)
            result = np.hstack((result, mfccs))
        if chroma:
            chroma = np.mean(librosa.feature.chroma_stft(y=audio, sr=sample_rate).T, axis=0)
            result = np.hstack((result, chroma))
        if mel:
            mel = np.mean(librosa.feature.melspectrogram(y=audio, sr=sample_rate).T, axis=0)
            result = np.hstack((result, mel))
    return result

In [20]:
#pip install resampy

Collecting resampy
  Downloading resampy-0.4.2-py3-none-any.whl (3.1 MB)
     ---------------------------------------- 0.0/3.1 MB ? eta -:--:--
     ---------------------------------------- 0.0/3.1 MB ? eta -:--:--
      --------------------------------------- 0.1/3.1 MB 812.7 kB/s eta 0:00:04
     -- ------------------------------------- 0.2/3.1 MB 1.6 MB/s eta 0:00:02
     ----- ---------------------------------- 0.4/3.1 MB 2.3 MB/s eta 0:00:02
     --------- ------------------------------ 0.7/3.1 MB 3.2 MB/s eta 0:00:01
     ------------ --------------------------- 1.0/3.1 MB 3.7 MB/s eta 0:00:01
     ---------------- ----------------------- 1.3/3.1 MB 4.0 MB/s eta 0:00:01
     ------------------- -------------------- 1.5/3.1 MB 4.1 MB/s eta 0:00:01
     --------------------- ------------------ 1.7/3.1 MB 4.1 MB/s eta 0:00:01
     ------------------------ --------------- 1.9/3.1 MB 4.1 MB/s eta 0:00:01
     -------------------------- ------------- 2.0/3.1 MB 4.1 MB/s eta 0:00:01
   

In [3]:
data = []
labels = []

for file in os.listdir(dataset_folder_path):
    if file.endswith('.wav') or file.endswith('.mp3'):  # Adjust file extensions as needed
        file_path = os.path.join(dataset_folder_path, file)
        feature = extract_features(file_path)
        data.append(feature)
        labels.append(file.split('_')[0])  # Assuming file names are in the format 'emotion_filename.wav'


  return pitch_tuning(


In [6]:
le = LabelEncoder()
labels = le.fit_transform(labels)

# Convert data and labels to numpy arrays
data = np.array(data)
labels = np.array(labels)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=42)

# Normalize the data
X_train = (X_train - np.min(X_train)) / (np.max(X_train) - np.min(X_train))
X_test = (X_test - np.min(X_test)) / (np.max(X_test) - np.min(X_test))

# Reshape the data for input to a Convolutional Neural Network (CNN)
X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)
X_test = X_test.reshape(X_test.shape[0], X_test.shape[1], 1)

# Convert labels to one-hot encoding
y_train = to_categorical(y_train, num_classes=len(le.classes_))
y_test = to_categorical(y_test, num_classes=len(le.classes_))

In [11]:
model = Sequential()
model.add(Conv1D(32, kernel_size=3, activation='relu', input_shape=(X_train.shape[1], 1)))
model.add(BatchNormalization())
model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(0.25))
model.add(Conv1D(64, kernel_size=3, activation='relu'))
model.add(BatchNormalization())
model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(0.25))
model.add(Conv1D(128, kernel_size=3, activation='relu'))
model.add(BatchNormalization())
model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(0.25))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.5))
model.add(Dense(len(le.classes_), activation='softmax'))


In [12]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [13]:
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=20, batch_size=64)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x1b16eb2d950>

In [14]:
accuracy = model.evaluate(X_test, y_test)[1]
print(f"Test Accuracy: {accuracy * 100:.2f}%")

Test Accuracy: 1.34%


In [15]:
model.save('voice_analysis_model.h5')

  saving_api.save_model(


In [16]:
model_json = model.to_json()
with open("voice.json","w") as json_file:
    json_file.write(model_json)