In [3]:
import numpy as np
import matplotlib.pyplot as plt
import librosa
import soundfile
import glob
import os
from sklearn.model_selection import train_test_split

In [4]:
def extract_feature(file_name, mfcc, chroma, mel,tonnetz):
    
    with soundfile.SoundFile(file_name) as sound_file:
        
        # X is the sound_file data
        
        X = sound_file.read(dtype="float32")
        sample_rate=sound_file.samplerate
        
        result=np.array([])
        
        # if chroma is True then we get the Short-Time Fourier Transform of X
        
        if chroma:
            stft=np.abs(librosa.stft(X))
    
        # each of the three feautres are checked if present, took the mean value and pushed into result stack
        if mfcc:
            mfccs=np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T, axis=0)
            result=np.hstack((result, mfccs))
        if chroma:
            chroma=np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T,axis=0)
            result=np.hstack((result, chroma))
        if mel:
            mel=np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T,axis=0)
            result=np.hstack((result, mel))
        if tonnetz:
            tonnetz = np.mean(librosa.feature.tonnetz(y=librosa.effects.harmonic(X), sr=sample_rate).T,axis=0)
            result = np.hstack((result, tonnetz))
            
    return result

In [5]:
gender={
    0:'male',
    1:'female'
}

In [6]:
def load_data(test_size=0.2):
    x,y=[],[]
    for file in glob.glob("C:\\Users\\vasavi\\Downloads\\ravdess-data\\Actor_*\\*.wav"):
        ##C:\Users\vasavi\Downloads\ravdess-data
        file_name=os.path.basename(file)
        gr=file_name.split('.')[0].split('-')
        #print(gr[6])
        g=gender[int(gr[6])%2]
        feature=extract_feature(file, mfcc=True, chroma=True, mel=True,tonnetz=True)
        x.append(feature)
        y.append(g)
    return train_test_split(np.array(x), y, test_size=test_size, random_state=9)

In [7]:
# Training the model ## 3 

## Split the dataset

x_train,x_test,y_train,y_test=load_data(test_size=0.25)
print(x_train.shape[0],x_test.shape[0])

from sklearn.neural_network import MLPClassifier

# Initialize the Multi Layer Perceptron Classifier
model=MLPClassifier(alpha=0.01, batch_size=256, epsilon=1e-08, hidden_layer_sizes=(300,), learning_rate='adaptive', max_iter=600)

1080 360


In [8]:
model.fit(x_train,y_train)

MLPClassifier(activation='relu', alpha=0.01, batch_size=256, beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(300,), learning_rate='adaptive',
              learning_rate_init=0.001, max_iter=600, momentum=0.9,
              n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
              random_state=None, shuffle=True, solver='adam', tol=0.0001,
              validation_fraction=0.1, verbose=False, warm_start=False)

In [9]:
y_pred=model.predict(x_test)

In [10]:
from sklearn.metrics import accuracy_score

# Calculate the accuracy of our model
accuracy=accuracy_score(y_true=y_test, y_pred=y_pred)

# Print the accuracy
print("Accuracy: {:.2f}%".format(accuracy*100))

Accuracy: 99.44%


In [20]:
x_file="C:\\Users\\vasavi\\Downloads\\test_audio.wav"
f=extract_feature(x_file, mfcc=True, chroma=True, mel=True,tonnetz=True).reshape(1,-1)

In [24]:
result = model.predict(f)[0]
result

'female'