In [15]:
!pip install sounddevice

Defaulting to user installation because normal site-packages is not writeable


[notice] A new release of pip is available: 24.1.2 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip



Collecting sounddevice
  Downloading sounddevice-0.5.1-py3-none-win_amd64.whl.metadata (1.4 kB)
Downloading sounddevice-0.5.1-py3-none-win_amd64.whl (363 kB)
   ---------------------------------------- 0.0/363.6 kB ? eta -:--:--
   --------------------------------------- 363.6/363.6 kB 11.4 MB/s eta 0:00:00
Installing collected packages: sounddevice
Successfully installed sounddevice-0.5.1


In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Dense
import librosa as lb
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer, StandardScaler
import pickle

def extract_features(main_dir, list_of_features):
    features_list = []
    labels = []

    for subfolder in os.listdir(main_dir):
        subfolder_path = os.path.join(main_dir, subfolder)
        if os.path.isdir(subfolder_path):
            for filename in os.listdir(subfolder_path):
                if filename.endswith('.wav'):
                    file_path = os.path.join(subfolder_path, filename)
                    y, sr = lb.load(file_path)

                    features = []
                    if 'mfcc' in list_of_features:
                        mfcc = lb.feature.mfcc(y=y, sr=sr, n_mfcc=13)
                        features.extend(mfcc.mean(axis=1))

                    if 'chroma' in list_of_features:
                        stft = np.abs(lb.stft(y))
                        chroma = lb.feature.chroma_stft(S=stft, sr=sr)
                        features.extend(chroma.mean(axis=1))

                    if 'melspectrogram' in list_of_features:
                        mel_spect = lb.feature.melspectrogram(y=y, sr=sr)
                        mel_spect_db = lb.power_to_db(mel_spect, ref=np.max)
                        features.extend(mel_spect_db.mean(axis=1))

                    features_list.append(features)
                    labels.append(filename[7])  # Adjust based on your label extraction logic

    return features_list, labels

def load_data(test_size1=0.2):
    filename = r'C:\Users\pashu\OneDrive\Desktop\non-academics\soc\speech_recong_project\Audio_Speech_Actors_01-24'
    filename1 = r'C:\Users\pashu\OneDrive\Desktop\non-academics\soc\speech_recong_project\Audio_Song_Actors_01-24'
    listf = ['mfcc', 'chroma', 'melspectrogram']

    features, labels = extract_features(main_dir=filename, list_of_features=listf)
    features1, labels1 = extract_features(main_dir=filename1, list_of_features=listf)

    features_array = np.array(features, dtype=np.float32)
    labels_array = np.array(labels)
    features_array1 = np.array(features1, dtype=np.float32)
    labels_array1 = np.array(labels1)

    features_array2 = np.concatenate((features_array, features_array1), axis=0)
    labels_array2 = np.concatenate((labels_array, labels_array1), axis=0)

    scaler = StandardScaler()
    features_array2 = scaler.fit_transform(features_array2)

    lb = LabelBinarizer()
    labels_one_hot = lb.fit_transform(labels_array2)

    # Save the LabelBinarizer instance
    with open('label_binarizer.pkl', 'wb') as f:
        pickle.dump(lb, f)

    X_train, X_test, y_train, y_test = train_test_split(features_array2, labels_one_hot, test_size=test_size1, random_state=42)
    return X_train, X_test, y_train, y_test, scaler

X_train, X_test, y_train, y_test, scaler = load_data()

model = tf.keras.Sequential([
    Dense(300, activation='relu', input_shape=(X_train.shape[1],)),
    Dense(y_train.shape[1], activation='softmax')
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=200, batch_size=256, validation_split=0.1)
test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=2)
print(f'Test Accuracy: {test_accuracy*100:.4f}')

# Load the LabelBinarizer instance (lb) used during training
with open('label_binarizer.pkl', 'rb') as f:
    lb = pickle.load(f)

# Function to predict emotion
def predict_emotion(audio):
    # Extract features from audio
    features = extract_features_from_audio(audio)

    # Standardize features (apply the same scaler used during training)
    features_scaled = scaler.transform([features])  # Reshape and scale

    # Predict emotion
    prediction = model.predict(features_scaled)

    # Use LabelBinarizer to decode the prediction
    emotion_label = lb.inverse_transform(prediction)
    print(f'Predicted Emotion: {emotion_label[0]}')

# Replace `record_audio(duration=5)` and `extract_features_from_audio(audio)` with your actual recording and extraction methods.


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/200
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 87ms/step - accuracy: 0.1829 - loss: 2.1689 - val_accuracy: 0.3249 - val_loss: 1.7426
Epoch 2/200
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step - accuracy: 0.3709 - loss: 1.6575 - val_accuracy: 0.3706 - val_loss: 1.5862
Epoch 3/200
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - accuracy: 0.4658 - loss: 1.4308 - val_accuracy: 0.4721 - val_loss: 1.4527
Epoch 4/200
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step - accuracy: 0.5196 - loss: 1.3219 - val_accuracy: 0.5279 - val_loss: 1.3773
Epoch 5/200
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step - accuracy: 0.5829 - loss: 1.2287 - val_accuracy: 0.5584 - val_loss: 1.3126
Epoch 6/200
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step - accuracy: 0.6130 - loss: 1.1562 - val_accuracy: 0.5635 - val_loss: 1.2613
Epoch 7/200
[1m7/7[0m [32m━━━━━━━━━━━

In [2]:
emotions1 = [ 'sad','anxious', 'angry', 'surprised','fearful' , 'neutral', 'disgusted','happy','grateful']
emotions = [ 'sad','surprised', 'surprised', 'happy','surprised' , 'surprised', 'happy','happy','happy']

In [7]:
import numpy as np
import tensorflow as tf
import sounddevice as sd
import librosa
import joblib  # Use joblib to load the scaler
import tkinter as tk
from tkinter import messagebox
import pickle

# Load the LabelBinarizer instance
with open('label_binarizer.pkl', 'rb') as f:
    lb = pickle.load(f)

# Load your trained model
model = tf.keras.models.load_model('emotion_model.h5')

# Load the scaler
scaler = joblib.load('scaler.pkl')  # Load the scaler

def record_audio(duration=5, fs=44100):
    print("Recording...")
    audio = sd.rec(int(duration * fs), samplerate=fs, channels=1, dtype='float64')
    sd.wait()  # Wait until recording is finished
    print("Recording complete.")
    return audio.flatten()

def extract_features_from_audio(audio):
    y = audio
    sr = 44100  # Sample rate
    features = []
    
    # Extract MFCC
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
    features.extend(mfcc.mean(axis=1))

    # Extract Chroma
    stft = np.abs(librosa.stft(y))
    chroma = librosa.feature.chroma_stft(S=stft, sr=sr)
    features.extend(chroma.mean(axis=1))

    # Extract Mel Spectrogram
    mel_spect = librosa.feature.melspectrogram(y=y, sr=sr)
    mel_spect_db = librosa.power_to_db(mel_spect, ref=np.max)
    features.extend(mel_spect_db.mean(axis=1))

    return np.array(features)

def predict_emotion():
    audio = record_audio(duration=5)  # Record 5 seconds of audio
    features = extract_features_from_audio(audio)

    # Standardize features (apply the same scaler used during training)
    features_scaled = scaler.transform([features])  # Reshape and scale
    
    # Predict emotion
    prediction = model.predict(features_scaled)
    
    # Use LabelBinarizer to decode the prediction
    emotion_label = lb.inverse_transform(prediction)
    emotion=int(emotion_label[0])
    emotion1=emotions[emotion]
    messagebox.showinfo("Predicted Emotion", f'Predicted Emotion: {emotion1}')

# Create the GUI
app = tk.Tk()
app.title("Emotion Detection from Voice")
app.geometry("300x200")

record_button = tk.Button(app, text="Record and Predict Emotion", command=predict_emotion, height=2, width=30)
record_button.pack(pady=50)

app.mainloop()




Recording...
Recording complete.




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
