In [None]:
import os
import cv2
import numpy as np
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.image import img_to_array
import threading
import time


In [None]:
import os
import zipfile
import librosa
import numpy as np
import pickle

# Extract the ZIP file
with zipfile.ZipFile('RAVDESS.zip', 'r') as zip_ref:
    zip_ref.extractall('/content/RAVDESS')

# Define the path to the extracted files
extracted_path = '/content/RAVDESS'

# Define the emotion labels
emotion_labels = {
    '01': 'neutral',
    '02': 'calm',
    '03': 'happy',
    '04': 'sad',
    '05': 'angry',
    '06': 'fearful',
    '07': 'disgust',
    '08': 'surprised'
}

# Initialize your lists or other variables as needed
X_audio = []
y_audio = []

# Define fixed length for padding/truncating
fixed_length = 173

# Iterate through the subdirectories and files in the extracted directory
for root, dirs, files in os.walk(extracted_path):
    for file in files:
        if file.endswith('.wav'):
            file_path = os.path.join(root, file)
            segments = file.split('-')
            if len(segments) > 2:
                try:
                    emotion = int(segments[2]) - 1  # Example: 02-01-06-01-02-01-12.wav
                    y, sr = librosa.load(file_path, sr=None)
                    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
                    
                    # Pad or truncate MFCCs to the fixed length
                    if mfccs.shape[1] < fixed_length:
                        mfccs = np.pad(mfccs, ((0, 0), (0, fixed_length - mfccs.shape[1])), mode='constant')
                    else:
                        mfccs = mfccs[:, :fixed_length]
                    
                    X_audio.append(mfccs)
                    y_audio.append(emotion)
                except Exception as e:
                    print(f"Error processing {file_path}: {e}")

# Convert lists to numpy arrays for further processing
X_audio = np.array(X_audio)
y_audio = np.array(y_audio)

print("Feature extraction completed.")


In [3]:
import os

# Create directories for each emotion
for emotion_id, emotion_label in emotion_labels.items():
    os.makedirs(f'/content/audio_features/{emotion_label}', exist_ok=True)

# Save the features in corresponding directories
for i, mfcc in enumerate(X_audio):
    emotion = emotion_labels[str(y_audio[i] + 1).zfill(2)]
    feature_path = f'/content/audio_features/{emotion}/{i}.pkl'
    with open(feature_path, 'wb') as f:
        pickle.dump(mfcc, f)

print("Features saved based on emotions.")


Features saved based on emotions.


In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv2D, Flatten, Dropout, MaxPooling2D

# Reshape data for the CNN input
X_audio = X_audio.reshape(X_audio.shape[0], X_audio.shape[1], X_audio.shape[2], 1)

# Define the model
model = Sequential([
    Conv2D(32, (3, 3), activation='relu', input_shape=(13, 173, 1)),
    MaxPooling2D((2, 2)),
    Dropout(0.2),
    Conv2D(64, (3, 3), activation='relu'),
    MaxPooling2D((2, 2)),
    Dropout(0.2),
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(len(emotion_labels), activation='softmax')
])


model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()


In [None]:
# Split the data into training and testing sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_audio, y_audio, test_size=0.2, random_state=42)

# Train the model
history = model.fit(X_train, y_train, epochs=30, batch_size=32, validation_data=(X_test, y_test))

# Evaluate the model
test_loss, test_acc = model.evaluate(X_test, y_test, verbose=2)
print('\nTest accuracy:', test_acc)

In [6]:
# Assuming X_train and X_test are 4D arrays of shape (num_samples, time_steps, num_features, 1)
# We need to reshape the data to 2D for normalization, but we should preserve the time_steps and num_features separately.

X_train_reshaped = X_train.reshape(X_train.shape[0], -1)  # Flatten the time and feature dimensions for normalization
X_test_reshaped = X_test.reshape(X_test.shape[0], -1)


In [7]:
from sklearn.preprocessing import StandardScaler
import numpy as np

# Assuming your data is in the shape (num_samples, time_steps, num_features, 1)
# Remove the singleton dimension
X_train_reshaped = X_train.squeeze(-1)
X_test_reshaped = X_test.squeeze(-1)

# Apply normalization to each feature for every time step
scaler = StandardScaler()

# Normalize along the time_steps and features (across each sample)
X_train_normalized = np.array([scaler.fit_transform(sample) for sample in X_train_reshaped])
X_test_normalized = np.array([scaler.transform(sample) for sample in X_test_reshaped])

# Check the new shape of the data
print(f"Shape of X_train_normalized: {X_train_normalized.shape}")
print(f"Shape of X_test_normalized: {X_test_normalized.shape}")


Shape of X_train_normalized: (2304, 13, 173)
Shape of X_test_normalized: (576, 13, 173)


In [8]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Input

model = Sequential([
    Input(shape=(13, 173)),  # Explicitly define the input shape using Input layer
    LSTM(64, return_sequences=False),  # LSTM layer
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid')  # or 'softmax' for multi-class classification
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


In [9]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv2D, Flatten, Dropout, MaxPooling2D

# Reshape data for the CNN input
X_audio = X_audio.reshape(X_audio.shape[0], X_audio.shape[1], X_audio.shape[2], 1)

# Define the model
model = Sequential([
    Conv2D(32, (3, 3), activation='relu', input_shape=(13, 173, 1)),
    MaxPooling2D((2, 2)),
    Dropout(0.2),
    Conv2D(64, (3, 3), activation='relu'),
    MaxPooling2D((2, 2)),
    Dropout(0.2),
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(len(emotion_labels), activation='softmax')
])


model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [10]:
# Train the model
history = model.fit(X_train_normalized, y_train, epochs=30, batch_size=32, validation_data=(X_test_normalized, y_test))

# Evaluate the model
test_loss, test_acc = model.evaluate(X_test_normalized, y_test, verbose=2)
print('\nTest accuracy:', test_acc)


Epoch 1/30
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 35ms/step - accuracy: 0.1356 - loss: 2.1148 - val_accuracy: 0.1858 - val_loss: 1.9895
Epoch 2/30
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 31ms/step - accuracy: 0.2484 - loss: 1.8975 - val_accuracy: 0.2708 - val_loss: 1.8707
Epoch 3/30
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 31ms/step - accuracy: 0.2851 - loss: 1.7787 - val_accuracy: 0.2917 - val_loss: 1.8263
Epoch 4/30
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 31ms/step - accuracy: 0.3208 - loss: 1.7409 - val_accuracy: 0.3229 - val_loss: 1.7717
Epoch 5/30
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 31ms/step - accuracy: 0.3525 - loss: 1.6446 - val_accuracy: 0.3490 - val_loss: 1.7351
Epoch 6/30
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 32ms/step - accuracy: 0.3947 - loss: 1.5935 - val_accuracy: 0.3715 - val_loss: 1.7457
Epoch 7/30
[1m72/72[0m [32m━━━━

In [11]:
# Save the model
model.save('/content/voice_emotion_model.keras')
print("Model saved.")


Model saved.


In [12]:
import tensorflow as tf

# Load the saved model
model = tf.keras.models.load_model('/content/voice_emotion_model.keras')
print("voice_emotion_model saved")


voice_emotion_model saved


In [13]:
pip install sounddevice


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.3.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip




In [14]:
import sounddevice as sd
import librosa
import numpy as np

# Define the emotion labels
emotion_labels = {
    0: 'neutral',
    1: 'calm',
    2: 'happy',
    3: 'sad',
    4: 'angry',
    5: 'fearful',
    6: 'disgust',
    7: 'surprised'
}

# Parameters
duration = 3  # seconds
fixed_length = 173  # Length of the MFCC features

def predict_emotion(audio_data, sample_rate):
    # Extract MFCC features
    mfccs = librosa.feature.mfcc(y=audio_data, sr=sample_rate, n_mfcc=13)

    # Pad or truncate MFCCs to the fixed length
    if mfccs.shape[1] < fixed_length:
        mfccs = np.pad(mfccs, ((0, 0), (0, fixed_length - mfccs.shape[1])), mode='constant')
    else:
        mfccs = mfccs[:, :fixed_length]

    # Reshape for the model
    mfccs = mfccs.reshape(1, mfccs.shape[0], mfccs.shape[1], 1)

    # Predict emotion
    predictions = model.predict(mfccs)
    emotion_index = np.argmax(predictions)
    return emotion_labels[emotion_index]

def record_and_predict():
    print("Recording...")
    audio_data = sd.rec(int(duration * 44100), samplerate=44100, channels=1, dtype='float32')
    sd.wait()  # Wait until the recording is finished
    audio_data = audio_data.flatten()  # Flatten the audio data to 1D array
    emotion = predict_emotion(audio_data, 44100)
    print(f"Detected Emotion: {emotion}")

if __name__ == "__main__":
    while True:
        user_input = input("Press Enter to record or type 'exit' to quit: ")
        if user_input.lower() == 'exit':
            print("Exiting...")
            break  # Exit the loop
        record_and_predict()


Press Enter to record or type 'exit' to quit:  enter


Recording...
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 168ms/step
Detected Emotion: disgust


Press Enter to record or type 'exit' to quit:  


Recording...
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step
Detected Emotion: angry


Press Enter to record or type 'exit' to quit:  


Recording...
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step
Detected Emotion: disgust


Press Enter to record or type 'exit' to quit:  


Recording...
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step
Detected Emotion: angry


Press Enter to record or type 'exit' to quit:  


Recording...
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step
Detected Emotion: happy


Press Enter to record or type 'exit' to quit:  


Recording...
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step
Detected Emotion: fearful


Press Enter to record or type 'exit' to quit:  


Recording...
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 55ms/step
Detected Emotion: sad


Press Enter to record or type 'exit' to quit:  


Recording...
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step
Detected Emotion: fearful


Press Enter to record or type 'exit' to quit:  exit


Exiting...


In [None]:
#Face emotion Model

In [15]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical

# Load the dataset
data = pd.read_csv('fer2013.csv')

# Preprocess the data
X = []
y = []

for index, row in data.iterrows():
    pixels = row['pixels'].split(' ')
    X.append(np.array(pixels, 'float32'))
    y.append(row['emotion'])

X = np.array(X)
y = np.array(y)

# Check unique labels
unique_labels = np.unique(y)
print("Unique labels in y:", unique_labels)

# Ensure labels are within the expected range (0-6)
valid_indices = [i for i, label in enumerate(y) if label >= 0 and label <= 6]
X = X[valid_indices]
y = y[valid_indices]

# Normalize the data
X = X / 255.0
X = X.reshape(X.shape[0], 48, 48, 1)

# One-hot encode the labels
y = to_categorical(y, num_classes=7)

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build the model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout

model = Sequential()

model.add(Conv2D(32, (3, 3), activation='relu', input_shape=(48, 48, 1)))
model.add(MaxPooling2D((2, 2)))
model.add(Dropout(0.25))

model.add(Conv2D(64, (3, 3), activation='relu'))
model.add(MaxPooling2D((2, 2)))
model.add(Dropout(0.25))

model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(7, activation='softmax'))

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=30, batch_size=64, validation_data=(X_test, y_test))

# Save the model
model.save('/content/face_emotion_model.h5')
print("face_emotion_model saved")


Unique labels in y: [0 1 2 3 4 5 6 7]


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/30
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 101ms/step - accuracy: 0.5743 - loss: 1.4701 - val_accuracy: 0.6740 - val_loss: 1.4023
Epoch 2/30
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 66ms/step - accuracy: 0.6466 - loss: 1.3006 - val_accuracy: 0.6740 - val_loss: 1.4070
Epoch 3/30
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 66ms/step - accuracy: 0.6626 - loss: 1.2797 - val_accuracy: 0.6740 - val_loss: 1.3983
Epoch 4/30
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 66ms/step - accuracy: 0.6499 - loss: 1.2743 - val_accuracy: 0.6740 - val_loss: 1.2767
Epoch 5/30
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 67ms/step - accuracy: 0.6546 - loss: 1.2381 - val_accuracy: 0.6740 - val_loss: 1.2666
Epoch 6/30
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 67ms/step - accuracy: 0.6424 - loss: 1.1791 - val_accuracy: 0.6740 - val_loss: 1.1194
Epoch 7/30
[1m12/12[0m [32m━━━



face_emotion_model saved


In [None]:
import cv2
from tensorflow.keras.models import load_model
import numpy as np

# Load pre-trained face detection model and emotion recognition model
face_detector = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')
emotion_model = load_model('face_emotion_model.h5')

# Emotion labels
emotion_labels = ['Angry', 'Disgust', 'Fear', 'Happy', 'Sad', 'Surprise', 'Neutral']

# Start video capture
cap = cv2.VideoCapture(0)

while True:
    # Capture frame-by-frame
    ret, frame = cap.read()
    
    # Convert to grayscale
    gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
    
    # Detect faces
    faces = face_detector.detectMultiScale(gray, scaleFactor=1.1, minNeighbors=5, minSize=(30, 30), flags=cv2.CASCADE_SCALE_IMAGE)
    
    for (x, y, w, h) in faces:
        # Extract the region of interest (ROI) and preprocess it
        roi_gray = gray[y:y + h, x:x + w]
        roi_gray = cv2.resize(roi_gray, (48, 48))
        roi_gray = roi_gray.astype('float32') / 255.0
        roi_gray = np.reshape(roi_gray, (1, 48, 48, 1))
        
        # Predict emotion
        prediction = emotion_model.predict(roi_gray)
        max_index = np.argmax(prediction[0])
        predicted_emotion = emotion_labels[max_index]
        
        # Draw a rectangle around the face and label the emotion
        cv2.rectangle(frame, (x, y), (x + w, y + h), (255, 0, 0), 2)
        cv2.putText(frame, predicted_emotion, (x, y - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (36, 255, 12), 2)
    
    # Display the resulting frame
    cv2.imshow('Emotion Recognition', frame)
    
    # Break the loop on 'q' key press
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

# Release the capture and close windows
cap.release()
cv2.destroyAllWindows()




In [21]:
pip install librosa tensorflow numpy scikit-learn

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.3.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
import tensorflow as tf
import os

# Check if the model files exist
if not os.path.exists('/content/voice_emotion_model.keras'):
    raise FileNotFoundError("Voice emotion model file not found.")
if not os.path.exists('/content/face_emotion_model.h5'):
    raise FileNotFoundError("Face emotion model file not found.")

# Load the saved models
voice_model = tf.keras.models.load_model('/content/voice_emotion_model.keras')
face_model = tf.keras.models.load_model('/content/face_emotion_model.h5')

# Compile the models (if needed)
voice_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
face_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

print("Models loaded and compiled successfully.")


In [None]:
import sounddevice as sd
import librosa
import numpy as np
import cv2
import os
import tensorflow as tf

# Define the emotion labels for voice
voice_emotion_labels = {
    0: 'neutral',
    1: 'calm',
    2: 'happy',
    3: 'sad',
    4: 'angry',
    5: 'fearful',
    6: 'disgust',
    7: 'surprised'
}

# Define the emotion labels for face
face_emotion_labels = {
    0: 'angry',
    1: 'disgust',
    2: 'fear',
    3: 'happy',
    4: 'sad',
    5: 'surprise',
    6: 'neutral'
}

# Parameters
duration = 3  # seconds

# Load Haar Cascade for face detection
face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')

# Check if the model files exist
if not os.path.exists('/content/voice_emotion_model.keras'):
    raise FileNotFoundError("Voice emotion model file not found.")
if not os.path.exists('/content/face_emotion_model.h5'):
    raise FileNotFoundError("Face emotion model file not found.")

# Load the saved models
voice_model = tf.keras.models.load_model('/content/voice_emotion_model.keras')
face_model = tf.keras.models.load_model('/content/face_emotion_model.h5')

def predict_voice_emotion(audio_data, sample_rate):
    # Extract MFCC features
    mfccs = librosa.feature.mfcc(y=audio_data, sr=sample_rate, n_mfcc=13)

    # Calculate the desired number of frames
    desired_columns = 173  # This is based on the model's expected input

    # Adjust padding or truncation to make sure we get exactly 173 columns (frames)
    if mfccs.shape[1] < desired_columns:
        padding_width = desired_columns - mfccs.shape[1]
        mfccs = np.pad(mfccs, ((0, 0), (0, padding_width)), mode='constant')
    elif mfccs.shape[1] > desired_columns:
        mfccs = mfccs[:, :desired_columns]

    # Reshape the MFCCs to match the input shape expected by the model (1, 173, 13)
    mfccs = np.expand_dims(mfccs, axis=-1)  # Add channel dimension (1)
    mfccs = np.expand_dims(mfccs, axis=0)  # Add batch dimension (1)

    # Predict emotion
    predictions = voice_model.predict(mfccs)
    emotion_index = np.argmax(predictions)
    return voice_emotion_labels[emotion_index]

def predict_face_emotion(frame):
    # Convert to grayscale
    gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
    faces = face_cascade.detectMultiScale(gray, scaleFactor=1.1, minNeighbors=5, minSize=(48, 48))

    if len(faces) == 0:
        return "No face detected"

    # Assume one face per frame for simplicity
    for (x, y, w, h) in faces:
        face_img = gray[y:y+h, x:x+w]
        face_img = cv2.resize(face_img, (48, 48))
        face_img = face_img.astype('float32') / 255
        face_img = np.expand_dims(face_img, axis=0)
        face_img = np.expand_dims(face_img, axis=-1)

        predictions = face_model.predict(face_img)
        emotion_index = np.argmax(predictions)
        return face_emotion_labels[emotion_index]

def record_and_predict():
    # Record audio
    print("Recording...")
    audio_data = sd.rec(int(duration * 44100), samplerate=44100, channels=1, dtype='float32')
    sd.wait()  # Wait until the recording is finished
    audio_data = audio_data.flatten()  # Flatten the audio data to 1D array
    voice_emotion = predict_voice_emotion(audio_data, 44100)
    print(f"Detected Voice Emotion: {voice_emotion}")

    # Capture video frame
    cap = cv2.VideoCapture(0)
    ret, frame = cap.read()
    cap.release()
    if ret:
        face_emotion = predict_face_emotion(frame)
        print(f"Detected Face Emotion: {face_emotion}")

    # Combine predictions
    if voice_emotion == face_emotion:
        final_emotion = voice_emotion
    else:
        final_emotion = f"Voice: {voice_emotion}, Face: {face_emotion}"

    print(f"Final Detected Emotion: {final_emotion}")

if __name__ == "__main__":
    while True:
        user_input = input("Press Enter to record and capture (or type 'q' to quit)... ")
        if user_input.lower() == 'q':
            print("Exiting...")
            break
        record_and_predict()


In [None]:
import sounddevice as sd
import librosa
import numpy as np
import cv2
import os
import tensorflow as tf
import threading

# Define the emotion labels for voice
voice_emotion_labels = {
    0: 'neutral',
    1: 'calm',
    2: 'happy',
    3: 'sad',
    4: 'angry',
    5: 'fearful',
    6: 'disgust',
    7: 'surprised'
}

# Define the emotion labels for face
face_emotion_labels = {
    0: 'angry',
    1: 'disgust',
    2: 'fear',
    3: 'happy',
    4: 'sad',
    5: 'surprise',
    6: 'neutral'
}

# Parameters
duration = 3  # seconds

# Load Haar Cascade for face detection
face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')

# Check if the model files exist
if not os.path.exists('/content/voice_emotion_model.keras'):
    raise FileNotFoundError("Voice emotion model file not found.")
if not os.path.exists('/content/face_emotion_model.h5'):
    raise FileNotFoundError("Face emotion model file not found.")

# Load the saved models
voice_model = tf.keras.models.load_model('/content/voice_emotion_model.keras')
face_model = tf.keras.models.load_model('/content/face_emotion_model.h5')

def predict_voice_emotion(audio_data, sample_rate):
    # Extract MFCC features
    mfccs = librosa.feature.mfcc(y=audio_data, sr=sample_rate, n_mfcc=13)

    # Calculate the desired number of frames
    desired_columns = 173  # This is based on the model's expected input

    # Adjust padding or truncation to make sure we get exactly 173 columns (frames)
    if mfccs.shape[1] < desired_columns:
        padding_width = desired_columns - mfccs.shape[1]
        mfccs = np.pad(mfccs, ((0, 0), (0, padding_width)), mode='constant')
    elif mfccs.shape[1] > desired_columns:
        mfccs = mfccs[:, :desired_columns]

    # Reshape the MFCCs to match the input shape expected by the model (1, 173, 13)
    mfccs = np.expand_dims(mfccs, axis=-1)  # Add channel dimension (1)
    mfccs = np.expand_dims(mfccs, axis=0)  # Add batch dimension (1)

    # Predict emotion
    predictions = voice_model.predict(mfccs)
    emotion_index = np.argmax(predictions)
    return voice_emotion_labels[emotion_index]

def predict_face_emotion(frame):
    # Convert to grayscale
    gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
    faces = face_cascade.detectMultiScale(gray, scaleFactor=1.1, minNeighbors=5, minSize=(48, 48))

    if len(faces) == 0:
        return "No face detected"

    # Assume one face per frame for simplicity
    for (x, y, w, h) in faces:
        face_img = gray[y:y+h, x:x+w]
        face_img = cv2.resize(face_img, (48, 48))
        face_img = face_img.astype('float32') / 255
        face_img = np.expand_dims(face_img, axis=0)
        face_img = np.expand_dims(face_img, axis=-1)

        predictions = face_model.predict(face_img)
        emotion_index = np.argmax(predictions)
        return face_emotion_labels[emotion_index]

def record_audio():
    while True:
        audio_data = sd.rec(int(duration * 44100), samplerate=44100, channels=1, dtype='float32')
        sd.wait()  # Wait until the recording is finished
        audio_data = audio_data.flatten()  # Flatten the audio data to 1D array
        voice_emotion = predict_voice_emotion(audio_data, 44100)
        print(f"Detected Voice Emotion: {voice_emotion}")
        global voice_emotion_display
        voice_emotion_display = voice_emotion

def capture_video():
    cap = cv2.VideoCapture(0)
    while True:
        ret, frame = cap.read()
        if not ret:
            break

        face_emotion = predict_face_emotion(frame)
        if face_emotion != "No face detected":
            cv2.putText(frame, f"Face Emotion: {face_emotion}", (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2, cv2.LINE_AA)

        if voice_emotion_display:
            cv2.putText(frame, f"Voice Emotion: {voice_emotion_display}", (10, 70), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 0, 0), 2, cv2.LINE_AA)

        cv2.imshow('Webcam', frame)
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

    cap.release()
    cv2.destroyAllWindows()

if __name__ == "__main__":
    voice_emotion_display = None

    audio_thread = threading.Thread(target=record_audio)
    video_thread = threading.Thread(target=capture_video)

    audio_thread.start()
    video_thread.start()

    audio_thread.join()
    video_thread.join()


In [4]:
import sounddevice as sd
import librosa
import numpy as np
import cv2
import os
import tensorflow as tf
import threading
import random
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Define the emotion labels for voice and face
voice_emotion_labels = {
    0: 'neutral',
    1: 'calm',
    2: 'happy',
    3: 'sad',
    4: 'angry',
    5: 'fearful',
    6: 'disgust',
    7: 'surprised'
}

face_emotion_labels = {
    0: 'angry',
    1: 'disgust',
    2: 'fear',
    3: 'happy',
    4: 'sad',
    5: 'surprise',
    6: 'neutral'
}

# Predefined recommendations
recommendations = {
    'music': {
        'happy': ["Happy Song 1", "Happy Song 2", "Happy Song 3"],
        'sad': ["Uplifting Song 1", "Uplifting Song 2", "Uplifting Song 3"],
        'fear': ["Calm Song 1", "Calm Song 2", "Calm Song 3"],
        'disgust': ["Relaxing Song 1", "Relaxing Song 2", "Relaxing Song 3"]
    },
    'jokes': {
        'happy': ["Happy Joke 1", "Happy Joke 2", "Happy Joke 3"],
        'sad': ["Cheerful Joke 1", "Cheerful Joke 2", "Cheerful Joke 3"],
        'fear': ["Funny Joke 1", "Funny Joke 2", "Funny Joke 3"],
        'disgust': ["Silly Joke 1", "Silly Joke 2", "Silly Joke 3"]
    },
    'deep_breathe': {
        'happy': ["Deep Breathe Exercise 1", "Deep Breathe Exercise 2", "Deep Breathe Exercise 3"],
        'sad': ["Deep Breathe Exercise 4", "Deep Breathe Exercise 5", "Deep Breathe Exercise 6"],
        'fear': ["Deep Breathe Exercise 7", "Deep Breathe Exercise 8", "Deep Breathe Exercise 9"],
        'disgust': ["Deep Breathe Exercise 10", "Deep Breathe Exercise 11", "Deep Breathe Exercise 12"]
    },
    'books': {
        'happy': ["Book 1", "Book 2", "Book 3"],
        'sad': ["Book 4", "Book 5", "Book 6"],
        'fear': ["Book 7", "Book 8", "Book 9"],
        'disgust': ["Book 10", "Book 11", "Book 12"]
    },
    'yoga': {
        'happy': ["Yoga 1", "Yoga 2", "Yoga 3"],
        'sad': ["Yoga 4", "Yoga 5", "Yoga 6"],
        'fear': ["Yoga 7", "Yoga 8", "Yoga 9"],
        'disgust': ["Yoga 10", "Yoga 11", "Yoga 12"]
    },
    'videos': {
        'happy': ["Video 1", "Video 2", "Video 3"],
        'sad': ["Video 4", "Video 5", "Video 6"],
        'fear': ["Video 7", "Video 8", "Video 9"],
        'disgust': ["Video 10", "Video 11", "Video 12"]
    },
    'puzzles': {
        'happy': ["Puzzle 1", "Puzzle 2", "Puzzle 3"],
        'sad': ["Puzzle 4", "Puzzle 5", "Puzzle 6"],
        'fear': ["Puzzle 7", "Puzzle 8", "Puzzle 9"],
        'disgust': ["Puzzle 10", "Puzzle 11", "Puzzle 12"]
    }
}

def recommend_content(emotion):
    content = {}
    if emotion in recommendations['music']:
        content['music'] = random.choice(recommendations['music'][emotion])
        content['jokes'] = random.choice(recommendations['jokes'][emotion])
        content['deep_breathe'] = random.choice(recommendations['deep_breathe'][emotion])
        content['books'] = random.choice(recommendations['books'][emotion])
        content['yoga'] = random.choice(recommendations['yoga'][emotion])
        content['videos'] = random.choice(recommendations['videos'][emotion])
        content['puzzles'] = random.choice(recommendations['puzzles'][emotion])
    return content

# Parameters
duration = 3  # seconds

# Load Haar Cascade for face detection
face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')

# Check if the model files exist
if not os.path.exists('/content/voice_emotion_model.keras'):
    raise FileNotFoundError("Voice emotion model file not found.")
if not os.path.exists('/content/face_emotion_model.h5'):
    raise FileNotFoundError("Face emotion model file not found.")

# Load the saved models
voice_model = tf.keras.models.load_model('/content/voice_emotion_model.keras')
face_model = tf.keras.models.load_model('/content/face_emotion_model.h5')

def predict_voice_emotion(audio_data, sample_rate):
    # Extract MFCC features
    mfccs = librosa.feature.mfcc(y=audio_data, sr=sample_rate, n_mfcc=13)

    # Calculate the desired number of frames
    desired_columns = 173  # This is based on the model's expected input

    # Adjust padding or truncation to make sure we get exactly 173 columns (frames)
    if mfccs.shape[1] < desired_columns:
        padding_width = desired_columns - mfccs.shape[1]
        mfccs = np.pad(mfccs, ((0, 0), (0, padding_width)), mode='constant')
    elif mfccs.shape[1] > desired_columns:
        mfccs = mfccs[:, :desired_columns]

    # Reshape the MFCCs to match the input shape expected by the model (1, 173, 13)
    mfccs = np.expand_dims(mfccs, axis=-1)  # Add channel dimension (1)
    mfccs = np.expand_dims(mfccs, axis=0)  # Add batch dimension (1)

    # Predict emotion
    predictions = voice_model.predict(mfccs)
    emotion_index = np.argmax(predictions)
    return voice_emotion_labels[emotion_index]

def predict_face_emotion(frame):
    # Convert to grayscale
    gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
    faces = face_cascade.detectMultiScale(gray, scaleFactor=1.1, minNeighbors=5, minSize=(48, 48))

    if len(faces) == 0:
        return "No face detected"

    # Assume one face per frame for simplicity
    for (x, y, w, h) in faces:
        face_img = gray[y:y+h, x:x+w]
        face_img = cv2.resize(face_img, (48, 48))
        face_img = face_img.astype('float32') / 255
        face_img = np.expand_dims(face_img, axis=0)
        face_img = np.expand_dims(face_img, axis=-1)

        predictions = face_model.predict(face_img)
        emotion_index = np.argmax(predictions)
        return face_emotion_labels[emotion_index]

def record_and_predict():
    # Record audio
    print("Recording...")
    audio_data = sd.rec(int(duration * 44100), samplerate=44100, channels=1, dtype='float32')
    sd.wait()  # Wait until the recording is finished
    audio_data = audio_data.flatten()  # Flatten the audio data to 1D array
    voice_emotion = predict_voice_emotion(audio_data, 44100)
    print(f"Detected Voice Emotion: {voice_emotion}")

    # Capture video frame
    cap = cv2.VideoCapture(0)
    ret, frame = cap.read()
    if ret:
        face_emotion = predict_face_emotion(frame)
        print(f"Detected Face Emotion: {face_emotion}")

        # Display recommendations based on combined emotion
        final_emotion = face_emotion if face_emotion != "No face detected" else voice_emotion
        content = recommend_content(final_emotion)
        display_recommendations(content, frame, face_emotion, voice_emotion)
    cap.release()
    cv2.destroyAllWindows()

def display_recommendations(content, frame, face_emotion, voice_emotion):
    print(f"Final Emotion: {face_emotion if face_emotion != 'No face detected' else voice_emotion}")
    print(f"Recommended Music: {content.get('music', 'No recommendation')}")
    print(f"Recommended Joke: {content.get('jokes', 'No recommendation')}")
    print(f"Recommended Deep Breathe: {content.get('deep_breathe', 'No recommendation')}")
    print(f"Recommended Book: {content.get('books', 'No recommendation')}")
    print(f"Recommended Yoga: {content.get('yoga', 'No recommendation')}")
    print(f"Recommended Video: {content.get('videos', 'No recommendation')}")
    print(f"Recommended Puzzle: {content.get('puzzles', 'No recommendation')}")

# For demonstration purposes, we'll create some dummy data
true_emotions = ['happy', 'sad', 'fear', 'happy', 'disgust', 'happy']
predicted_emotions = ['happy', 'sad', 'fear', 'happy', 'disgust', 'sad']

# Calculate accuracy, precision, recall, and F1-score
accuracy = accuracy_score(true_emotions, predicted_emotions)
precision = precision_score(true_emotions, predicted_emotions, average='weighted')
recall = recall_score(true_emotions, predicted_emotions, average='weighted')
f1 = f1_score(true_emotions, predicted_emotions, average='weighted')

print(f"Overall Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

# Run the system
record_and_predict()




Overall Accuracy: 0.8333333333333334
Precision: 0.9166666666666666
Recall: 0.8333333333333334
F1 Score: 0.8444444444444446
Recording...
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step
Detected Voice Emotion: disgust
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step
Detected Face Emotion: happy
Final Emotion: happy
Recommended Music: Happy Song 1
Recommended Joke: Happy Joke 1
Recommended Deep Breathe: Deep Breathe Exercise 2
Recommended Book: Book 1
Recommended Yoga: Yoga 2
Recommended Video: Video 1
Recommended Puzzle: Puzzle 3
