In [None]:
!pip install sounddevice


In [None]:
import cv2
import sounddevice as sd
import numpy as np
import tensorflow as tf

# Load your pre-trained models for video and audio
video_model = tf.keras.models.load_model('EmotionDetectionImageModel.keras')
audio_model = tf.keras.models.load_model('AudioDetectionEmotionModel.keras')

# Parameters for audio
SAMPLE_RATE = 16000  # Hz
DURATION = 1  # Seconds (e.g., 1 second of audio)

def get_video_prediction(frame):
    frame = cv2.flip(frame, 1)
    # Preprocess the frame (resize, convert to grayscale, normalize, etc.)
    gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
    resized_frame = cv2.resize(gray, (48, 48))  # Resize to match your model's input size (48x48)
    normalized_frame = resized_frame / 255.0  # Normalize pixel values

    # Reshape the frame to match model input (1, 48, 48, 1) for grayscale images
    input_frame = np.expand_dims(normalized_frame, axis=0)
    input_frame = np.expand_dims(input_frame, axis=-1)

    # Make a prediction
    prediction = model.predict(input_frame)
    emotion_label = np.argmax(prediction)  # Get the label with the highest probability
    
    # Add text to the frame
    emotions = ['Angry', 'Disgust', 'Fear', 'Happy', 'Neutral', 'Sad','Surprise']  # Update with your model's labels
    label_text = emotions[emotion_label]    
    # Display the frame with the predicted emotion
    cv2.imshow('Emotion Detection', frame)
    return prediction

def get_audio_prediction(audio_data):
    # Preprocess audio for the audio model
    audio_data_normalized = audio_data / np.max(np.abs(audio_data))  # Normalize
    audio_input = np.expand_dims(audio_data_normalized, axis=0)

    # Get audio model prediction
    audio_prediction = audio_model.predict(audio_input)
    return audio_prediction

def audio_callback(indata, frames, time, status):
    """Callback function to capture audio."""
    if status:
        print(status)
    audio_data = indata[:, 0]  # Use the first channel (mono)

    # Get prediction for the audio
    audio_prediction = get_audio_prediction(audio_data)
    print(f"Audio Prediction: {audio_prediction}")

# Open video capture (Webcam)
cap = cv2.VideoCapture(0)

# Start audio stream
with sd.InputStream(channels=1, samplerate=SAMPLE_RATE, callback=audio_callback):
    while True:
        # Capture frame-by-frame from webcam
        ret, frame = cap.read()
        if not ret:
            break

        # Show the video frame
        # Get prediction for the current video frame
        video_prediction = get_video_prediction(frame)
        print(f"Video Prediction: {video_prediction}")

        # Break the loop if 'q' is pressed
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

# Release the webcam and close windows
cap.release()
cv2.destroyAllWindows()


In [5]:
import cv2
import numpy as np
from tensorflow.keras.models import load_model  # or the relevant library if you used something else
import tensorflow as tf
import sounddevice as sd
import threading
import time

model_video = load_model("EmotionDetectionImageModel.keras")
model_audio = load_model('AudioDetectionEmotionModel.keras')

  saveable.load_own_variables(weights_store.get(inner_path))


In [6]:


# Global variables for audio and video predictions
audio_probs = np.zeros(7)  # Placeholder for 7 emotion probabilities
video_probs = np.zeros(7)  # Placeholder for 7 emotion probabilities
running = True  # Flag to control running state

# Emotion labels (Update according to your model)
emotions = ['Angry', 'Disgust', 'Fear', 'Happy', 'Neutral', 'Sad', 'Surprise']

# Function to capture and preprocess audio
def get_audio_features():
    duration = 1  # seconds
    sample_rate = 16000
    audio_data = sd.rec(int(duration * sample_rate), samplerate=sample_rate, channels=1)
    sd.wait()  # Wait for the recording to complete
    audio_data = np.mean(audio_data, axis=1)  # Convert stereo to mono if needed
    audio_data = audio_data / np.max(np.abs(audio_data))  # Normalize
    audio_data = np.expand_dims(audio_data, axis=0)  # Match model input shape
    return audio_data

# Function to predict audio emotion in a separate thread
def audio_emotion_detection():
    global audio_probs, running

    while running:
        audio_features = get_audio_features()
        prediction_audio = model_audio.predict(audio_features)[0]
        audio_probs = prediction_audio  # Store the raw probabilities
        time.sleep(2)  # Reduce audio processing frequency

# Start audio detection thread
audio_thread = threading.Thread(target=audio_emotion_detection)
audio_thread.daemon = True
audio_thread.start()

# Initialize the webcam
cap = cv2.VideoCapture(0)  # 0 is the default webcam
frame_skip = 5  # Only predict every 5th frame
frame_count = 0

while running:
    ret, frame = cap.read()
    if not ret:
        break

    frame = cv2.flip(frame, 1)

    if frame_count % frame_skip == 0:
        # Preprocess the video frame
        gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
        resized_frame = cv2.resize(gray, (48, 48))  # Resize to match your model's input size (48x48)
        normalized_frame = resized_frame / 255.0  # Normalize pixel values

        # Reshape the frame to match model input (1, 48, 48, 1)
        input_frame = np.expand_dims(normalized_frame, axis=0)
        input_frame = np.expand_dims(input_frame, axis=-1)

        # Make video model prediction
        prediction_video = model_video.predict(input_frame)[0]
        video_probs = prediction_video  # Store the raw probabilities

    frame_count += 1

    # Combine the probabilities from audio and video
    combined_probs = audio_probs + video_probs

    # Get indices of top 3 emotions
    top_3_indices = np.argsort(combined_probs)[-3:][::-1]  # Sort and get top 3 in descending order
    top_3_emotions = [(emotions[i], combined_probs[i]) for i in top_3_indices]

    # Display the top 3 emotions on the video frame
    label_text = f"1st: {top_3_emotions[0][0]} ({top_3_emotions[0][1]:.2f}), " \
                 f"2nd: {top_3_emotions[1][0]} ({top_3_emotions[1][1]:.2f}), " \
                 f"3rd: {top_3_emotions[2][0]} ({top_3_emotions[2][1]:.2f})"
    cv2.putText(frame, label_text, (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 255, 0), 2, cv2.LINE_AA)

    # Display the frame with the top 3 predicted emotions
    cv2.imshow('Emotion Detection', frame)

    # Break the loop if 'q' is pressed
    if cv2.waitKey(1) & 0xFF == ord('q'):
        running = False  # Stop both audio and video when 'q' is pressed

# Wait for the audio thread to finish
audio_thread.join()

# Release the webcam and close windows
cap.release()
cv2.destroyAllWindows()


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 233ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/