In [1]:
!pip install opencv-python deepface transformers librosa torch tensorflow
!pip install deepface opencv-python-headless

Collecting deepface
  Downloading deepface-0.0.93-py3-none-any.whl.metadata (30 kB)
Collecting flask-cors>=4.0.1 (from deepface)
  Downloading flask_cors-5.0.1-py3-none-any.whl.metadata (961 bytes)
Collecting mtcnn>=0.1.0 (from deepface)
  Downloading mtcnn-1.0.0-py3-none-any.whl.metadata (5.8 kB)
Collecting retina-face>=0.0.1 (from deepface)
  Downloading retina_face-0.0.17-py3-none-any.whl.metadata (10 kB)
Collecting fire>=0.4.0 (from deepface)
  Downloading fire-0.7.0.tar.gz (87 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m87.2/87.2 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting gunicorn>=20.1.0 (from deepface)
  Downloading gunicorn-23.0.0-py3-none-any.whl.metadata (4.4 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downl

In [2]:
import cv2
from deepface import DeepFace
from collections import Counter

def detect_video_emotion(video_path, num_frames=100):
    """Detect dominant emotion from video frames using DeepFace."""
    emotions = []  # Store detected emotions for each frame
    cap = cv2.VideoCapture(video_path)
    frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    step = max(1, frame_count // num_frames)  # Sample frames evenly

    print("Analyzing video frames for emotions...")

    while True:
        ret, frame = cap.read()
        if not ret:
            break

        frame_number = int(cap.get(cv2.CAP_PROP_POS_FRAMES))
        if frame_number % step == 0:  # Process every 'step'-th frame
            try:
                analysis = DeepFace.analyze(frame, actions=['emotion'], enforce_detection=False)

                # Adjust handling for the response format
                if isinstance(analysis, list):
                    # For newer DeepFace versions returning a list of results
                    emotions.append(analysis[0]['dominant_emotion'])
                else:
                    # For older versions returning a dictionary
                    emotions.append(analysis['dominant_emotion'])

                print(f"Frame {frame_number}: {emotions[-1]}")
            except Exception as e:
                print(f"Error analyzing frame {frame_number}: {e}")

    cap.release()

    # Calculate the most frequent emotion across all analyzed frames
    if emotions:
        dominant_emotion = Counter(emotions).most_common(1)[0][0]
        return dominant_emotion
    else:
        return "Unknown"

# Example Usage
video_path = "/content/drive/MyDrive/data/6038291_Woman_Young_3840x2160.mp4"  # Replace with the path to your video file
dominant_emotion = detect_video_emotion(video_path)
print(f"Dominant Emotion in Video: {dominant_emotion}")


25-03-11 05:58:40 - Directory /root/.deepface has been created
25-03-11 05:58:40 - Directory /root/.deepface/weights has been created
Analyzing video frames for emotions...
Dominant Emotion in Video: Unknown


In [3]:
!pip install speechbrain opensmile librosa torch transformers
!pip install speechbrain torch librosa opensmile
!pip install speechbrain torchaudio

Collecting speechbrain
  Downloading speechbrain-1.0.2-py3-none-any.whl.metadata (23 kB)
Collecting opensmile
  Downloading opensmile-2.5.1-py3-none-manylinux_2_17_x86_64.whl.metadata (15 kB)
Collecting hyperpyyaml (from speechbrain)
  Downloading HyperPyYAML-1.2.2-py3-none-any.whl.metadata (7.6 kB)
Collecting audobject>=0.6.1 (from opensmile)
  Downloading audobject-0.7.11-py3-none-any.whl.metadata (2.6 kB)
Collecting audinterface>=0.7.0 (from opensmile)
  Downloading audinterface-1.2.3-py3-none-any.whl.metadata (4.2 kB)
Collecting audeer>=2.1.1 (from audinterface>=0.7.0->opensmile)
  Downloading audeer-2.2.1-py3-none-any.whl.metadata (4.1 kB)
Collecting audformat<2.0.0,>=1.0.1 (from audinterface>=0.7.0->opensmile)
  Downloading audformat-1.3.1-py3-none-any.whl.metadata (4.6 kB)
Collecting audiofile>=1.3.0 (from audinterface>=0.7.0->opensmile)
  Downloading audiofile-1.5.1-py3-none-any.whl.metadata (4.9 kB)
Collecting audmath>=1.4.1 (from audinterface>=0.7.0->opensmile)
  Downloading 

In [5]:

import librosa
import librosa.display
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from matplotlib.pyplot import specgram
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
import seaborn as sns
import keras
from keras.callbacks import ReduceLROnPlateau
from keras.models import Sequential
from keras.layers import Dense, Conv1D, MaxPooling1D, Flatten, Dropout, BatchNormalization
from keras.utils import to_categorical
from keras.callbacks import ModelCheckpoint
from keras import regularizers
import os
import glob
import pandas as pd
import IPython.display as ipd
import plotly.express as px
import scipy.io.wavfile
import sys
import warnings
if not sys.warnoptions:
    warnings.simplefilter("ignore")
warnings.filterwarnings("ignore", category=DeprecationWarning)
import torch
import torchaudio
import librosa
import numpy as np
from transformers import Wav2Vec2Processor, Wav2Vec2ForSequenceClassification

# Load a fully trained emotion classification model
model_name = "audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim"
processor = Wav2Vec2Processor.from_pretrained(model_name)
model = Wav2Vec2ForSequenceClassification.from_pretrained(model_name)

# Load the classroom audio file
audio_path = "/content/drive/MyDrive/data/kids-laugh-45357.mp3"
waveform, sample_rate = torchaudio.load(audio_path)

# Ensure correct sampling rate (16kHz required)
if sample_rate != 16000:
    resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
    waveform = resampler(waveform)

# Convert to numpy and normalize
waveform = waveform.numpy().flatten()

# Process the audio
inputs = processor(waveform, sampling_rate=16000, return_tensors="pt", padding=True)

# Perform emotion classification
with torch.no_grad():
    logits = model(**inputs).logits

# Get the predicted emotion label
predicted_index = torch.argmax(logits, dim=-1).item()
emotion_labels = ["neutral", "happy", "sad", "angry", "fear", "disgust", "surprise"]
predicted_emotion = emotion_labels[predicted_index]
print(f"Predicted Emotion in Audio: {predicted_emotion}")

preprocessor_config.json:   0%|          | 0.00/214 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/2.34k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/661M [00:00<?, ?B/s]

Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RuntimeError: Failed to open the input "/content/drive/MyDrive/data/kids-laugh-45357.mp3" (No such file or directory).
Exception raised from get_input_format_context at /__w/audio/audio/pytorch/audio/src/libtorio/ffmpeg/stream_reader/stream_reader.cpp:42 (most recent call first):
frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x96 (0x7c9c61506446 in /usr/local/lib/python3.11/dist-packages/torch/lib/libc10.so)
frame #1: c10::detail::torchCheckFail(char const*, char const*, unsigned int, std::string const&) + 0x64 (0x7c9c614b06e4 in /usr/local/lib/python3.11/dist-packages/torch/lib/libc10.so)
frame #2: <unknown function> + 0x42134 (0x7c9c606ca134 in /usr/local/lib/python3.11/dist-packages/torio/lib/libtorio_ffmpeg4.so)
frame #3: torio::io::StreamingMediaDecoder::StreamingMediaDecoder(std::string const&, std::optional<std::string> const&, std::optional<std::map<std::string, std::string, std::less<std::string>, std::allocator<std::pair<std::string const, std::string> > > > const&) + 0x14 (0x7c9c606ccb34 in /usr/local/lib/python3.11/dist-packages/torio/lib/libtorio_ffmpeg4.so)
frame #4: <unknown function> + 0x3a8de (0x7c9bdb44a8de in /usr/local/lib/python3.11/dist-packages/torio/lib/_torio_ffmpeg4.so)
frame #5: <unknown function> + 0x323ee (0x7c9bdb4423ee in /usr/local/lib/python3.11/dist-packages/torio/lib/_torio_ffmpeg4.so)
frame #6: /usr/bin/python3() [0x55559b]
frame #7: _PyObject_MakeTpCall + 0x27c (0x52f67c in /usr/bin/python3)
frame #8: /usr/bin/python3() [0x58536d]
frame #9: /usr/bin/python3() [0x56e229]
frame #10: /usr/bin/python3() [0x52fa60]
frame #11: <unknown function> + 0xfc8b (0x7c9c6146cc8b in /usr/local/lib/python3.11/dist-packages/torchaudio/lib/_torchaudio.so)
frame #12: _PyObject_MakeTpCall + 0x27c (0x52f67c in /usr/bin/python3)
frame #13: _PyEval_EvalFrameDefault + 0x6bf (0x53d7ff in /usr/bin/python3)
frame #14: _PyFunction_Vectorcall + 0x173 (0x5661a3 in /usr/bin/python3)
frame #15: /usr/bin/python3() [0x56deb6]
frame #16: _PyObject_MakeTpCall + 0x23b (0x52f63b in /usr/bin/python3)
frame #17: _PyEval_EvalFrameDefault + 0x6bf (0x53d7ff in /usr/bin/python3)
frame #18: /usr/bin/python3() [0x6135e4]
frame #19: PyEval_EvalCode + 0x97 (0x612c47 in /usr/bin/python3)
frame #20: /usr/bin/python3() [0x62ca33]
frame #21: _PyEval_EvalFrameDefault + 0x390f (0x540a4f in /usr/bin/python3)
frame #22: /usr/bin/python3() [0x6284b0]
frame #23: _PyEval_EvalFrameDefault + 0x3485 (0x5405c5 in /usr/bin/python3)
frame #24: /usr/bin/python3() [0x6284b0]
frame #25: _PyEval_EvalFrameDefault + 0x3485 (0x5405c5 in /usr/bin/python3)
frame #26: /usr/bin/python3() [0x6284b0]
frame #27: /usr/bin/python3() [0x62aaec]
frame #28: _PyEval_EvalFrameDefault + 0x3a9d (0x540bdd in /usr/bin/python3)
frame #29: /usr/bin/python3() [0x585a87]
frame #30: /usr/bin/python3() [0x58526e]
frame #31: PyObject_Call + 0xf4 (0x570704 in /usr/bin/python3)
frame #32: _PyEval_EvalFrameDefault + 0x4a8f (0x541bcf in /usr/bin/python3)
frame #33: /usr/bin/python3() [0x6284b0]
frame #34: _PyEval_EvalFrameDefault + 0x3485 (0x5405c5 in /usr/bin/python3)
frame #35: /usr/bin/python3() [0x6284b0]
frame #36: _PyEval_EvalFrameDefault + 0x3485 (0x5405c5 in /usr/bin/python3)
frame #37: /usr/bin/python3() [0x6284b0]
frame #38: _PyEval_EvalFrameDefault + 0x3485 (0x5405c5 in /usr/bin/python3)
frame #39: /usr/bin/python3() [0x6284b0]
frame #40: _PyEval_EvalFrameDefault + 0x3485 (0x5405c5 in /usr/bin/python3)
frame #41: /usr/bin/python3() [0x6284b0]
frame #42: <unknown function> + 0x745f (0x7c9d62c2b45f in /usr/lib/python3.11/lib-dynload/_asyncio.cpython-311-x86_64-linux-gnu.so)
frame #43: /usr/bin/python3() [0x553a1f]
frame #44: /usr/bin/python3() [0x4d0bc0]
frame #45: /usr/bin/python3() [0x4e94f3]
frame #46: /usr/bin/python3() [0x54b25b]
frame #47: _PyEval_EvalFrameDefault + 0x9129 (0x546269 in /usr/bin/python3)
frame #48: /usr/bin/python3() [0x6135e4]
frame #49: PyEval_EvalCode + 0x97 (0x612c47 in /usr/bin/python3)
frame #50: /usr/bin/python3() [0x62ca33]
frame #51: /usr/bin/python3() [0x54b25b]
frame #52: PyObject_Vectorcall + 0x35 (0x54b145 in /usr/bin/python3)
frame #53: _PyEval_EvalFrameDefault + 0x6bf (0x53d7ff in /usr/bin/python3)
frame #54: _PyFunction_Vectorcall + 0x173 (0x5661a3 in /usr/bin/python3)
frame #55: /usr/bin/python3() [0x63e860]
frame #56: Py_RunMain + 0x13c (0x63e1bc in /usr/bin/python3)
frame #57: Py_BytesMain + 0x2d (0x603f2d in /usr/bin/python3)
frame #58: <unknown function> + 0x29d90 (0x7c9d63361d90 in /lib/x86_64-linux-gnu/libc.so.6)
frame #59: __libc_start_main + 0x80 (0x7c9d63361e40 in /lib/x86_64-linux-gnu/libc.so.6)
frame #60: _start + 0x25 (0x603db5 in /usr/bin/python3)
