<a href="https://colab.research.google.com/github/Brahmee-Rout/AI-CLUB-TASK-SUBMISSION/blob/main/voice_emotion_recognition_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
!pip install numpy librosa matplotlib scikit-learn tensorflow soundfile
!pip freeze > requirements.txt
!cat requirements.txt
import librosa
import numpy as np

SAMPLE_RATE = 22050
DURATION = 3
N_MELS = 128
MAX_LEN = 128

def extract_log_mel(file_path):
    y, sr = librosa.load(file_path, sr=SAMPLE_RATE, duration=DURATION)

    # Silence trimming
    y, _ = librosa.effects.trim(y)

    mel = librosa.feature.melspectrogram(
        y=y,
        sr=sr,
        n_mels=N_MELS
    )

    log_mel = librosa.power_to_db(mel)

    # Padding
    if log_mel.shape[1] < MAX_LEN:
        pad_width = MAX_LEN - log_mel.shape[1]
        log_mel = np.pad(log_mel, pad_width=((0,0),(0,pad_width)))
    else:
        log_mel = log_mel[:, :MAX_LEN]

    return log_mel
emotion_map = {
    "01": "neutral",
    "02": "calm",
    "03": "happy",
    "04": "sad",
    "05": "angry",
    "06": "fearful",
    "07": "disgust",
    "08": "surprised"
}
import os

X, y = [], []
from google.colab import files
files.upload()

import kagglehub


path = kagglehub.dataset_download("orvile/ravdess-dataset")

print("Path to dataset files:", path)
dataset_path = path

for root, _, files in os.walk(dataset_path):
    for file in files:
        if file.endswith(".wav"):
            emotion_code = file.split("-")[2]
            feature = extract_log_mel(os.path.join(root, file))
            X.append(feature)
            y.append(emotion_code)

X = np.array(X)
y = np.array(y)
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

le = LabelEncoder()
y_encoded = le.fit_transform(y)
y_cat = to_categorical(y_encoded)
from sklearn.model_selection import train_test_split

X = X[..., np.newaxis]

X_train, X_temp, y_train, y_temp = train_test_split(
    X, y_cat, test_size=0.2, stratify=y_cat, random_state=42
)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42
)
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, BatchNormalization
from tensorflow.keras.layers import Dense, Dropout, GlobalAveragePooling2D
from tensorflow.keras.optimizers import Adam

model = Sequential([
    Conv2D(32, (3,3), activation='relu', input_shape=X_train.shape[1:]),
    BatchNormalization(),
    MaxPooling2D(2,2),

    Conv2D(64, (3,3), activation='relu'),
    BatchNormalization(),
    MaxPooling2D(2,2),

    Conv2D(128, (3,3), activation='relu'),
    BatchNormalization(),
    GlobalAveragePooling2D(),

    Dense(128, activation='relu'),
    Dropout(0.4),
    Dense(8, activation='softmax')
])

model.compile(
    optimizer=Adam(0.0003),
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

model.summary()
history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=30,
    batch_size=32
)
model.save("model.h5")
from sklearn.metrics import classification_report, confusion_matrix

y_pred = model.predict(X_test)
y_pred_labels = np.argmax(y_pred, axis=1)
y_true = np.argmax(y_test, axis=1)

print(classification_report(y_true, y_pred_labels, target_names=le.classes_))
print(confusion_matrix(y_true, y_pred_labels))
import librosa
import numpy as np
import tensorflow as tf

model = tf.keras.models.load_model("model.h5")

def predict_emotion(file_path):
    mel = extract_log_mel(file_path)
    mel = mel[np.newaxis, ..., np.newaxis]

    preds = model.predict(mel)[0]
    emotion = le.inverse_transform([np.argmax(preds)])[0]
    confidence = np.max(preds) * 100

    print(f"Emotion: {emotion}")
    print(f"Confidence: {confidence:.2f}%")

# Example usage
predict_emotion("test.wav")




TypeError: 'NoneType' object is not subscriptable

In [None]:
import sys
import numpy as np
import librosa
import tensorflow as tf

# Load model
MODEL_PATH = "model.h5"
model = tf.keras.models.load_model(MODEL_PATH)

# Emotion labels (RAVDESS order)
emotion_labels = [
    "neutral", "calm", "happy", "sad",
    "angry", "fearful", "disgust", "surprised"
]

SAMPLE_RATE = 22050
DURATION = 3
N_MELS = 128
MAX_LEN = 128

def extract_log_mel(file_path):
    y, sr = librosa.load(file_path, sr=SAMPLE_RATE, duration=DURATION)
    y, _ = librosa.effects.trim(y)

    mel = librosa.feature.melspectrogram(
        y=y, sr=sr, n_mels=N_MELS
    )
    log_mel = librosa.power_to_db(mel)

    if log_mel.shape[1] < MAX_LEN:
        pad = MAX_LEN - log_mel.shape[1]
        log_mel = np.pad(log_mel, ((0,0),(0,pad)))
    else:
        log_mel = log_mel[:, :MAX_LEN]

    return log_mel

def predict_emotion(audio_path):
    mel = extract_log_mel(audio_path)
    mel = mel[np.newaxis, ..., np.newaxis]

    preds = model.predict(mel)[0]
    idx = np.argmax(preds)

    print(f"Predicted Emotion : {emotion_labels[idx]}")
    print(f"Confidence        : {preds[idx]*100:.2f}%")

if __name__ == "__main__":
    if len(sys.argv) != 2:
        print("Usage: python predict.py <audio.wav>")
        sys.exit(1)

    predict_emotion(sys.argv[1])




Usage: python predict.py <audio.wav>


SystemExit: 1

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
