In [53]:
!pip install numpy librosa tensorflow scikit-learn




In [54]:
import os
import numpy as np
import librosa
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical

def extract_features(file_name):
    try:
        audio, sample_rate = librosa.load(file_name, sr=None, res_type='kaiser_fast')
        mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
        mfccs_scaled = np.mean(mfccs.T, axis=0)
        return mfccs_scaled
    except Exception as e:
        print(f"Error processing file {file_name}: {e}")
        return None

# Define the base directory containing actor folders
base_dir = '/content/drive/MyDrive/RAVIDASS'

X = []
y = []

# Iterate over each actor folder
for actor_folder in os.listdir(base_dir):
    actor_path = os.path.join(base_dir, actor_folder)

    # Check if it's a directory
    if os.path.isdir(actor_path):
        for filename in os.listdir(actor_path):
            if filename.endswith('.wav'):
                file_path = os.path.join(actor_path, filename)
                features = extract_features(file_path)
                if features is not None:
                    X.append(features)
                    # Extract emotion label from the filename, assuming it follows a pattern
                    # Example filename: 03-01-01-01-01-01-01.wav
                    label = int(filename.split('-')[2]) - 1
                    y.append(label)

X = np.array(X)
y = np.array(y)

# Check if data is loaded
print(f"Number of samples: {len(X)}")
if len(X) > 0:
    print(f"Sample feature shape: {X[0].shape}")
    print(f"Sample label: {y[0]}")
else:
    print("No data available.")

# Split data into training and validation sets
if len(X) > 0:
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
    num_classes = 8
    y_train = to_categorical(y_train, num_classes=num_classes)
    y_val = to_categorical(y_val, num_classes=num_classes)
else:
    print("Cannot proceed with training as no data is available.")


Number of samples: 1440
Sample feature shape: (40,)
Sample label: 0


In [35]:

print(os.listdir(data_dir))


['audio_speech_actors_01-24', 'Actor_23', 'Actor_24', 'Actor_22', 'Actor_21', 'Actor_18', 'Actor_19', 'Actor_20', 'Actor_17', 'Actor_16', 'Actor_14', 'Actor_12', 'Actor_13', 'Actor_15', 'Actor_11', 'Actor_10', 'Actor_05', 'Actor_09', 'Actor_08', 'Actor_07', 'Actor_06', 'Actor_02', 'Actor_04', 'Actor_01', 'Actor_03']


In [55]:
def extract_features(file_name):
    try:
        audio, sample_rate = librosa.load(file_name, sr=None, res_type='kaiser_fast')
        mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
        mfccs_scaled = np.mean(mfccs.T, axis=0)
        return mfccs_scaled
    except Exception as e:
        print(f"Error processing file {file_name}: {e}")
        return None


In [56]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout, LSTM, Bidirectional

def build_cnn_rnn_model(input_shape):
    model = Sequential([
        # Expand dims to add channel dimension for Conv1D
        Conv1D(64, kernel_size=3, activation='relu', input_shape=input_shape),
        MaxPooling1D(pool_size=2),
        Dropout(0.25),

        Conv1D(128, kernel_size=3, activation='relu'),
        MaxPooling1D(pool_size=2),
        Dropout(0.25),

        Conv1D(256, kernel_size=3, activation='relu'),
        MaxPooling1D(pool_size=2),
        Dropout(0.25),

        Flatten(),
        Dense(256, activation='relu'),
        Dropout(0.5),

        Dense(num_classes, activation='softmax')
    ])

    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

# Define input shape (features dimension, 1 channel)
input_shape = (40, 1)
num_classes = 8
model = build_cnn_rnn_model(input_shape)


In [57]:
X = np.expand_dims(X, axis=-1)  # Add channel dimension


In [58]:
from tensorflow.keras.callbacks import EarlyStopping

early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

history = model.fit(
    X_train,
    y_train,
    epochs=50,
    batch_size=32,
    validation_data=(X_val, y_val),
    callbacks=[early_stopping],
    verbose=1
)


Epoch 1/50
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 20ms/step - accuracy: 0.1324 - loss: 5.1981 - val_accuracy: 0.2639 - val_loss: 1.9854
Epoch 2/50
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.2046 - loss: 2.0621 - val_accuracy: 0.2222 - val_loss: 1.9608
Epoch 3/50
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.2313 - loss: 1.9970 - val_accuracy: 0.2153 - val_loss: 1.9385
Epoch 4/50
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.2032 - loss: 1.9842 - val_accuracy: 0.2465 - val_loss: 1.9216
Epoch 5/50
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.2315 - loss: 1.9733 - val_accuracy: 0.2708 - val_loss: 1.9287
Epoch 6/50
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.2253 - loss: 1.9442 - val_accuracy: 0.2778 - val_loss: 1.8982
Epoch 7/50
[1m36/36[0m [32m━━━━━━━━━

In [59]:
# Define emotion labels
emotion_labels = [
    'Anger',    # Index 0
    'Disgust',  # Index 1
    'Fear',     # Index 2
    'Happiness',# Index 3
    'Sadness',  # Index 4
    'Surprise', # Index 5
    'Neutral',  # Index 6
    'Other'     # Index 7
]

def get_emotion_label(predicted_index):
    return emotion_labels[predicted_index]

# Function to extract features
def extract_features(file_name):
    try:
        audio, sample_rate = librosa.load(file_name, sr=None, res_type='kaiser_fast')
        mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
        mfccs_scaled = np.mean(mfccs.T, axis=0)
        return mfccs_scaled
    except Exception as e:
        print(f"Error processing file {file_name}: {e}")
        return None



In [60]:
def predict_emotion(file_name, model):
    features = extract_features(file_name)
    if features is not None:
        processed_features = np.expand_dims(features, axis=0)  # Add batch dimension
        processed_features = np.expand_dims(processed_features, axis=-1)  # Add channel dimension
        predictions = model.predict(processed_features)
        predicted_index = np.argmax(predictions, axis=1)[0]  # Get the index of the highest probability

        # Map the predicted index to an emotion label
        predicted_emotion = get_emotion_label(predicted_index)
        return predicted_emotion
    else:
        return None

# Example usage
file_path = '/content/drive/MyDrive/RAVIDASS/Actor_04/03-01-03-02-01-02-04.wav'
predicted_emotion = predict_emotion(file_path, model)
print(f"Predicted Emotion: {predicted_emotion}")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 248ms/step
Predicted Emotion: Fear
