In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
import librosa
import os
# Set the path to the directory containing the audio files
audio_dir = "/Users/apple/Desktop/speech/Audio_Speech_Actors_01-24/Actor_01"  # Update with the path to your audio files directory

# Set the maximum length for padding/truncating the audio features
max_length = 500  # Update with your desired maximum length

# Create empty lists to store the features and labels
features = []
labels = []

# Loop through each audio file in the directory
for filename in os.listdir(audio_dir):
    if filename.endswith(".wav"):
        # Load the audio file
        file_path = os.path.join(audio_dir, filename)
        audio, sr = librosa.load(file_path, sr=None)
        
        # Extract audio features (e.g., Mel-frequency cepstral coefficients - MFCCs)
        mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=13)
        
        # Pad or truncate the audio features to the maximum length
        if mfccs.shape[1] < max_length:
            pad_width = max_length - mfccs.shape[1]
            mfccs = np.pad(mfccs, ((0, 0), (0, pad_width)), mode='constant')
        else:
            mfccs = mfccs[:, :max_length]
        
        # Flatten the MFCCs to obtain a feature vector
        audio_features = mfccs.flatten()
        
        # Append the features and labels to the respective lists
        features.append(audio_features)
        
        # Extract the emotion label from the file name
        label = filename.split("_")[0]  # Assumes the file name follows a specific format
        labels.append(label)

# Convert the lists to NumPy arrays
features = np.array(features)
labels = np.array(labels)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

# Preprocess the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)  # Scale the training features
X_test_scaled = scaler.transform(X_test)  # Scale the testing features

# Train the machine learning model
model = SVC(kernel='linear')  # Choose an appropriate algorithm
model.fit(X_train_scaled, y_train)  # Train the model

# Evaluate the model
train_accuracy = model.score(X_train_scaled, y_train)  # Evaluate training accuracy
test_accuracy = model.score(X_test_scaled, y_test)  # Evaluate testing accuracy
print("Training Accuracy:", train_accuracy)
print("Testing Accuracy:", test_accuracy)

# Save the trained model
# Assuming you want to save the model as a file 'emotion_detection_model.pkl'
# import pickle
# with open("emotion_detection_model.pkl", "wb") as file:
#     pickle.dump(model, file)


In [1]:
import os
import numpy as np
import librosa
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

# Define the path to the directory containing the audio files
audio_dir = "/Volumes/Time Machine Backups/Desktop/speech/Audio_Speech_Actors_01-24/all"

# Define the maximum length for padding/truncating the audio features
max_length = 500

# Create empty lists to store the features and labels
features = []
labels = []

# Emotions in the dataset
emotions = {
    "01": "neutral",
    "02": "calm",
    "03": "happy",
    "04": "sad",
    "05": "angry",
    "06": "fearful",
    "07": "disgust",
    "08": "surprised"
}

observed_emotions = ["calm", "happy", "fearful", "disgust"]

# Loop through each audio file in the directory
for filename in os.listdir(audio_dir):
    if filename.endswith(".wav"):
        # Load the audio file
        file_path = os.path.join(audio_dir, filename)
        audio, sr = librosa.load(file_path, sr=None)
        
        # Extract audio features (MFCCs)
        mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=40, n_fft=1024)
        
        # Pad or truncate the audio features to the maximum length
        if mfccs.shape[1] < max_length:
            pad_width = max_length - mfccs.shape[1]
            mfccs = np.pad(mfccs, ((0, 0), (0, pad_width)), mode='constant')
        else:
            mfccs = mfccs[:, :max_length]
        
        # Flatten the MFCCs to obtain a feature vector
        audio_features = mfccs.flatten()
        
        # Extract the emotion label from the file name
        parts = filename.split("-")
        if len(parts) < 3:
            print(f"Skipping file with invalid name: {filename}")
            continue
        emotion = emotions.get(parts[2], "unknown")
        
        if emotion not in observed_emotions:
            continue
        
        # Append the features and labels to the respective lists
        features.append(audio_features)
        labels.append(emotion)

# Convert the lists to NumPy arrays
features = np.array(features)
labels = np.array(labels)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.25, random_state=9)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize and train the machine learning model (SVM)
model = SVC(kernel='linear')
model.fit(X_train_scaled, y_train)

# Evaluate the model
train_accuracy = model.score(X_train_scaled, y_train)
test_accuracy = model.score(X_test_scaled, y_test)
print("Training Accuracy:", train_accuracy)
print("Testing Accuracy:", test_accuracy)

# # Save the trained model if needed
# import pickle
# with open("emotion_detection_model_new_4_emo.pkl", "wb") as file:
#    pickle.dump(model, file)



Training Accuracy: 1.0
Testing Accuracy: 0.8020833333333334


In [1]:
import os
import numpy as np
import librosa
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

# Define the path to the directory containing the audio files
audio_dir = "/Volumes/Time Machine Backups/Desktop/speech/Audio_Speech_Actors_01-24/all"

# Define the maximum length for padding/truncating the audio features
max_length = 500

# Create empty lists to store the features and labels
features = []
labels = []

# Emotions in the dataset
emotions = {
    "01": "neutral",
    "02": "calm",
    "03": "happy",
    "04": "sad",
    "05": "angry",
    "06": "fearful",
    "07": "disgust",
    "08": "surprised"
}

observed_emotions = list(emotions.values())  # Use all emotions from the dictionary

# Loop through each audio file in the directory
for filename in os.listdir(audio_dir):
    if filename.endswith(".wav"):
        # Load the audio file
        file_path = os.path.join(audio_dir, filename)
        audio, sr = librosa.load(file_path, sr=None)
        
        # Extract audio features (MFCCs)
        mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=40, n_fft=1024)
        
        # Pad or truncate the audio features to the maximum length
        if mfccs.shape[1] < max_length:
            pad_width = max_length - mfccs.shape[1]
            mfccs = np.pad(mfccs, ((0, 0), (0, pad_width)), mode='constant')
        else:
            mfccs = mfccs[:, :max_length]
        
        # Flatten the MFCCs to obtain a feature vector
        audio_features = mfccs.flatten()
        
        # Extract the emotion label from the file name
        parts = filename.split("-")
        if len(parts) < 3:
            print(f"Skipping file with invalid name: {filename}")
            continue
        emotion = emotions.get(parts[2], "unknown")
        
        if emotion not in observed_emotions:
            continue
        
        # Append the features and labels to the respective lists
        features.append(audio_features)
        labels.append(emotion)

# Convert the lists to NumPy arrays
features = np.array(features)
labels = np.array(labels)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.25, random_state=9)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize and train the machine learning model (SVM)
model = SVC(kernel='linear')
model.fit(X_train_scaled, y_train)

# Evaluate the model
train_accuracy = model.score(X_train_scaled, y_train)
test_accuracy = model.score(X_test_scaled, y_test)
print("Training Accuracy:", train_accuracy)
print("Testing Accuracy:", test_accuracy)

# # Save the trained model if needed
import pickle
with open("emotion_detection_model_new_all_emo.pkl", "wb") as file:
     pickle.dump(model, file)



Training Accuracy: 1.0
Testing Accuracy: 0.6194444444444445


In [1]:
import os
import numpy as np
import librosa
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Define the path to the directory containing the audio files
audio_dir = "/Volumes/Time Machine Backups/Desktop/speech/Audio_Speech_Actors_01-24/all"

# Define the maximum length for padding/truncating the audio features
max_length = 500

# Create empty lists to store the features and labels
features = []
labels = []

# Emotions in the dataset
emotions = {
    "01": "neutral",
    "02": "calm",
    "03": "happy",
    "04": "sad",
    "05": "angry",
    "06": "fearful",
    "07": "disgust",
    "08": "surprised"
}

observed_emotions = ["calm", "happy", "fearful", "disgust"]

# Loop through each audio file in the directory
for filename in os.listdir(audio_dir):
    if filename.endswith(".wav"):
        # Load the audio file
        file_path = os.path.join(audio_dir, filename)
        audio, sr = librosa.load(file_path, sr=None)
        
        # Extract audio features (MFCCs)
        mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=40, n_fft=1024)
        
        # Pad or truncate the audio features to the maximum length
        if mfccs.shape[1] < max_length:
            pad_width = max_length - mfccs.shape[1]
            mfccs = np.pad(mfccs, ((0, 0), (0, pad_width)), mode='constant')
        else:
            mfccs = mfccs[:, :max_length]
        
        # Flatten the MFCCs to obtain a feature vector
        audio_features = mfccs.flatten()
        
        # Extract the emotion label from the file name
        parts = filename.split("-")
        if len(parts) < 3:
            print(f"Skipping file with invalid name: {filename}")
            continue
        emotion = emotions.get(parts[2], "unknown")
        
        if emotion not in observed_emotions:
            continue
        
        # Append the features and labels to the respective lists
        features.append(audio_features)
        labels.append(emotion)

# Convert the lists to NumPy arrays
features = np.array(features)
labels = np.array(labels)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.25, random_state=9)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize and train the Random Forest model
model = RandomForestClassifier()

# Define hyperparameters to tune
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}

# Perform grid search for hyperparameter tuning
grid_search = GridSearchCV(model, param_grid, cv=5)
grid_search.fit(X_train_scaled, y_train)

# Get the best model from grid search
best_model = grid_search.best_estimator_

# Evaluate the best model
train_accuracy = best_model.score(X_train_scaled, y_train)
test_accuracy = best_model.score(X_test_scaled, y_test)
print("Training Accuracy:", train_accuracy)
print("Testing Accuracy:", test_accuracy)

# Predictions and classification report
y_pred = best_model.predict(X_test_scaled)
print(classification_report(y_test, y_pred))


Training Accuracy: 1.0
Testing Accuracy: 0.7083333333333334
              precision    recall  f1-score   support

        calm       0.75      0.85      0.80        54
     disgust       0.70      0.70      0.70        44
     fearful       0.76      0.60      0.67        53
       happy       0.60      0.66      0.63        41

    accuracy                           0.71       192
   macro avg       0.71      0.70      0.70       192
weighted avg       0.71      0.71      0.71       192



In [2]:
import pickle
with open("emotion_detection_model_new_4.70_emo.pkl", "wb") as file:
     pickle.dump(model, file)

In [1]:
import os
import numpy as np
import librosa
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Define the path to the directory containing the audio files
audio_dir = "/Volumes/Time Machine Backups/Desktop/speech/Audio_Speech_Actors_01-24/all"

# Define the maximum length for padding/truncating the audio features
max_length = 500

# Create empty lists to store the features and labels
features = []
labels = []

# Emotions in the dataset
emotions = {
    "01": "neutral",
    "02": "calm",
    "03": "happy",
    "04": "sad",
    "05": "angry",
    "06": "fearful",
    "07": "disgust",
    "08": "surprised"
}

# Loop through each audio file in the directory
for filename in os.listdir(audio_dir):
    if filename.endswith(".wav"):
        # Load the audio file
        file_path = os.path.join(audio_dir, filename)
        audio, sr = librosa.load(file_path, sr=None)
        
        # Extract audio features (MFCCs)
        mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=40, n_fft=1024)
        
        # Pad or truncate the audio features to the maximum length
        if mfccs.shape[1] < max_length:
            pad_width = max_length - mfccs.shape[1]
            mfccs = np.pad(mfccs, ((0, 0), (0, pad_width)), mode='constant')
        else:
            mfccs = mfccs[:, :max_length]
        
        # Flatten the MFCCs to obtain a feature vector
        audio_features = mfccs.flatten()
        
        # Extract the emotion label from the file name
        parts = filename.split("-")
        if len(parts) < 3:
            print(f"Skipping file with invalid name: {filename}")
            continue
        emotion = emotions.get(parts[2], "unknown")
        
        # Append the features and labels to the respective lists
        features.append(audio_features)
        labels.append(emotion)

# Convert the lists to NumPy arrays
features = np.array(features)
labels = np.array(labels)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.25, random_state=9)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize and train the Random Forest model
model = RandomForestClassifier()

# Define hyperparameters to tune
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}

# Perform grid search for hyperparameter tuning
grid_search = GridSearchCV(model, param_grid, cv=5)
grid_search.fit(X_train_scaled, y_train)

# Get the best model from grid search
best_model = grid_search.best_estimator_

# Evaluate the best model
train_accuracy = best_model.score(X_train_scaled, y_train)
test_accuracy = best_model.score(X_test_scaled, y_test)
print("Training Accuracy:", train_accuracy)
print("Testing Accuracy:", test_accuracy)

# Predictions and classification report
y_pred = best_model.predict(X_test_scaled)
print(classification_report(y_test, y_pred))


Training Accuracy: 1.0
Testing Accuracy: 0.49444444444444446
              precision    recall  f1-score   support

       angry       0.69      0.55      0.61        53
        calm       0.54      0.75      0.63        53
     disgust       0.52      0.58      0.55        52
     fearful       0.49      0.45      0.47        42
       happy       0.42      0.27      0.33        51
     neutral       0.29      0.26      0.28        19
         sad       0.26      0.20      0.23        44
   surprised       0.52      0.70      0.59        46

    accuracy                           0.49       360
   macro avg       0.47      0.47      0.46       360
weighted avg       0.49      0.49      0.48       360



In [2]:
import pickle
with open("emotion_detection_model_new_all.49_emo.pkl", "wb") as file:
     pickle.dump(model, file)

In [4]:
import os
import numpy as np
import librosa
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras import layers, models, callbacks
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.image import ImageDataGenerator

# Define the path to the directory containing the audio files
audio_dir = "/Volumes/Time Machine Backups/Desktop/speech/Audio_Speech_Actors_01-24/all"

# Define the maximum length for padding/truncating the audio features
max_length = 500

# Create empty lists to store the features and labels
features = []
labels = []

# Emotions in the dataset
emotions = {
    "01": "neutral",
    "02": "calm",
    "03": "happy",
    "04": "sad",
    "05": "angry",
    "06": "fearful",
    "07": "disgust",
    "08": "surprised"
}

# Loop through each audio file in the directory
for filename in os.listdir(audio_dir):
    if filename.endswith(".wav"):
        # Load the audio file
        file_path = os.path.join(audio_dir, filename)
        audio, sr = librosa.load(file_path, sr=None)
        
        # Extract audio features (MFCCs)
        mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=40, n_fft=1024)
        
        # Pad or truncate the audio features to the maximum length
        if mfccs.shape[1] < max_length:
            pad_width = max_length - mfccs.shape[1]
            mfccs = np.pad(mfccs, ((0, 0), (0, pad_width)), mode='constant')
        else:
            mfccs = mfccs[:, :max_length]
        
        # Append the features and labels to the respective lists
        features.append(mfccs)
        
        # Extract the emotion label from the file name
        parts = filename.split("-")
        if len(parts) < 3:
            print(f"Skipping file with invalid name: {filename}")
            continue
        emotion = emotions.get(parts[2], "unknown")
        
        labels.append(emotion)

# Convert the lists to NumPy arrays
features = np.array(features)
labels = np.array(labels)

# Convert emotion labels to integers
label_encoder = LabelEncoder()
labels_encoded = label_encoder.fit_transform(labels)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, labels_encoded, test_size=0.25, random_state=9)

# Reshape features for CNN input
X_train = X_train[..., np.newaxis]  # Add channel dimension
X_test = X_test[..., np.newaxis]

# Convert labels to one-hot encoding
y_train_onehot = to_categorical(y_train)
y_test_onehot = to_categorical(y_test)

# Data Augmentation
datagen = ImageDataGenerator(
    width_shift_range=0.1,
    height_shift_range=0.1,
    zoom_range=0.2,
    shear_range=0.1,
    fill_mode='nearest'
)

# Initialize the CNN model
model = models.Sequential([
    layers.Conv2D(64, (3, 3), activation='relu', input_shape=(40, max_length, 1)),
    layers.MaxPooling2D((2, 2)),
    layers.Conv2D(128, (3, 3), activation='relu'),
    layers.MaxPooling2D((2, 2)),
    layers.Conv2D(256, (3, 3), activation='relu'),
    layers.MaxPooling2D((2, 2)),
    layers.Flatten(),
    layers.Dense(256, activation='relu'),
    layers.Dropout(0.5),
    layers.Dense(8, activation='softmax')
])

# Compile the model
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

# Train the model with data augmentation
history = model.fit(datagen.flow(X_train, y_train_onehot, batch_size=32),
                    steps_per_epoch=len(X_train) / 32, epochs=50,
                    validation_data=(X_test, y_test_onehot),
                    callbacks=[callbacks.EarlyStopping(patience=5)])

# Evaluate the model
test_loss, test_acc = model.evaluate(X_test, y_test_onehot)
print("Testing Loss:", test_loss)
print("Testing Accuracy:", test_acc)


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Testing Loss: 1.9982143640518188
Testing Accuracy: 0.21944443881511688
