In [None]:
import os
import numpy as np
import librosa
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import matplotlib.pyplot as plt
from torch.optim.lr_scheduler import ReduceLROnPlateau

# Data Augmentation: Add White Noise
def augment_data(data):
    ratio = 0.005
    max_val = np.amax(data)
    r_uniform = np.random.uniform()
    noise_factor = ratio * max_val * r_uniform
    noise = np.random.randn(len(data))
    augmented_data = data + noise_factor * noise
    return augmented_data

# Log-Mel Spectrogram Extraction
def extract_log_mel_spectrogram(audio, sr=22050, n_mels=128):
    mel_spectrogram = librosa.feature.melspectrogram(y=audio, sr=sr, n_mels=n_mels)
    log_mel_spectrogram = librosa.power_to_db(mel_spectrogram, ref=np.max)
    return log_mel_spectrogram


# CNN+LSTM Model
class CNNLSTM(nn.Module):
    def __init__(self):
        super(CNNLSTM, self).__init__()
        
        # LFLB Block 1
        self.conv1 = nn.Conv2d(1, 64, kernel_size=3)
        self.bn1 = nn.BatchNorm2d(64)
        self.elu1 = nn.ELU()
        self.max_pool1 = nn.MaxPool2d(kernel_size=2)
        self.dropout1 = nn.Dropout(0.3)
        
        # LFLB Block 2
        self.conv2 = nn.Conv2d(64, 128, kernel_size=3)
        self.bn2 = nn.BatchNorm2d(128)
        self.elu2 = nn.ELU()
        self.max_pool2 = nn.MaxPool2d(kernel_size=2)
        self.dropout2 = nn.Dropout(0.3)
        
        # LFLB Block 3
        self.conv3 = nn.Conv2d(128, 256, kernel_size=3)
        self.bn3 = nn.BatchNorm2d(256)
        self.elu3 = nn.ELU()
        self.max_pool3 = nn.MaxPool2d(kernel_size=2)
        self.dropout3 = nn.Dropout(0.3)
        
        # LFLB Block 4
        self.conv4 = nn.Conv2d(256, 512, kernel_size=3)
        self.bn4 = nn.BatchNorm2d(512)
        self.elu4 = nn.ELU()
        self.max_pool4 = nn.MaxPool2d(kernel_size=2)
        self.dropout4 = nn.Dropout(0.3)
        
        # LSTM and FC Layers
        self.lstm = nn.LSTM(input_size=512, hidden_size=128, num_layers=2, batch_first=True, dropout=0.3)
        self.fc = nn.Linear(128, 8)  # 8 classes for emotions
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.elu1(x)
        x = self.max_pool1(x)
        x = self.dropout1(x)
        
        x = self.conv2(x)
        x = self.bn2(x)
        x = self.elu2(x)
        x = self.max_pool2(x)
        x = self.dropout2(x)
        
        x = self.conv3(x)
        x = self.bn3(x)
        x = self.elu3(x)
        x = self.max_pool3(x)
        x = self.dropout3(x)
        
        x = self.conv4(x)
        x = self.bn4(x)
        x = self.elu4(x)
        x = self.max_pool4(x)
        x = self.dropout4(x)
        
        x = x.permute(0, 2, 3, 1)  # Reorder dimensions for LSTM
        x = x.reshape(x.size(0), x.size(1), -1)  # Flatten to 3D for LSTM
        x = x[:, :, :512]  # Reshape to match LSTM input size
        
        x, _ = self.lstm(x)
        x = self.fc(x[:, -1, :])  # Use the last output from LSTM
        x = self.softmax(x)
        
        return x

# Function to load RAVDESS dataset from subdirectories
def load_ravdess_data(data_dir, n_mels=128, augment=False):
    audio_files = []
    labels = []

    for actor_path in os.listdir(data_dir):
        for file in os.listdir(os.path.join(data_dir, actor_path)):
            if file.endswith('.wav'):
                audio, sr = librosa.load(os.path.join(data_dir, actor_path, file))
                if augment:
                    audio = augment_data(audio, sr=sr)
                
                # Extract mel spectrogram
                S = librosa.feature.melspectrogram(y=audio, sr=sr, n_mels=n_mels)
                S_DB = librosa.power_to_db(S, ref=np.max)
                
                # Pad the spectrogram to a fixed length
                max_length = 1440  # adjust this value according to your needs
                if S_DB.shape[1] > max_length:
                    S_DB = S_DB[:, :max_length]
                else:
                    padding_width = max_length - S_DB.shape[1]
                    S_DB = np.pad(S_DB, ((0, 0), (0, padding_width)), mode='constant')
                
                audio_files.append(S_DB)
                
                # Extract label from filename
                try:
                    label = int(file.split('-')[2]) - 1  # Adjust index if necessary
                    labels.append(label)
                except IndexError:
                    print(f"Issue with file: {file}. Check filename structure.")
                    continue

    print("Loaded RAVDESS audio files:", len(audio_files))
    print("Loaded RAVDESS labels:", len(labels))

    return audio_files, labels


def load_savee_data(data_dir, n_mels=128, augment=False):
    audio_files = []
    labels = []

    emotion_dict = {
        'a': 0,  # 'angry'
        'd': 1,  # 'disgust'
        'f': 2,  # 'fear'
        'h': 3,  # 'happy'
        'n': 4,  # 'neutral'
        'sa': 5,  # 'sad'
        'su': 6  # 'surprise'
    }

    all_files_path = os.path.join(data_dir, 'ALL')
    for file in os.listdir(all_files_path):
        if file.endswith('.wav'):
            emotion_key = file.split('_')[1][0]  # Extract the first character of the emotion code
            if emotion_key in emotion_dict:
                emotion_label = emotion_dict[emotion_key]
                audio, sr = librosa.load(os.path.join(all_files_path, file))
                if augment:
                    audio = augment_data(audio, sr=sr)

                # Extract mel spectrogram
                S = librosa.feature.melspectrogram(y=audio, sr=sr, n_mels=n_mels)
                S_DB = librosa.power_to_db(S, ref=np.max)

                # Pad the spectrogram to a fixed length
                max_length = 1440  # adjust this value according to your needs
                if S_DB.shape[1] > max_length:
                    S_DB = S_DB[:, :max_length]
                else:
                    padding_width = max_length - S_DB.shape[1]
                    S_DB = np.pad(S_DB, ((0, 0), (0, padding_width)), mode='constant')

                audio_files.append(S_DB)
                labels.append(emotion_label)

    print("Loaded SAVEE audio files:", len(audio_files))
    print("Loaded SAVEE labels:", len(labels))

    return audio_files, labels


def load_data(data_dir, dataset="RAVDESS", n_mels=128, augment=False):
    if dataset == "RAVDESS":
        return load_ravdess_data(data_dir, n_mels=n_mels, augment=augment)
    elif dataset == "SAVEE":
        return load_savee_data(data_dir, n_mels=n_mels, augment=augment)
    else:
        raise ValueError("Unsupported dataset. Please choose either 'RAVDESS' or 'SAVEE'.")


# Custom Dataset Class
class AudioDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        X = self.X[idx]
        y = self.y[idx]
        X_tensor = torch.FloatTensor(X).unsqueeze(0)  # Add channel dimension
        y_tensor = torch.LongTensor([y])
        return X_tensor, y_tensor

# Split data into train and validation sets
def split_data(X, y, test_size=0.2):
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=test_size, random_state=42)
    return X_train, X_val, y_train, y_val

# Train Model Function
def train_model(model, train_loader, val_loader, device, num_epochs=30, learning_rate=0.001, label_smoothing=0.1):
    criterion = nn.CrossEntropyLoss(label_smoothing=label_smoothing)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    scheduler = ReduceLROnPlateau(optimizer, mode='min', patience=5, factor=0.5, min_lr=1e-5)

    train_losses = []
    val_losses = []
    train_accuracies = []
    val_accuracies = []

    for epoch in range(num_epochs):
        model.train()
        train_loss = 0.0
        correct_train = 0
        total_train = 0

        for inputs, labels in tqdm(train_loader, desc=f'Epoch {epoch + 1}/{num_epochs}', unit='batch', ncols=80):
            inputs, labels = inputs.to(device), labels.to(device).squeeze(1)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            train_loss += loss.item() * inputs.size(0)
            _, predicted = torch.max(outputs, 1)
            total_train += labels.size(0)
            correct_train += (predicted == labels).sum().item()

        train_loss = train_loss / len(train_loader.dataset)
        train_acc = correct_train / total_train
        train_losses.append(train_loss)
        train_accuracies.append(train_acc)

        model.eval()
        val_loss = 0.0
        correct_val = 0
        total_val = 0

        with torch.no_grad():
            for inputs, labels in val_loader:
                inputs, labels = inputs.to(device), labels.to(device).squeeze(1)
                outputs = model(inputs)
                loss = criterion(outputs, labels)

                val_loss += loss.item() * inputs.size(0)
                _, predicted = torch.max(outputs, 1)
                total_val += labels.size(0)
                correct_val += (predicted == labels).sum().item()

        val_loss = val_loss / len(val_loader.dataset)
        val_acc = correct_val / total_val
        val_losses.append(val_loss)
        val_accuracies.append(val_acc)

        print(f'Epoch {epoch + 1}/{num_epochs}, Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}, Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}')

        scheduler.step(val_loss)  # Adjust learning rate based on validation loss

    return train_losses, val_losses, train_accuracies, val_accuracies


# SFS-guided WOA Optimization Function
def sfs_guided_woa(maxiter, n, w1_range=(0, 0.5), w2_range=(0, 1), w3_range=(0, 1)):
    # Initialize population
    population = np.random.random((n, 2))  # Randomly initialize population of solutions
    fitness = np.zeros(n)  # Array to store fitness values

    # WOA parameters
    a = 0.5
    A = 2
    C = 2 * np.random.random() - 1
    l = np.random.random()

    # Guided-WOA parameters
    w1 = np.random.uniform(*w1_range)
    w2 = np.random.uniform(*w2_range)
    w3 = np.random.uniform(*w3_range)

    # Initialization of best solution
    best_solution = population[np.argmax(fitness)]

    for t in range(maxiter):
        z = 1 - (t / maxiter) ** 2

        for i in range(n):
            r1, r2, r3 = np.random.random(3)

            if r3 < 0.5:
                if np.abs(A) < 1:
                    D = np.abs(C * best_solution - population[i])
                    population[i] = best_solution - A * D
                else:
                    # Select three random solutions
                    rand_indices = np.random.choice(np.arange(n), size=3, replace=False)
                    Wrand1, Wrand2, Wrand3 = population[rand_indices]

                    population[i] = w1 * Wrand1 + z * w2 * (Wrand2 - Wrand3) + (1 - z) * w3 * (best_solution - Wrand1)
            else:
                b = 1
                D = np.exp(b * l) * np.cos(2 * np.pi * l)
                population[i] = D * np.exp(a * l) * np.cos(2 * np.pi * l) + best_solution

            # Clip solutions to keep them within valid ranges (if needed)
            population[i] = np.clip(population[i], 0, 1)

            # Apply sigmoid function to convert to range [0, 1]
            population[i] = 1 / (1 + np.exp(-population[i]))

            # Evaluate fitness (dummy fitness function)
            fitness[i] = np.sum(population[i])  # Replace with actual fitness evaluation

        # Update best_solution
        if np.max(fitness) > np.max(np.array([np.sum(best_solution)])):
            best_solution = population[np.argmax(fitness)]

        # Update WOA parameters (randomize for diversity)
        A = 2 * np.random.random() - 1
        C = 2 * np.random.random() - 1
        l = np.random.random()
        w1 = np.random.uniform(*w1_range)
        w2 = np.random.uniform(*w2_range)
        w3 = np.random.uniform(*w3_range)

    return best_solution

# Main function to load data, create dataset, define model, and perform optimization
def main():
    # Parameters
    data_dir_ravdess = "C:\\Users\\User\\Downloads\\ravdess"
    data_dir_savee = "C:\\Users\\User\\Downloads\\savee"
    num_epochs = 5
    maxiter = 100  # Maximum number of iterations for SFS-guided WOA
    n = 10  # Population size

    # Load RAVDESS data
    X_ravdess, y_ravdess = load_data(data_dir_ravdess, dataset='RAVDESS')
    print()

    # Load SAVEE data
    X_savee, y_savee = load_data(data_dir_savee, dataset='SAVEE')
    print()

    # Combine data and labels
    X = np.concatenate([X_ravdess, X_savee], axis=0)
    y = np.concatenate([y_ravdess, y_savee], axis=0)

    print(f"Combined Data - Total Samples: {len(X)}")
    print(f"RAVDESS Data Samples: {len(X_ravdess)}")
    print(f"SAVEE Data Samples: {len(X_savee)}")

    # Split data into train and validation sets
    X_train, X_val, y_train, y_val = split_data(X, y, test_size=0.2)

    # Create DataLoader for training and validation sets
    train_dataset = AudioDataset(X_train, y_train)
    val_dataset = AudioDataset(X_val, y_val)
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

    # Initialize and train the model
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = CNNLSTM().to(device)

    # Train the model using the default hyperparameters
    train_losses, val_losses, train_accuracies, val_accuracies = train_model(model, train_loader, val_loader, device,
                                                                             num_epochs=num_epochs,
                                                                             learning_rate=0.001,
                                                                             label_smoothing=0.1)

    # Plotting results (optional)
    plt.figure(figsize=(12, 6))
    plt.subplot(1, 2, 1)
    plt.plot(train_losses, label='Training Loss')
    plt.plot(val_losses, label='Validation Loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    plt.title('Training and Validation Losses')

    plt.subplot(1, 2, 2)
    plt.plot(train_accuracies, label='Training Accuracy')
    plt.plot(val_accuracies, label='Validation Accuracy')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.legend()
    plt.title('Training and Validation Accuracies')

    plt.tight_layout()
    plt.show()

if __name__ == "__main__":
    main()

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = CNNLSTM().to(device)

# Example usage of inference function
def infer_emotion(model, audio_file):
    # Load audio and extract log mel spectrogram
    audio, sr = librosa.load(audio_file)
    log_mel_spectrogram = extract_log_mel_spectrogram(audio, sr=sr)
    
    # Normalize the spectrogram data (similar to training data preprocessing)
    scaler = StandardScaler()
    log_mel_spectrogram_scaled = scaler.fit_transform(log_mel_spectrogram)
    
    # Convert to PyTorch tensor and add batch dimension
    spectrogram_tensor = torch.tensor(log_mel_spectrogram_scaled).unsqueeze(0).unsqueeze(0).float()
    
    # Move tensor to device (GPU if available)
    device = next(model.parameters()).device
    spectrogram_tensor = spectrogram_tensor.to(device)
    
    # Perform inference
    model.eval()
    with torch.no_grad():
        outputs = model(spectrogram_tensor)
        _, predicted = torch.max(outputs, 1)
    
    # Return predicted emotion index
    return predicted.item()


# Example usage of plotting functions and inference
audio_file = 'C:\\Users\\User\\Downloads\\ravdess\\Actor_13\\03-01-08-01-02-02-13.wav'
emotion = infer_emotion(model, audio_file)
emotion_labels = ['neutral', 'calm', 'happy', 'sad', 'angry', 'fearful', 'disgust', 'surprised']
predicted_emotion_label = emotion_labels[emotion]
print(f'Predicted Emotion: {predicted_emotion_label}')

# Plot mel spectrogram
def plot_mel_spectrogram(audio_file):
    audio, sr = librosa.load(audio_file)
    log_mel_spectrogram = extract_log_mel_spectrogram(audio, sr=sr)
    
    plt.figure(figsize=(10, 4))
    librosa.display.specshow(log_mel_spectrogram, sr=sr, x_axis='time', y_axis='mel')
    plt.colorbar(format='%+2.0f dB')
    plt.title('Mel spectrogram')
    plt.tight_layout()
    plt.show()

plot_mel_spectrogram(audio_file)
