In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchaudio
import librosa
import numpy as np

In [2]:
#Class

class ConformerBlock(nn.Module):
    def __init__(self, hidden_dim, dropout=0.3):
        super(ConformerBlock, self).__init__()
        self.conv1 = nn.Conv1d(hidden_dim, hidden_dim, kernel_size=3, padding=1)
        self.conv2 = nn.Conv1d(hidden_dim, hidden_dim, kernel_size=3, padding=1)
        self.attention = nn.MultiheadAttention(hidden_dim, num_heads=4, batch_first=True)
        self.ffn = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim * 4),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim * 4, hidden_dim)
        )
        self.norm1 = nn.LayerNorm(hidden_dim)
        self.norm2 = nn.LayerNorm(hidden_dim)
        self.norm3 = nn.LayerNorm(hidden_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # Conv block
        residual = x
        x = x.transpose(1, 2)
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = x.transpose(1, 2)
        x = self.norm1(x + residual)

        # Self-attention block
        residual = x
        attn_output, _ = self.attention(x, x, x)
        x = self.norm2(x + self.dropout(attn_output))

        # Feedforward block
        residual = x
        ff_output = self.ffn(x)
        x = self.norm3(x + self.dropout(ff_output))

        return x

class ConformerModel(nn.Module):
    def __init__(self, input_dim, num_classes, num_blocks=8, hidden_dim=384, dropout=0.3):
        super(ConformerModel, self).__init__()
        self.input_proj = nn.Linear(input_dim, hidden_dim)
        self.conformer_blocks = nn.ModuleList([
            ConformerBlock(hidden_dim, dropout) for _ in range(num_blocks)
        ])
        self.global_norm = nn.LayerNorm(hidden_dim)
        self.fc_out = nn.Linear(hidden_dim, num_classes)

    def forward(self, x):
        x = x.transpose(1, 2)  # (B, T, input_dim)
        x = self.input_proj(x)
        for block in self.conformer_blocks:
            x = block(x)
        x = self.global_norm(x)
        x = torch.mean(x, dim=1)
        return self.fc_out(x)

In [3]:
input_dim = 512
num_classes = 5 

# Inisialisasi dan load model
model = ConformerModel(input_dim=input_dim, num_classes=num_classes)
model.load_state_dict(torch.load("best_model_Conformer_Crema.pth", map_location=torch.device("cpu")))
model.eval()

ConformerModel(
  (input_proj): Linear(in_features=512, out_features=384, bias=True)
  (conformer_blocks): ModuleList(
    (0-7): 8 x ConformerBlock(
      (conv1): Conv1d(384, 384, kernel_size=(3,), stride=(1,), padding=(1,))
      (conv2): Conv1d(384, 384, kernel_size=(3,), stride=(1,), padding=(1,))
      (attention): MultiheadAttention(
        (out_proj): NonDynamicallyQuantizableLinear(in_features=384, out_features=384, bias=True)
      )
      (ffn): Sequential(
        (0): Linear(in_features=384, out_features=1536, bias=True)
        (1): ReLU()
        (2): Dropout(p=0.3, inplace=False)
        (3): Linear(in_features=1536, out_features=384, bias=True)
      )
      (norm1): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
      (norm2): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
      (norm3): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.3, inplace=False)
    )
  )
  (global_norm): LayerNorm((384,), eps=1e-05, elementwise

In [4]:
def preprocess_audio(file_path, sample_rate=16000, n_mels=512):
    waveform, sr = torchaudio.load(file_path)
    if sr != sample_rate:
        waveform = torchaudio.transforms.Resample(orig_freq=sr, new_freq=sample_rate)(waveform)

    waveform = waveform.mean(dim=0)  
    waveform = waveform.numpy()

    # Compute mel spectrogram
    mel_spec = librosa.feature.melspectrogram(y=waveform, sr=sample_rate, n_mels=n_mels)
    mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)

    # Normalisasi
    mel_spec_db = (mel_spec_db - mel_spec_db.mean()) / (mel_spec_db.std() + 1e-6)

    return torch.tensor(mel_spec_db).unsqueeze(0)  

In [5]:
def predict_emotion(audio_tensor):
    with torch.no_grad():
        output = model(audio_tensor)
        predicted_class = output.argmax(dim=1).item()
    return predicted_class

In [6]:
audio_path = "03-01-05-01-02-02-18.wav"  
audio_tensor = preprocess_audio(audio_path) 
emotion_label = predict_emotion(audio_tensor)

print(f"Predicted Emotion Class: {emotion_label}")

label_map = {0: "Anger", 1: "Fear", 2: "Happy", 3: "Neutral", 4: "Sad"}  
print(f"Predicted Emotion: {label_map.get(emotion_label, 'Unknown')}")

Predicted Emotion Class: 0
Predicted Emotion: Anger


In [11]:
audio_path = "03-01-01-01-01-01-04.wav"  
audio_tensor = preprocess_audio(audio_path) 
emotion_label = predict_emotion(audio_tensor)

print(f"Predicted Emotion Class: {emotion_label}")

label_map = {0: "Anger", 1: "Fear", 2: "Happy", 3: "Neutral", 4: "Sad"}  
print(f"Predicted Emotion: {label_map.get(emotion_label, 'Unknown')}")

Predicted Emotion Class: 1
Predicted Emotion: Fear
