In [1]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

import torch
import torch.nn as nn
import numpy as np
import librosa

import opensmile
from transformers import Wav2Vec2Processor, Wav2Vec2Model



  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# ----- 오디오 처리 함수 -----
def extract_wav2vec_embedding(waveform, model, processor):
    inputs = processor(waveform, sampling_rate=16000, return_tensors="pt")
    input_values = inputs.input_values.to(model.device)
    with torch.no_grad():
        outputs = model(input_values, output_hidden_states=True)
        hidden_states = outputs.hidden_states
    return torch.stack([layer.squeeze(0) for layer in hidden_states], dim=0)[:6]  # (6, T, D)

import opensmile
def extract_egemaps_feature(y, sr=16000):
    smile = opensmile.Smile(
        feature_set=opensmile.FeatureSet.eGeMAPSv02,
        feature_level=opensmile.FeatureLevel.LowLevelDescriptors
    )
    df = smile.process_signal(y, sampling_rate=sr)
    features = df.to_numpy().astype(np.float32)
    if features.shape[0] % 2 == 1:
        features = features[:-1]
    if features.shape[0] >= 2:
        features = features.reshape(-1, 2, features.shape[1]).mean(axis=1)
    return torch.tensor(features, dtype=torch.float32).to("cuda")

In [3]:

class FusionModel(nn.Module):
    def __init__(self, w2v_dim=768, os_dim=25, hidden_dim=128, num_layers=6, num_classes=4, dropout=0.2):
        super(FusionModel, self).__init__()

        # 1. 각 w2v 레이어에 대해 Group Conv1D (6 x (768 -> 128))
        self.w2v_layer_convs = nn.ModuleList([
            nn.Sequential(
                nn.Conv1d(in_channels=w2v_dim, out_channels=hidden_dim, kernel_size=1),
                nn.BatchNorm1d(hidden_dim),
                nn.ReLU(),
                nn.Dropout(dropout)
            ) for _ in range(num_layers)
        ])

        # 2. w2v 레이어 통합 후 128으로 축소 (6 x 128 -> 128)
        self.w2v_reduce = nn.Sequential(
            nn.Conv1d(in_channels=num_layers * hidden_dim, out_channels=hidden_dim, kernel_size=1),
            nn.BatchNorm1d(hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout)
        )

        # 3. os (25 -> 128)
        self.os_proj = nn.Sequential(
            nn.Conv1d(in_channels=os_dim, out_channels=hidden_dim, kernel_size=1),
            nn.BatchNorm1d(hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout)
        )

        # 4. 최종 합치고 Conv1D → classifier
        self.final_conv = nn.Sequential(
            nn.Conv1d(in_channels=2 * hidden_dim, out_channels=hidden_dim, kernel_size=1),
            nn.ReLU()
        )

        self.classifier = nn.Linear(hidden_dim, num_classes)

    def forward(self, w2v_layers, os_input):
        # w2v_layers: (B, 6, T, 768) → 각 레이어 conv 후 concat → (B, 6*128, T)
        conv_outputs = []
        for i in range(w2v_layers.shape[1]):
            x = w2v_layers[:, i].permute(0, 2, 1)  # (B, 768, T)
            x = self.w2v_layer_convs[i](x)         # (B, 128, T)
            conv_outputs.append(x)

        w2v_concat = torch.cat(conv_outputs, dim=1)  # (B, 6*128, T)
        w2v_feat = self.w2v_reduce(w2v_concat)       # (B, 128, T)

        os_feat = self.os_proj(os_input.permute(0, 2, 1))  # (B, 128, T)

        merged = torch.cat([w2v_feat, os_feat], dim=1)     # (B, 256, T)
        merged = self.final_conv(merged)                   # (B, 128, T)

        pooled = merged.mean(dim=2)  # 평균 풀링 (B, 128)
        return self.classifier(pooled)  # (B, num_classes)


In [4]:


# ----- 모델 및 프로세서 불러오기 -----
# (wav2vec2, FusionModel)
wav2vec_model = Wav2Vec2Model.from_pretrained("Kkonjeong/wav2vec2-base-korean", output_hidden_states=True).to("cuda")
processor = Wav2Vec2Processor.from_pretrained("Kkonjeong/wav2vec2-base-korean")


# 모델 로드
model = FusionModel(
    w2v_dim=768,
    os_dim=25,
    hidden_dim=128,
    num_classes=4,
    dropout=0.2
).to("cuda")


checkpoint = torch.load("epoch20_final.pth", map_location="cuda")
model.load_state_dict(checkpoint["model_state_dict"])
model.eval()


# ----- 오디오 예측 -----
def predict_emotion(wav_path):
    y, sr = librosa.load(wav_path, sr=16000, mono=True)
    
    # Wav2Vec2 특징 추출
    w2v = extract_wav2vec_embedding(y, wav2vec_model, processor)  # (6, T, 768)

    # OpenSMILE 특징 추출
    os_ = extract_egemaps_feature(y, sr=16000)  # (T, 25)

    # Padding
    T = max(w2v.shape[1], os_.shape[0])
    w2v_tensor = torch.zeros(1, 6, T, 768).to("cuda")
    os_tensor = torch.zeros(1, T, 25).to("cuda")
    w2v_tensor[0, :, :w2v.shape[1], :] = w2v
    os_tensor[0, :os_.shape[0], :] = os_

    # 예측
    with torch.no_grad():
        logits = model(w2v_tensor, os_tensor)
        probs = torch.softmax(logits, dim=1).squeeze(0).cpu().numpy()

    emotion_list = ['angry', 'happiness', 'sadness', 'neutral']
    pred_label = np.argmax(probs)
    pred_emotion = emotion_list[pred_label]

    return pred_emotion, probs




In [5]:


# ----- 경로 설정 -------
wav_path = "./neutral1.wav"



emotion, probs = predict_emotion(wav_path)
print(f"Predicted Emotion: {emotion}")
formatted_probs = [f"{p:.3f}" for p in probs]
print(f"Class Probabilities: {formatted_probs}")


Predicted Emotion: neutral
Class Probabilities: ['0.381', '0.160', '0.069', '0.390']
