In [None]:
import pandas as pd
import pickle

df = pd.read_csv('/VATT_AudioSet.csv')
df['EMOTION'].value_counts()

import torch
import torch.nn as nn
import numpy as np
from sklearn.preprocessing import LabelEncoder

torch.manual_seed(42)  # Seed 설정

df = pd.read_csv('VATT_AudioSet.csv')
labels = df['EMOTION']
features = df.drop(columns=['Unnamed: 0', 'EMOTION'])

# LabelEncoding
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)

# DataFrame -> NumPy -> Tensor
features_numpy = features.values 
features_tensor = torch.FloatTensor(features_numpy)

class AudioFeatureExpander(nn.Module): # 차원 확장 & 포지셔닝 인코딩 적용 class
    def __init__(self, input_dim, output_dim=768):
        super(AudioFeatureExpander, self).__init__()
        
        self.linear = nn.Linear(input_dim, output_dim)
        self.position_embedding = nn.Parameter(torch.randn(1, output_dim))
    
    def forward(self, x):
        x = self.linear(x)  # (32, 768)
        x += self.position_embedding
        
        return x

input_dim = features_tensor.shape[1]  # 32 features = Chromagram(feature_0~11) + MFCC(feature_12~31)

# Instance 생성
expander = AudioFeatureExpander(input_dim)

# Class 적용
transformer_input = expander(features_tensor)

print("원본:", features_tensor.shape)
print("Transformer input 용:", transformer_input.shape)
print("Label:", encoded_labels.shape)


In [None]:
torch.save(transformer_input, 'transformer_input.pt')

with open('audio_label_encoder.pkl', 'wb') as f:
    pickle.dump(label_encoder, f)

loaded_input = torch.load('transformer_input.pt')
with open('audio_label_encoder.pkl', 'rb') as f:
    loaded_encoder = pickle.load(f)