In [2]:
import torch
import pandas as pd
import numpy as np
from transformers import Wav2Vec2Model, Wav2Vec2Processor
import torchaudio
import os
from tqdm import tqdm
import math
import torch.nn as nn

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)
        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)

def extract_wav2vec_features(audio_path, processor, model, positional_encoder, device):
    """Wav2Vec 2.0과 Positional Encoding을 사용하여 음성 특징 추출"""
    try:
        waveform, sample_rate = torchaudio.load(audio_path)
        if waveform.shape[0] > 1:
            waveform = torch.mean(waveform, dim=0, keepdim=True)
        
        if sample_rate != 16000:
            resampler = torchaudio.transforms.Resample(sample_rate, 16000)
            waveform = resampler(waveform)
        
        inputs = processor(
            waveform.squeeze().numpy(),
            sampling_rate=16000,
            return_tensors="pt",
            padding=True
        )
        input_values = inputs.input_values.to(device)

        with torch.no_grad():
            outputs = model(input_values)
            features = outputs.last_hidden_state
            features = positional_encoder(features)
            features = features.mean(dim=1).squeeze().cpu().numpy()
        
        return features
    except Exception as e:
        print(f"Error processing {audio_path}: {e}")
        return None

def create_vatt_features_csv(audio_dir, emotion_labels, output_path):
    """음성 파일들의 특징을 추출하여 CSV 파일 생성"""
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    # Wav2Vec 2.0 모델 로드
    processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base")
    model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base").to(device)
    model.eval()

    positional_encoder = PositionalEncoding(d_model=768).to(device)
    
    features_list = []
    wav_ids = []
    emotions = []

    for wav_id, emotion in tqdm(emotion_labels.items()):
        # .wav 확장자 추가
        audio_path = os.path.join(audio_dir, f"{wav_id}.wav")
        features = extract_wav2vec_features(audio_path, processor, model, positional_encoder, device)
        
        if features is not None:
            features_list.append(features)
            wav_ids.append(wav_id)
            emotions.append(emotion)
    
    features_array = np.array(features_list)
    feature_columns = [f'feature_{i}' for i in range(features_array.shape[1])]

    df = pd.DataFrame(features_array, columns=feature_columns)
    df.insert(0, 'wav_id', wav_ids)
    df['emotion'] = emotions

    df.to_csv(output_path, index=False)
    return df

def main():
    # 감정 레이블 불러오기 (예: CSV 파일로부터 로드)
    emotion_labels = pd.read_csv('VATT_Audio_cleaned.csv').set_index('wav_id')['상황'].to_dict()

    df = create_vatt_features_csv(
        audio_dir='data',
        emotion_labels=emotion_labels,
        output_path='vatt_audio_features.csv'
    )
    
    print("CSV 파일 생성 완료")
    print(df.head())

if __name__ == "__main__":
    main()


100%|████████████████████████████████████████████████████████████████████████████| 43975/43975 [33:31<00:00, 21.86it/s]


CSV 파일 생성 완료
                     wav_id  feature_0  feature_1  feature_2  feature_3  \
0  5e258fd1305bcf3ad153a6a4   0.116951   1.097171  -0.021984   1.307806   
1  5e258fe2305bcf3ad153a6a5   0.061960   1.062623  -0.082103   1.291594   
2  5e258ff5305bcf3ad153a6a6   0.065121   1.129697  -0.145383   1.214148   
3  5e25902f305bcf3ad153a6a9   0.108897   1.214930  -0.077743   1.276002   
4  5e27f90b5807b852d9e0157b   0.072057   1.093633   0.116496   1.357680   

   feature_4  feature_5  feature_6  feature_7  feature_8  ...  feature_759  \
0   0.402940   0.785558   0.366777   0.842524   0.053977  ...     1.038114   
1   0.344941   0.791830   0.375266   0.782226  -0.075762  ...     1.025738   
2   0.375381   0.814593   0.335034   0.794488   0.130037  ...     0.948327   
3   0.315453   0.777725   0.391046   0.919713   0.050814  ...     1.048030   
4   0.232550   1.120988   0.266333   0.874760  -0.087891  ...     1.000806   

   feature_760  feature_761  feature_762  feature_763  feature_764 

In [5]:
df = pd.read_csv('vatt_audio_features.csv')

In [6]:
df = df[df['emotion'] != 'neutral']
df

Unnamed: 0,wav_id,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,...,feature_759,feature_760,feature_761,feature_762,feature_763,feature_764,feature_765,feature_766,feature_767,emotion
0,5e258fd1305bcf3ad153a6a4,0.116951,1.097171,-0.021984,1.307806,0.402940,0.785558,0.366777,0.842524,0.053977,...,1.038114,0.026707,1.005453,-0.485218,1.057157,-0.173923,1.343480,0.277802,0.578118,anger
1,5e258fe2305bcf3ad153a6a5,0.061960,1.062623,-0.082103,1.291594,0.344941,0.791830,0.375266,0.782226,-0.075762,...,1.025738,0.064325,0.877379,-0.405910,1.020559,-0.126246,1.270666,0.311409,0.628683,anger
2,5e258ff5305bcf3ad153a6a6,0.065121,1.129697,-0.145383,1.214148,0.375381,0.814593,0.335034,0.794488,0.130037,...,0.948327,-0.090791,0.981710,-0.664306,0.995173,-0.121721,1.382615,0.314245,0.562532,anger
3,5e25902f305bcf3ad153a6a9,0.108897,1.214930,-0.077743,1.276002,0.315453,0.777725,0.391046,0.919713,0.050814,...,1.048030,-0.056587,0.960514,-0.509469,0.957783,-0.191861,1.298677,0.206973,0.615054,anger
4,5e27f90b5807b852d9e0157b,0.072057,1.093633,0.116496,1.357680,0.232550,1.120988,0.266333,0.874760,-0.087891,...,1.000806,-0.331375,0.713429,-0.489550,0.953741,-0.160698,1.340417,0.145461,0.669466,sad
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43970,5fbe313c44697678c497c05a,-0.035033,1.260395,-0.124864,1.289381,0.494645,0.848422,0.150974,0.685800,0.004787,...,1.124438,-0.035512,1.030388,-0.329507,0.911616,-0.036360,1.222697,0.535841,0.657236,fear
43971,5fbe251044697678c497bfb8,-0.094076,1.183450,-0.093830,1.373462,0.491172,0.866548,0.229189,0.801480,-0.028140,...,1.124917,-0.121186,1.080586,-0.259393,0.966213,-0.125562,1.256359,0.379296,0.543748,anger
43972,5fbe31584c55eb78bd7cee7f,-0.005337,1.108996,0.023202,1.302338,0.466410,0.875766,0.182545,0.777587,-0.111704,...,1.028257,-0.043521,1.006352,-0.355941,0.924228,-0.106112,1.139934,0.427693,0.646497,fear
43973,5fbe2f8544697678c497c047,-0.007707,1.178137,-0.034027,1.332217,0.486676,0.909362,0.187587,0.788998,-0.014973,...,1.071261,-0.072408,1.096629,-0.346822,0.915297,-0.123593,1.194311,0.445144,0.793738,happiness


In [7]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# CSV 파일 로드
# LabelEncoder를 사용하여 emotion을 정수로 인코딩
label_encoder = LabelEncoder()
df['emotion'] = label_encoder.fit_transform(df['emotion'])

# 인코딩된 emotion 값 확인
print("인코딩된 emotion 값:", df['emotion'].unique())

# 변환된 DataFrame을 새로운 CSV 파일로 저장
output_path = 'vatt_audio_features_encoded.csv'
df.to_csv(output_path, index=False)
print(f"변환된 CSV 파일이 {output_path}에 저장되었습니다.")


인코딩된 emotion 값: [0 4 2 1 3 5]
변환된 CSV 파일이 vatt_audio_features_encoded.csv에 저장되었습니다.


In [12]:
dff = pd.read_csv('vatt_audio_features_encoded.csv')
dff

Unnamed: 0,wav_id,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,...,feature_759,feature_760,feature_761,feature_762,feature_763,feature_764,feature_765,feature_766,feature_767,emotion
0,5e258fd1305bcf3ad153a6a4,0.116951,1.097171,-0.021984,1.307806,0.402940,0.785558,0.366777,0.842524,0.053977,...,1.038114,0.026707,1.005453,-0.485218,1.057157,-0.173923,1.343480,0.277802,0.578118,0
1,5e258fe2305bcf3ad153a6a5,0.061960,1.062623,-0.082103,1.291594,0.344941,0.791830,0.375266,0.782226,-0.075762,...,1.025738,0.064325,0.877379,-0.405910,1.020559,-0.126246,1.270666,0.311409,0.628683,0
2,5e258ff5305bcf3ad153a6a6,0.065121,1.129697,-0.145383,1.214148,0.375381,0.814593,0.335034,0.794488,0.130037,...,0.948327,-0.090791,0.981710,-0.664306,0.995173,-0.121721,1.382615,0.314245,0.562532,0
3,5e25902f305bcf3ad153a6a9,0.108897,1.214930,-0.077743,1.276002,0.315453,0.777725,0.391046,0.919713,0.050814,...,1.048030,-0.056587,0.960514,-0.509469,0.957783,-0.191861,1.298677,0.206973,0.615054,0
4,5e27f90b5807b852d9e0157b,0.072057,1.093633,0.116496,1.357680,0.232550,1.120988,0.266333,0.874760,-0.087891,...,1.000806,-0.331375,0.713429,-0.489550,0.953741,-0.160698,1.340417,0.145461,0.669466,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40708,5fbe313c44697678c497c05a,-0.035033,1.260395,-0.124864,1.289381,0.494645,0.848422,0.150974,0.685800,0.004787,...,1.124438,-0.035512,1.030388,-0.329507,0.911616,-0.036360,1.222697,0.535841,0.657236,2
40709,5fbe251044697678c497bfb8,-0.094076,1.183450,-0.093830,1.373462,0.491172,0.866548,0.229189,0.801480,-0.028140,...,1.124917,-0.121186,1.080586,-0.259393,0.966213,-0.125562,1.256359,0.379296,0.543748,0
40710,5fbe31584c55eb78bd7cee7f,-0.005337,1.108996,0.023202,1.302338,0.466410,0.875766,0.182545,0.777587,-0.111704,...,1.028257,-0.043521,1.006352,-0.355941,0.924228,-0.106112,1.139934,0.427693,0.646497,2
40711,5fbe2f8544697678c497c047,-0.007707,1.178137,-0.034027,1.332217,0.486676,0.909362,0.187587,0.788998,-0.014973,...,1.071261,-0.072408,1.096629,-0.346822,0.915297,-0.123593,1.194311,0.445144,0.793738,3


In [13]:
dff['emotion'].value_counts()

emotion
4    13986
0    11633
1     4660
3     4548
2     4131
5     1755
Name: count, dtype: int64

In [11]:
df = pd.read_csv('vatt_audio_features.csv')
df

Unnamed: 0,wav_id,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,...,feature_759,feature_760,feature_761,feature_762,feature_763,feature_764,feature_765,feature_766,feature_767,emotion
0,5e258fd1305bcf3ad153a6a4,0.116951,1.097171,-0.021984,1.307806,0.402940,0.785558,0.366777,0.842524,0.053977,...,1.038114,0.026707,1.005453,-0.485218,1.057157,-0.173923,1.343480,0.277802,0.578118,anger
1,5e258fe2305bcf3ad153a6a5,0.061960,1.062623,-0.082103,1.291594,0.344941,0.791830,0.375266,0.782226,-0.075762,...,1.025738,0.064325,0.877379,-0.405910,1.020559,-0.126246,1.270666,0.311409,0.628683,anger
2,5e258ff5305bcf3ad153a6a6,0.065121,1.129697,-0.145383,1.214148,0.375381,0.814593,0.335034,0.794488,0.130037,...,0.948327,-0.090791,0.981710,-0.664306,0.995173,-0.121721,1.382615,0.314245,0.562532,anger
3,5e25902f305bcf3ad153a6a9,0.108897,1.214930,-0.077743,1.276002,0.315453,0.777725,0.391046,0.919713,0.050814,...,1.048030,-0.056587,0.960514,-0.509469,0.957783,-0.191861,1.298677,0.206973,0.615054,anger
4,5e27f90b5807b852d9e0157b,0.072057,1.093633,0.116496,1.357680,0.232550,1.120988,0.266333,0.874760,-0.087891,...,1.000806,-0.331375,0.713429,-0.489550,0.953741,-0.160698,1.340417,0.145461,0.669466,sad
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43970,5fbe313c44697678c497c05a,-0.035033,1.260395,-0.124864,1.289381,0.494645,0.848422,0.150974,0.685800,0.004787,...,1.124438,-0.035512,1.030388,-0.329507,0.911616,-0.036360,1.222697,0.535841,0.657236,fear
43971,5fbe251044697678c497bfb8,-0.094076,1.183450,-0.093830,1.373462,0.491172,0.866548,0.229189,0.801480,-0.028140,...,1.124917,-0.121186,1.080586,-0.259393,0.966213,-0.125562,1.256359,0.379296,0.543748,anger
43972,5fbe31584c55eb78bd7cee7f,-0.005337,1.108996,0.023202,1.302338,0.466410,0.875766,0.182545,0.777587,-0.111704,...,1.028257,-0.043521,1.006352,-0.355941,0.924228,-0.106112,1.139934,0.427693,0.646497,fear
43973,5fbe2f8544697678c497c047,-0.007707,1.178137,-0.034027,1.332217,0.486676,0.909362,0.187587,0.788998,-0.014973,...,1.071261,-0.072408,1.096629,-0.346822,0.915297,-0.123593,1.194311,0.445144,0.793738,happiness


In [14]:
df['emotion'].value_counts()

emotion
sad          13986
anger        11633
disgust       4660
happiness     4548
fear          4131
neutral       3262
surprise      1755
Name: count, dtype: int64

- 0 : angry
- 1 : disgust
- 2 : fear
- 3 : happiness
- 4 : sad
- 5 : surprise