#### **오디오 데이터 추출**

##### **사전 준비**

라이브러리 import

In [1]:
import os
import torch
import torchaudio
import numpy as np
from transformers import ASTFeatureExtractor, ASTModel

사전 학습 모델 import

In [2]:
pretrained_model = "MIT/ast-finetuned-audioset-10-10-0.4593"
feature_extractor = ASTFeatureExtractor.from_pretrained(pretrained_model)
model = ASTModel.from_pretrained(pretrained_model)

GPU 사용 확인 및 모델 설정

In [None]:
print(torch.cuda.is_available()) 
print(torch.cuda.get_device_name(0)) 

In [None]:
model.eval() 
model.cuda()

#### **1. .wav 파일 로드**

In [None]:
import soundfile as sf

def load_audio(path):
    waveform, sr = sf.read(path, dtype='float32')
    waveform = torch.from_numpy(waveform).squeeze() 
    return waveform, sr

##### **1-1. waveform segmenting**

In [None]:
import torch.nn.functional as F

def split_waveform(waveform, num_segments=18):
    T = waveform.shape[0]
    target_length = int(np.ceil(T / num_segments)) * num_segments 
    pad_len = target_length - T
    if pad_len > 0:
        waveform = F.pad(waveform, (0, pad_len))
    segment_length = waveform.shape[0] // num_segments
    return waveform.split(segment_length) 

#### **2. Spectrogram 변환**

In [None]:
def extract_spectrogram_segments(segments, sr=16000):
    specs = []
    for segment in segments:
        inputs = feature_extractor(
            segment,
            sampling_rate=sr,
            return_tensors="pt"
        )
        inputs = {k: v.cuda() for k, v in inputs.items()}
        specs.append(inputs)
    return specs 

#### **3. ASTModel에 입력 & 4. [CLS] 임베딩 추출**

In [None]:
@torch.no_grad()
def forward_ast_sequence(spec_input_list):
    input_values = torch.cat([inp["input_values"] for inp in spec_input_list], dim=0)  
    outputs = model(input_values)  
    cls_embeddings = outputs.last_hidden_state[:, 0, :] 
    return cls_embeddings

#### **5. 128-dim Linear Projection & 6. .npy 저장**

In [None]:
projector = torch.nn.Linear(768, 128).cuda()
projector.eval()

def project_sequence_feature(patch_sequence):
    feature_seq = projector(patch_sequence) 
    feature_seq = feature_seq.detach().cpu().numpy() 
    return feature_seq

##### **피처 추출 및 딕셔너리 구성**

In [None]:
import os
import numpy as np

def process_and_save_features(audio_dir, target_classes):
    results = {}

    for class_name in os.listdir(audio_dir):
        if class_name not in target_classes:
            continue

        input_class_dir = os.path.join(audio_dir, class_name, '18frames')
        if not os.path.exists(input_class_dir):
            continue

        for filename in os.listdir(input_class_dir):
            if not filename.endswith(".wav"):
                continue

            audio_path = os.path.join(input_class_dir, filename)
            video_id = os.path.splitext(filename)[0]
            try:
                waveform, sr = load_audio(audio_path)
                segments = split_waveform(waveform, num_segments=18)
                inputs = extract_spectrogram_segments(segments, sr=16000)
                segments_embedding = forward_ast_sequence(inputs)
                features = project_sequence_feature(segments_embedding)
            except Exception as e:
                print(f"[ERROR] {audio_path}: {e}")
                continue

            key = (class_name, video_id)
            results[key] = features
            print(f"처리 완료: {class_name}/{video_id}, 피처 shape: {features.shape}")

    return results

In [None]:
if __name__ == '__main__':
    
    '''
    실행 시 로컬 환경과 경로를 맞춰주세요.
    target_class는 실험 시 데이터 범위에 따라 임의로 설정해주세요.
    '''
    audio_dir = root_rgb = r"D:\Audio\training"
    target_classes = [
            "adult+female+singing", "adult+female+speaking", "adult+male+singing",
            "adult+male+speaking", "applauding", "ascending", "asking", "assembling",
            "autographing", "baking", "balancing", "barbecuing", "barking", "bending",
            "bicycling", "biting", "blowing", "boarding", "boating", "boiling"
        ]
    
    results = process_and_save_features(audio_dir, target_classes)
    np.save("audio_filtered.npy", results)

##### train/test/val split

In [None]:
import numpy as np

'''
실행 시 로컬 환경과 경로를 맞춰주세요.
'''
audio_dict = np.load("audio_filtered.npy", allow_pickle=True).item()
train_rgb = np.load(r"D:\Video-Feature\training\3.18frames-audio유효-split-feature\rgb_filtered_train.npy", allow_pickle=True).item()
val_rgb = np.load(r"D:\Video-Feature\training\3.18frames-audio유효-split-feature\rgb_filtered_val.npy", allow_pickle=True).item()
test_rgb = np.load(r"D:\Video-Feature\training\3.18frames-audio유효-split-feature\rgb_filtered_test.npy", allow_pickle=True).item()

train_keys = set(train_rgb.keys())
val_keys = set(val_rgb.keys())
test_keys = set(test_rgb.keys())

train_audio = {k: v for k, v in audio_dict.items() if k in train_keys}
val_audio = {k: v for k, v in audio_dict.items() if k in val_keys}
test_audio = {k: v for k, v in audio_dict.items() if k in test_keys}

np.save("audio_filtered_train.npy", train_audio)
np.save("audio_filtered_val.npy", val_audio)
np.save("audio_filtered_test.npy", test_audio)

print(f"분할 완료: train={len(train_audio)}, val={len(val_audio)}, test={len(test_audio)}")


#### waveform 추출

In [None]:
import os
import numpy as np
import torch
import torch.nn.functional as F
import soundfile as sf

def load_audio(path):
    waveform, sr = sf.read(path, dtype='float32', always_2d=True)  
    waveform = torch.from_numpy(waveform[:, 0])  
    return waveform, sr

def pad_waveform_ceil_18(waveform, num_segments=18):
    T = waveform.shape[0]
    target_length = int(np.ceil(T / num_segments)) * num_segments  
    pad_len = target_length - T
    if pad_len > 0:
        waveform = F.pad(waveform, (0, pad_len)) 
    return waveform 

def save_waveform(audio_dir, target_classes):
    results = {}

    for class_name in os.listdir(audio_dir):
        if class_name not in target_classes:
            continue

        input_class_dir = os.path.join(audio_dir, class_name, '18frames')
        if not os.path.exists(input_class_dir):
            continue

        for filename in os.listdir(input_class_dir):
            if not filename.endswith(".wav"):
                continue

            audio_path = os.path.join(input_class_dir, filename)
            video_id = os.path.splitext(filename)[0]
            try:
                waveform, sr = load_audio(audio_path)
                waveform = pad_waveform_ceil_18(waveform)
            except Exception as e:
                print(f"[ERROR] {audio_path}: {e}")
                continue

            key = (class_name, video_id)
            results[key] = waveform
            print(f"저장 완료: {class_name}/{video_id}, padded length: {waveform.shape[0]}")

    return results


In [None]:
if __name__ == '__main__':
    
    '''
    실행 시 로컬 환경과 경로를 맞춰주세요.
    '''
    audio_dir = root_rgb = r"D:\Audio\training"
    target_classes = [
            "adult+female+singing", "adult+female+speaking", "adult+male+singing",
            "adult+male+speaking", "applauding", "ascending", "asking", "assembling",
            "autographing", "baking", "balancing", "barbecuing", "barking", "bending",
            "bicycling", "biting", "blowing", "boarding", "boating", "boiling"
        ]
    
    results = save_waveform(audio_dir, target_classes)
    np.save("audio_waveform.npy", results)

In [None]:
from collections import Counter
import numpy as np

waveform_dict = np.load("audio_waveform.npy", allow_pickle=True).item()

lengths = [v.shape[0] for v in waveform_dict.values()]

length_counts = Counter(lengths)

for length in sorted(length_counts.keys()):
    print(f"Length: {length}, Count: {length_counts[length]}")

In [None]:
import numpy as np

'''
실행 시 로컬 환경과 경로를 맞춰주세요.
'''
audio_dict = np.load("audio_waveform.npy", allow_pickle=True).item()
train_rgb = np.load(r"D:\Video-Feature\training\3.18frames-audio유효-split-feature\rgb_filtered_train.npy", allow_pickle=True).item()
val_rgb = np.load(r"D:\Video-Feature\training\3.18frames-audio유효-split-feature\rgb_filtered_val.npy", allow_pickle=True).item()
test_rgb = np.load(r"D:\Video-Feature\training\3.18frames-audio유효-split-feature\rgb_filtered_test.npy", allow_pickle=True).item()

train_keys = set(train_rgb.keys())
val_keys = set(val_rgb.keys())
test_keys = set(test_rgb.keys())

train_audio = {k: v for k, v in audio_dict.items() if k in train_keys}
val_audio = {k: v for k, v in audio_dict.items() if k in val_keys}
test_audio = {k: v for k, v in audio_dict.items() if k in test_keys}

np.save("audio_waveform_train.npy", train_audio)
np.save("audio_waveform_val.npy", val_audio)
np.save("audio_waveform_test.npy", test_audio)

print(f"분할 완료: train={len(train_audio)}, val={len(val_audio)}, test={len(test_audio)}")