### LSTM audio feature generation  

In [None]:
import os
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
# Preprocessed .npy Files 불러오기 

base_path = os.path.join(os.path.dirname(__file__), 'data') if '__file__' in globals() else 'data/'
audio_train = np.load(os.path.join(base_path, "audio_filtered_train.npy"), allow_pickle=True).item()
audio_val = np.load(os.path.join(base_path, "audio_filtered_val.npy"), allow_pickle=True).item()
audio_test = np.load(os.path.join(base_path, "audio_filtered_test.npy"), allow_pickle=True).item()

rgb_train = np.load(os.path.join(base_path, "rgb_filtered_train.npy"), allow_pickle=True).item()
rgb_val = np.load(os.path.join(base_path, "rgb_filtered_val.npy"), allow_pickle=True).item()
rgb_test = np.load(os.path.join(base_path, "rgb_filtered_test.npy"), allow_pickle=True).item()

flow_train = np.load(os.path.join(base_path, "flow_filtered_train.npy"), allow_pickle=True).item()
flow_val = np.load(os.path.join(base_path, "flow_filtered_val.npy"), allow_pickle=True).item()
flow_test = np.load(os.path.join(base_path, "flow_filtered_test.npy"), allow_pickle=True).item()


In [None]:
# 데이터셋 정의

class VideoToAudioDataset(Dataset):
    def __init__(self, rgb_dict, flow_dict, audio_dict):
        self.keys = list(audio_dict.keys())
        self.rgb_dict = rgb_dict
        self.flow_dict = flow_dict
        self.audio_dict = audio_dict

    def __len__(self):
        return len(self.keys)

    def __getitem__(self, idx):
        key = self.keys[idx]
        rgb_feat = self.rgb_dict[key]      # (18, 768)
        flow_feat = self.flow_dict[key]    # (18, 768)
        x = np.concatenate([rgb_feat, flow_feat], axis=-1)  # (18, 1536)
        y = self.audio_dict[key]           # (18, 128)
        return torch.tensor(x, dtype=torch.float32), torch.tensor(y, dtype=torch.float32), key


In [None]:
# 데이터 불러오기 

train_loader = DataLoader(VideoToAudioDataset(rgb_train, flow_train, audio_train), batch_size=4, shuffle=True)
val_loader = DataLoader(VideoToAudioDataset(rgb_val, flow_val, audio_val), batch_size=4)
test_loader = DataLoader(VideoToAudioDataset(rgb_test, flow_test, audio_test), batch_size=4)

In [None]:
# LSTM-based generator 

class AudioFeatureGenerator(nn.Module):
    def __init__(self, input_dim=1536, hidden_dim=512, num_layers=2, output_dim=128):
        super().__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True, dropout=0.3)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        out, _ = self.lstm(x)  # (B, 18, H)
        return self.fc(out)    # (B, 18, 128)

In [None]:
# Training & Validation Functions

def sequence_mse_loss(pred, target):
    return ((pred - target) ** 2).mean()

def train(model, loader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    for x, y, _ in loader:
        x, y = x.to(device), y.to(device)
        optimizer.zero_grad()
        pred = model(x)  # (B, 18, 218)
        loss = criterion(pred, y)  # element-wise comparison
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * x.size(0)
    return total_loss / len(loader.dataset)

def validate(model, loader, criterion, device, visualize=False, num_samples=3):
    model.eval()
    total_loss = 0
    samples = []

    with torch.no_grad():
        for i in range(len(loader.dataset)):
            x, y, key = loader.dataset[i]
            x = x.unsqueeze(0).to(device)
            y = y.unsqueeze(0).to(device)
            pred = model(x)
            loss = criterion(pred, y)
            total_loss += loss.item()

            if visualize and len(samples) < num_samples:
                samples.append((key, pred.squeeze(0).cpu().numpy(), y.squeeze(0).cpu().numpy()))

    if visualize:
        visualize_audio_similarity_samples(samples)

    return total_loss / len(loader.dataset)

def visualize_audio_similarity_samples(samples):
    for key, pred, gt in samples:
        cosine_sim = [cosine_similarity(pred[i:i+1], gt[i:i+1])[0, 0] for i in range(18)]
        mse = [np.mean((pred[i] - gt[i]) ** 2) for i in range(18)]

        fig, ax1 = plt.subplots()
        ax1.set_title(f"Similarity & MSE for video {key}")
        ax1.plot(cosine_sim, label='Cosine Similarity', marker='o', color='blue')
        ax1.set_ylim(0, 1)
        ax1.set_ylabel('Cosine Similarity', color='blue')

        ax2 = ax1.twinx()
        ax2.plot(mse, label='MSE', marker='x', color='red')
        ax2.set_ylabel('MSE', color='red')

        fig.tight_layout()
        plt.xlabel('Frame')
        plt.legend()
        plt.show()

In [None]:
# Generated audio feature 저장 함수

def generate_and_save_audio_features(model, dataset, device, save_path):
    model.eval()
    results = {}
    with torch.no_grad():
        for i in range(len(dataset)):
            x, _, key = dataset[i]
            x = x.unsqueeze(0).to(device)
            pred = model(x).cpu().squeeze(0).numpy()  # (18, 128)
            results[key] = pred
    np.save(save_path, results)
    print(f"✅ Saved: {save_path} with {len(results)} samples")

In [None]:
# Run Training

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AudioFeatureGenerator().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = sequence_mse_loss

for epoch in range(1, 11):
    train_loss = train(model, train_loader, optimizer, criterion, device)
    val_loss = validate(model, val_loader, criterion, device, visualize=(epoch==10), num_samples=3)
    print(f"[Epoch {epoch}] Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}")


In [None]:
# Final Test Evaluation

def test(model, loader, criterion, device):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for x, y, _ in loader:
            x, y = x.to(device), y.to(device)
            pred = model(x)
            loss = criterion(pred, y)
            total_loss += loss.item() * x.size(0)
    return total_loss / len(loader.dataset)

final_test_loss = test(model, test_loader, criterion, device)
print(f"\n✅ Final Test Loss: {final_test_loss:.4f}")

def test_audio_similarity(model, dataset, device, num_samples=3):
    model.eval()
    samples = []
    with torch.no_grad():
        for i in range(len(dataset)):
            x, y, key = dataset[i]
            x = x.unsqueeze(0).to(device)
            y = y.unsqueeze(0).to(device)
            pred = model(x)

            if len(samples) < num_samples:
                samples.append((key, pred.squeeze(0).cpu().numpy(), y.squeeze(0).cpu().numpy()))
            else:
                break

    visualize_audio_similarity_samples(samples)

test_audio_similarity(model, test_loader.dataset, device, num_samples=3)


In [None]:
# Generated audio feature 저장 함수 호출

generate_and_save_audio_features(model, train_loader.dataset, device, 'gen_audio_train.npy')
generate_and_save_audio_features(model, val_loader.dataset, device, 'gen_audio_val.npy')
generate_and_save_audio_features(model, test_loader.dataset, device, 'gen_audio_test.npy')

### LSTM generated audio based Action Recognition

In [None]:
# Action Recognition Dataset

class ActionRecognitionDataset(torch.utils.data.Dataset):
    def __init__(self, rgb_dict, flow_dict, audio_dict=None):
        self.keys = list(rgb_dict.keys())
        self.rgb_dict = rgb_dict
        self.flow_dict = flow_dict
        self.audio_dict = audio_dict  # None if audio is not used
        self.use_audio = audio_dict is not None
        self.label_map = {cls: i for i, cls in enumerate(sorted(set(k[0] for k in self.keys)))}

    def __len__(self):
        return len(self.keys)

    def __getitem__(self, idx):
        key = self.keys[idx]
        rgb = self.rgb_dict[key]      # (18, 768)
        flow = self.flow_dict[key]    # (18, 768)
        x = np.concatenate([rgb, flow], axis=-1)  # (18, 1536)

        if self.use_audio:
            audio = self.audio_dict[key]  # (18, 128)
            x = np.concatenate([x, audio], axis=-1)  # (18, 1664)

        label = self.label_map[key[0]]  # class_name → int label
        return torch.tensor(x, dtype=torch.float32), label


In [None]:
# Action Recognition Dataset Loaders

gen_audio_train = np.load(os.path.join(base_path, 'gen_audio_train.npy'), allow_pickle=True).item()
gen_audio_val = np.load(os.path.join(base_path, 'gen_audio_val.npy'), allow_pickle=True).item()
gen_audio_test = np.load(os.path.join(base_path, 'gen_audio_test.npy'), allow_pickle=True).item()

# No Audio Datasets
no_audio_train = ActionRecognitionDataset(rgb_train, flow_train)
no_audio_val = ActionRecognitionDataset(rgb_val, flow_val)
no_audio_test = ActionRecognitionDataset(rgb_test, flow_test)

# With Generated Audio Datasets
gen_audio_train_ds = ActionRecognitionDataset(rgb_train, flow_train, gen_audio_train)
gen_audio_val_ds = ActionRecognitionDataset(rgb_val, flow_val, gen_audio_val)
gen_audio_test_ds = ActionRecognitionDataset(rgb_test, flow_test, gen_audio_test)

# DataLoaders
no_audio_train_loader = DataLoader(no_audio_train, batch_size=16, shuffle=True)
no_audio_val_loader = DataLoader(no_audio_val, batch_size=16)
no_audio_test_loader = DataLoader(no_audio_test, batch_size=16)

gen_audio_train_loader = DataLoader(gen_audio_train_ds, batch_size=16, shuffle=True)
gen_audio_val_loader = DataLoader(gen_audio_val_ds, batch_size=16)
gen_audio_test_loader = DataLoader(gen_audio_test_ds, batch_size=16)

In [None]:
# Action Recognition Classifier

class ActionClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim=256, num_classes=20):
        super().__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, batch_first=True, num_layers=1, dropout=0.3)
        self.fc = nn.Linear(hidden_dim, num_classes)

    def forward(self, x):
        out, _ = self.lstm(x)  # (B, 18, hidden_dim)
        last_hidden = out[:, -1, :]  # (B, hidden_dim)
        return self.fc(last_hidden)  # (B, num_classes)


In [None]:
# 학습 및 평가 함수

def train_classifier(model, loader, optimizer, criterion, device):
    model.train()
    total_loss, correct = 0, 0
    for x, y in loader:
        x, y = x.to(device), y.to(device)
        optimizer.zero_grad()
        pred = model(x)
        loss = criterion(pred, y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * x.size(0)
        correct += (pred.argmax(1) == y).sum().item()
    return total_loss / len(loader.dataset), correct / len(loader.dataset)

def evaluate_classifier(model, loader, criterion, device):
    model.eval()
    total_loss, correct = 0, 0
    with torch.no_grad():
        for x, y in loader:
            x, y = x.to(device), y.to(device)
            pred = model(x)
            loss = criterion(pred, y)
            total_loss += loss.item() * x.size(0)
            correct += (pred.argmax(1) == y).sum().item()
    return total_loss / len(loader.dataset), correct / len(loader.dataset)

In [None]:
# No Audio (오디오 없이 비디오만 가지고 행동 인식)

model = ActionClassifier(input_dim=1536).to(device)  
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()

for epoch in range(1, 11):
    train_loss, train_acc = train_classifier(model, no_audio_train_loader, optimizer, criterion, device)
    val_loss, val_acc = evaluate_classifier(model, no_audio_val_loader, criterion, device)
    print(f"[Epoch {epoch}] Train Acc: {train_acc:.4f} | Val Acc: {val_acc:.4f}")

test_loss, test_acc = evaluate_classifier(model, no_audio_test_loader, criterion, device)
print(f"\n✅ Final Test Accuracy (No Audio): {test_acc:.4f}")

In [None]:
# Gen Audio (LSTM 생성 오디오로 행동인식)

model = ActionClassifier(input_dim=1664).to(device)  
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()

for epoch in range(1, 11):
    train_loss, train_acc = train_classifier(model, gen_audio_train_loader, optimizer, criterion, device)
    val_loss, val_acc = evaluate_classifier(model, gen_audio_val_loader, criterion, device)
    print(f"[Epoch {epoch}] Train Acc: {train_acc:.4f} | Val Acc: {val_acc:.4f}")

test_loss, test_acc = evaluate_classifier(model, gen_audio_test_loader, criterion, device)
print(f"\n✅ Final Test Accuracy (Gen Audio): {test_acc:.4f}")


In [None]:
# GT 오디오를 활용한 Action Recognition 평가용 Dataset 로딩

audio_train_gt = np.load(os.path.join(base_path, "audio_filtered_train.npy"), allow_pickle=True).item()
audio_val_gt = np.load(os.path.join(base_path, "audio_filtered_val.npy"), allow_pickle=True).item()
audio_test_gt = np.load(os.path.join(base_path, "audio_filtered_test.npy"), allow_pickle=True).item()

gt_audio_train_ds = ActionRecognitionDataset(rgb_train, flow_train, audio_train_gt)
gt_audio_val_ds = ActionRecognitionDataset(rgb_val, flow_val, audio_val_gt)
gt_audio_test_ds = ActionRecognitionDataset(rgb_test, flow_test, audio_test_gt)

gt_audio_train_loader = DataLoader(gt_audio_train_ds, batch_size=16, shuffle=True)
gt_audio_val_loader = DataLoader(gt_audio_val_ds, batch_size=16)
gt_audio_test_loader = DataLoader(gt_audio_test_ds, batch_size=16)

In [None]:
# GT 오디오로 ActionClassifier 학습 예시
model = ActionClassifier(input_dim=1664).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()

for epoch in range(1, 11):
     train_loss, train_acc = train_classifier(model, gt_audio_train_loader, optimizer, criterion, device)
     val_loss, val_acc = evaluate_classifier(model, gt_audio_val_loader, criterion, device)
     print(f"[Epoch {epoch}] Train Acc: {train_acc:.4f} | Val Acc: {val_acc:.4f}")

test_loss, test_acc = evaluate_classifier(model, gt_audio_test_loader, criterion, device)
print(f"✅ Final Test Accuracy (GT Audio): {test_acc:.4f}")


### AST (Pretrained Multi-Label Prediction)

In [None]:
pip install transformers torchaudio librosa

In [None]:
import numpy as np
import torch
from tqdm import tqdm
from transformers import ASTFeatureExtractor, ASTForAudioClassification

# 1. 모델 및 feature extractor 불러오기
feature_extractor = ASTFeatureExtractor.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593")
model = ASTForAudioClassification.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593")
model.eval()

In [None]:
# 2. waveform dictionary 불러오기
waveform_dict = np.load("audio_waveform_test.npy", allow_pickle=True).item()

# 3. 결과 저장용 딕셔너리
pred_dict = {}
skipped_keys = []

# 4. 예측 수행
for key, waveform in tqdm(waveform_dict.items()):
    try:
        # 1D numpy array로 squeeze
        waveform = np.array(waveform).squeeze()

        # 너무 짧은 waveform은 스킵
        if len(waveform) < 400:
            skipped_keys.append((key, "Too short"))
            continue

        # Feature extraction 및 모델 예측
        inputs = feature_extractor(
            waveform,
            sampling_rate=16000,
            return_tensors="pt"
        )

        with torch.no_grad():
            outputs = model(**inputs)
            probs = torch.sigmoid(outputs.logits).squeeze().numpy()  # [527]
            pred_dict[key] = probs

    except Exception as e:
        skipped_keys.append((key, str(e)))
        continue

# 5. 결과 저장
np.save("ast_pred_audio_test.npy", pred_dict)
np.save("ast_pred_audio_test_skipped.npy", np.array(skipped_keys, dtype=object))


In [None]:
# 인덱스-의미 매핑
from transformers import ASTForAudioClassification
import numpy as np

# 모델 불러오기
model = ASTForAudioClassification.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593")
id2label = model.config.id2label  # {int: str}

# 예측 결과 불러오기
pred_dict = np.load("ast_pred_audio_test.npy", allow_pickle=True).item()

# Top-k 예측 결과를 이름으로 매핑해서 저장
topk = 5
mapped_dict = {}

for key, probs in pred_dict.items():
    topk_indices = probs.argsort()[-topk:][::-1]  # 큰 값부터 정렬
    topk_labels = [id2label[idx] for idx in topk_indices]
    topk_scores = [float(probs[idx]) for idx in topk_indices]

    mapped_dict[key] = {
        "topk_labels": topk_labels,
        "topk_scores": topk_scores
    }


import json

json_mapped_dict = {
    f"{k[0]}|{k[1]}": v for k, v in mapped_dict.items()
}

with open("ast_pred_test_top5.json", "w") as f:
    json.dump(json_mapped_dict, f, indent=2)

np.save("ast_pred_test_top5.npy", mapped_dict)


### IOU (Intersection over Union)

In [None]:
import json
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import jaccard_score
from collections import defaultdict

# 1. 파일 불러오기
with open("data/pred_test.json", "r") as f:
    pred_dict = json.load(f)

savld_df = pd.read_csv("data/SVD.csv")

# 2. SAVLD dictionary 생성: {class_name: [audio_label1, audio_label2, ...]}
savld_dict = {}
for _, row in savld_df.iterrows():
    class_name = row["Video Label"].strip()
    audio_labels = [l.strip() for l in row["Audio Labels"].split(";")]
    savld_dict[class_name] = audio_labels

# 3. 모든 라벨 집합 수집
all_labels = set()
for pred in pred_dict.values():
    all_labels.update(pred["topk_labels"])
for gt_labels in savld_dict.values():
    all_labels.update(gt_labels)
all_labels = sorted(all_labels)

mlb = MultiLabelBinarizer(classes=all_labels)
mlb.fit([all_labels])  # fit with full label set

# 4. 클래스별 IOU 계산
class_ious = defaultdict(list)

for key, pred_info in pred_dict.items():
    if "|" not in key:
        continue
    class_name, _ = key.split("|", 1)
    if class_name not in savld_dict:
        continue

    pred_labels = pred_info["topk_labels"]
    gt_labels = savld_dict[class_name]

    pred_vec = mlb.transform([pred_labels])[0]
    gt_vec = mlb.transform([gt_labels])[0]

    iou = jaccard_score(gt_vec, pred_vec)
    class_ious[class_name].append(iou)

# 5. 클래스별 평균 IOU 계산
class_avg_ious = {
    cls: np.mean(iou_list) for cls, iou_list in class_ious.items()
}

# 6. 출력
print("[INFO] Class-wise average IOU:")
for cls, iou in sorted(class_avg_ious.items(), key=lambda x: -x[1]):
    print(f"  {cls:30s} : {iou:.4f}")


In [None]:
import numpy as np
import os

# base 경로 설정
base_path = "data"  # 필요 시 절대경로로 수정

# 필터링 기준 클래스
selected_classes = [
    'adult+female+singing',
    'adult+female+speaking',
    'adult+male+speaking',
    'adult+male+singing',
    'applauding'
]

# 파일명 리스트
modalities = ["audio", "rgb", "flow"]
splits = ["train", "val", "test"]

for modality in modalities:
    for split in splits:
        # 원본 파일 로딩
        fname = f"{modality}_filtered_{split}.npy"
        full_path = os.path.join(base_path, fname)
        data = np.load(full_path, allow_pickle=True).item()

        # 클래스 기준 필터링
        filtered_data = {
            k: v for k, v in data.items() if k[0] in selected_classes
        }

        # 새 파일 저장
        new_fname = f"{modality}_filtered_{split}_selected.npy"
        new_path = os.path.join(base_path, new_fname)
        np.save(new_path, filtered_data)
        print(f"[SAVED] {new_path} ({len(filtered_data)} samples)")


### LSTM audio feature generation (filtered.ver)

In [None]:
import os
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
# Load Preprocessed .npy Files

base_path = os.path.join(os.path.dirname(__file__), 'data') if '__file__' in globals() else 'data/'

audio_train = np.load(os.path.join(base_path, "audio_filtered_train_selected.npy"), allow_pickle=True).item()
audio_val   = np.load(os.path.join(base_path, "audio_filtered_val_selected.npy"), allow_pickle=True).item()
audio_test  = np.load(os.path.join(base_path, "audio_filtered_test_selected.npy"), allow_pickle=True).item()

rgb_train = np.load(os.path.join(base_path, "rgb_filtered_train_selected.npy"), allow_pickle=True).item()
rgb_val   = np.load(os.path.join(base_path, "rgb_filtered_val_selected.npy"), allow_pickle=True).item()
rgb_test  = np.load(os.path.join(base_path, "rgb_filtered_test_selected.npy"), allow_pickle=True).item()

flow_train = np.load(os.path.join(base_path, "flow_filtered_train_selected.npy"), allow_pickle=True).item()
flow_val   = np.load(os.path.join(base_path, "flow_filtered_val_selected.npy"), allow_pickle=True).item()
flow_test  = np.load(os.path.join(base_path, "flow_filtered_test_selected.npy"), allow_pickle=True).item()


In [None]:
# Dataset Definition

class VideoToAudioDataset(Dataset):
    def __init__(self, rgb_dict, flow_dict, audio_dict):
        self.keys = list(audio_dict.keys())
        self.rgb_dict = rgb_dict
        self.flow_dict = flow_dict
        self.audio_dict = audio_dict

    def __len__(self):
        return len(self.keys)

    def __getitem__(self, idx):
        key = self.keys[idx]
        rgb_feat = self.rgb_dict[key]      # (18, 768)
        flow_feat = self.flow_dict[key]    # (18, 768)
        x = np.concatenate([rgb_feat, flow_feat], axis=-1)  # (18, 1536)
        y = self.audio_dict[key]           # (18, 128)
        return torch.tensor(x, dtype=torch.float32), torch.tensor(y, dtype=torch.float32), key


In [None]:
# 데이터 불러오기 

train_loader = DataLoader(VideoToAudioDataset(rgb_train, flow_train, audio_train), batch_size=4, shuffle=True)
val_loader = DataLoader(VideoToAudioDataset(rgb_val, flow_val, audio_val), batch_size=4)
test_loader = DataLoader(VideoToAudioDataset(rgb_test, flow_test, audio_test), batch_size=4)


In [None]:
# LSTM-based generator 

class AudioFeatureGenerator(nn.Module):
    def __init__(self, input_dim=1536, hidden_dim=512, num_layers=2, output_dim=128):
        super().__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True, dropout=0.3)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        out, _ = self.lstm(x)  # (B, 18, H)
        return self.fc(out)    # (B, 18, 128)

In [None]:
# Training & Validation Functions

def sequence_mse_loss(pred, target):
    return ((pred - target) ** 2).mean()

def train(model, loader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    for x, y, _ in loader:
        x, y = x.to(device), y.to(device)
        optimizer.zero_grad()
        pred = model(x)  # (B, 18, 218)
        loss = criterion(pred, y)  # element-wise comparison
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * x.size(0)
    return total_loss / len(loader.dataset)

def validate(model, loader, criterion, device, visualize=False, num_samples=3):
    model.eval()
    total_loss = 0
    samples = []

    with torch.no_grad():
        for i in range(len(loader.dataset)):
            x, y, key = loader.dataset[i]
            x = x.unsqueeze(0).to(device)
            y = y.unsqueeze(0).to(device)
            pred = model(x)
            loss = criterion(pred, y)
            total_loss += loss.item()

            if visualize and len(samples) < num_samples:
                samples.append((key, pred.squeeze(0).cpu().numpy(), y.squeeze(0).cpu().numpy()))

    if visualize:
        visualize_audio_similarity_samples(samples)

    return total_loss / len(loader.dataset)

def visualize_audio_similarity_samples(samples):
    for key, pred, gt in samples:
        cosine_sim = [cosine_similarity(pred[i:i+1], gt[i:i+1])[0, 0] for i in range(18)]
        mse = [np.mean((pred[i] - gt[i]) ** 2) for i in range(18)]

        fig, ax1 = plt.subplots()
        ax1.set_title(f"Similarity & MSE for video {key}")
        ax1.plot(cosine_sim, label='Cosine Similarity', marker='o', color='blue')
        ax1.set_ylim(0, 1)
        ax1.set_ylabel('Cosine Similarity', color='blue')

        ax2 = ax1.twinx()
        ax2.plot(mse, label='MSE', marker='x', color='red')
        ax2.set_ylabel('MSE', color='red')

        fig.tight_layout()
        plt.xlabel('Frame')
        plt.legend()
        plt.show()

In [None]:
# Generated audio feature 저장 함수

def generate_and_save_audio_features(model, dataset, device, save_path):
    model.eval()
    results = {}
    with torch.no_grad():
        for i in range(len(dataset)):
            x, _, key = dataset[i]
            x = x.unsqueeze(0).to(device)
            pred = model(x).cpu().squeeze(0).numpy()  # (18, 128)
            results[key] = pred
    np.save(save_path, results)
    print(f"✅ Saved: {save_path} with {len(results)} samples")

In [None]:
# Run Training

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AudioFeatureGenerator().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = sequence_mse_loss

for epoch in range(1, 11):
    train_loss = train(model, train_loader, optimizer, criterion, device)
    val_loss = validate(model, val_loader, criterion, device, visualize=(epoch==10), num_samples=3)
    print(f"[Epoch {epoch}] Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}")


In [None]:
# Final Test Evaluation

def test(model, loader, criterion, device):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for x, y, _ in loader:
            x, y = x.to(device), y.to(device)
            pred = model(x)
            loss = criterion(pred, y)
            total_loss += loss.item() * x.size(0)
    return total_loss / len(loader.dataset)

final_test_loss = test(model, test_loader, criterion, device)
print(f"\n✅ Final Test Loss: {final_test_loss:.4f}")

def test_audio_similarity(model, dataset, device, num_samples=3):
    model.eval()
    samples = []
    with torch.no_grad():
        for i in range(len(dataset)):
            x, y, key = dataset[i]
            x = x.unsqueeze(0).to(device)
            y = y.unsqueeze(0).to(device)
            pred = model(x)

            if len(samples) < num_samples:
                samples.append((key, pred.squeeze(0).cpu().numpy(), y.squeeze(0).cpu().numpy()))
            else:
                break

    visualize_audio_similarity_samples(samples)

test_audio_similarity(model, test_loader.dataset, device, num_samples=3)


### LSTM generated audio based Action Recognition (filtered.ver)

In [None]:
# Generated audio feature 저장 함수 호출 (IOU-filtered version)
generate_and_save_audio_features(model, train_loader.dataset, device, 'gen_audio_train_selected.npy')
generate_and_save_audio_features(model, val_loader.dataset, device, 'gen_audio_val_selected.npy')
generate_and_save_audio_features(model, test_loader.dataset, device, 'gen_audio_test_selected.npy')

In [None]:
# Action Recognition Dataset

class ActionRecognitionDataset(torch.utils.data.Dataset):
    def __init__(self, rgb_dict, flow_dict, audio_dict=None):
        self.keys = list(rgb_dict.keys())
        self.rgb_dict = rgb_dict
        self.flow_dict = flow_dict
        self.audio_dict = audio_dict  # None if audio is not used
        self.use_audio = audio_dict is not None
        self.label_map = {cls: i for i, cls in enumerate(sorted(set(k[0] for k in self.keys)))}

    def __len__(self):
        return len(self.keys)

    def __getitem__(self, idx):
        key = self.keys[idx]
        rgb = self.rgb_dict[key]      # (18, 768)
        flow = self.flow_dict[key]    # (18, 768)
        x = np.concatenate([rgb, flow], axis=-1)  # (18, 1536)

        if self.use_audio:
            audio = self.audio_dict[key]  # (18, 128)
            x = np.concatenate([x, audio], axis=-1)  # (18, 1664)

        label = self.label_map[key[0]]  # class_name → int label
        return torch.tensor(x, dtype=torch.float32), label


In [None]:
# Action Recognition Dataset Loaders

gen_audio_train = np.load(os.path.join(base_path, 'gen_audio_train_selected.npy'), allow_pickle=True).item()
gen_audio_val = np.load(os.path.join(base_path, 'gen_audio_val_selected.npy'), allow_pickle=True).item()
gen_audio_test = np.load(os.path.join(base_path, 'gen_audio_test_selected.npy'), allow_pickle=True).item()

# No Audio Datasets
no_audio_train = ActionRecognitionDataset(rgb_train, flow_train)
no_audio_val = ActionRecognitionDataset(rgb_val, flow_val)
no_audio_test = ActionRecognitionDataset(rgb_test, flow_test)

# With Generated Audio Datasets
gen_audio_train_ds = ActionRecognitionDataset(rgb_train, flow_train, gen_audio_train)
gen_audio_val_ds = ActionRecognitionDataset(rgb_val, flow_val, gen_audio_val)
gen_audio_test_ds = ActionRecognitionDataset(rgb_test, flow_test, gen_audio_test)

# DataLoaders
no_audio_train_loader = DataLoader(no_audio_train, batch_size=16, shuffle=True)
no_audio_val_loader = DataLoader(no_audio_val, batch_size=16)
no_audio_test_loader = DataLoader(no_audio_test, batch_size=16)

gen_audio_train_loader = DataLoader(gen_audio_train_ds, batch_size=16, shuffle=True)
gen_audio_val_loader = DataLoader(gen_audio_val_ds, batch_size=16)
gen_audio_test_loader = DataLoader(gen_audio_test_ds, batch_size=16)

In [None]:
# Action Recognition Classifier

class ActionClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim=256, num_classes=20):
        super().__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, batch_first=True, num_layers=1, dropout=0.3)
        self.fc = nn.Linear(hidden_dim, num_classes)

    def forward(self, x):
        out, _ = self.lstm(x)  # (B, 18, hidden_dim)
        last_hidden = out[:, -1, :]  # (B, hidden_dim)
        return self.fc(last_hidden)  # (B, num_classes)


In [None]:
# 학습 및 평가 함수

def train_classifier(model, loader, optimizer, criterion, device):
    model.train()
    total_loss, correct = 0, 0
    for x, y in loader:
        x, y = x.to(device), y.to(device)
        optimizer.zero_grad()
        pred = model(x)
        loss = criterion(pred, y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * x.size(0)
        correct += (pred.argmax(1) == y).sum().item()
    return total_loss / len(loader.dataset), correct / len(loader.dataset)

def evaluate_classifier(model, loader, criterion, device):
    model.eval()
    total_loss, correct = 0, 0
    with torch.no_grad():
        for x, y in loader:
            x, y = x.to(device), y.to(device)
            pred = model(x)
            loss = criterion(pred, y)
            total_loss += loss.item() * x.size(0)
            correct += (pred.argmax(1) == y).sum().item()
    return total_loss / len(loader.dataset), correct / len(loader.dataset)

In [None]:
# Gen Audio

model = ActionClassifier(input_dim=1664).to(device)  
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()

for epoch in range(1, 11):
    train_loss, train_acc = train_classifier(model, gen_audio_train_loader, optimizer, criterion, device)
    val_loss, val_acc = evaluate_classifier(model, gen_audio_val_loader, criterion, device)
    print(f"[Epoch {epoch}] Train Acc: {train_acc:.4f} | Val Acc: {val_acc:.4f}")

test_loss, test_acc = evaluate_classifier(model, gen_audio_test_loader, criterion, device)
print(f"\n✅ Final Test Accuracy (Gen Audio): {test_acc:.4f}")