In [None]:
import torch

device = 'cuda' if torch.cuda.is_available() else 'cpu'
embedding_dim = 128  # match what you used in training

model = AudioEditingTransformer(embedding_dim=embedding_dim).to(device)
model.load_state_dict(torch.load('audio_editing_transformer.pth', map_location=device))
model.eval()

In [None]:
from sentence_transformers import SentenceTransformer
import numpy as np

model = SentenceTransformer("BAAI/bge-large-en-v1.5")
prompt = "add rain to this audio"
embedding = model.encode(prompt, normalize_embeddings=True)
np.save("test_prompt_emb.npy", embedding)


In [None]:
import torch
import torchaudio
from torchvggish import vggish, vggish_input
import numpy as np

model = vggish()
model.eval()

audio_path = "test_audio.wav"
waveform, sr = torchaudio.load(audio_path)
if waveform.shape[0] > 1:
    waveform = waveform.mean(dim=0, keepdim=True)
if sr != 16000:
    waveform = torchaudio.transforms.Resample(sr, 16000)(waveform)
if waveform.shape[1] < 16000:
    pad_len = 16000 - waveform.shape[1]
    waveform = torch.nn.functional.pad(waveform, (0, pad_len))
torchaudio.save("tmp.wav", waveform, 16000)
examples = vggish_input.wavfile_to_examples("tmp.wav")
with torch.no_grad():
    embedding = model(torch.tensor(examples)).mean(dim=0).cpu().numpy()
np.save("test_audio_emb.npy", embedding)


In [None]:
import numpy as np

prompt_emb = np.load("test_prompt_emb.npy")  # shape: (num_tokens, 1024)
if prompt_emb.ndim > 1:
    prompt_emb = prompt_emb.mean(axis=0)     # shape: (1024,)

input_audio_emb = np.load("test_audio_emb.npy")  # shape: (128,)
if input_audio_emb.ndim > 1:
    input_audio_emb = input_audio_emb.mean(axis=0)

# Convert to torch tensors
prompt_emb = torch.tensor(prompt_emb, dtype=torch.float32).unsqueeze(0).to(device)  # add batch dim
input_audio_emb = torch.tensor(input_audio_emb, dtype=torch.float32).unsqueeze(0).to(device)


In [None]:
with torch.no_grad():
    predicted_output_emb = model(prompt_emb, input_audio_emb)  # shape: (1, 128)