# 0. load model

In [1]:
import json
import pytorch_lightning as pl
from einops import rearrange
import torch
import torchaudio
from torch.utils.data import DataLoader
from safetensors.torch import load_file
import shutil
from collections import OrderedDict
import safetensors
from datetime import datetime
import os

from stable_audio_tools import create_model_from_config, replace_audio, save_audio
from stable_audio_tools.data.dataset import VideoFeatDataset, collation_fn
from stable_audio_tools.training.training_wrapper import DiffusionCondTrainingWrapper
from stable_audio_tools.inference.generation import generate_diffusion_cond, generate_diffusion_cond_from_path


model_config_file = './stable_audio_tools/configs/model_config.json'
with open(model_config_file) as f:
    model_config = json.load(f)

sample_rate = model_config["sample_rate"]
model = create_model_from_config(model_config)
sample_rate

  from .autonotebook import tqdm as notebook_tqdm


44100

In [2]:
model_dir = './weight/StableAudio/2024-07-09 12:11:40'
model_name = 'epoch=3-step=1018'  
# ./weight/StableAudio/lightning_logs/version_3/checkpoints         epoch=9-step=14620    
# ./weight/StableAudio/2024-07-02 19:49:04                  epoch=2-step=427
# ./weight/StableAudio/2024-07-03 15:16:01               可训练权重attn_bias之后，epoch=5 epoch=9 出现了一些time align的现象
# ./weight/StableAudio/2024-07-03 21:46:54               可训练权重attn_bias之后，epoch=18 多物体表现较好
# ./weight/StableAudio/2024-07-04 11:41:24               不可训练的从0.5-1的attn_bias权重，epoch=39-step=59  epoch=3-step=55  epoch=21-step=57
# ./weight/StableAudio/2024-07-06 10:28:13               加入pos_emb, epoch=30-step=58   time_align和生成效果表现较好(best)
# ./weight/StableAudio/2024-07-06 21:34:56               在上一行的基础上加入VGG进行训练 epoch=3-step=1023
# ./weight/StableAudio/2024-07-09 12:11:40               epoch=3 epoch=2 epoch=1 好像都还行



try:
    state_dict = load_file(f'{model_dir}/{model_name}.safetensors')
except:
    state_dict = torch.load(f'{model_dir}/{model_name}.ckpt')['state_dict']
    state_dict = OrderedDict([(".".join(key.split('.')[1:]), value)  for key, value in state_dict.items()])
    safetensors.torch.save_file(state_dict, f'{model_dir}/{model_name}.safetensors')

model.load_state_dict(state_dict, strict=False)



<All keys matched successfully>

# 2. Train

In [9]:
info_dirs = ['./dataset/feature/train/AudioSet/10']
audio_dirs = ['/home/chengxin/chengxin/AudioSet/generated_audios/train/10']
# info_dirs = ['./dataset/feature/train/AudioSet/10', './dataset/feature/train/VGGSound/10']
# audio_dirs = ['/home/chengxin/chengxin/AudioSet/generated_audios/train/10', '/home/chengxin/chengxin/VGGSound/generated_audios/train/10']

ds_config = {
    'info_dirs' : info_dirs,
    'audio_dirs' : audio_dirs,
    'exts':'wav',
    'sample_rate':sample_rate, 
    'force_channels':"stereo"
}
dl_config = {
    'batch_size':25, 
    'shuffle':False,
    'num_workers':4, 
    'persistent_workers':True, 
    'pin_memory':True, 
    'drop_last':False, 
}


dataset = VideoFeatDataset(**ds_config)
dataloader = DataLoader(dataset=dataset,  collate_fn=collation_fn, **dl_config)

Found 22050 files


In [10]:
training_config = model_config.get('training', None)
training_wrapper = DiffusionCondTrainingWrapper(
            model=model, 
            lr=training_config.get("learning_rate", None),
            optimizer_configs=training_config.get("optimizer_configs", None),
            pre_encoded=training_config.get("pre_encoded", False),
            cfg_dropout_prob = training_config.get("cfg_dropout_prob", 0.1),
            timestep_sampler = training_config.get("timestep_sampler", "uniform"),
        )

trainer = pl.Trainer(
    devices=[1],
    accelerator="gpu",
    num_nodes = 1,
    max_epochs=2,
)
trainer.fit(training_wrapper, dataloader)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
You are using a CUDA device ('NVIDIA H100 PCIe') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]

  | Name      | Type                             | Params
---------------------------------------------------------------
0 | diffusion | ConditionedDiffusionModelWrapper | 1.2 B 
1 | losses    | MultiLoss                        | 0     
---------------------------------------------------------------
1.1 B     Trainable params
156 M     Non-trainable params
1.2 B     Total params
4,880.205 Total estimated model params size (MB)


Epoch 0:   1%|          | 11/882 [00:40<53:34,  0.27it/s, v_num=5] 

Traceback (most recent call last):
  File "/home/chengxin/chengxin/anaconda3/envs/stableaudio/lib/python3.8/runpy.py", line 194, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/home/chengxin/chengxin/anaconda3/envs/stableaudio/lib/python3.8/runpy.py", line 87, in _run_code
    exec(code, run_globals)
  File "/home/chengxin/chengxin/anaconda3/envs/stableaudio/lib/python3.8/site-packages/ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "/home/chengxin/chengxin/anaconda3/envs/stableaudio/lib/python3.8/site-packages/traitlets/config/application.py", line 1075, in launch_instance
    app.start()
  File "/home/chengxin/chengxin/anaconda3/envs/stableaudio/lib/python3.8/site-packages/ipykernel/kernelapp.py", line 739, in start
    self.io_loop.start()
  File "/home/chengxin/chengxin/anaconda3/envs/stableaudio/lib/python3.8/site-packages/tornado/platform/asyncio.py", line 199, in start
    self.asyncio_loop.run_forever()
  File "/h

: 

: 

# 3.Test

In [3]:

info_dirs = ['./dataset/feature/test/VGGSound/10']
# audio_dirs = ['/home/chengxin/chengxin/VGGSound/generated_audios/test/10']

ds_config = {
    'info_dirs' : info_dirs,
    # 'audio_dirs' : audio_dirs,
    'exts':'wav',
    'sample_rate':sample_rate, 
    'force_channels':"stereo",
    'limit_num':50
}
dl_config = {
    'batch_size':25, 
    'shuffle':False,
    'num_workers':4, 
    'persistent_workers':True, 
    'pin_memory':True, 
    'drop_last':False, 
}


dataset = VideoFeatDataset(**ds_config)
dataloader = DataLoader(dataset=dataset,  collate_fn=collation_fn, **dl_config)

Found 50 files


In [4]:
device = 0
duration = 10  # generate 10 secs
output_dir = f"./demo/{model_name}"
count = 0
max_iter = 4
os.makedirs(output_dir, exist_ok=True)
print(output_dir)

for conditioning in dataloader:
    output = generate_diffusion_cond(
        model = model.to(device),
        steps=150,
        cfg_scale=7,
        conditioning=conditioning,
        sample_size=sample_rate*duration,
        batch_size=len(conditioning['feature']),
        sigma_min=0.3,
        sigma_max=500,
        sampler_type="dpmpp-3m-sde",
        device=device
    )

    
    for idx in range(len(conditioning['feature'])):
        # Save generated audio
        video_path = conditioning['video_path'][idx].replace('../../', './')
        audio_path = f"{output_dir}/{video_path.split('/')[-1].replace('.mp4', '.wav')}"
        save_audio(output[idx:1+idx], audio_path, sample_rate)
        
        # Replace the audio of original video to generated one
        moved_video_path = f"{output_dir}/{video_path.split('/')[-1]}"
        shutil.copy(video_path, moved_video_path)
        generated_video_path = moved_video_path.replace(".mp4","_GEN.mp4")
        replace_audio(moved_video_path, audio_path, generated_video_path)
        
    count += 1
    if count >= max_iter:
        break
    

./demo/epoch=3-step=1018
1304458688


100%|██████████| 150/150 [00:20<00:00,  7.37it/s]


3820658675


100%|██████████| 150/150 [00:19<00:00,  7.54it/s]


In [14]:
# device = 0
# duration = 10  # generate 10 secs
# output_dir = "./demo"


# for audio, conditioning in dataloader:
#     for idx in range(dl_config['batch_size']):
#         # Save generated audio
#         video_path = conditioning['video_path'][idx].replace('../../', './')
#         audio_path = f"/home/chengxin/chengxin/AudioSet/generated_audios/specvqgan/10/{video_path.split('/')[-1].replace('.mp4', '.wav')}"
            
#         # Replace the audio of original video to generated one
#         moved_video_path = f"{output_dir}/{video_path.split('/')[-1]}"
#         shutil.copy(video_path, moved_video_path)
#         generated_video_path = moved_video_path.replace(".mp4","_GEN.mp4")
#         replace_audio(moved_video_path, audio_path, generated_video_path)

#     break

# 4. Sample

In [8]:
device = 0
duration = 10
video_paths = ['/home/chengxin/chengxin/FoleyCrafter/examples/sora/0.mp4', '/home/chengxin/chengxin/FoleyCrafter/examples/sora/1.mp4']
output_dir = "./demo"

output = generate_diffusion_cond_from_path(
    model=model.to(device),
    video_paths=video_paths,
    steps=100,
    cfg_scale=7,
    sample_rate=sample_rate,
    sample_size=sample_rate*duration,
    batch_size=len(video_paths),
    sigma_min=0.3,
    sigma_max=500,
    sampler_type="dpmpp-3m-sde",
    device=device
)

for idx in range(len(video_paths)):
    # Save generated audio
    video_path = video_paths[idx]
    audio_path = f"{output_dir}/{video_path.split('/')[-1].replace('.mp4', '.wav')}"
    save_audio(output[idx:1+idx], audio_path, sample_rate)
        
    # Replace the audio of original video to generated one
    moved_video_path = f"{output_dir}/{video_path.split('/')[-1]}"
    shutil.copy(video_path, moved_video_path)
    generated_video_path = moved_video_path.replace(".mp4","_GEN.mp4")
    replace_audio(moved_video_path, audio_path, generated_video_path)

2592135618


Extracting features from video:/home/chengxin/chengxin/FoleyCrafter/examples/sora/0.mp4
Extracting features from video:/home/chengxin/chengxin/FoleyCrafter/examples/sora/1.mp4


100%|██████████| 100/100 [00:02<00:00, 33.39it/s]
