In [None]:
import torch
from model.wan.pipeline_wan_ttv import AutoencoderKLWan, WanPipeline
from diffusers.utils import export_to_video
from model.wan.wan_transformer_for_video import WanTransformer3DModel


# Available models: Wan-AI/Wan2.1-T2V-14B-Diffusers, Wan-AI/Wan2.1-T2V-1.3B-Diffusers
model_id = './assets/Wan2.1-T2V-1.3B-Diffusers'
load_dtype = torch.float32
transformer = WanTransformer3DModel.from_pretrained(
        model_id, subfolder="transformer", 
        torch_dtype=load_dtype, 
        local_files_only=True,
        # low_cpu_mem_usage=False, 
        use_safetensors=True,
        ignore_mismatched_sizes=True,      # Setting for model structure changes
    )

In [4]:
import torch
from diffusers.utils import export_to_video
from diffusers import AutoencoderKLWan, WanPipeline
from diffusers.schedulers.scheduling_unipc_multistep import UniPCMultistepScheduler

# Available models: Wan-AI/Wan2.1-T2V-14B-Diffusers, Wan-AI/Wan2.1-T2V-1.3B-Diffusers
model_id = "./assets/Wan2.1-T2V-1.3B-Diffusers"
vae = AutoencoderKLWan.from_pretrained(model_id, subfolder="vae", torch_dtype=torch.float16)
flow_shift = 5.0 # 5.0 for 720P, 3.0 for 480P
scheduler = UniPCMultistepScheduler(prediction_type='flow_prediction', use_flow_sigmas=True, num_train_timesteps=1000, flow_shift=flow_shift)
pipe = WanPipeline.from_pretrained(model_id, vae=vae, torch_dtype=torch.float16)
pipe.scheduler = scheduler
pipe.to("cuda")

prompt = "A cat and a dog baking a cake together in a kitchen. The cat is carefully measuring flour, while the dog is stirring the batter with a wooden spoon. The kitchen is cozy, with sunlight streaming through the window."
negative_prompt = "Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards"

output = pipe(
     prompt=prompt,
     negative_prompt=negative_prompt,
     height=480,
     width=832,
     num_frames=81,
     guidance_scale=5.0,
    ).frames[0]
export_to_video(output, "output.mp4", fps=16)

Loading checkpoint shards: 100%|██████████| 5/5 [00:01<00:00,  3.48it/s]
Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  8.21it/s]s/it]
Loading pipeline components...: 100%|██████████| 5/5 [00:02<00:00,  2.30it/s]
100%|██████████| 50/50 [02:30<00:00,  3.01s/it]
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


'output.mp4'

In [None]:
import gc
import os
from tqdm import tqdm
from diffusers import AutoModel
from diffusers.training_utils import cast_training_params, free_memory
import torch
from torch.cuda.amp import autocast

from diffusers.utils.torch_utils import randn_tensor
from diffusers.video_processor import VideoProcessor
from omegaconf import OmegaConf
from model.wan.wan_transformer_for_video import WanTransformer3DModel
from model.wan.pipeline_wan_ttv import FlowMatchEulerDiscreteScheduler, AutoencoderKLWan, WanPipeline
from diffusers.schedulers.scheduling_unipc_multistep import UniPCMultistepScheduler
from utils.text_encoding import get_t5_prompt_embeds, encode_prompt
from diffusers.utils import check_min_version, convert_unet_state_dict_to_peft, export_to_video, is_wandb_available



args = OmegaConf.load("/home/chengxin/chengxin/vagen/config/ttv_wan.yaml")
infer_dtype = torch.float16
load_dtype = torch.float32
device = "cuda:1"

vae = AutoencoderKLWan.from_pretrained(
    args.pretrained_model_name_or_path, subfolder="vae", 
    torch_dtype=load_dtype
).to(device)
tokenizer = AutoModel.from_pretrained(
    args.pretrained_model_name_or_path, subfolder="tokenizer", 
    local_files_only=True,
    use_safetensors=True,
)
transformer = WanTransformer3DModel.from_pretrained(
    args.pretrained_model_name_or_path, subfolder="transformer", 
    torch_dtype=load_dtype, 
    local_files_only=True,
    low_cpu_mem_usage=False, 
    use_safetensors=True,
    ignore_mismatched_sizes=True,      # Setting for model structure changes
).to(device)
text_encoder = AutoModel.from_pretrained(
    args.pretrained_model_name_or_path, subfolder="text_encoder", 
    torch_dtype=load_dtype, 
    local_files_only=True,
    use_safetensors=True,
).to(device)

scheduler = UniPCMultistepScheduler(prediction_type='flow_prediction', use_flow_sigmas=True, num_train_timesteps=1000, flow_shift=args.validation.flow_shift)
video_processor = VideoProcessor(vae_scale_factor=vae.config.scale_factor_spatial)





config = args.validation
with torch.no_grad():
    print("Start log_validation")
    prompt_list = config.prompt.split(config.prompt_separator)
    negative_prompt = config.negetive_prompt
    output_dir = getattr(config, "save_dir", 5.0)
    os.makedirs(output_dir, exist_ok=True)

    for prompt_idx, prompt in enumerate(prompt_list):

        torch.cuda.empty_cache()
        gc.collect()
        free_memory()

        prompt_embeds, negative_prompt_embeds = None, None
        if config.negetive_prompt_embed is not None:
            negative_prompt_embeds = torch.load(config.negetive_prompt_embed).to(device).unsqueeze(0)
            
        prompt_embeds, negative_prompt_embeds = encode_prompt(
            prompt=prompt,
            negative_prompt=negative_prompt,
            tokenizer = tokenizer,
            text_encoder = text_encoder,
            do_classifier_free_guidance=config.guidance_scale > 1.0,
            num_videos_per_prompt=config.num_videos_per_prompt,
            prompt_embeds=prompt_embeds,
            negative_prompt_embeds=negative_prompt_embeds,
            max_sequence_length=512,
            device=device,
            dtype = load_dtype,
        ) # type: ignore


        with autocast(dtype=infer_dtype):
            num_latent_frames = (config.num_frames - 1) // vae.config.scale_factor_temporal + 1
            num_channels = transformer.module.config.in_channels if hasattr(transformer, "module") else transformer.config.in_channels
            shape = (
                    config.num_videos_per_prompt,
                    num_channels,
                    num_latent_frames,
                    config.height // vae.config.scale_factor_spatial,
                    config.width // vae.config.scale_factor_spatial,
            )
            latents = randn_tensor(shape, device=device, dtype=infer_dtype)
            scheduler.set_timesteps(config.num_inference_steps, device=device)
            timesteps = scheduler.timesteps

            for i, t in tqdm(enumerate(timesteps)):
                current_model = transformer.module if hasattr(transformer, "module") else transformer
                current_guidance_scale = config.guidance_scale
                latent_model_input = latents
                timestep = t.expand(latents.shape[0])

                with current_model.cache_context("cond"):
                    noise_pred = current_model(
                        hidden_states=latent_model_input,
                        timestep=timestep,
                        encoder_hidden_states=prompt_embeds,
                        attention_kwargs=None,
                        return_dict=False,
                    )[0]

                if config.guidance_scale > 1.0:
                    with current_model.cache_context("uncond"):
                        noise_uncond = current_model(
                            hidden_states=latent_model_input,
                            timestep=timestep,
                            encoder_hidden_states=negative_prompt_embeds,
                            attention_kwargs=None,
                            return_dict=False,
                        )[0]
                    noise_pred = noise_uncond + current_guidance_scale * (noise_pred - noise_uncond)
                latents = scheduler.step(noise_pred, t, latents, return_dict=False)[0]
                                    

            latents = latents.to(vae.dtype)
            latents_mean = torch.tensor(vae.config.latents_mean).view(1, vae.config.z_dim, 1, 1, 1).to(vae.device, vae.dtype)
            latents_std = 1 / torch.tensor(vae.config.latents_std).view(1, vae.config.z_dim, 1, 1, 1).to(vae.device, vae.dtype)
            latents = latents / latents_std + latents_mean
            video = vae.decode(latents, return_dict=False)[0]
            video = video_processor.postprocess_video(video, output_type='np')
            for i in range(config.num_videos_per_prompt):
                export_to_video(video[i], f"{output_dir}/output{prompt_idx}_{i}.mp4", fps=config.fps if hasattr(config, "fps") else 16)
            

: 

In [2]:
import gc
import os
from tqdm import tqdm
from diffusers import AutoModel
from diffusers.training_utils import cast_training_params, free_memory
import torch
from torch.cuda.amp import autocast

from diffusers.utils.torch_utils import randn_tensor
from diffusers.video_processor import VideoProcessor
from omegaconf import OmegaConf
from model.wan.wan_transformer_for_video import WanTransformer3DModel
from model.wan.pipeline_wan_ttv import FlowMatchEulerDiscreteScheduler, AutoencoderKLWan, WanPipeline
from diffusers.schedulers.scheduling_unipc_multistep import UniPCMultistepScheduler
from utils.text_encoding import get_t5_prompt_embeds, encode_prompt
from diffusers.utils import check_min_version, convert_unet_state_dict_to_peft, export_to_video, is_wandb_available
from peft import LoraConfig, get_peft_model



args = OmegaConf.load("/home/chengxin/chengxin/vagen/config/tta_tuning.yaml")
infer_dtype = torch.float16
load_dtype = torch.float32
device = "cpu"


transformer = WanTransformer3DModel.from_pretrained(
    args.pretrained_model_name_or_path, subfolder="transformer", 
    torch_dtype=load_dtype, 
    local_files_only=True,
    low_cpu_mem_usage=False, 
    use_safetensors=True,
    ignore_mismatched_sizes=True,      # Setting for model structure changes
).to(device)



def set_requires_grad(transformer, target_params, print_param = False):
    for name, param in transformer.named_parameters():
        for target in target_params:
            if target in name:
                param.requires_grad = True  # 设置为需要梯度
                if print_param:
                    print(f"{target}", end = " ")
    print("\n")
    return transformer

  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  9.87it/s]


In [7]:
lora_config = LoraConfig(
    r=args.rank,  # LoRA 的秩（rank），通常设为 4、8、16 或 32
    lora_alpha=args.lora_alpha,  # 缩放因子，控制 LoRA 更新的幅度
    target_modules=["to_q", 
                    "to_k", 
                    "to_v", 
                    "ffn.net.0.proj", 
                    "ffn.net.2"],  # 指定应用 LoRA 的模块（如 Transformer 的查询和值投影）
    lora_dropout=0.1,  # Dropout 概率
    bias="none",  # 是否调整偏置
    # task_type="CAUSAL_LM"  # 任务类型，如 CAUSAL_LM 或 SEQ_CLS
)


transformer = get_peft_model(transformer, lora_config)
transformer = set_requires_grad(transformer, ['patch_embedding', 'proj_out', 'scale_shift_table', 'text_embedder'], True)
transformer.print_trainable_parameters()  # 查看可训练参数量


# print("=== 可训练参数 ===")
# trainable_params = 0
# total_params = 0
# for name, param in transformer.named_parameters():
#     if param.requires_grad:
#         print(f"Layer: {name}, Shape: {param.shape}")
#         trainable_params += param.numel()
#     total_params += param.numel()



scale_shift_table patch_embedding patch_embedding text_embedder text_embedder text_embedder text_embedder scale_shift_table scale_shift_table scale_shift_table scale_shift_table scale_shift_table scale_shift_table scale_shift_table scale_shift_table scale_shift_table scale_shift_table scale_shift_table scale_shift_table scale_shift_table scale_shift_table scale_shift_table scale_shift_table scale_shift_table scale_shift_table scale_shift_table scale_shift_table scale_shift_table scale_shift_table scale_shift_table scale_shift_table scale_shift_table scale_shift_table scale_shift_table scale_shift_table scale_shift_table scale_shift_table proj_out proj_out 

trainable params: 160,519,744 || all params: 1,570,384,960 || trainable%: 10.2217


# Stable Diffusion

In [3]:
from diffusers import AutoModel
from diffusers.pipelines.stable_audio.modeling_stable_audio import StableAudioProjectionModel
from model.stable_audio.stable_audio_transformer import StableAudioDiTModel
import torch
from diffusers.utils.torch_utils import randn_tensor
from diffusers.models.embeddings import get_1d_rotary_pos_embed
import inspect
from tqdm import tqdm
import soundfile as sf


load_dtype = torch.float16
device = "cuda:0"
transformer = StableAudioDiTModel.from_pretrained(
    '/home/chengxin/chengxin/vagen/assets/stable-audio-open-1.0', 
    subfolder="transformer", 
    torch_dtype = load_dtype,
    local_files_only=True,                  # From pretrained
    low_cpu_mem_usage=False, 
    ignore_mismatched_sizes=True,      
    # num_layers=10,                                 
    use_safetensors=True,                          
).to(device)


projection_model = StableAudioProjectionModel.from_pretrained(
    '/home/chengxin/chengxin/vagen/assets/stable-audio-open-1.0', 
    subfolder="projection_model",
    torch_dtype=load_dtype, 
    local_files_only=True,
    use_safetensors=True,
).to(device)

vae = AutoModel.from_pretrained(
    '/home/chengxin/chengxin/vagen/assets/stable-audio-open-1.0', 
    subfolder="vae", 
    torch_dtype = load_dtype,
    local_files_only=True,
    use_safetensors=True,                      
).to(device)


text_encoder = AutoModel.from_pretrained(
    '/home/chengxin/chengxin/vagen/assets/stable-audio-open-1.0', 
    subfolder="text_encoder", 
    torch_dtype=load_dtype, 
    local_files_only=True,
    use_safetensors=True,
).to(device)

tokenizer = AutoModel.from_pretrained(
    '/home/chengxin/chengxin/vagen/assets/stable-audio-open-1.0', 
    subfolder="tokenizer", 
    local_files_only=True,
    use_safetensors=True,
)

scheduler = AutoModel.from_pretrained(
    '/home/chengxin/chengxin/vagen/assets/stable-audio-open-1.0', 
    subfolder="scheduler", 
)


In [6]:
from utils.text_encoding import encode_prompt_sd, encode_duration_sd, prepare_extra_step_kwargs


prompt = ["The sharp, resonant sound of a bowling ball striking the pins, followed by the clattering and scattering of the pins in a chaotic dance, fills the air with a mix of impact and rolling echoes."]
negative_prompt = ["Low quality."]
batch_size = len(prompt)

do_classifier_free_guidance = True
num_waveforms_per_prompt = 1
num_inference_steps = 200
eta = 0.0
guidance_scale = 7
negative_prompt_embeds = None

audio_start_in_s = 0.0
audio_end_in_s   = 5.0

with torch.no_grad():
    prompt_embeds = encode_prompt_sd(
        prompt,
        tokenizer,
        text_encoder,
        projection_model,
        device,
        do_classifier_free_guidance,
        negative_prompt,
    )

    # Encode duration
    seconds_start_hidden_states, seconds_end_hidden_states = encode_duration_sd(
        projection_model,
        audio_start_in_s,
        audio_end_in_s,
        device,
        do_classifier_free_guidance and (negative_prompt is not None or negative_prompt_embeds is not None),
        batch_size,
    )

    # Create text_audio_duration_embeds and audio_duration_embeds
    text_audio_duration_embeds = torch.cat([prompt_embeds, seconds_start_hidden_states, seconds_end_hidden_states], dim=1)
    audio_duration_embeds = torch.cat([seconds_start_hidden_states, seconds_end_hidden_states], dim=2)

    # In case of classifier free guidance without negative prompt, we need to create unconditional embeddings and
    if do_classifier_free_guidance and negative_prompt_embeds is None and negative_prompt is None:
        negative_text_audio_duration_embeds = torch.zeros_like(text_audio_duration_embeds, device=text_audio_duration_embeds.device)
        text_audio_duration_embeds = torch.cat([negative_text_audio_duration_embeds, text_audio_duration_embeds], dim=0)
        audio_duration_embeds = torch.cat([audio_duration_embeds, audio_duration_embeds], dim=0)

    bs_embed, seq_len, hidden_size = text_audio_duration_embeds.shape
    # duplicate audio_duration_embeds and text_audio_duration_embeds for each generation per prompt, using mps friendly method
    text_audio_duration_embeds = text_audio_duration_embeds.repeat(1, num_waveforms_per_prompt, 1)
    text_audio_duration_embeds = text_audio_duration_embeds.view(bs_embed * num_waveforms_per_prompt, seq_len, hidden_size)

    # # print(audio_duration_embeds.shape)
    audio_duration_embeds = audio_duration_embeds.repeat(1, num_waveforms_per_prompt, 1)
    audio_duration_embeds = audio_duration_embeds.view(bs_embed * num_waveforms_per_prompt, -1, audio_duration_embeds.shape[-1])
    # # print(audio_duration_embeds.shape)

    # 4. Prepare timesteps
    scheduler.set_timesteps(num_inference_steps, device=device)
    timesteps = scheduler.timesteps

    # 5. Prepare latent variables
    num_channels_vae = transformer.config.in_channels
    waveform_length = int(transformer.config.sample_size)
    # waveform_length = int(audio_end_in_s * 22.5)
    shape = (batch_size * num_waveforms_per_prompt, num_channels_vae, waveform_length)
    generator = torch.Generator("cuda").manual_seed(0)
    # generator = None
    latents = randn_tensor(shape, generator=generator, device=device, dtype=load_dtype)

    # 6. Prepare extra step kwargs and rotary_embed_dim
    extra_step_kwargs = prepare_extra_step_kwargs(generator, eta, scheduler)
    rotary_embed_dim = transformer.config.attention_head_dim // 2
    rotary_embedding = get_1d_rotary_pos_embed(
        rotary_embed_dim,
        latents.shape[2] + audio_duration_embeds.shape[1],
        use_real=True,
        repeat_interleave_real=False,
    )

    # 8. Denoising loop
    for i, t in tqdm(enumerate(timesteps)):
        # expand the latents if we are doing classifier free guidance
        latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
        latent_model_input = scheduler.scale_model_input(latent_model_input, t)
        t = torch.stack([t for _ in range(latent_model_input.shape[0])])

        # predict the noise residual
        noise_pred = transformer(
            latent_model_input,
            t,
            encoder_hidden_states=text_audio_duration_embeds,
            global_hidden_states=audio_duration_embeds,
            rotary_embedding=rotary_embedding,
            return_dict=False,
        )[0]

        # perform guidance
        if do_classifier_free_guidance:
            noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
            noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
        latents = scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample

    audio = vae.decode(latents).sample

import torchaudio

for i in range(len(audio)):
    torchaudio.save(f'test{i}.wav', audio[i].to(torch.float32).cpu(), 44100)

0it [00:00, ?it/s]

200it [00:06, 28.58it/s]


# Stable Diffusion FT

In [1]:
from diffusers import AutoModel
from diffusers.pipelines.stable_audio.modeling_stable_audio import StableAudioProjectionModel
from model.stable_audio.stable_audio_transformer import StableAudioDiTModel
import torch
from diffusers.utils.torch_utils import randn_tensor
from diffusers.models.embeddings import get_1d_rotary_pos_embed
import inspect
from tqdm import tqdm
import soundfile as sf
from diffusers.schedulers.scheduling_unipc_multistep import UniPCMultistepScheduler


load_dtype = torch.float16
device = "cuda:0"
transformer = StableAudioDiTModel.from_pretrained(
    '/home/chengxin/chengxin/vagen/assets/stable-audio-open-1.0', 
    subfolder="transformer_ft", 
    torch_dtype = load_dtype,
    local_files_only=True,                  # From pretrained
    low_cpu_mem_usage=False, 
    ignore_mismatched_sizes=True,      
    # num_layers=10,                                 
    use_safetensors=True,                          
).to(device)

# sd_tta_pt_16 ckpt1
projection_model = StableAudioProjectionModel.from_pretrained(
    '/home/chengxin/chengxin/vagen/assets/stable-audio-open-1.0', 
    subfolder="projection_model",
    torch_dtype=load_dtype, 
    local_files_only=True,
    use_safetensors=True,
).to(device)

vae = AutoModel.from_pretrained(
    '/home/chengxin/chengxin/vagen/assets/stable-audio-open-1.0', 
    subfolder="vae", 
    torch_dtype = load_dtype,
    local_files_only=True,
    use_safetensors=True,                      
).to(device)


text_encoder = AutoModel.from_pretrained(
    '/home/chengxin/chengxin/vagen/assets/stable-audio-open-1.0', 
    subfolder="text_encoder", 
    torch_dtype=load_dtype, 
    local_files_only=True,
    use_safetensors=True,
).to(device)

tokenizer = AutoModel.from_pretrained(
    '/home/chengxin/chengxin/vagen/assets/stable-audio-open-1.0', 
    subfolder="tokenizer", 
    local_files_only=True,
    use_safetensors=True,
)

step_scheduler = UniPCMultistepScheduler(prediction_type='flow_prediction', use_flow_sigmas=True, num_train_timesteps=1000) 


  from .autonotebook import tqdm as notebook_tqdm
  WeightNorm.apply(module, name, dim)


In [6]:
from utils.text_encoding import encode_prompt_sd, encode_duration_sd, prepare_extra_step_kwargs
import json
import torchaudio


do_classifier_free_guidance = True
num_waveforms_per_prompt = 1
num_inference_steps = 200
eta = 0.0
guidance_scale = 7
negative_prompt_embeds = None
audio_start_in_s = 0.0
audio_end_in_s   = 10.0


with open('/home/chengxin/chengxin/vagen/data/tta/test_avsync_recap.json', 'r') as f:
    data = json.load(f)

for path, info in data.items():
    prompt = [info['label']] 
    # Lions Roaring Deeply, Bowling striking
    prompt = [ "Sharpen knife", "chicken crowing", "Bowling rolling and striking", "hammering", "Lions Roaring Deeply", 'Frog Croaking'] # ['Lions Roaring Deeply', 'Frog Croaking', 'Dog barking', 'Bowling striking']
            #  [ "Sharpen knife", "chicken crowing", "toilet flushing", "hammering"]
    negative_prompt = ["" for _ in range(len(prompt))]
    batch_size = len(prompt)

    with torch.no_grad():
        prompt_embeds = encode_prompt_sd(
            prompt,
            tokenizer,
            text_encoder,
            projection_model,
            device,
            do_classifier_free_guidance,
            negative_prompt,
        )

        # 5. Prepare latent variables
        num_channels_vae = transformer.config.in_channels
        # waveform_length = int(transformer.config.sample_size)
        waveform_length = int(audio_end_in_s * 22.5)
        shape = (batch_size * num_waveforms_per_prompt, num_channels_vae, waveform_length)
        # generator = torch.Generator("cuda").manual_seed(0)
        latents = randn_tensor(shape, device=device, dtype=load_dtype)

        # 6. Prepare extra step kwargs and rotary_embed_dim
        rotary_embed_dim = transformer.config.attention_head_dim // 2
        rotary_embedding = get_1d_rotary_pos_embed(
            rotary_embed_dim,
            latents.shape[2] + 1,
            use_real=True,
            repeat_interleave_real=False,
        )

        # 8. Denoising loop
        # TODO: Check scheduler, t = 1 for noise;  UniPCMultistepScheduler timesteps [1-1000]
        step_scheduler.set_timesteps(num_inference_steps, device=device)
        timesteps = step_scheduler.timesteps
        for i, t in tqdm(enumerate(timesteps)):
            latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
            latent_model_input = step_scheduler.scale_model_input(latent_model_input, t)
            t = torch.stack([t for _ in range(latent_model_input.shape[0])])

            # predict the noise residual
            noise_pred = transformer(
                latent_model_input,
                t,
                encoder_hidden_states=prompt_embeds, # text_audio_duration_embeds,
                # global_hidden_states=audio_duration_embeds,
                rotary_embedding=rotary_embedding,
                return_dict=False,
            )[0]

            # perform guidance
            if do_classifier_free_guidance:
                t = t[:len(t)//2]
                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
            latents = step_scheduler.step(noise_pred, t[0], latents).prev_sample

        audio = vae.decode(latents).sample


    for i in range(len(audio)):
        # torchaudio.save(f"/home/chengxin/chengxin/vagen/log/predict/{path.split('/')[-1][:-4]}.wav", audio[i].to(torch.float32).cpu(), 44100)
        torchaudio.save(f"./test{i}.wav", audio[i].to(torch.float32).cpu(), 44100)

    break

0it [00:00, ?it/s]

200it [00:55,  3.58it/s]


In [None]:
import os, shutil
import torchaudio

i = 0
gen_path = '/home/chengxin/chengxin/vagen/log/sd_tta_pt_16/logging/10000'    # /home/chengxin/chengxin/vagen/log/sd_tta_pt_16/checkpoints/checkpoint_1
gen_path = '/home/chengxin/chengxin/vagen/log/sd_tta_ft_16/logging/6500'     # /home/chengxin/chengxin/vagen/log/sd_tta_ft_16/checkpoints/checkpoint_13   6500 7000 7500 
# gen_path = '/home/chengxin/chengxin/vagen/log_backup/sd_tta_ft_recap_16/logging/5500'     # /home/chengxin/chengxin/vagen/log/sd_tta_ft_recap_16/checkpoints/   5500 4500   7000 8000
# gen_path = '/home/chengxin/chengxin/vagen/log/wan_sd_ttva_55_bi_new/logging/1500'     

target_path = '/home/chengxin/chengxin/Dataset_Sound/VGGSound/generated_audios/veo3/avsync'


for file in os.listdir(target_path):
    try:

        input_file = f'{gen_path}/{file}'
        output_file = f'{target_path}/{file}'
        duration = 5.4  # 截取时长（秒）

        waveform, sample_rate = torchaudio.load(input_file)
        num_samples = int(duration * sample_rate)
        waveform = waveform[:, :num_samples]
        # waveform[:, num_samples:] = 0

        torchaudio.save(output_file, waveform, sample_rate)
        # os.remove(input_file)
        i += 1

    except:
        pass
        # print(file)
i

In [None]:
from utils.va_processing import add_audio_to_video
from glob import glob
from tqdm import tqdm
import os
import shutil

input_dir = '/home/chengxin/chengxin/vagen/log/wan_sd_ttva_55_bi_new/logging/0'
output_dir = '/home/chengxin/chengxin/Dataset_Sound/VGGSound/generated_audios/veo3/avsync_video'
for path in tqdm(glob(f'{input_dir}/*.mp4')):
    v_path =  f"{input_dir}/{path.split('/')[-1][:-4]}.mp4"
    a_path =  f"{input_dir}/{path.split('/')[-1][:-4]}.wav"
    o_path = f"{output_dir}/{path.split('/')[-1][:-4]}.mp4"
    if os.path.exists(a_path):
        add_audio_to_video(video_path = v_path, audio_path = a_path, output_path = o_path)


In [3]:
import torch
import os
from tqdm import tqdm
from utils.fm_solvers import FlowUniPCMultistepScheduler
from utils.va_processing import snap_hw_to_multiple_of_32, save_video
from utils.model_loading import (
    init_fusion_score_model_ovi, 
    init_text_model, 
    init_mmaudio_vae, 
    init_wan_vae_2_2, 
    load_fusion_checkpoint
)
from peft import LoraConfig, get_peft_model


device = 5
target_dtype = torch.bfloat16
ckpt_dir = "/home/chengxin/chengxin/Ovi/ckpts"  # 请修改为你的检查点路径

model, video_config, audio_config = init_fusion_score_model_ovi(rank=device, meta_init=True)
if ckpt_dir is not None:
    checkpoint_path = os.path.join(ckpt_dir, "Ovi", "model.safetensors")
    load_fusion_checkpoint(model, checkpoint_path=checkpoint_path, from_meta=True)
model = model.requires_grad_(False).eval().to(dtype=target_dtype).to(device=device)
model.set_rope_params()
model.train()
model.gradient_checkpointing = True
model.requires_grad_(False)

for name, param in model.named_parameters():
    # if "ffn" in name:
        print(name)

Successfully loaded fusion checkpoint from /home/chengxin/chengxin/Ovi/ckpts/Ovi/model.safetensors
trainable params: 739,246,080 || all params: 12,399,999,188 || trainable%: 5.9617


In [3]:

lora_config = LoraConfig(
    r=128,
    lora_alpha=64,
    target_modules=['k_fusion', 'v_fusion', 'ffn.0', 'ffn.2',
                    'self_attn.q', 'self_attn.k', 'self_attn.v', 'self_attn.o', 
                    'cross_attn.q', 'cross_attn.k', 'cross_attn.v', 'cross_attn.o'],
    lora_dropout=0.1,
    bias="none",)
model.gradient_checkpointing = True
model = get_peft_model(model, lora_config, adapter_name="ref")
model = get_peft_model(model, lora_config, adapter_name="learner")
model.print_trainable_parameters()

model.set_adapter('ref')
model.save_pretrained(f"log")
model.set_adapter('learner')
model.save_pretrained(f"log")
# lora_config = LoraConfig(
#     r=128,
#     lora_alpha=64,
#     target_modules=['k_fusion'],
#     lora_dropout=0.1,
#     bias="none",)
# model = get_peft_model(model, lora_config, adapter_name="learner")




trainable params: 739,246,080 || all params: 13,139,245,268 || trainable%: 5.6262


In [None]:
# import safetensors
# import safetensors.torch

# # 读取 safetensors 文件
# learner1 = safetensors.torch.load_file("/home/chengxin/chengxin/vagen/log/learner/adapter_model.safetensors")
# ref1 = safetensors.torch.load_file("/home/chengxin/chengxin/vagen/log/ref/adapter_model.safetensors")
# learner2 = safetensors.torch.load_file("/home/chengxin/chengxin/vagen/log/ovi_fusion_debug/checkpoints/checkpoint_0/learner/adapter_model.safetensors")
# ref2 = safetensors.torch.load_file("/home/chengxin/chengxin/vagen/log/ovi_fusion_debug/checkpoints/checkpoint_0/ref/adapter_model.safetensors")

# print((learner1['base_model.model.base_model.model.audio_model.blocks.0.cross_attn.k.lora_A.weight'] != ref1['base_model.model.base_model.model.audio_model.blocks.0.cross_attn.k.lora_A.weight']).sum())
# print((learner1['base_model.model.base_model.model.audio_model.blocks.0.cross_attn.k.lora_A.weight'] != ref2['base_model.model.base_model.model.audio_model.blocks.0.cross_attn.k.lora_A.weight']).sum())
# print((ref1['base_model.model.base_model.model.audio_model.blocks.0.cross_attn.k.lora_A.weight'] != ref2['base_model.model.base_model.model.audio_model.blocks.0.cross_attn.k.lora_A.weight']).sum())
# print((learner1['base_model.model.base_model.model.audio_model.blocks.0.cross_attn.k.lora_A.weight'] != learner2['base_model.model.base_model.model.audio_model.blocks.0.cross_attn.k.lora_A.weight']).sum())


tensor(0)
tensor(0)
tensor(0)
tensor(393216)


: 