In [None]:
import shutil
from tqdm import tqdm
from glob import glob

from moviepy.editor import VideoFileClip, AudioFileClip
import os, subprocess

def replace_audio_in_video(video_path, audio_path, output_path):
    """
    批量替换MP4文件的音频
    video_folder: MP4文件所在文件夹
    audio_folder: WAV文件所在文件夹
    output_folder: 输出文件保存文件夹
    """
    try:
        cmd = [
            'ffmpeg',
            '-y',
            '-i', video_path,           # 输入视频文件
            '-i', audio_path,          # 输入音频文件
            '-c:v', 'copy',            # 直接复制视频流
            '-c:a', 'aac',             # 编码音频为AAC
            '-map', '0:v:0',           # 选择视频流的第0个视频轨道
            '-map', '1:a:0',           # 选择音频流的第0个音频轨道
            '-shortest',               # 以最短的流长度为准
            output_path
        ]
            
        subprocess.run(cmd, check=True, capture_output=True)

    except Exception as e:
        print(f"处理失败: {output_path}, 错误: {str(e)}")
        raise
    
model = 'vasflow'
group = '10'
epoch = '0129'

# files = glob(f"./log/2025_05_13-*-vaflow_sda_dit_noise_text_mel_10l_cc_first10/val/video/epoch_{epoch}*/audio_*_00.wav")
files = glob(f"/home/chengxin/chengxin/vasflow/log/2025_05_18-*-vaflow_sda_dit_noise_text_clip_mel_10l_cn_first10/val/video/epoch_{epoch}*/audio_*_00.wav")
# files = glob(f"/home/chengxin/chengxin/vasflow/log/2025_06_27-19_*-vaflow_sda_dit_noise_text_mel_infer_vas_cfg2/predict/video/audio_*_00.wav")
# files = glob(f"/home/chengxin/chengxin/vasflow/log/2025_06_23-20_*-vaflow_sda_dit_noise_text_mel_infer_va_cfg0/predict/video/audio_*_00.wav")
# files = glob(f"/home/chengxin/chengxin/vasflow/log/2025_05_26-16*_dit_noise_text_clip_mel_infer/predict/video/audio_*_00.wav")

for audio_file in tqdm(files):
    name = audio_file.split('/')[-1][6:-7]
    target_file = f"/home/chengxin/chengxin/Dataset_Sound/VGGSound/generated_audios/{model}/{group}/{name}.wav"
    shutil.copy(audio_file, target_file)

len(files)

# Replace Audio

In [None]:

# for file in os.listdir('/home/chengxin/chengxin/Dataset_Sound/VGGSound/generated_audios/vasflow/10')[5400:5500]:
#     name = file[:-4]
#     video_path = f"/home/chengxin/chengxin/Dataset_Sound/VGGSound/dataset/test/10/{name}.mp4"  # Path to the input MP4 video
#     audio_path = f"/home/chengxin/chengxin/Dataset_Sound/VGGSound/generated_audios/vasflow/10/{file}"    # Path to the new audio (can be MP3, WAV, etc.)
#     output_path = f"/home/chengxin/chengxin/Dataset_Sound/VGGSound/generated_audios/vasflow/video/{name}.mp4" # Path for the output video
    
#     replace_audio_in_video(video_path, audio_path, output_path)
    

In [None]:

# name = 'ruK8QzIWrSY_000080'
# for baseline in ['difffoley', 'seeing', 'Frieren', 'stablev2a_', 'tiva', 'test', 'vaura', 'specvqgan', 'im2wav', 'vab', 'v2a-mapper']:
#     video_path = f"/home/chengxin/chengxin/Dataset_Sound/VGGSound/dataset/test/10/{name}.mp4"  # Path to the input MP4 video
#     audio_path = f"/home/chengxin/chengxin/Dataset_Sound/VGGSound/generated_audios/{baseline}/10/{name}.wav"    # Path to the new audio (can be MP3, WAV, etc.)
#     output_path = f"/home/chengxin/chengxin/Dataset_Sound/VGGSound/generated_audios/demo/{baseline}_{name}.mp4" # Path for the output video
#     try:
#         replace_audio_in_video(video_path, audio_path, output_path)
#     except:
#         print(baseline, name)


# SPEECH

In [3]:
import shutil
from tqdm import tqdm
from glob import glob
import re

model = 'vasflow'
epoch = '0129'
# files = glob("/home/chengxin/chengxin/vasflow/log/2025_05_07-13_*/predict/video/audio_*_00.wav")
files = glob(f"/home/chengxin/chengxin/vasflow/log/2025_05_18-*-vaflow_sda_dit_noise_text_clip_mel_10l_cn_first10/val/video/epoch_{epoch}*/speech_*_00.wav")
files = glob(f"/home/chengxin/chengxin/vasflow/log/2025_07_04-*-vaflow_sda_dit_noise_text_mel_infer_chem_grid/predict/video/speech_*_00.wav")
# files = glob(f"/home/chengxin/chengxin/vasflow/log/2025_07_04-19_*-vaflow_sda_dit_noise_text_mel_infer_chem/predict/video/speech_*_00.wav")
# files = glob(f"/home/chengxin/chengxin/vasflow/log/2025_06_22-14_*-vaflow_sda_dit_noise_text_mel_infer_va/predict/video/speech_*_00.wav")

# files = glob(f"/home/chengxin/chengxin/vasflow/log/2025_05_26-14_*_dit_noise_text_mel_infer/predict/video/speech_*_00.wav")

ljfiles = []
gridfiles = []
chemfiles = []
lrsfiles = []
for audio_file in tqdm(files):
    name = audio_file.split('/')[-1][7:-7]
    if name.startswith('LJ00'):
        ljfiles.append(audio_file)
    elif bool(re.match(r'^s(?:0[0-9]|[12][0-9]|3[0-5])', name)):
        gridfiles.append(audio_file)
    elif name.startswith('chem'):
        chemfiles.append(audio_file)
    else:
        lrsfiles.append(audio_file)
    # target_file = f"/home/chengxin/chengxin/Dataset_Sound/VGGSound/generated_audios/{model}/{group}/{name}.wav"
    # shutil.copy(audio_file, target_file)

len(ljfiles), len(gridfiles), len(chemfiles), len(lrsfiles)


100%|██████████| 3491/3491 [00:00<00:00, 575042.82it/s]


(0, 3291, 200, 0)

# 1. wav.scp

In [11]:
visualtts_dataset = 'GRID'
meta_file_dir = f"/home/chengxin/chengxin/Dataset_Sound/{visualtts_dataset}"
gen_file_paths = gridfiles
output_scp_path = f"{meta_file_dir}/results/{model}/wer/wav.scp"


with open(output_scp_path, "w") as scp_file:
    for file_path in gen_file_paths:
        file_name = file_path.split("/")[-1]
        file_id = '_'.join(file_name.split("_")[1:-1])
        sample_id = int(file_name.split("_")[-1][:-4])

        name = f"visual_tts_{file_id}_sample{sample_id}"
        gen_path = file_path
        scp_file.write(f"{name} {gen_path}\n")

print(f"Successfully written {len(gen_file_paths)} entries to {output_scp_path}")

Successfully written 3291 entries to /home/chengxin/chengxin/Dataset_Sound/GRID/results/vasflow/wer/wav.scp


# 2. utt2spk

In [12]:
output_utt_path = f"{meta_file_dir}/results/{model}/wer/utt2spk"


with open(output_utt_path, "w") as utt_file:
    for file_path in gen_file_paths:
        file_name = file_path.split("/")[-1]
        file_id = '_'.join(file_name.split("_")[1:-1])
        sample_id = int(file_name.split("_")[-1][:-4])
        spk_id = '_'.join(file_id.split("_")[:-1])


        name = f"{file_id}"
        gt_path = f"{meta_file_dir}/speakers/{file_id}.wav" # LJSpeech
        gt_path = f"{meta_file_dir}/speakers/{spk_id}/{file_id}.wav"  # GRID LRS CHEM
        utt_file.write(f"{name} {gt_path}\n")

print(f"Successfully written {len(gen_file_paths)} entries to {output_utt_path}")

Successfully written 3291 entries to /home/chengxin/chengxin/Dataset_Sound/GRID/results/vasflow/wer/utt2spk


# 3. Text

In [13]:
output_utt_path = f"{meta_file_dir}/results/{model}/wer/text"


with open(output_utt_path, "w") as txt_file:
    for file_path in gen_file_paths:
        file_name = file_path.split("/")[-1]
        file_id = '_'.join(file_name.split("_")[1:-1])
        sample_id = int(file_name.split("_")[-1][:-4])
        spk_id = '_'.join(file_id.split("_")[:-1])
        

        name = f"{file_id}"
        # gt_transcript_path = f"{meta_file_dir}/speakers/{file_id}.lab" # LJSpeech
        gt_transcript_path = f"{meta_file_dir}/speakers/{spk_id}/{file_id}.lab" # GRID LRS CHEM
        with open(gt_transcript_path, "r") as lab_file:
            gt_transcript = lab_file.read().strip()
        txt_file.write(f"{name} {gt_transcript}\n")

print(f"Successfully written {len(gen_file_paths)} entries to {output_utt_path}")

Successfully written 3291 entries to /home/chengxin/chengxin/Dataset_Sound/GRID/results/vasflow/wer/text


# key_file

In [14]:
output_utt_path = f"{meta_file_dir}/results/{model}/wer/key_file"


with open(output_utt_path, "w") as key_file:
    for file_path in gen_file_paths:
        file_name = file_path.split("/")[-1]
        file_id = '_'.join(file_name.split("_")[1:-1])
        sample_id = int(file_name.split("_")[-1][:-4])
        
        name = f"visual_tts_{file_id}"
        key_file.write(f"{name}\n")

print(f"Successfully written {len(gen_file_paths)} entries to {output_utt_path}")

Successfully written 3291 entries to /home/chengxin/chengxin/Dataset_Sound/GRID/results/vasflow/wer/key_file


In [None]:
import shutil
from tqdm import tqdm
from glob import glob
import os
import subprocess

import subprocess
from moviepy.editor import VideoFileClip, AudioFileClip




save_dir = f'/home/chengxin/chengxin/Dataset_Sound/{visualtts_dataset}/results/{model}/data'
os.makedirs(save_dir, exist_ok=True)
for file_path in tqdm(gen_file_paths[:200]):
    file_name = file_path.split("/")[-1]
    file_id = '_'.join(file_name.split("_")[1:-1])
    sample_id = int(file_name.split("_")[-1][:-4])

    name = f"{file_id}_sample{sample_id}"
    shutil.copy(file_path, f"{save_dir}/{name}.wav")

    if visualtts_dataset == 'Chem':
        file_id = file_id[8:]
        gt_video_path = f"{meta_file_dir}/sentence_video_25fps/{file_id}.mp4"  # Chem
    elif visualtts_dataset == 'GRID':
        spk = file_id.split("_")[0]
        gt_video_path = f"{meta_file_dir}/videos_25fps_mp4/{spk}/{file_id}.mp4"  # Chem
    elif visualtts_dataset == 'LRS2':
        spk = "_".join(file_id.split("_")[:2])
        gt_video_path = f"{meta_file_dir}/video_25fps/{spk}/{file_id}.mp4"  # Chem    shutil.copy(gt_video_path, f"{save_dir}/{name}.mp4")
    replace_audio_in_video(gt_video_path, f"{save_dir}/{name}.wav", f"{save_dir}/{name}.mp4")
    # print(f"{save_dir}/{name}.wav", f"{save_dir}/{name}.mp4")


100%|██████████| 200/200 [00:16<00:00, 11.95it/s]


In [None]:
# visualtts_dataset = 'GRID'
# meta_file_dir = f"/home/chengxin/chengxin/Dataset_Sound/{visualtts_dataset}"
# file_name = 's28_pris7n_sample0'
# for baseline in ['HPMDubbing_randomref', 'emodubber_randomref', 'gt_vocoder', 'style_randomref', 'dsu', 'vasflow']:
#     if visualtts_dataset == 'Chem':
#         file_id = '_'.join(file_name.split("_")[2:-1])
#         gt_video_path = f"{meta_file_dir}/sentence_video_25fps/{file_id}.mp4" 
#         wav_path =  f'/home/chengxin/chengxin/Dataset_Sound/Chem/results/{baseline}/data/{file_name}.wav'
#         target_path = f'/home/chengxin/chengxin/Dataset_Sound/Chem/results/demo/{baseline}_{file_name}.mp4'
#         replace_audio_in_video(gt_video_path, wav_path, target_path)
#     if visualtts_dataset == 'GRID':
#         file_id = '_'.join(file_name.split("_")[:-1])
#         spk = file_id.split("_")[0]
#         gt_video_path = f"{meta_file_dir}/videos_25fps_mp4/{spk}/{file_id}.mp4"  # Chem
#         wav_path =  f'/home/chengxin/chengxin/Dataset_Sound/GRID/results/{baseline}/data/{file_name}.wav'
#         target_path = f'/home/chengxin/chengxin/Dataset_Sound/GRID/results/demo/{baseline}_{file_name}.mp4'
#         replace_audio_in_video(gt_video_path, wav_path, target_path)