### Import libraries

In [1]:
"""Import libraries"""

import os
import librosa
import numpy as np
from PIL import Image, ImageOps
import soundfile as sf
from IPython.display import display
import librosa.display
import matplotlib.pyplot as plt
from diffusers import AudioLDMPipeline
from transformers import SpeechT5HifiGan, SpeechT5Processor
import torch
from waveglow_vocoder import WaveGlowVocoder


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def create_directory_if_not_exists(directory):
    if not os.path.exists(directory):
        os.makedirs(directory)
        

### Audio to mel

#### Preprocessing: convert audio to tiff

In [3]:
def wav2tiff(wav_path, save_path, n_fft=2048, hop_length=512, n_mels=80):
    create_directory_if_not_exists(os.path.dirname(save_path))
    
    y, sr = librosa.load(wav_path)
    S = librosa.feature.melspectrogram(
        y, sr=sr, n_fft=n_fft, hop_length=hop_length, n_mels=n_mels
    )
    print(f"Mel Spectrogram Shape: {S.shape}")
    im_tiff = Image.fromarray(S).convert('F')
    
    im_tiff.save(save_path)
    print(f"Mel saved to {save_path}")

# def wav2tiff(wav_path, save_path, n_fft=2048, hop_length=512, n_mels=128):
#     create_directory_if_not_exists(os.path.dirname(save_path))
    
#     # 載入音檔並轉換為 Tensor
#     y,sr = librosa.load(wav_path, sr=22050)
#     y_tensor = torch.from_numpy(y).to(device='cuda', dtype=torch.float32)

#     # 用 WaveGlowVocoder 轉換成 Mel Spectrogram
#     WV = WaveGlowVocoder()
#     mel = WV.wav2mel(y_tensor)

#     # 將 Mel Spectrogram 保存為 TIFF 圖片
#     mel = mel.squeeze().cpu().numpy()
#     image = Image.fromarray(mel)
#     image.save(save_path)
#     print(f"Mel Spectrogram saved to {save_path}")


In [None]:
"""轉換成tiff"""

def batch_wav2tiff(root_dir):
    # 遍歷 eval_data 資料夾
    for dirpath, dirnames, filenames in os.walk(root_dir):
        if 'source' in dirpath:  # 確保是在 source 資料夾內
            for filename in filenames:
                if filename.endswith('.wav'):  # 找出 .wav 音檔
                    wav_path = os.path.join(dirpath, filename)  # 音檔完整路徑
                    tiff_name = filename.replace('.wav', '.tiff')  # 轉換後的名稱
                    tiff_path = os.path.join(dirpath, tiff_name)  # tiff 檔案路徑

                    # 呼叫 wav2tiff 函數
                    wav2tiff(wav_path, tiff_path)
                    print(f"Converted: {wav_path} -> {tiff_path}")

# 設定起始資料夾路徑
root_dir = 'eval_data'
batch_wav2tiff(root_dir)
print('done')


In [5]:
"""同一個資料夾下的形狀要一樣(模型才有辦法跑)"""

def resize_tiffs_to_same_shape(root_dir):
    # 遍歷 eval_data 資料夾
    for dirpath, dirnames, filenames in os.walk(root_dir):
        if 'source' in dirpath:  # 確保是在 source 資料夾內
            tiff_paths = [os.path.join(dirpath, f) for f in filenames if f.endswith('.tiff')]
            
            # 跳過如果沒有 TIFF 檔案的情況
            if not tiff_paths:
                continue
            
            # 步驟 1：檢查所有 tiff 的寬度和高度，找出最大值
            max_width, max_height = 0, 0
            for tiff_path in tiff_paths:
                with Image.open(tiff_path) as img:
                    width, height = img.size
                    max_width = max(max_width, width)
                    max_height = max(max_height, height)

            # 檢查是否所有圖片大小都一致
            all_same_size = all(Image.open(p).size == (max_width, max_height) for p in tiff_paths)
            if all_same_size:
                print(f"Skipping {dirpath}: All TIFFs are already the same size.")
                continue  # 如果大小都一樣，則跳過該資料夾

            # 步驟 2：調整所有圖片的大小
            print(f"Resizing images in {dirpath} to Width={max_width}, Height={max_height}")
            for tiff_path in tiff_paths:
                with Image.open(tiff_path) as img:
                    # 調整大小並填補空白（使用黑色填充）
                    if img.size != (max_width, max_height):
                        resized_img = ImageOps.pad(img, (max_width, max_height), color="black")
                        resized_img.save(tiff_path)
                        print(f"Resized: {tiff_path}")

# 設定起始資料夾路徑
root_dir = 'eval_data'
resize_tiffs_to_same_shape(root_dir)


Skipping eval_data/churchBells_clockAlarm/source: All TIFFs are already the same size.
Skipping eval_data/piano_violin/source: All TIFFs are already the same size.
Skipping eval_data/guitar3_piano3/source: All TIFFs are already the same size.
Skipping eval_data/babyCrying_humanLaughing/source: All TIFFs are already the same size.
Skipping eval_data/cat_dog/source: All TIFFs are already the same size.
Skipping eval_data/organ_piano/source: All TIFFs are already the same size.
Skipping eval_data/woodDoorKnocking_clapping/source: All TIFFs are already the same size.
Skipping eval_data/kalimaba4_harp4/source: All TIFFs are already the same size.
Skipping eval_data/guitar_piano/source: All TIFFs are already the same size.


#### Run model to generate morph mels
- Run ./diff_morph.sh

#### Check results

In [6]:
"""檢查input跟output形狀"""

def print_image_shapes_in_results(root_dir):
    # 遍歷 results 資料夾
    for dirpath, dirnames, filenames in os.walk(root_dir):
        tiff_files = [filename for filename in filenames if filename.endswith('.tiff')]
        
        if not tiff_files:
            continue  # 如果資料夾中沒有 tiff 檔案，跳過
        
        print(f"Checking images in folder: {dirpath}")
        # 只對有 tiff 檔案的資料夾處理
        for filename in tiff_files:
            tiff_path = os.path.join(dirpath, filename)
            with Image.open(tiff_path) as img:
                width, height = img.size
                print(f"Shape of {filename}: Width={width}, Height={height}")

# input
root_dir = 'eval_data' # 'results'
# root_dir = 'results' # 'results'
print_image_shapes_in_results(root_dir)


Checking images in folder: eval_data/churchBells_clockAlarm/source
Shape of clockAlarm.tiff: Width=216, Height=80
Shape of churchBells.tiff: Width=216, Height=80
Checking images in folder: eval_data/piano_violin/source
Shape of piano.tiff: Width=216, Height=80
Shape of violin.tiff: Width=216, Height=80
Checking images in folder: eval_data/guitar3_piano3/source
Shape of piano3.tiff: Width=259, Height=80
Shape of guitar3.tiff: Width=259, Height=80
Checking images in folder: eval_data/babyCrying_humanLaughing/source
Shape of babyCrying.tiff: Width=216, Height=80
Shape of humanLaughing.tiff: Width=216, Height=80
Checking images in folder: eval_data/cat_dog/source
Shape of dog.tiff: Width=216, Height=80
Shape of cat.tiff: Width=216, Height=80
Checking images in folder: eval_data/organ_piano/source
Shape of piano.tiff: Width=216, Height=80
Shape of organ.tiff: Width=216, Height=80
Checking images in folder: eval_data/woodDoorKnocking_clapping/source
Shape of woodDoorKnocking.tiff: Width=216,

### Mel to audio

#### librosa.feature.inverse.mel_to_audio

In [120]:
def tiff2wav(tiff_path, save_path, sr=22050, hop_length=512):
    create_directory_if_not_exists(os.path.dirname(save_path))
    
    img = Image.open(tiff_path).convert("L")
    # img = img.resize((431, 128))
    img = np.array(img, dtype=np.float32)
    wav = librosa.feature.inverse.mel_to_audio(img, sr=sr, hop_length=hop_length)
    
    sf.write(save_path, wav, samplerate=sr)
    print(f"Audio saved to {save_path}")

# def tiff2wav(tiff_path, save_path, sr=22050, hop_length=512):
#     create_directory_if_not_exists(os.path.dirname(save_path))

#     # 使用 Pillow 讀取圖片
#     image = Image.open(tiff_path)
#     mel = np.array(image, dtype=np.float32)
#     mel = torch.tensor(mel).to(device='cuda')
#     print(mel.shape)

#     # 使用 mel2wav 方法轉回音訊波形
#     WV = WaveGlowVocoder()
#     wav = WV.mel2wav(mel)

#     # 可視化波形或儲存音訊
#     sf.write(save_path, wav.squeeze().cpu().numpy(), samplerate=22050)
#     print("Reconstructed audio saved successfully!")
    
    
# def tiff2wav(tiff_path, save_path):
#     create_directory_if_not_exists(os.path.dirname(save_path))

#     # 使用 Pillow 讀取圖片
#     image = Image.open(tiff_path)
#     mel = np.array(image, dtype=np.float32)
#     mel = torch.tensor(mel).to(device='cuda')
#     print(mel.shape)

#     # 調整到 (1, 80, time_frames) 格式
#     mel = torch.nn.functional.interpolate(
#         torch.tensor(mel).unsqueeze(0).unsqueeze(0),  # [1, 1, H, W]
#         size=(80, mel.shape[1]),  # 重新調整到 80 bins
#         mode="bilinear",
#         align_corners=False
#     ).squeeze(0)
#     mel = mel.to(device="cuda", dtype=torch.float32)

#     # 初始化 WaveGlowVocoder
#     WV = WaveGlowVocoder()
#     wav = WV.mel2wav(mel)
#     print(f"Reconstructed audio shape: {wav.shape}")

#     # 保存音訊
#     sf.write(save_path, wav.squeeze().cpu().numpy(), samplerate=22050)
#     print(f"Audio saved to {save_path}")


In [122]:
"""把結果轉換為音檔"""

def convert_tiffs_to_wav(root_dir, output_dir, conversion_func):
    # 遍歷 results 資料夾
    for dirpath, dirnames, filenames in os.walk(root_dir):
        if not filenames:
            continue  # 如果資料夾中沒有檔案，跳過
        for filename in filenames:
            if filename.endswith('.tiff'):
                tiff_path = os.path.join(dirpath, filename)
                
                # 設定輸出路徑
                relative_dir = os.path.relpath(dirpath, root_dir)
                output_folder = os.path.join(output_dir, relative_dir)
                os.makedirs(output_folder, exist_ok=True)
                
                wav_path = os.path.join(output_folder, f"{filename[:-5]}.wav")  # 去掉 `.tiff` 後換成 `.wav`
                
                # 呼叫轉換函數進行轉換
                conversion_func(tiff_path, wav_path)
                print(f"Converted {tiff_path} to {wav_path}")

root_dir = 'results'
output_dir = 'reconstruct'
convert_tiffs_to_wav(root_dir, output_dir, tiff2wav)


Audio saved to reconstruct/churchBells_clockAlarm/04.wav
Converted results/churchBells_clockAlarm/04.tiff to reconstruct/churchBells_clockAlarm/04.wav
Audio saved to reconstruct/churchBells_clockAlarm/01.wav
Converted results/churchBells_clockAlarm/01.tiff to reconstruct/churchBells_clockAlarm/01.wav
Audio saved to reconstruct/churchBells_clockAlarm/00.wav
Converted results/churchBells_clockAlarm/00.tiff to reconstruct/churchBells_clockAlarm/00.wav
Audio saved to reconstruct/churchBells_clockAlarm/02.wav
Converted results/churchBells_clockAlarm/02.tiff to reconstruct/churchBells_clockAlarm/02.wav
Audio saved to reconstruct/churchBells_clockAlarm/03.wav
Converted results/churchBells_clockAlarm/03.tiff to reconstruct/churchBells_clockAlarm/03.wav
Audio saved to reconstruct/piano_violin/04.wav
Converted results/piano_violin/04.tiff to reconstruct/piano_violin/04.wav
Audio saved to reconstruct/piano_violin/01.wav
Converted results/piano_violin/01.tiff to reconstruct/piano_violin/01.wav
Aud

#### Waveglow
https://github.com/HudsonHuang/waveglow_vocoder

In [22]:
from waveglow_vocoder import WaveGlowVocoder
import torch
import librosa

# 載入音檔並轉換為 Tensor
y,sr = librosa.load('eval_data/cat_dog/source/cat.wav', sr=22050)
y_tensor = torch.from_numpy(y).to(device='cuda', dtype=torch.float32)
print(y_tensor.shape)

# 用 WaveGlowVocoder 轉換成 Mel Spectrogram
WV = WaveGlowVocoder()
mel = WV.wav2mel(y_tensor)
print(mel.shape)

# 將 Mel Spectrogram 保存為 TIFF 圖片
save_path = 'eval_data/cat_dog/source/cat.tiff'
mel = mel.squeeze().cpu().numpy()  # 去掉多餘的維度並轉回 CPU
print(mel.shape)
image = Image.fromarray(mel)
image.save(save_path)
print(f"Mel Spectrogram saved to {save_path}")


torch.Size([110250])


Using cache found in /home/huyushin/.cache/torch/hub/nvidia_DeepLearningExamples_torchhub


torch.Size([1, 80, 431])
(80, 431)
Mel Spectrogram saved to eval_data/cat_dog/source/cat.tiff


In [23]:
# 使用 Pillow 讀取圖片
image = Image.open('eval_data/cat_dog/source/cat.tiff')
mel = np.array(image, dtype=np.float32)
print(mel.shape)
mel = torch.tensor(mel).unsqueeze(0).to(device='cuda')
print(mel.shape)

# 使用 mel2wav 方法轉回音訊波形
WV = WaveGlowVocoder()
wav = WV.mel2wav(mel)
print(wav.shape)

# 可視化波形或儲存音訊
sf.write('reconstruct/cat_dog/cat_reconstructed.wav', 
         wav.squeeze().cpu().numpy(), samplerate=22050)
print("Reconstructed audio saved successfully!")


(80, 431)
torch.Size([1, 80, 431])


Using cache found in /home/huyushin/.cache/torch/hub/nvidia_DeepLearningExamples_torchhub


torch.Size([1, 110336])
Reconstructed audio saved successfully!


In [9]:
# 讀取 TIFF 圖片並轉換回 Mel Spectrogram
save_path = 'eval_data/babyCrying_humanLaughing/source/babyCrying.tiff'

# 使用 Pillow 讀取圖片
image = Image.open('eval_data/babyCrying_humanLaughing/source/babyCrying.tiff')
mel = np.array(image, dtype=np.float32)
print(mel.shape)
mel = torch.tensor(mel).unsqueeze(0).to(device='cuda')
print(mel.shape)

# 使用 mel2wav 方法轉回音訊波形
WV = WaveGlowVocoder()
wav = WV.mel2wav(mel)
print(wav.shape)

# 可視化波形或儲存音訊
sf.write('reconstruct/babyCrying_humanLaughing/babyCrying_reconstructed.wav', 
         wav.squeeze().cpu().numpy(), samplerate=22050)
print("Reconstructed audio saved successfully!")


(80, 431)
torch.Size([1, 80, 431])


Using cache found in /home/huyushin/.cache/torch/hub/nvidia_DeepLearningExamples_torchhub


torch.Size([1, 110336])
Reconstructed audio saved successfully!


In [16]:
# 讀取 TIFF 圖片並轉換回 Mel Spectrogram
save_path = 'eval_data/babyCrying_humanLaughing/source/humanLaughing.tiff'

# 使用 Pillow 讀取圖片
image = Image.open('eval_data/babyCrying_humanLaughing/source/humanLaughing.tiff')
mel = np.array(image, dtype=np.float32)
print(mel.shape)
mel = torch.tensor(mel).unsqueeze(0).to(device='cuda')
print(mel.shape)

# 使用 mel2wav 方法轉回音訊波形
WV = WaveGlowVocoder()
wav = WV.mel2wav(mel)
print(wav.shape)

# 可視化波形或儲存音訊
sf.write('reconstruct/babyCrying_humanLaughing/humanLaughing_reconstructed.wav', 
         wav.squeeze().cpu().numpy(), samplerate=22050)
print("Reconstructed audio saved successfully!")


(80, 216)
torch.Size([1, 80, 216])


Using cache found in /home/huyushin/.cache/torch/hub/nvidia_DeepLearningExamples_torchhub


torch.Size([1, 55296])
Reconstructed audio saved successfully!


In [24]:
from PIL import Image
import numpy as np

img = Image.open('eval_data/babyCrying_humanLaughing/source/babyCrying.tiff')
img_array = np.array(img)
print(f"Image shape: {img_array.shape}")

img = Image.open('eval_data/babyCrying_humanLaughing/source/humanLaughing.tiff')
img_array = np.array(img)
print(f"Image shape: {img_array.shape}")

img = Image.open('results/babyCrying_humanLaughing/00.tiff')
img_array = np.array(img)
print(f"Image shape: {img_array.shape}")


img = Image.open('eval_data/cat_dog/source/cat.tiff')
img_array = np.array(img)
print(f"Image shape: {img_array.shape}")

img = Image.open('eval_data/cat_dog/source/dog.tiff')
img_array = np.array(img)
print(f"Image shape: {img_array.shape}")

img = Image.open('results/cat_dog/00.tiff')
img_array = np.array(img)
print(f"Image shape: {img_array.shape}")


Image shape: (80, 431)
Image shape: (80, 216)
Image shape: (128, 216, 3)
Image shape: (80, 431)
Image shape: (80, 216)
Image shape: (128, 216, 3)


### tiff to png

In [6]:
import os
from PIL import Image

def convert_tiff_to_png(root_dir):
    # 遍歷資料夾，找出所有 tiff 檔案
    for dirpath, dirnames, filenames in os.walk(root_dir):
        for filename in filenames:
            if filename.lower().endswith('.tiff'):  # 找到所有 tiff 檔案
                tiff_path = os.path.join(dirpath, filename)  # tiff 檔案的完整路徑
                png_name = filename.replace('.tiff', '.png')  # 轉換後的檔案名稱
                png_path = os.path.join(dirpath, png_name)  # png 檔案的完整路徑

                # 開啟 tiff 檔案並儲存為 png
                with Image.open(tiff_path) as img:
                    img.save(png_path)
                    print(f"Converted: {tiff_path} -> {png_path}")

# 設定資料夾路徑
root_dir = '/home/huyushin/python_files/DiffMorpher-re griff還原/results/babyCrying_humanLaughing'
convert_tiff_to_png(root_dir)
print('done')


Converted: /home/huyushin/python_files/DiffMorpher-re griff還原/results/babyCrying_humanLaughing/04.tiff -> /home/huyushin/python_files/DiffMorpher-re griff還原/results/babyCrying_humanLaughing/04.png
Converted: /home/huyushin/python_files/DiffMorpher-re griff還原/results/babyCrying_humanLaughing/01.tiff -> /home/huyushin/python_files/DiffMorpher-re griff還原/results/babyCrying_humanLaughing/01.png
Converted: /home/huyushin/python_files/DiffMorpher-re griff還原/results/babyCrying_humanLaughing/00.tiff -> /home/huyushin/python_files/DiffMorpher-re griff還原/results/babyCrying_humanLaughing/00.png
Converted: /home/huyushin/python_files/DiffMorpher-re griff還原/results/babyCrying_humanLaughing/02.tiff -> /home/huyushin/python_files/DiffMorpher-re griff還原/results/babyCrying_humanLaughing/02.png
Converted: /home/huyushin/python_files/DiffMorpher-re griff還原/results/babyCrying_humanLaughing/03.tiff -> /home/huyushin/python_files/DiffMorpher-re griff還原/results/babyCrying_humanLaughing/03.png
done
