Skip to content

Commit

Permalink
Merge pull request #40 from lmzjms/main
Browse files Browse the repository at this point in the history
update text_to_audio
  • Loading branch information
lmzjms committed Apr 30, 2023
2 parents 97a9a2f + f61a97c commit 148737e
Show file tree
Hide file tree
Showing 119 changed files with 1,062 additions and 16 deletions.
12 changes: 5 additions & 7 deletions audio-chatgpt.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,6 @@
sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))

This comment has been minimized.

Copy link
@3004474105

3004474105 May 2, 2023

audio-chatgpt.py

sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'NeuralSeq'))
sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'text_to_audio/Make_An_Audio'))
sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'text_to_audio/Make_An_Audio_img'))
sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'text_to_audio/Make_An_Audio_inpaint'))
sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'audio_detection'))
sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'mono2binaural'))
import gradio as gr
Expand Down Expand Up @@ -186,7 +184,7 @@ def txt2audio(self, text, seed = 55, scale = 1.5, ddim_steps = 100, n_samples =

def select_best_audio(self, prompt, wav_list):
from wav_evaluation.models.CLAPWrapper import CLAPWrapper
clap_model = CLAPWrapper('useful_ckpts/CLAP/CLAP_weights_2022.pth', 'useful_ckpts/CLAP/config.yml',
clap_model = CLAPWrapper('text_to_audio/Make_An_Audio/useful_ckpts/CLAP/CLAP_weights_2022.pth', 'text_to_audio/Make_An_Audio/useful_ckpts/CLAP/config.yml',
use_cuda=torch.cuda.is_available())
text_embeddings = clap_model.get_text_embeddings([prompt])
score_list = []
Expand Down Expand Up @@ -217,8 +215,8 @@ class I2A:
def __init__(self, device):
print("Initializing Make-An-Audio-Image to %s" % device)
self.device = device
self.sampler = self._initialize_model('text_to_audio/Make_An_Audio_img/configs/img_to_audio/img2audio_args.yaml', 'text_to_audio/Make_An_Audio_img/useful_ckpts/ta54_epoch=000216.ckpt', device=device)
self.vocoder = VocoderBigVGAN('text_to_audio/Make_An_Audio_img/vocoder/logs/bigv16k53w',device=device)
self.sampler = self._initialize_model('text_to_audio/Make_An_Audio/configs/img_to_audio/img2audio_args.yaml', 'text_to_audio/Make_An_Audio/useful_ckpts/ta54_epoch=000216.ckpt', device=device)
self.vocoder = VocoderBigVGAN('text_to_audio/Make_An_Audio/vocoder/logs/bigv16k53w',device=device)

def _initialize_model(self, config, ckpt, device):
config = OmegaConf.load(config)
Expand Down Expand Up @@ -421,8 +419,8 @@ class Inpaint:
def __init__(self, device):
print("Initializing Make-An-Audio-inpaint to %s" % device)
self.device = device
self.sampler = self._initialize_model_inpaint('text_to_audio/Make_An_Audio_inpaint/configs/inpaint/txt2audio_args.yaml', 'text_to_audio/Make_An_Audio_inpaint/useful_ckpts/inpaint7_epoch00047.ckpt')
self.vocoder = VocoderBigVGAN('./vocoder/logs/bigv16k53w',device=device)
self.sampler = self._initialize_model_inpaint('text_to_audio/Make_An_Audio/configs/inpaint/txt2audio_args.yaml', 'text_to_audio/Make_An_Audio/useful_ckpts/inpaint7_epoch00047.ckpt')
self.vocoder = VocoderBigVGAN('text_to_audio/Make_An_Audio/vocoder/logs/bigv16k53w',device=device)
self.cmap_transform = matplotlib.cm.viridis

def _initialize_model_inpaint(self, config, ckpt):
Expand Down
8 changes: 2 additions & 6 deletions download.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,10 @@ wget -P checkpoints/0109_hifigan_bigpopcs_hop128/ -i https://huggingface.co/spac
wget -P checkpoints/0102_xiaoma_pe/ -i https://huggingface.co/spaces/Silentlin/DiffSinger/blob/main/checkpoints/0102_xiaoma_pe/config.yaml https://huggingface.co/spaces/Silentlin/DiffSinger/resolve/main/checkpoints/0102_xiaoma_pe/model_ckpt_steps_60000.ckpt
# Text to audio
cd text_to_audio
git clone https://huggingface.co/spaces/DiffusionSpeech/Make_An_Audio
git clone https://huggingface.co/spaces/DiffusionSpeech/Make_An_Audio_img
git clone https://huggingface.co/spaces/DiffusionSpeech/Make_An_Audio_inpaint
wget -P text_to_audio/Make_An_Audio/useful_ckpts/ -i https://huggingface.co/spaces/DiffusionSpeech/Make_An_Audio/resolve/main/useful_ckpts/ta40multi_epoch=000085.ckpt
wget -P text_to_audio/Make_An_Audio/useful_ckpts/CLAP/ -i https://huggingface.co/spaces/DiffusionSpeech/Make_An_Audio/resolve/main/useful_ckpts/CLAP/CLAP_weights_2022.pth
wget -P text_to_audio/Make_An_Audio_img/useful_ckpts/ -i https://huggingface.co/spaces/DiffusionSpeech/Make_An_Audio_img/resolve/main/useful_ckpts/ta54_epoch=000216.ckpt
wget -P text_to_audio/Make_An_Audio_img/useful_ckpts/CLAP/ -i https://huggingface.co/spaces/DiffusionSpeech/Make_An_Audio_img/blob/main/useful_ckpts/CLAP/CLAP_weights_2022.pth
wget -P text_to_audio/Make_An_Audio_inpaint/useful_ckpts/ -i https://huggingface.co/spaces/DiffusionSpeech/Make_An_Audio_inpaint/resolve/main/useful_ckpts/inpaint7_epoch00047.ckpt
wget -P text_to_audio/Make_An_Audio/useful_ckpts/ -i https://huggingface.co/spaces/DiffusionSpeech/Make_An_Audio_img/resolve/main/useful_ckpts/ta54_epoch=000216.ckpt
wget -P text_to_audio/Make_An_Audio/useful_ckpts/ -i https://huggingface.co/spaces/DiffusionSpeech/Make_An_Audio_inpaint/resolve/main/useful_ckpts/inpaint7_epoch00047.ckpt
# Text to speech
wget -P checkpoints/GenerSpeech/ -i https://huggingface.co/spaces/Rongjiehuang/GenerSpeech/blob/main/checkpoints/GenerSpeech/config.yaml https://huggingface.co/spaces/Rongjiehuang/GenerSpeech/resolve/main/checkpoints/GenerSpeech/model_ckpt_steps_300000.ckpt
wget -P checkpoints/trainset_hifigan/ -i https://huggingface.co/spaces/Rongjiehuang/GenerSpeech/blob/main/checkpoints/trainset_hifigan/config.yaml https://huggingface.co/spaces/Rongjiehuang/GenerSpeech/resolve/main/checkpoints/trainset_hifigan/model_ckpt_steps_1000000.ckpt
Expand Down
12 changes: 10 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
--extra-index-url https://download.pytorch.org/whl/cu113
accelerate
addict==2.4.0
aiofiles
albumentations==1.3.0
appdirs==1.4.4
basicsr==1.4.2
Expand All @@ -10,17 +11,23 @@ diffusers
einops==0.3.0
espnet
espnet_model_zoo
ffmpeg-python
g2p-en==2.1.0
google==3.0.0
gradio
h5py==2.8.0
h5py
imageio==2.9.0
imageio-ffmpeg==0.4.2
invisible-watermark>=0.1.5
jieba
kornia==0.6
langchain==0.0.101
librosa
loguru
miditoolkit==0.1.7
mmcv==1.5.0
mmdet==2.23.0
mmengine==0.7.2
moviepy==1.0.3
numpy==1.23.1
omegaconf==2.1.1
Expand Down Expand Up @@ -56,8 +63,9 @@ torchlibrosa
torchmetrics==0.6.0
torchvision==0.13.1
transformers==4.26.1
typing-extensions==3.10.0.2
typing-extensions==4.0.0
uuid==1.30
webdataset==0.2.5
webrtcvad==2.0.10
yapf==0.32.0
git+https://github.com/openai/CLIP.git
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
model:
base_learning_rate: 1.0e-05
target: ldm.models.diffusion.ddpm_audio.LatentDiffusion_audio
params:
linear_start: 0.00085
linear_end: 0.0120
num_timesteps_cond: 1
log_every_t: 200
timesteps: 1000
first_stage_key: image
cond_stage_key: caption
image_size: 32 # unused
mel_dim: 10 # 80 // 2^3
mel_length: 78 # 624 // 2^3
channels: 4
cond_stage_trainable: false
conditioning_key: crossattn
monitor: val/loss_simple_ema
scale_by_std: True
use_ema: False

scheduler_config: # 10000 warmup steps
target: ldm.lr_scheduler.LambdaLinearScheduler
params:
warm_up_steps: [10000]
cycle_lengths: [10000000000000]
f_start: [1.e-6]
f_max: [1.]
f_min: [ 1.]

unet_config:
target: ldm.modules.diffusionmodules.custom_openaimodel.UNetModel
params:
image_size: 32 # ununsed
in_channels: 4
out_channels: 4
model_channels: 256
attention_resolutions:
- 1
- 2
num_res_blocks: 2
channel_mult: # num_down = len(ch_mult)-1
- 1
- 2
num_head_channels: 32
use_spatial_transformer: true
transformer_depth: 1
context_dim: 1024
use_context_project: false


first_stage_config:
target: ldm.models.autoencoder.AutoencoderKL
params:
embed_dim: 4
monitor: val/rec_loss
ddconfig:
double_z: true
z_channels: 4
resolution: 848
in_channels: 1
out_ch: 1
ch: 128
ch_mult: [ 1, 2, 2, 4 ] # num_down = len(ch_mult)-1
num_res_blocks: 2
attn_resolutions: [106, 212]
dropout: 0.0
lossconfig:
target: torch.nn.Identity

cond_stage_config:
target: ldm.modules.encoders.modules.FrozenGlobalNormOpenCLIPEmbedder
params:
freeze: True
delvisual: False


68 changes: 68 additions & 0 deletions text_to_audio/Make_An_Audio/configs/inpaint/txt2audio_args.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
model:
base_learning_rate: 1.0e-05
target: ldm.models.diffusion.ddpm_audio.LatentDiffusion_audio
params:
linear_start: 0.0015
linear_end: 0.0205
log_every_t: 100
timesteps: 1000
loss_type: l1
first_stage_key: image
cond_stage_key: masked_image
image_size: 32 # unused
mel_dim: 10 # 80 // 2^3
mel_length: 106 # 848 // 2^3
channels: 4
concat_mode: true
monitor: val/loss
use_ema: False

scheduler_config:
target: ldm.lr_scheduler.LambdaWarmUpCosineScheduler
params:
verbosity_interval: 0
warm_up_steps: 1000
max_decay_steps: 50000
lr_start: 0.001
lr_max: 0.1
lr_min: 0.0001

unet_config:
target: ldm.modules.diffusionmodules.openaimodel.UNetModel
params:
image_size: 32 # ununsed
in_channels: 9 # 4 + 1 + 4
out_channels: 4
model_channels: 320
attention_resolutions:
- 1
- 2
num_res_blocks: 2
channel_mult: # num_down = len(ch_mult)-1
- 1
- 2
num_heads: 8
resblock_updown: true

first_stage_config:
target: ldm.models.autoencoder.AutoencoderKL
params:
embed_dim: 4
monitor: val/rec_loss
ckpt_path: # /apdcephfs/share_1316500/nlphuang/results/Text_to_audio/ae15/2022-12-15T22-24-00_mixdata_kl_4_tile/epoch=000009-v2.ckpt
ddconfig:
double_z: true
z_channels: 4
resolution: 848
in_channels: 1
out_ch: 1
ch: 128
ch_mult: [ 1, 2, 2, 4 ] # num_down = len(ch_mult)-1
num_res_blocks: 2
attn_resolutions: [106, 212]
dropout: 0.0
lossconfig:
target: torch.nn.Identity

cond_stage_config: __is_first_stage__

0 comments on commit 148737e

Please sign in to comment.