In [1]:
import os

In [4]:
import torch

def explore_ckpt_file(ckpt_path):
    try:
        # Load the checkpoint data
        checkpoint = torch.load(ckpt_path, map_location=torch.device('cpu'))

        # List all keys in the checkpoint dictionary (usually contains 'model_state_dict' and more)
        print("Keys in the checkpoint dictionary:\n")
        for key in checkpoint.keys():
            print(key)

        # Access the model's state_dict (modify 'model_state_dict' if different key)
        if 'model_state_dict' in checkpoint:
            state_dict = checkpoint['model_state_dict']

            # List all keys in the state_dict (these are the model's parameter names)
            print("\nKeys in the model's state_dict:\n")
            for key in state_dict.keys():
                print(key)

            # Access specific parameters (you can modify these to explore the data)
            print("\nExample: Accessing specific parameters:\n")
            parameter_name = "your_parameter_name_here"  # Change this to a specific parameter name
            if parameter_name in state_dict:
                parameter = state_dict[parameter_name]
                print(f"Parameter: {parameter_name}")
                print(f"Shape: {parameter.shape}")
                print(f"Data: {parameter}")
            else:
                print(f"Parameter {parameter_name} not found in the state_dict.")
        else:
            print("The 'model_state_dict' key not found in the checkpoint.")

    except Exception as e:
        print(f"Error: {str(e)}")

# if __name__ == "__main__":
    # Replace 'path/to/your/checkpoint.ckpt' with the actual path to your .ckpt file
# ckpt_file_path = '/workspace/nemo/vol/dataStore2/FastSpeech2/output/ckpt/LBM_EB/105000.pth.tar'
ckpt_file_path = '/workspace/nemo/vol/dataStore2/FastSpeech2/output/ckpt/LBM_EB_EMOVDB/12000.pth.tar'
explore_ckpt_file(ckpt_file_path)


Keys in the checkpoint dictionary:

model
optimizer
The 'model_state_dict' key not found in the checkpoint.


In [5]:
checkpoint = torch.load(ckpt_file_path, map_location=torch.device('cpu'))

In [6]:
checkpoint.keys()

dict_keys(['model', 'optimizer'])

In [7]:
checkpoint['model']['speaker_emb.weight'].size()

torch.Size([1906, 512])

In [8]:
checkpoint['model']['emotion_emb.weight'].size()

torch.Size([18, 512])

In [9]:
from model import FastSpeech2

In [10]:
import yaml
model_config = yaml.load(open('/workspace/nemo/vol/dataStore2/FastSpeech2/config/EMOVDB/model.yaml',
                              "r"), Loader=yaml.FullLoader)
preprocess_config = yaml.load(open('/workspace/nemo/vol/dataStore2/FastSpeech2/config/EMOVDB/preprocess.yaml',
                              "r"), Loader=yaml.FullLoader)

In [11]:
fp = FastSpeech2(model_config=model_config, preprocess_config=preprocess_config)

In [30]:
import numpy as np
import torch
from synthesize import preprocess_english
text = "This is a test sentence with a certain emotion. Also this will test if the model can speak larger sentences"
speakers = torch.tensor([1904]).to('cuda:3')
emotion = torch.tensor([14]).to('cuda:3')
ids = raw_texts = [text[:100]]

texts = torch.tensor([preprocess_english(text, preprocess_config)]).to('cuda:3')
text_lens = torch.tensor([len(texts[0])]).to('cuda:3')
batchs = [(ids, raw_texts, speakers, emotion, texts, text_lens, max(text_lens))]

In [31]:
fp.load_state_dict(checkpoint['model'])

<All keys matched successfully>

In [32]:
fp = fp.to('cuda:3')

In [33]:
predictions = fp(*(batchs[0][2:]))

In [34]:
from utils.model import vocoder_infer, get_vocoder
from scipy.io import wavfile
import os

%time mel_predictions = predictions[1].transpose(1, 2)

CPU times: user 41 µs, sys: 0 ns, total: 41 µs
Wall time: 45.3 µs


In [35]:
%time mel_predictions = predictions[1].transpose(1, 2)

CPU times: user 1.68 ms, sys: 0 ns, total: 1.68 ms
Wall time: 1.23 ms


In [36]:
# from nemo.collections.tts.models import HifiGanModel
# vocoder = HifiGanModel.load_from_checkpoint("/workspace/nemo/vol/FastSpeech2/hifigan_ft/HifiGan/2023-07-28_02-15-51/checkpoints/HifiGan--val_loss=0.3318-epoch=944.ckpt")
# vocoder = vocoder.eval().cuda()

In [27]:
from nemo.collections.tts.models import HifiGanModel
vocoder = HifiGanModel.from_pretrained("tts_en_hifigan")
vocoder = vocoder.eval()
vocoder = vocoder.to("cuda:3")

[NeMo I 2023-10-17 07:35:18 cloud:58] Found existing object /root/.cache/torch/NeMo/NeMo_1.19.0/tts_hifigan/e6da322f0f7e7dcf3f1900a9229a7e69/tts_hifigan.nemo.
[NeMo I 2023-10-17 07:35:18 cloud:64] Re-using file from: /root/.cache/torch/NeMo/NeMo_1.19.0/tts_hifigan/e6da322f0f7e7dcf3f1900a9229a7e69/tts_hifigan.nemo
[NeMo I 2023-10-17 07:35:18 common:913] Instantiating model from pre-trained checkpoint


[NeMo W 2023-10-17 07:35:20 modelPT:161] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    dataset:
      _target_: nemo.collections.tts.data.datalayers.MelAudioDataset
      manifest_filepath: /home/fkreuk/data/train_finetune.txt
      min_duration: 0.75
      n_segments: 8192
    dataloader_params:
      drop_last: false
      shuffle: true
      batch_size: 64
      num_workers: 4
    
[NeMo W 2023-10-17 07:35:20 modelPT:168] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method and provide a valid configuration file to setup the validation data loader(s). 
    Validation config : 
    dataset:
      _target_: nemo.collections.tts.data.datalayers.MelAudioDataset
      manifest_filepath: /home/fkreuk/data/val_finetune.txt
      min_duration: 3
      n_segments: 66150


[NeMo I 2023-10-17 07:35:20 features:291] PADDING: 0


[NeMo W 2023-10-17 07:35:20 features:268] Using torch_stft is deprecated and has been removed. The values have been forcibly set to False for FilterbankFeatures and AudioToMelSpectrogramPreprocessor. Please set exact_pad to True as needed.


[NeMo I 2023-10-17 07:35:20 features:291] PADDING: 0
[NeMo I 2023-10-17 07:35:24 save_restore_connector:249] Model HifiGanModel was successfully restored from /root/.cache/torch/NeMo/NeMo_1.19.0/tts_hifigan/e6da322f0f7e7dcf3f1900a9229a7e69/tts_hifigan.nemo.


In [37]:
audio = vocoder.convert_spectrogram_to_audio(spec=mel_predictions)
audio.cpu()

tensor([[0.0030, 0.0044, 0.0051,  ..., 0.0067, 0.0066, 0.0010]],
       grad_fn=<ToCopyBackward0>)

In [38]:
import IPython.display as ipd
ipd.display(ipd.Audio(audio.cpu().detach(), rate=22050))

In [22]:
import torchaudio
torchaudio.save("josh_amused.wav", audio.cpu().detach(), 22050, format='wav')

In [None]:
# from utils.model import vocoder_infer, get_vocoder
# from scipy.io import wavfile
# import os

# mel_predictions = predictions[1].transpose(1, 2)
# vocoder = get_vocoder(model_config, 'cuda')
# lengths = predictions[9] * preprocess_config["preprocessing"]["stft"]["hop_length"]
# wav_predictions = vocoder_infer(
#     mel_predictions, vocoder, model_config, preprocess_config, lengths=lengths
# )

# sampling_rate = preprocess_config["preprocessing"]["audio"]["sampling_rate"]
# outfilename = "josh_disgusted"
# for wav, basename in zip(wav_predictions, [outfilename]):
#     wavfile.write(os.path.join( "{}.wav".format(basename)), sampling_rate, wav)

In [34]:
with open("/workspace/nemo/vol/FastSpeech2/output/log/RAVDESS/train/log.txt", "r") as f:
    logs = f.read()

FileNotFoundError: [Errno 2] No such file or directory: '/workspace/nemo/vol/FastSpeech2/output/log/RAVDESS/train/log.txt'

In [None]:
logs_ = logs.split("\n")

In [None]:
item = logs_[0]
key_value_pairs = [item.split(':') for item in text.split(', ')[1:]]

# Convert the key-value pairs into a dictionary
result_dict = {key.strip(): float(value) for key, value in key_value_pairs}


In [None]:
logs_[1]

In [None]:
loss_dict = {}
for i, log_ in enumerate(logs_):
    try:
        vals = [i.split(":")[1] for i in log_.replace("Step", "Step:").split(",")]
        iter_dict = {}
        iter_dict['iter'] = vals[0]
        iter_dict['total_loss'] = vals[1]
        iter_dict['mel_loss'] = vals[2]
        iter_dict['postnet_loss'] = vals[3]
        iter_dict['pitch_loss'] = vals[4]
        iter_dict['energy_loss'] = vals[5]
        iter_dict['dur_loss'] = vals[6]
        loss_dict[(i+1)*100] = iter_dict
    except:
        continue
    

In [None]:
len(logs_)

In [128]:
loss_dict1 = {}
for i, log_ in enumerate(logs_):
    try:
        vals = [j.split(":")[1] for j in log_.replace("Step", "Step:").split(",")]
#         print(vals[0])
#         iter_dict = {}
#         iter_dict['iter'] = vals[0]
#         iter_dict['total_loss'] = vals[1]
#         iter_dict['mel_loss'] = vals[2]
#         iter_dict['postnet_loss'] = vals[3]
#         iter_dict['pitch_loss'] = vals[4]
#         iter_dict['energy_loss'] = vals[5]
#         iter_dict['dur_loss'] = vals[6]
        if int(vals[0].split('/')[0]) % 100000 == 0:
            loss_dict1[vals[0].split("/")[0]] = vals[1]
    except:
        continue

In [113]:
vals[]

[' 900000/900000',
 ' 0.9803',
 ' 0.4343',
 ' 0.4329',
 ' 0.0312',
 ' 0.0499',
 ' 0.0321']

In [114]:
int(vals[0].split('/')[0])

900000

In [115]:
vals[0]

' 900000/900000'

In [93]:
52%10

2

In [129]:
loss_dict1

{' 100000': ' 1.1884',
 ' 200000': ' 1.0851',
 ' 300000': ' 1.1187',
 ' 400000': ' 0.9975',
 ' 500000': ' 0.9179',
 ' 600000': ' 1.0297',
 ' 700000': ' 0.9462',
 ' 800000': ' 0.9281',
 ' 900000': ' 0.9803'}

In [85]:
min_value = min(loss_dict1.values())

In [86]:
min_value

' 0.8560'

In [47]:
logs_[0]

'Step 100/900000, Total Loss: 10.8035, Mel Loss: 3.7052, Mel PostNet Loss: 3.1430, Pitch Loss: 1.5909, Energy Loss: 1.4730, Duration Loss: 0.8915'

In [None]:
logs_dict = {"iterations": i, "total_loss": }

In [None]:
iterations, total_loss, mel_loss, postnet_loss, pitch_loss, energy_loss, dur_loss = []

In [21]:
#checkpoint['model'].keys()

In [3]:
# vocoder_dict = torch.load("/workspace/nemo/vol/TTS_voices/FP_checkpoints/HifiGan--val_loss=0.3257-epoch=9479.ckpt")

In [28]:
#vocoder_dict

In [2]:
# import json
# import hifigan
# with open("hifigan/config.json", "r") as f:
#             config = json.load(f)
# config = hifigan.AttrDict(config)
# vocoder = hifigan.Generator(config)

In [24]:
from nemo.collections.tts.models import HifiGanModel
vocoder = HifiGanModel.from_pretrained("tts_en_hifigan")
vocoder = vocoder.eval().cuda()

[NeMo W 2023-07-27 22:52:22 experimental:27] Module <class 'nemo.collections.asr.modules.audio_modules.SpectrogramToMultichannelFeatures'> is experimental, not ready for production and is not fully supported. Use at your own risk.
[NeMo W 2023-07-27 22:52:22 experimental:27] Module <class 'nemo.collections.tts.parts.utils.callbacks.LoggingCallback'> is experimental, not ready for production and is not fully supported. Use at your own risk.
[NeMo W 2023-07-27 22:52:22 experimental:27] Module <class 'nemo.collections.tts.models.fastpitch_ssl.FastPitchModel_SSL'> is experimental, not ready for production and is not fully supported. Use at your own risk.
[NeMo W 2023-07-27 22:52:22 experimental:27] Module <class 'nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.IPATokenizer'> is experimental, not ready for production and is not fully supported. Use at your own risk.
[NeMo W 2023-07-27 22:52:22 experimental:27] Module <class 'nemo.collections.tts.models.radtts.RadTTSModel'> 

[NeMo I 2023-07-27 22:52:22 cloud:58] Found existing object /root/.cache/torch/NeMo/NeMo_1.19.0rc0/tts_hifigan/e6da322f0f7e7dcf3f1900a9229a7e69/tts_hifigan.nemo.
[NeMo I 2023-07-27 22:52:22 cloud:64] Re-using file from: /root/.cache/torch/NeMo/NeMo_1.19.0rc0/tts_hifigan/e6da322f0f7e7dcf3f1900a9229a7e69/tts_hifigan.nemo
[NeMo I 2023-07-27 22:52:22 common:913] Instantiating model from pre-trained checkpoint


[NeMo W 2023-07-27 22:52:23 modelPT:161] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    dataset:
      _target_: nemo.collections.tts.data.datalayers.MelAudioDataset
      manifest_filepath: /home/fkreuk/data/train_finetune.txt
      min_duration: 0.75
      n_segments: 8192
    dataloader_params:
      drop_last: false
      shuffle: true
      batch_size: 64
      num_workers: 4
    
[NeMo W 2023-07-27 22:52:23 modelPT:168] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method and provide a valid configuration file to setup the validation data loader(s). 
    Validation config : 
    dataset:
      _target_: nemo.collections.tts.data.datalayers.MelAudioDataset
      manifest_filepath: /home/fkreuk/data/val_finetune.txt
      min_duration: 3
      n_segments: 66150


[NeMo I 2023-07-27 22:52:23 features:291] PADDING: 0


[NeMo W 2023-07-27 22:52:23 features:268] Using torch_stft is deprecated and has been removed. The values have been forcibly set to False for FilterbankFeatures and AudioToMelSpectrogramPreprocessor. Please set exact_pad to True as needed.


[NeMo I 2023-07-27 22:52:23 features:291] PADDING: 0
[NeMo I 2023-07-27 22:52:24 save_restore_connector:249] Model HifiGanModel was successfully restored from /root/.cache/torch/NeMo/NeMo_1.19.0rc0/tts_hifigan/e6da322f0f7e7dcf3f1900a9229a7e69/tts_hifigan.nemo.
