In [8]:
import torchaudio
import IPython.display as ipd
from scipy.io.wavfile import read
import numpy as np
import torch

def load_wav_to_torch(full_path):
  sampling_rate, data = read(full_path)
  return torch.FloatTensor(data.astype(np.float32)), sampling_rate


audio, _ = load_wav_to_torch('/Users/zhaoyuanqi/Desktop/LJ_small/wavs/LJ001-0001.wav')

ipd.Audio(data=audio, autoplay=True, rate=16000)

**TEST FOR KMEANS**

In [10]:
import torch
from transformers import Wav2Vec2Processor, Wav2Vec2Model
import soundfile as sf
import torch.nn.functional as F
from sklearn.cluster import KMeans
import numpy as np



# 读取和处理音频数据
audio1, _ = sf.read('/Users/zhaoyuanqi/Desktop/LJ_small/wavs/LJ001-0001.wav')
audio1 = torch.FloatTensor(audio1.astype(np.float32))

audio2, _ = sf.read('/Users/zhaoyuanqi/Desktop/LJ_small/wavs/LJ001-0002.wav')
audio2 = torch.FloatTensor(audio2.astype(np.float32))

# pad the shorter waveform with zeros to match the size of the longer waveform
max_length = max(len(audio1), len(audio2))
audio1 = torch.nn.functional.pad(audio1, (0, max_length - len(audio1)), 'constant', 0)
audio2 = torch.nn.functional.pad(audio2, (0, max_length - len(audio2)), 'constant', 0)
audio = torch.stack([audio1, audio2])


# 加载预处理器和模型
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h")
model.eval()

input_values = processor(audio, return_tensors="pt", sampling_rate=16000).input_values
print(input_values.shape)

input_values = processor(audio, return_tensors="pt", sampling_rate=16000).input_values.squeeze(0)  


with torch.no_grad():
    outputs = model(input_values)
    hidden_representations = outputs.last_hidden_state
    print(hidden_representations.shape)

n_clusters = 128
batch_of_phonemes = []

for i in range(hidden_representations.shape[0]):
    # Perform K-means clustering on each row of the hidden representations 
    sub_hidden = hidden_representations[i]
    kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(sub_hidden)
    # Get cluster indices for each frame
    cluster_indices = kmeans.labels_

    # Merge consecutive indices
    merged_indices = []
    current_index = cluster_indices[0]
    merged_indices.append(current_index)

    for index in cluster_indices[1:]:
        if index != current_index:
            merged_indices.append(index)
            current_index = index
    batch_of_phonemes.append(merged_indices)








DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /facebook/wav2vec2-base-960h/resolve/main/preprocessor_config.json HTTP/1.1" 200 0
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /facebook/wav2vec2-base-960h/resolve/main/tokenizer_config.json HTTP/1.1" 200 0
Ignored unknown kwarg option normalize
Ignored unknown kwarg option normalize
Ignored unknown kwarg option normalize
Ignored unknown kwarg option normalize
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /facebook/wav2vec2-base-960h/resolve/main/config.json HTTP/1.1" 200 0
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /facebook/wav2vec2-base-960h/resolve/main/config.json HTTP/1.1" 200 0


Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.masked_spec_embed', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


torch.Size([1, 2, 154481])
torch.Size([2, 482, 768])


**Check phoneme file**


In [4]:
def load_filepaths_and_text(filename, split="|"):
  with open(filename, encoding='utf-8') as f:
    filepaths_and_text = [line.strip().split(split) for line in f]
  return filepaths_and_text

In [5]:

d = load_filepaths_and_text('/Users/zhaoyuanqi/Desktop/LJ/metadata_new.txt.cleaned.txt')
d[0]

['/Users/zhaoyuanqi/Desktop/LJ/wavs/LJ001-0001.wav',
 'bˌʌt baɪ pɹˈɪntɚz ɪn stɹˈæsbɜːɡ, bˈæsəl, pˈæɹɪs, lˈuːbɛk, ænd ˈʌðɚ sˈɪɾiz.']

**Check for pre-training data loader**

In [6]:
from data_utils import TextAudioLoader, TextAudioCollate
from torch.utils.data import DataLoader
import utils

hps = utils.get_hparams_from_file("./configs/ljs_base.json")
hps.data.sampling_rate = 16000
dataset = TextAudioLoader(hps.data.training_files, hps.data, mode='pre-training')

collate_fn = TextAudioCollate(mode = 'pre-training')
train_loader = DataLoader(dataset, num_workers=4, shuffle=False, pin_memory=True,collate_fn=collate_fn, batch_size=2)

from models import psudo_phoneme
import torch
psudo = psudo_phoneme(128)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')


for i in train_loader:
    _,_,wave,_ = i
    wave.to(device)
    res, len = psudo(wave)

    break

print(res[0])

16000

**Check for fine-tuning mode data loader**

In [7]:
dataset = TextAudioLoader(hps.data.training_files, hps.data, mode='fine-tuning')

collate_fn = TextAudioCollate(mode = 'fine-tuning')
train_loader = DataLoader(dataset, num_workers=4, shuffle=False, pin_memory=True,collate_fn=collate_fn, batch_size=2)


for x in train_loader:
    print(x)
    break

**Check pseudo text encoder**

In [14]:
from models import pseudo_text_encoder

encoder = pseudo_text_encoder(n_vocab=128, hidden_channels=192, n_layers=1, kernel_size=3, n_heads = 2, filter_channels=768, p_dropout = 0.1, out_channels=192)
encoder.to(device)
result = encoder(res, len)

print(result[0])

**Check Text Encoder**

In [15]:
from models import TextEncoder
txt_enc = TextEncoder(n_vocab=128, hidden_channels=192, out_channels=192, filter_channels=768, n_heads=2, n_layers=6, kernel_size=3, p_dropout=0.1)

txt_enc(res, len)

(tensor([[[-0.0721, -1.5695, -0.8030,  ..., -0.7254, -1.5935,  0.0773],
          [-1.4492, -1.1191, -0.7052,  ...,  0.0318,  0.0407, -1.3130],
          [-0.6142,  0.4361,  1.2679,  ..., -0.2700, -0.0508,  1.2739],
          ...,
          [ 0.5703,  1.6511,  1.8712,  ...,  0.4326,  1.6806,  2.0744],
          [ 0.9958,  0.2519,  2.4846,  ...,  2.0348,  0.1025,  2.7290],
          [ 1.2372,  1.5461,  1.5863,  ...,  0.6954,  1.2079,  0.8441]],
 
         [[-0.3243, -2.0804, -1.2786,  ...,  0.0000,  0.0000,  0.0000],
          [-0.6817, -0.5830, -0.6864,  ..., -0.0000, -0.0000, -0.0000],
          [ 0.5739,  0.0838,  0.7608,  ..., -0.0000, -0.0000, -0.0000],
          ...,
          [ 1.7903,  1.9035,  0.0259,  ...,  0.0000,  0.0000,  0.0000],
          [ 1.0276, -0.1600,  2.0729,  ...,  0.0000,  0.0000,  0.0000],
          [ 1.2259,  1.0648,  1.6115,  ...,  0.0000,  0.0000,  0.0000]]],
        grad_fn=<MulBackward0>),
 tensor([[[-0.3454, -1.0231, -0.1171,  ..., -0.7021, -1.0274, -1.096

In [3]:
from transformers import Wav2Vec2Processor, Wav2Vec2Model

processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h")
import commons
import utils
from data_utils import (
  TextAudioLoader,
  TextAudioCollate,
  DistributedBucketSampler
)
from models import (
  SynthesizerTrn,
  phoneme_SynthesizerTrn,
  MultiPeriodDiscriminator,
  psudo_phoneme,
)
hps = utils.get_hparams_from_file("./configs/ljs_base.json")
hps.data.sampling_rate = 16000

train_dataset = TextAudioLoader(hps.data.training_files, hps.data, 'pre-training')

collate_fn = TextAudioCollate(mode='pre-training')
train_loader = DataLoader(train_dataset, num_workers=1, shuffle=False, pin_memory=True,
      collate_fn=collate_fn, batch_size = 2)

for i in train_loader:
    print(i)
    break

DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /facebook/wav2vec2-base-960h/resolve/main/preprocessor_config.json HTTP/1.1" 200 0
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /facebook/wav2vec2-base-960h/resolve/main/tokenizer_config.json HTTP/1.1" 200 0
Ignored unknown kwarg option normalize
Ignored unknown kwarg option normalize
Ignored unknown kwarg option normalize
Ignored unknown kwarg option normalize
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /facebook/wav2vec2-base-960h/resolve/main/config.json HTTP/1.1" 200 0
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /facebook/wav2vec2-base-960h/resolve/main/config.json HTTP/1.1" 200 0


Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.masked_spec_embed', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


FileNotFoundError: [Errno 2] No such file or directory: 'DUMMY1/LJ038-0104.wav'