In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "7"
import numpy as np
import torch
from denoising_diffusion_pytorch import Unet1D, GaussianDiffusion1D, Trainer1D, Dataset1D
import sys
sys.path.append('/home/dagarwal/Speech-Articulatory-Coding')
from sparc import load_model
import soundfile as sf
import IPython.display as ipd
import matplotlib.pyplot as plt
import numpy as np
import h5py
import concurrent.futures
from multiprocessing import cpu_count
from accelerate import Accelerator


articulatory_feature_directory = "/data/common/LibriTTS_R/articulatory_features"
pitch_data_file = '/data/common/LibriTTS_R/pitch_stats.npy'

pitch_stats_data = np.load(pitch_data_file, allow_pickle=True).item()

# Function to create chunks from arrays
def create_chunks(arrays, chunk_size):
    all_arrays = []
    for array in arrays:
        additional_padding = (chunk_size - len(array) % chunk_size) % chunk_size
        padded_array = np.pad(array, ((0, additional_padding), (0, 0)), mode='constant', constant_values=0)
        split_arrays = np.array_split(padded_array, len(padded_array) // chunk_size)
        all_arrays += split_arrays
    return all_arrays

# Parallel file loading and processing function
def load_and_transform_file(file_path, chunk_size):
    file_prefix = os.path.basename(file_path).split("_")[0]
    file_pitch_stats = pitch_stats_data[file_prefix]
    
    data = np.load(file_path, allow_pickle=True).item()
    ema_data = data['ema']
    pitch_data = (np.log(data['pitch']) - np.log(file_pitch_stats[0]))[:-1].reshape(-1, 1)
    loudness_data = data['loudness'][:-1].reshape(-1, 1)
    
    if ema_data.shape[0] == pitch_data.shape[0] == loudness_data.shape[0]:
        combined_data = np.concatenate([ema_data, pitch_data, loudness_data], axis=1)
        return combined_data  # Returns processed data for a single file

def get_features(chunk_size, print_every=2000):
    file_paths = [os.path.join(root, file) 
                  for root, _, files in os.walk(articulatory_feature_directory) 
                  for file in files if file.endswith(".npy")][:2000]

    # Process files in parallel
    total_files = len(file_paths)
    processed_files = 0
    with concurrent.futures.ProcessPoolExecutor(max_workers= (int)(cpu_count() / 5)) as executor:
        results = []
        for idx, result in enumerate(executor.map(load_and_transform_file, file_paths, [chunk_size] * total_files)):
            if result is not None:
                results.append(result)
            processed_files += 1
            if processed_files % print_every == 0:
                print(f"Processed {processed_files} / {total_files} files")

    # Concatenate and chunk results
    combined_data = np.concatenate([res for res in results if res is not None], axis=0)
    if chunk_size != 1:
        chunked_data = np.transpose(np.array(create_chunks([combined_data], chunk_size)), (0, 2, 1))
    else:
        chunked_data = combined_data[:, :, np.newaxis]
    return chunked_data

    #         if idx % 200 == 0:
    #             print('here at ', idx)
    #         if idx % 2000 == 0:
    #             with h5py.File(f"/home/dagarwal/processed_articulatory_features.h5", "a") as hf:
    #                 modified_data = np.transpose(np.array(create_chunks(all_data, chunk_size)), (0, 2, 1))
    #                 hf.create_dataset(f"chunks_{idx}", data=modified_data)
    #             with h5py.File("/home/dagarwal/processed_articulatory_features.h5", "r") as hf:
    #                 modified_data = hf[f"chunks_{idx}"][0] 
    #             all_data = []
    #             if idx == 20000:
    #                 break
    # return modified_data

# Call the function

SEQUENCE_LENGTH = 128
TIMESTEPS = 1000

all_data_printing = get_features(SEQUENCE_LENGTH)
print(all_data_printing.shape)
        
def perform_synthesis(coder, articulatory_feature_file, speaker_embedding):
    code = np.load(articulatory_feature_file, allow_pickle=True)[()]
    code['spk_emb'] = speaker_embedding
    ipd.display(ipd.Audio(wav, rate=coder.sr))
    return coder.decode(**code)
    # to display resynthesized audio
    

speaker_embedding_file = '/home/dagarwal/Speech-Articulatory-Coding/sample_audio/sample1.wav'
articulatory_feature_file = '/data/common/LibriTTS_R/articulatory_features/100_121669_000001_000000.npy'
coder = load_model("en", device="cpu")
speaker_embedding = coder.encode(speaker_embedding_file)['spk_emb']
perform_synthesis(coder, articulatory_feature_file, speaker_embedding)


  from .autonotebook import tqdm as notebook_tqdm


Processed 2000 / 373339 files
Processed 4000 / 373339 files
Processed 6000 / 373339 files
Processed 8000 / 373339 files
Processed 10000 / 373339 files
Processed 12000 / 373339 files
Processed 14000 / 373339 files
Processed 16000 / 373339 files
Processed 18000 / 373339 files
Processed 20000 / 373339 files
Processed 22000 / 373339 files
Processed 24000 / 373339 files
Processed 26000 / 373339 files
Processed 28000 / 373339 files
Processed 30000 / 373339 files
Processed 32000 / 373339 files
Processed 34000 / 373339 files
Processed 36000 / 373339 files
Processed 38000 / 373339 files
Processed 40000 / 373339 files
Processed 42000 / 373339 files
Processed 44000 / 373339 files
Processed 46000 / 373339 files
Processed 48000 / 373339 files
Processed 50000 / 373339 files
Processed 52000 / 373339 files
Processed 54000 / 373339 files
Processed 56000 / 373339 files
Processed 58000 / 373339 files
Processed 60000 / 373339 files
Processed 62000 / 373339 files
Processed 64000 / 373339 files
Processed 66

In [6]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "7"
import numpy as np
import torch

torch.cuda.empty_cache()
print(torch.cuda.memory_summary())

from denoising_diffusion_pytorch import Unet1D, GaussianDiffusion1D
import sys
sys.path.append('/home/dagarwal/Speech-Articulatory-Coding')
from sparc import load_model
import IPython.display as ipd

# Paths
sample_file = '/data/common/LibriTTS_R/articulatory_features/986_129388_000060_000005.npy'
# sample_file = '/data/common/LibriTTS_R/articulatory_features/986_129388_000060_000005.npy'
checkpoint_path = '/home/dagarwal/results/model-19.pt'
pitch_data_file = '/data/common/LibriTTS_R/pitch_stats.npy'
speaker_embedding_file = '/home/dagarwal/Speech-Articulatory-Coding/sample_audio/sample1.wav'
denoised_sample_path = "/home/dagarwal/results/denoised_sample.npy"

# Load pitch stats
pitch_stats_data = np.load(pitch_data_file, allow_pickle=True).item()

# Function to preprocess a single file
def preprocess_sample(file_path):
    file_prefix = os.path.basename(file_path).split("_")[0]
    file_pitch_stats = pitch_stats_data[file_prefix]

    data = np.load(file_path, allow_pickle=True).item()
    ema_data = data['ema']
    pitch_data = (np.log(data['pitch']) - np.log(file_pitch_stats[0]))[:-1].reshape(-1, 1)
    loudness_data = data['loudness'][:-1].reshape(-1, 1)
    
    if ema_data.shape[0] == pitch_data.shape[0] == loudness_data.shape[0]:
        combined_data = np.concatenate([ema_data, pitch_data, loudness_data], axis=1)
        combined_data = combined_data[:, :, np.newaxis]
        return torch.from_numpy(combined_data).float()
    else:
        raise ValueError("Shape mismatch in data components")

if os.path.exists(denoised_sample_path):
    denoised_sample = np.load(denoised_sample_path)
    denoised_sample = torch.from_numpy(denoised_sample).float()
    print("Loaded saved denoised sample.")
else:
    new_sample = preprocess_sample(sample_file)
    start_index = (new_sample.shape[0] - 128) // 2
    new_sample = new_sample[start_index:start_index + 128]
    new_sample = new_sample.permute(2, 1, 0)
    print("Reshaped sample shape:", new_sample.shape)
    print(new_sample)

    # Initialize model and diffusion (same configuration as training)
    model = Unet1D(
        dim=64,
        dim_mults=(1, 2),
        channels=14,
        seq_length=1
    )

    diffusion = GaussianDiffusion1D(
        model,
        seq_length=1,     
        timesteps=1000,     
        objective='pred_x0'
    )

    checkpoint = torch.load(checkpoint_path)
    model.load_state_dict(checkpoint['model'], strict=True)
    model.eval()

    timestep = 500  
    noisy_sample = diffusion.q_sample(new_sample, torch.tensor([timestep]))
    with torch.no_grad():
        denoised_sample = diffusion.p_sample_loop(noisy_sample.shape)
    np.save(denoised_sample_path, denoised_sample.cpu().numpy())
    print("Saved denoised sample for future use.")

# Extract `ema`, `pitch`, and `loudness` and concatenate correctly
ema_data = denoised_sample[:, :12, :].squeeze().cpu().numpy().transpose()
pitch_data = denoised_sample[:, 12, :].squeeze().cpu().numpy()
loudness_data = denoised_sample[:, 13, :].squeeze().cpu().numpy()
print(ema_data.shape)
print(pitch_data.shape)
print(loudness_data.shape)

# Load SPARC model and encode speaker embedding
coder = load_model("en", device="cpu")
speaker_embedding = coder.encode(speaker_embedding_file)['spk_emb']
print(speaker_embedding.shape)

# Synthesis function
def perform_synthesis(coder, combined_data, speaker_embedding):
    code_dict = {
        "ema": ema_data,
        "pitch": pitch_data,
        "loudness": loudness_data,
        "spk_emb": speaker_embedding
    }
    wav = coder.decode(**code_dict)
    return wav

# # Perform synthesis and display audio
synthesized_audio = perform_synthesis(coder, None, speaker_embedding)
print(ema_data)
print(pitch_data)
ipd.display(ipd.Audio(synthesized_audio, rate=coder.sr))


|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      | 279291 KiB | 448002 KiB | 448002 KiB | 168711 KiB |
|       from large pool | 168704 KiB | 271104 KiB | 271104 KiB | 102400 KiB |
|       from small pool | 110587 KiB | 176898 KiB | 176898 KiB |  66311 KiB |
|---------------------------------------------------------------------------|
| Active memory         | 279291 KiB | 448002 KiB | 448002 KiB | 168711 KiB |
|       from large pool | 168704 KiB | 271104 KiB | 271104 KiB | 102400 KiB |
|       from small pool | 110587 KiB | 176898 KiB | 176898 KiB |  66311 KiB |
|---------------------------------------------------------------

  ckpt = torch.load(ckpt)
  WeightNorm.apply(module, name, dim)


(64,)
[[0.61155516 0.9850846  0.5297663  ... 0.4397547  0.20832384 0.5067964 ]
 [0.59273267 0.81122375 0.5961538  ... 0.6084321  0.5538503  0.5371217 ]
 [0.7225746  0.80128694 0.54388595 ... 0.6032861  0.53616357 0.44690615]
 ...
 [0.65321124 0.8739524  0.6048771  ... 0.4623033  0.4998118  0.4685274 ]
 [0.7060735  0.94819117 0.53289866 ... 0.6472876  0.5681862  0.50239307]
 [0.8391834  0.5383979  0.5215678  ... 0.69047344 0.6780749  0.6041712 ]]
[0.6859262  0.42441842 0.5618085  0.5492871  0.5886193  0.5732848
 0.59784573 0.59822905 0.5922839  0.59124917 0.59281397 0.5939934
 0.5901617  0.5949032  0.5938426  0.5894033  0.5987935  0.5947823
 0.5998757  0.5889011  0.593148   0.596278   0.59048676 0.5925676
 0.59989655 0.5982045  0.5965593  0.5955974  0.5876717  0.60017633
 0.5893769  0.5874868  0.58780086 0.595529   0.5927401  0.597722
 0.5975187  0.5929472  0.59983206 0.5991295  0.59566116 0.5934782
 0.5937352  0.588536   0.5910251  0.59476805 0.5982326  0.5937934
 0.59060526 0.5956675 