<a href="https://colab.research.google.com/github/7exe/HRVC/blob/main/Hybrid_Retrieval_based_Voice_Conversion.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Install dependencies
!pip install torch torchaudio torchcrepe librosa soundfile numpy tqdm einops

# (Optional) clone your repo or create workspace
!mkdir -p v2v_project && cd v2v_project


Collecting torchcrepe
  Downloading torchcrepe-0.0.24-py3-none-any.whl.metadata (8.3 kB)
Collecting resampy (from torchcrepe)
  Downloading resampy-0.4.3-py3-none-any.whl.metadata (3.0 kB)
Downloading torchcrepe-0.0.24-py3-none-any.whl (72.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.3/72.3 MB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading resampy-0.4.3-py3-none-any.whl (3.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m115.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: resampy, torchcrepe
Successfully installed resampy-0.4.3 torchcrepe-0.0.24


In [2]:
import torch, torch.nn as nn, torch.nn.functional as F
import torchaudio
import torchcrepe
import librosa, soundfile as sf
import numpy as np
from tqdm import tqdm

# device
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using", device)


Using cuda


In [3]:
# === Core modules ===

class TimbreStyleEncoder(nn.Module):
    def __init__(self, dim=128, model_dim=192, depth=2, nhead=4):
        super().__init__()
        self.inp = nn.Linear(80, model_dim)
        self.blocks = nn.ModuleList([
            nn.TransformerEncoderLayer(model_dim, nhead, dim_feedforward=model_dim*4, batch_first=True, activation="gelu")
            for _ in range(depth)
        ])
        self.pool = nn.AdaptiveAvgPool1d(1)
        self.proj = nn.Linear(model_dim, dim)

    def forward(self, mel):
        x = self.inp(mel)
        for blk in self.blocks:
            x = blk(x)
        x = x.transpose(1, 2)
        x = self.pool(x).squeeze(-1)
        return F.layer_norm(self.proj(x), [self.proj.out_features])


class ProsodyPrototype(nn.Module):
    """Learned accent/prosody prototype vector."""
    def __init__(self, dim=128):
        super().__init__()
        self.proto = nn.Parameter(torch.randn(1, dim))

    def forward(self, B, T):
        return self.proto.expand(B, T, -1)


class UnitF0EnergyToMel(nn.Module):
    def __init__(self, in_dim, out_dim=80, model_dim=384, depth=6, nhead=6, ff_mult=4, timbre_dim=128, prosody_dim=128):
        super().__init__()
        self.inp = nn.Linear(in_dim + timbre_dim + prosody_dim, model_dim)
        self.pe = nn.Parameter(torch.randn(1, 2048, model_dim))
        self.blocks = nn.ModuleList([
            nn.TransformerEncoderLayer(model_dim, nhead, dim_feedforward=model_dim*ff_mult, batch_first=True, activation="gelu")
            for _ in range(depth)
        ])
        self.out = nn.Linear(model_dim, out_dim)

    def forward(self, units, logf0, energy, timbre, prosody_proto):
        B, T, _ = units.shape
        x = torch.cat([units, logf0.unsqueeze(-1), energy.unsqueeze(-1), timbre.unsqueeze(1).expand(-1, T, -1), prosody_proto], dim=-1)
        x = self.inp(x) + self.pe[:, :T]
        for blk in self.blocks:
            x = blk(x)
        return self.out(x)


In [4]:
def extract_mel(wav, sr=16000, n_mels=80):
    mel = torchaudio.transforms.MelSpectrogram(
        sample_rate=sr, n_fft=1024, hop_length=256, n_mels=n_mels
    )(torch.tensor(wav).unsqueeze(0))
    mel = torchaudio.functional.amplitude_to_DB(mel, multiplier=20.0, amin=1e-5, db_multiplier=0.0)
    return mel.squeeze(0).T  # (T, 80)

def extract_f0_energy(wav, sr=16000):
    wav_t = torch.tensor(wav).float().to(device).unsqueeze(0)
    f0 = torchcrepe.predict(
        wav_t, sr, 256, fmin=50, fmax=550, model="full", device=device, return_periodicity=False
    ).squeeze(0)
    logf0 = torch.log(f0 + 1e-6).cpu()
    energy = torch.log(torch.clamp(torch.tensor(librosa.feature.rms(y=wav, frame_length=1024, hop_length=256)[0]), min=1e-6))
    return logf0, torch.tensor(energy)


In [None]:
# Simplified unsupervised loop (fill with your dataset loader)

timbre_enc = TimbreStyleEncoder().to(device)
prosody_proto = ProsodyPrototype().to(device)
generator = UnitF0EnergyToMel(in_dim=256).to(device)  # assuming HuBERT gives 256-dim units

params = list(generator.parameters()) + list(timbre_enc.parameters()) + list(prosody_proto.parameters())
optim = torch.optim.Adam(params, lr=2e-4)

for epoch in range(1):  # increase as needed
    # TODO: replace with real dataset loop
    wav, sr = librosa.load(librosa.example("trumpet"), sr=16000)
    mel = extract_mel(wav).to(device)
    logf0, energy = extract_f0_energy(wav)

    # dummy HuBERT-like units (replace with pretrained HuBERT extraction)
    units = torch.randn(mel.shape[0], 256).unsqueeze(0).to(device)

    timbre = timbre_enc(mel.unsqueeze(0).to(device))
    pros = prosody_proto(B=1, T=units.shape[1])

    mel_hat = generator(units, logf0.unsqueeze(0).to(device), energy.unsqueeze(0).to(device), timbre, pros)

    loss = F.l1_loss(mel_hat, mel.unsqueeze(0).to(device))
    optim.zero_grad(); loss.backward(); optim.step()
    print("Loss:", loss.item())


In [None]:
def convert_voice(src_wav, ref_wav, sr=16000):
    # load
    src, _ = librosa.load(src_wav, sr=sr)
    ref, _ = librosa.load(ref_wav, sr=sr)

    src_mel = extract_mel(src)
    src_logf0, src_energy = extract_f0_energy(src)

    ref_mel = extract_mel(ref).to(device)
    timbre = timbre_enc(ref_mel.unsqueeze(0)).to(device)

    units = torch.randn(src_mel.shape[0], 256).unsqueeze(0).to(device)  # replace with HuBERT encoding
    pros = prosody_proto(B=1, T=units.shape[1])

    mel_hat = generator(units, src_logf0.unsqueeze(0).to(device), src_energy.unsqueeze(0).to(device), timbre, pros)

    # Griffin-Lim vocoder
    wav_out = torchaudio.transforms.GriffinLim(n_fft=1024, hop_length=256)(mel_hat[0].T.exp().cpu())
    sf.write("converted.wav", wav_out.numpy(), sr)
    return "converted.wav"

# Example usage (replace with your own files)
# out_path = convert_voice("source.wav", "reference.wav")


In [None]:
# Use ProsodyStyleEncoder to compute average prosody vector from target-voice dataset
# and initialize prosody_proto.

def init_prototype_from_dir(proto, wav_dir, sr=16000):
    import os
    vecs = []
    encoder = ProsodyStyleEncoder().to(device)
    for f in os.listdir(wav_dir):
        if f.endswith(".wav"):
            wav, _ = librosa.load(os.path.join(wav_dir, f), sr=sr)
            logf0, energy = extract_f0_energy(wav)
            vec = encoder(logf0.unsqueeze(0).to(device), energy.unsqueeze(0).to(device))
            vecs.append(vec.detach().cpu())
    mean_vec = torch.stack(vecs).mean(0)
    with torch.no_grad():
        proto.proto.copy_(mean_vec.unsqueeze(0))
    print("Prototype initialized from", len(vecs), "files")

# Example:
# init_prototype_from_dir(prosody_proto, "/path/to/target_voice_wavs")
