In [88]:
from datasets import load_dataset

dataset = load_dataset("csv", data_files="datasets/elise_text.csv")
display(dataset)

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 1195
    })
})

In [89]:
import base64
import json
import numpy as np
import os
from typing import AsyncGenerator, Generator
import asyncio
import azure.cognitiveservices.speech as speechsdk
import logging
import requests
from openai.helpers import LocalAudioPlayer

from dotenv import load_dotenv
load_dotenv()

class AzureTTSGenerator:
    def __init__(self, voice_id: str):
        self.voice_id = voice_id
        

        self.ssml_string = """
            <speak version='1.0' xmlns='http://www.w3.org/2001/10/synthesis' 
                xmlns:mstts='https://www.w3.org/2001/mstts' xml:lang='en-US'>
            <voice name='{voice_id}'>
                <prosody rate='+20%' volume='+30%'>
                {text}
                </prosody>
            </voice>
            </speak>
        """
        speech_config = speechsdk.SpeechConfig(subscription=os.environ.get('SPEECH_KEY'), endpoint="https://westus2.api.cognitive.microsoft.com/")
        speech_config.speech_synthesis_voice_name=self.voice_id
        self.speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config, audio_config=None)
    
    def get_audio_bytes(self, text: str) -> bytes:
        ssml = self.ssml_string.format(voice_id=self.voice_id, text=text)
        speech_synthesis_result = self.speech_synthesizer.speak_ssml_async(ssml).get()
        audio_data = speech_synthesis_result.audio_data
        # Fade in/out to reduce popping
        if audio_data:
            # Convert to numpy
            audio_np = np.frombuffer(audio_data, dtype=np.int16).astype(np.float32)
            fade_ms = 150
            sample_rate = 24000  # Azure TTS default is 24kHz, change if needed
            fade_samples = int(sample_rate * fade_ms / 1000)
            if len(audio_np) > 2 * fade_samples:
                # Fade in
                fade_in = np.linspace(0, 1, fade_samples)
                audio_np[:fade_samples] *= fade_in
                # Fade out
                fade_out = np.linspace(1, 0, fade_samples)
                audio_np[-fade_samples:] *= fade_out
            # Remove DC offset
            audio_np -= np.mean(audio_np)
            # Convert back to int16
            audio_np = np.clip(audio_np, -32768, 32767)
            audio_data = audio_np.astype(np.int16).tobytes()
        return audio_data
    
    async def async_get_generator(self, text: str) -> AsyncGenerator[bytes, None]:
        loop = asyncio.get_event_loop()
        ssml = self.ssml_string.format(voice_id=self.voice_id, text=text)
        future = self.speech_synthesizer.speak_ssml_async(ssml)
        speech_synthesis_result = await loop.run_in_executor(None, future.get)
        stream = speechsdk.AudioDataStream(speech_synthesis_result)
        audio_buffer = bytes(1280000)
        filled_size = stream.read_data(audio_buffer)
        while filled_size > 0:
            print(f"{filled_size} bytes received.")
            yield audio_buffer[:filled_size]
            filled_size = stream.read_data(audio_buffer)

        if speech_synthesis_result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
            print("Speech synthesized for text [{}]".format(text))
        elif speech_synthesis_result.reason == speechsdk.ResultReason.Canceled:
            cancellation_details = speech_synthesis_result.cancellation_details
            print("Speech synthesis canceled: {}".format(cancellation_details.reason))
            if cancellation_details.reason == speechsdk.CancellationReason.Error:
                if cancellation_details.error_details:
                    print("Error details: {}".format(cancellation_details.error_details))
                    print("Did you set the speech resource key and endpoint values?")

In [93]:
import torch
import torchaudio

def bytes_to_pcm_float32(audio_bytes: bytes) -> np.ndarray:
    pcm_int16 = np.frombuffer(audio_bytes, dtype=np.int16)
    pcm_float32 = pcm_int16.astype(np.float32) / 32768.0
    return pcm_float32.reshape(-1, 1)  # Ensure shape is (N, 1)

def vocaloidify_audio_bytes(audio_bytes: bytes, orig_sr: int = 16000, pitch_semitones: int = 6, target_sr: int = None) -> bytes:
    """
    Takes PCM 16-bit mono audio bytes, applies a bandpass filter to keep only mid frequencies (musical tone),
    then applies pitch shift and mixes harmonies for a simple Vocaloid-like effect.
    Returns PCM 16-bit mono audio bytes (optionally resampled).
    """
    # Convert bytes to float32 waveform
    waveform = np.frombuffer(audio_bytes, dtype=np.int16).astype(np.float32) / 32768.0
    # # Add vibrato (sinusoidal modulation)
    # t = np.arange(waveform.shape[0]) / orig_sr
    # vibrato_freq = 6.0  # Hz
    # vibrato_depth = 0.04  # Fractional semitones
    # vibrato = np.sin(2 * np.pi * vibrato_freq * t) * vibrato_depth
    # waveform = waveform * (1 + vibrato)
    # Convert to torch tensor for pitch shifting
    waveform_tensor = torch.from_numpy(waveform).unsqueeze(0)
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    waveform_tensor = waveform_tensor.to(device)
    # Main pitch shift (more extreme)
    shifted_main = torchaudio.functional.pitch_shift(waveform_tensor, orig_sr, n_steps=pitch_semitones)
    # More harmonies
    shifted_harmony1 = torchaudio.functional.pitch_shift(waveform_tensor, orig_sr, n_steps=4)
    shifted_harmony2 = torchaudio.functional.pitch_shift(waveform_tensor, orig_sr, n_steps=10)
    shifted_harmony3 = torchaudio.functional.pitch_shift(waveform_tensor, orig_sr, n_steps=-4)
    # Mix main and harmonies
    mix = 0.7*shifted_main + 0.2 * shifted_harmony1 + 0.1 * shifted_harmony2 + 0.05 * shifted_harmony3
    # Resample if needed
    if target_sr and target_sr != orig_sr:
        mix = torchaudio.transforms.Resample(orig_sr, target_sr).to(device)(mix)
        out_sr = target_sr
    else:
        out_sr = orig_sr
    # Apply fade-in and fade-out to avoid popping
    out_np = mix.squeeze().clamp(-1, 1).cpu().numpy()
    fade_duration = int(0.05 * out_sr)  # 50 ms fade
    if out_np.shape[0] > 2 * fade_duration:
        fade_in = np.linspace(0, 1, fade_duration)
        fade_out = np.linspace(1, 0, fade_duration)
        out_np[:fade_duration] *= fade_in
        out_np[-fade_duration:] *= fade_out
    # Convert back to int16 bytes
    out_wave = (out_np * 32767.0).astype(np.int16)
    return out_wave.tobytes(), out_sr

def resample_pcm_float32(waveform: np.ndarray, orig_sr: int, target_sr: int) -> np.ndarray:
    """
    Resample a PCM float32 numpy array from orig_sr to target_sr using torchaudio.
    waveform: shape (N, 1) or (N,)
    Returns resampled waveform with shape (M, 1)
    """
    if waveform.ndim == 2 and waveform.shape[1] == 1:
        waveform = waveform.T  # shape (1, N)
    elif waveform.ndim == 1:
        waveform = waveform[np.newaxis, :]  # shape (1, N)
    tensor_waveform = torch.from_numpy(waveform)
    resampler = torchaudio.transforms.Resample(orig_sr, target_sr)
    resampled = resampler(tensor_waveform)
    resampled_np = resampled.squeeze(0).cpu().numpy()
    return resampled_np.reshape(-1, 1)

In [91]:
import sys
from rvc_python.infer import RVCInference
rvc = RVCInference(device="cuda:0")
RVCInference.set_params({"f0method": "rvmpe"})
rvc.load_model("../ai_assistant/weights/miku_default/miku_default_rvc.pth", index_path="../ai_assistant/weights/miku_default/added_IVF4457_Flat_nprobe_1_miku_default_rvc_v2.index")
tts = AzureTTSGenerator(voice_id="en-CA-ClaraNeural")


2025-09-19 19:46:44 | INFO | rvc_python.modules.vc.modules | Loading: ../ai_assistant/weights/miku_default/miku_default_rvc.pth


gin_channels: 256 self.spk_embed_dim: 109
Model miku_default_rvc.pth loaded.


In [114]:
import sounddevice as sd
OUTPUT_SAMPLE_RATE = 24000

# rvc_bytes = rvc.infer_bytes(tts.get_audio_bytes("Hello there. My name is Hatsune Miku!"))
# vocal_bytes = vocaloidify_audio_bytes(rvc_bytes, pitch_semitones=3)[0]
# rvc_chunks = bytes_to_pcm_float32(vocal_bytes) # resample_pcm_float32(bytes_to_pcm_float32(rvc_bytes), orig_sr=24000, target_sr=24000)
# # write("output.wav", 35000, rvc_chunks)
# sd.play(rvc_chunks, samplerate=35000)
# sd.wait()

audios = []

for index, item in enumerate(dataset["train"]):
    if index >= 3:
        break
    print(item)
    text = item["text"]
    tts_bytes = rvc.infer_bytes(tts.get_audio_bytes(text))
    vocal_bytes = vocaloidify_audio_bytes(tts_bytes, pitch_semitones=5)[0]
    rvc_chunks = resample_pcm_float32(bytes_to_pcm_float32(vocal_bytes), orig_sr=35000, target_sr=OUTPUT_SAMPLE_RATE)
    rvc_chunks = rvc_chunks.flatten()
    sd.play(rvc_chunks, samplerate=OUTPUT_SAMPLE_RATE)
    sd.wait()

{'text': 'Hey, hey, hey, come here. I want to pet you.', 'audio': None}
{'text': "I know you're working, but you can take a break, can't you? Well, I want to pet you, so you should stop and come here.", 'audio': None}


KeyboardInterrupt: 

In [None]:
from datasets import Audio
from tqdm import tqdm
from scipy.io.wavfile import write
import random

# rvc_bytes = rvc.infer_bytes(tts.get_audio_bytes("Hello there. My name is Hatsune Miku!"))
# vocal_bytes = vocaloidify_audio_bytes(rvc_bytes, pitch_semitones=3)[0]
# rvc_chunks = bytes_to_pcm_float32(vocal_bytes) # resample_pcm_float32(bytes_to_pcm_float32(rvc_bytes), orig_sr=24000, target_sr=24000)
# # write("output.wav", 35000, rvc_chunks)
# sd.play(rvc_chunks, samplerate=35000)
# sd.wait()
finished_dataset = dataset.copy()
audios = []
save_ratio = 0.01 
for item in tqdm(finished_dataset["train"]):
    text = item["text"]
    tts_bytes = rvc.infer_bytes(tts.get_audio_bytes(text))
    vocal_bytes = vocaloidify_audio_bytes(tts_bytes, pitch_semitones=5)[0]
    rvc_chunks = resample_pcm_float32(bytes_to_pcm_float32(vocal_bytes), orig_sr=35000, target_sr=OUTPUT_SAMPLE_RATE)
    rvc_chunks = rvc_chunks.flatten()
    audios.append({"array": rvc_chunks, "sampling_rate": OUTPUT_SAMPLE_RATE})
    if np.random.rand() < save_ratio:
        write(f"output{random.randint(0, 10000)}.wav", OUTPUT_SAMPLE_RATE, rvc_chunks)

100%|██████████| 1195/1195 [1:07:10<00:00,  3.37s/it]


ValueError: The table can't have duplicated columns but columns ['audio'] are duplicated.

In [160]:
# Remove the 'audio' column from finished_dataset["train"]
if "audio" in finished_dataset["train"].column_names:
    finished_dataset["train"] = finished_dataset["train"].remove_columns("audio")

In [None]:
from huggingface_hub import login
import os
login(os.getenv("HF_TOKEN"))
finished_dataset['train'].push_to_hub("Bossologist/miku-finetune-ds-test", private=True)

Creating parquet from Arrow format: 100%|██████████| 1/1 [00:01<00:00,  1.15s/ba]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:01<00:00,  1.12s/ba]
Uploading the dataset shards: 100%|██████████| 2/2 [03:29<00:00, 104.87s/it]


CommitInfo(commit_url='https://huggingface.co/datasets/Bossologist/miku-finetune-ds-test/commit/a5a701fd73f31ee95cfc90fd16fe4509a8aa8613', commit_message='Upload dataset', commit_description='', oid='a5a701fd73f31ee95cfc90fd16fe4509a8aa8613', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/Bossologist/miku-finetune-ds-test', endpoint='https://huggingface.co', repo_type='dataset', repo_id='Bossologist/miku-finetune-ds-test'), pr_revision=None, pr_num=None)

In [None]:
# sd.play(finished_dataset['train'][0]['audio']['array'], blocking=True, samplerate=OUTPUT_SAMPLE_RATE)
finished_dataset['train'][0]['audio']

{'array': [0.00019731229986064136,
  -0.00014607718912884593,
  0.001307738944888115,
  0.0040465365163981915,
  0.00041140406392514706,
  0.0009560502367094159,
  -0.004072791431099176,
  0.002320936182513833,
  -1.427540246368153e-05,
  0.005845612846314907,
  0.006826967000961304,
  -0.0025298497639596462,
  0.0011976368259638548,
  -0.00029472706955857575,
  5.367521589505486e-05,
  0.00031753824441693723,
  0.001299248542636633,
  -8.186750164895784e-06,
  -0.0002804866817314178,
  5.9581703681033105e-05,
  -5.3927629778627306e-05,
  4.2459523683646694e-05,
  3.853702583000995e-05,
  4.118705874134321e-06,
  -1.8645681848283857e-05,
  -7.76008382672444e-05,
  7.483786248485558e-06,
  -7.018249743850902e-05,
  2.885426329157781e-05,
  -2.3094173229765147e-05,
  0.0001222457503899932,
  -9.397098438057583e-06,
  -3.2653959351591766e-05,
  -0.00010159468365600333,
  -4.805498247151263e-05,
  -5.912330834689783e-06,
  -1.2993408745387569e-05,
  4.3917534640058875e-05,
  -2.94661422231

In [155]:
import os
from tqdm import tqdm

os.makedirs("data", exist_ok=True)
for idx, audio in enumerate(tqdm(audios)):
    file_path = os.path.join("data", f"audio_{idx}.wav")
    write(file_path, OUTPUT_SAMPLE_RATE, audio["array"])

100%|██████████| 1195/1195 [00:00<00:00, 1395.85it/s]


In [157]:
audio_with_path = []
for idx, audio in enumerate(audios):
    file_path = os.path.join("data", f"audio_{idx}.wav")
    audio_with_path.append({"path": file_path, "sampling_rate": OUTPUT_SAMPLE_RATE})
audio_with_path

[{'path': 'data\\audio_0.wav', 'sampling_rate': 24000},
 {'path': 'data\\audio_1.wav', 'sampling_rate': 24000},
 {'path': 'data\\audio_2.wav', 'sampling_rate': 24000},
 {'path': 'data\\audio_3.wav', 'sampling_rate': 24000},
 {'path': 'data\\audio_4.wav', 'sampling_rate': 24000},
 {'path': 'data\\audio_5.wav', 'sampling_rate': 24000},
 {'path': 'data\\audio_6.wav', 'sampling_rate': 24000},
 {'path': 'data\\audio_7.wav', 'sampling_rate': 24000},
 {'path': 'data\\audio_8.wav', 'sampling_rate': 24000},
 {'path': 'data\\audio_9.wav', 'sampling_rate': 24000},
 {'path': 'data\\audio_10.wav', 'sampling_rate': 24000},
 {'path': 'data\\audio_11.wav', 'sampling_rate': 24000},
 {'path': 'data\\audio_12.wav', 'sampling_rate': 24000},
 {'path': 'data\\audio_13.wav', 'sampling_rate': 24000},
 {'path': 'data\\audio_14.wav', 'sampling_rate': 24000},
 {'path': 'data\\audio_15.wav', 'sampling_rate': 24000},
 {'path': 'data\\audio_16.wav', 'sampling_rate': 24000},
 {'path': 'data\\audio_17.wav', 'sampling

In [None]:
finished_dataset["train"] = finished_dataset["train"].add_column("audio", audio_with_path)

ValueError: The table can't have duplicated columns but columns ['audio'] are duplicated.

In [163]:
finished_dataset["train"]['audio']

[{'path': 'data\\audio_0.wav', 'sampling_rate': 24000},
 {'path': 'data\\audio_1.wav', 'sampling_rate': 24000},
 {'path': 'data\\audio_2.wav', 'sampling_rate': 24000},
 {'path': 'data\\audio_3.wav', 'sampling_rate': 24000},
 {'path': 'data\\audio_4.wav', 'sampling_rate': 24000},
 {'path': 'data\\audio_5.wav', 'sampling_rate': 24000},
 {'path': 'data\\audio_6.wav', 'sampling_rate': 24000},
 {'path': 'data\\audio_7.wav', 'sampling_rate': 24000},
 {'path': 'data\\audio_8.wav', 'sampling_rate': 24000},
 {'path': 'data\\audio_9.wav', 'sampling_rate': 24000},
 {'path': 'data\\audio_10.wav', 'sampling_rate': 24000},
 {'path': 'data\\audio_11.wav', 'sampling_rate': 24000},
 {'path': 'data\\audio_12.wav', 'sampling_rate': 24000},
 {'path': 'data\\audio_13.wav', 'sampling_rate': 24000},
 {'path': 'data\\audio_14.wav', 'sampling_rate': 24000},
 {'path': 'data\\audio_15.wav', 'sampling_rate': 24000},
 {'path': 'data\\audio_16.wav', 'sampling_rate': 24000},
 {'path': 'data\\audio_17.wav', 'sampling

In [164]:
from datasets import Dataset, Audio

audio_dataset = Dataset.from_dict({"audio": [audio['path'] for audio in audio_with_path]}).cast_column("audio", Audio())

In [169]:
audio_dataset = audio_dataset.add_column("text", finished_dataset["train"]["text"])

In [None]:
from huggingface_hub import login
import os
login(os.getenv("HF_TOKEN"))
audio_dataset.push_to_hub("Bossologist/miku-finetune-ds-test-fixed")

Map: 100%|██████████| 598/598 [00:00<00:00, 1881.36 examples/s]t/s]
Creating parquet from Arrow format: 100%|██████████| 6/6 [00:00<00:00, 13.50ba/s]
Map: 100%|██████████| 597/597 [00:03<00:00, 197.27 examples/s]6, 136.88s/it]
Creating parquet from Arrow format: 100%|██████████| 6/6 [00:00<00:00, 10.64ba/s]
Uploading the dataset shards: 100%|██████████| 2/2 [04:35<00:00, 137.63s/it]


CommitInfo(commit_url='https://huggingface.co/datasets/Bossologist/miku-finetune-ds-test-fixed/commit/f288876d69d53a128fafec362ef71f9ee6ed4cce', commit_message='Upload dataset', commit_description='', oid='f288876d69d53a128fafec362ef71f9ee6ed4cce', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/Bossologist/miku-finetune-ds-test-fixed', endpoint='https://huggingface.co', repo_type='dataset', repo_id='Bossologist/miku-finetune-ds-test-fixed'), pr_revision=None, pr_num=None)

In [None]:
model.push_to_hub_merged("Bossologist/model", tokenizer, save_method = "merged_4bit", token = "")