## Installation

### Clone the repository

In [None]:
from google.colab import userdata
token = userdata.get('github')
!git clone -b colab_fix https://{token}@github.com/Dreablin/metavoice-tts.git
%cd metavoice-tts

### Install dependencies

In [None]:
!sudo apt install pipx
!pipx install poetry
!pipx run poetry install && pipx run poetry run pip install torch==2.4.0 torchaudio==2.4.0
!pipx run poetry env list | sed 's/ (Activated)//' > poetry_env.txt
# NOTE: pip's dependency resolver will error & complain, ignore it!
# its due to a temporary dependency issue, `tts.synthesise` will still work as intended!

In [None]:
import sys, pathlib
venv = pathlib.Path("poetry_env.txt").read_text().strip("\n")
sys.path.append(f"/root/.cache/pypoetry/virtualenvs/{venv}/lib/python3.10/site-packages")

## Inference

In [None]:
from IPython.display import Audio, display
from fam.llm.fast_inference import TTS
import os
import re
import subprocess
import sys
required_packages = ['soundfile', 'numpy']
for package in required_packages:
    try:
        __import__(package)
    except ImportError:
        subprocess.check_call([sys.executable, "-m", "pip", "install", package])

import soundfile as sf
import numpy as np

tts = TTS()

In [None]:
# Path to the input file
file_path = "result.txt"

# Initialize list to store paths of generated audio files
audio_paths = []

# Process each line of the file
with open(file_path, 'r', encoding='utf-8') as file:
    for line in file:
        # Strip whitespace and skip empty lines
        line = line.strip()
        if not line:
            continue

        # Skip lines longer than 140 characters
        if len(line) > 140:
            continue

        # Determine speaker reference path and text
        spk_ref_path = "assets/man.mp3"
        text_to_speak = line

        # Check for tags and adjust parameters
        if "<Man>" in line and "</Man>" in line:
            spk_ref_path = "assets/man.mp3"
            text_to_speak = re.search(r'<Man>(.*?)</Man>', line).group(1).strip()
        elif "<Woman>" in line and "</Woman>" in line:
            spk_ref_path = "assets/man.mp3"
            text_to_speak = re.search(r'<Woman>(.*?)</Woman>', line).group(1).strip()

        # Remove any surrounding double quotes from the extracted text
        text_to_speak = text_to_speak.strip('"')

        # Generate audio for the line using TTS
        wav_file = tts.synthesise(text=text_to_speak, spk_ref_path=spk_ref_path)

        # Store path of generated audio file
        audio_paths.append(wav_file)

# Combine all generated audio files into a single file 'result.wav'
output_wav = "result.wav"

# Load and concatenate all audio files
audio_data = []
samplerate = None  # Initialize samplerate for combined audio

for audio_file in audio_paths:
    data, sr = sf.read(audio_file)
    audio_data.append(data)
    if samplerate is None:
        samplerate = sr  # Set samplerate from the first file

# Concatenate all audio data
combined_audio = np.concatenate(audio_data)

# Write combined audio to the output wav file
sf.write(output_wav, combined_audio, samplerate)

print(f"All audio has been combined into {output_wav}")