In [None]:
!nvidia-smi

!pip install pyngrok -q
!pip install git+https://github.com/openai/whisper.git -q
!pip install fastapi[all] -q
!pip install uvicorn[standard] -q
!pip install python-multipart -q
!pip install nest-asyncio -q

In [None]:
import logging

logging.basicConfig(format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', level=logging.DEBUG, force=True)

## Download Required Libraries

In [None]:
import platform
import sys

USE_GPU = True #@param {"type": "boolean"}
CPU_ARCH = platform.machine()
OS_TYPE = sys.platform

logging.debug(f'CPU Arch: {CPU_ARCH}')
logging.debug(f'OS: {OS_TYPE}')

if (OS_TYPE == 'win' or OS_TYPE == 'linux') and CPU_ARCH == 'x86_64':
  CPU_ARCH = 'x64'

ONNX_VERSION = "1.13.1"
ONNX_VAR = f"{CPU_ARCH}{'-gpu' if USE_GPU else ''}"
ONNX_NAME = f"onnxruntime-{OS_TYPE}-{ONNX_VAR}-{ONNX_VERSION}"
ONNX_URL = f'https://github.com/microsoft/onnxruntime/releases/download/v{ONNX_VERSION}/{ONNX_NAME}.tgz'

logging.debug(f'ONNX Filename = {ONNX_NAME}')
!wget -N $ONNX_URL
!tar -xzf {ONNX_NAME}.tgz

!cp /content/{ONNX_NAME}/lib/* /usr/local/lib
!cp /content/{ONNX_NAME}/lib/* /content

In [None]:
VOICEVOX_VERSION = '0.14.2'
VOICEVOX_FILE = f"voicevox_core-{VOICEVOX_VERSION}+{'cuda' if USE_GPU else 'cpu'}-cp38-abi3-{OS_TYPE}_{platform.machine()}.whl"

!pip install https://github.com/VOICEVOX/voicevox_core/releases/download/{VOICEVOX_VERSION}/{VOICEVOX_FILE}

In [None]:
OPEN_JTALK_NAME = "open_jtalk_dic_utf_8-1.11"

!wget -N https://onboardcloud.dl.sourceforge.net/project/open-jtalk/Dictionary/open_jtalk_dic-1.11/{OPEN_JTALK_NAME}.tar.gz
!tar -xzf {OPEN_JTALK_NAME}.tar.gz

OPEN_JTALK_DIR = f"/content/{OPEN_JTALK_NAME}"

# Main Code

In [None]:
import whisper
from pyngrok import ngrok
from voicevox_core import AccelerationMode, AudioQuery, VoicevoxCore

NGROK_AUTH_TOKEN = "" #@param {type:"string"}
TRANSLATE_FILENAME = 'translate.wav' #@param {type:"string"}
TRANSCRIBE_FILENAME = 'transcribe.wav' #@param {type:"string"}
WHISPER_MODEL = "small" #@param ["tiny", "base", "small", "medium", "large"]
CHUNK_SIZE = 4096 #@param {type:"integer"}
DEFAULT_VOICE_ID = 5 #@param {type:"integer"}

ngrok.set_auth_token(NGROK_AUTH_TOKEN)

# Whisper
logging.info(f"[WHISPER] loading up \"{WHISPER_MODEL}\" model..")
model = whisper.load_model(WHISPER_MODEL)
logging.info(f"[WHISPER] loaded")

# VoiceVox
logging.info(f"[VOICEVOX] loading up voicevox core..")

core = VoicevoxCore(
    acceleration_mode="GPU" if USE_GPU else "CPU", open_jtalk_dict_dir=OPEN_JTALK_DIR
)
core.load_model(DEFAULT_VOICE_ID)

logging.info(f"[VOICEVOX] successfully loaded! running on {'gpu' if core.is_gpu_mode else 'cpu'}")

In [None]:
from io import BytesIO
import json
from fastapi import FastAPI, File, UploadFile, Request, Response, Body
from typing import Annotated, Union, Dict, Any
import asyncio
import uvicorn
import nest_asyncio
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import JSONResponse
import time
import numpy as np

app = FastAPI()
app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)


@app.get('/')
async def main():
  return { 'message': 'Hello world' }

# TTS
@app.post('/audio_query')
def query(text: str, speaker: int = DEFAULT_VOICE_ID):
  start = time.time()

  if not core.is_model_loaded(speaker):
    core.load_model(speaker)

  logging.debug("querying voicevox")

  audio_query = core.audio_query(text, speaker)
  audio_query.output_stereo = True

  logging.debug(f"querying took: {time.time() - start}")

  return audio_query

@app.post('/synthesis')
def synthesis(audio_query: Annotated[AudioQuery, Body()], speaker: int = DEFAULT_VOICE_ID):
  start = time.time()

  if not core.is_model_loaded(speaker):
    core.load_model(speaker)

  logging.debug('synthesizing')

  wav = core.synthesis(audio_query, speaker)

  logging.debug(f"synthesizing took: {time.time() - start}")

  return Response(content=wav, media_type="audio/wav")

@app.post('/tts')
def query(
    text: str,
    speaker: int = DEFAULT_VOICE_ID,

    speed_scale: float = 1.7,
    volume_scale: float = 2.0,
    intonation_scale: float = 1.5,
    pre_phoneme_length: float = 1.0,
    post_phoneme_length = 1.0
):
  start = time.time()

  if not core.is_model_loaded(speaker):
    core.load_model(speaker)

  logging.debug("querying voicevox")

  audio_query = core.audio_query(text, speaker)
  audio_query.output_stereo = True
  audio_query.speed_scale = float(speed_scale)
  audio_query.volume_scale = float(volume_scale)
  audio_query.intonation_scale = float(intonation_scale)
  audio_query.pre_phoneme_length = float(pre_phoneme_length)
  audio_query.post_phoneme_length = float(post_phoneme_length)

  logging.debug(f"querying took: {time.time() - start}")

  logging.debug('synthesizing')

  wav = core.synthesis(audio_query, speaker)

  logging.debug(f"synthesizing took: {time.time() - start}")

  return Response(content=wav, media_type="audio/wav")

# STT
@app.post('/asr')
def asr(audio_file: UploadFile, task: str = 'transcribe', language: str = 'ja'):
  if audio_file.size <= 0:
    return JSONResponse(content={ 'message': 'Missing audio' }, status_code=422)

  if task == 'transcribe':
    with open(TRANSCRIBE_FILENAME, 'wb') as f:
      audio_file.file.seek(0)
      f.write(audio_file.file.read())

    result = model.transcribe(TRANSCRIBE_FILENAME)

    return result

  elif task == 'translate':
    with open(TRANSLATE_FILENAME, 'wb') as f:
      audio_file.file.seek(0)
      f.write(audio_file.file.read())

    result = model.transcribe(TRANSLATE_FILENAME, language=language, task='translate')

    return result
  
  else:
    return JSONResponse(content = { 'message': 'Bad request' }, status_code=400)

@app.post('/asr_stream')
async def asr_stream(req: Request, task: str = 'transcribe', language: str = 'ja'):
  audio_data = BytesIO()

  async for chunk in req.stream():
    audio_data.write(chunk)

  audio = np.frombuffer(audio_data.getvalue(), np.int16).flatten().astype(np.float32) / 32768.0
  audio = whisper.pad_or_trim(audio)

  if task == 'transcribe':
    result = model.transcribe(audio)

    return result

  elif task == 'translate':
    result = model.transcribe(audio, language=language, task='translate')

    return result
  
  else:
    return JSONResponse(content = { 'message': 'Bad request' }, status_code=400)

if __name__ == "__main__":
  http_tunnel = ngrok.connect(5000)
  print(f'Public URL -> {http_tunnel.public_url}')
  print(f'Docs URL -> {http_tunnel.public_url}/docs')

  nest_asyncio.apply()
  uvicorn.run(app, port=5000)