-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathspeech.py
108 lines (90 loc) · 3.46 KB
/
speech.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
# Copyright (c) 2023 Eduard Rostkov <ea@rostkov.me>
# This script is licensed under the MIT License
# https://opensource.org/licenses/MIT
import os
from abc import ABC, abstractmethod
from speechkit import Session, SpeechSynthesis, ShortAudioRecognition
from google.cloud import speech
from google.cloud import texttospeech as tts
class SpeechKit(ABC):
def __init__(self, language):
self.language = language
@staticmethod
def create(provider, api_key, language, sample_rate):
if provider == 'yandex':
return YandexSpeechKit(api_key, language, sample_rate)
elif provider == 'google':
return GoogleSpeechAPI(api_key, language, sample_rate)
else:
raise ValueError(f'Unknown provider: {provider}')
@abstractmethod
def recognize(self, audio_bytes):
pass
@abstractmethod
def synthesize(self, text):
pass
class YandexSpeechKit(SpeechKit):
@property
def voice(self):
return {
'ru-RU': 'zahar',
'en-US': 'john',
}[self.language]
def __init__(self, api_key, language, sample_rate):
session = Session.from_api_key(api_key, x_client_request_id_header=True, x_data_logging_enabled=True)
self.synthesizer = SpeechSynthesis(session)
self.recognizer = ShortAudioRecognition(session)
self.sample_rate = sample_rate
super().__init__(language)
def recognize(self, audio_bytes):
return self.recognizer.recognize(
audio_bytes,
format='lpcm',
sampleRateHertz=self.sample_rate,
lang=self.language
).strip()
def synthesize(self, text):
return self.synthesizer.synthesize_stream(
text=text.strip(),
voice=self.voice,
lang=self.language,
format='lpcm',
sampleRateHertz='16000'
)
class GoogleSpeechAPI(SpeechKit):
# pycharm doesn't understand protobuf enums
def __init__(self, api_key_location, language, sample_rate):
# os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = "C:\\Users\\eddir\\google.json"
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = api_key_location
self.recognizer = speech.SpeechClient()
self.recognition_config = speech.RecognitionConfig(
# encoding=speech.RecognitionConfig.AudioEncoding.OGG_OPUS,
# sample_rate_hertz=sample_rate,
language_code=language,
enable_word_time_offsets=True
)
self.synthesize_config = tts.VoiceSelectionParams(
language_code=language, name=language + '-Wavenet-B'
)
self.sample_rate = sample_rate
super().__init__(language)
def synthesize(self, text):
text_input = tts.SynthesisInput(text=text)
audio_config = tts.AudioConfig(
audio_encoding=tts.AudioEncoding.LINEAR16,
sample_rate_hertz=self.sample_rate
)
client = tts.TextToSpeechClient()
response = client.synthesize_speech(
input=text_input, voice=self.synthesize_config, audio_config=audio_config
)
return response.audio_content
def recognize(self, audio_bytes):
result = self.recognizer.recognize(
config=self.recognition_config,
audio=speech.RecognitionAudio(content=audio_bytes)
)
if len(result.results) > 0:
return result.results[0].alternatives[0].transcript
else:
return ''