In [None]:
%pip install vosk
%pip install pyaudio
%pip install transformers
%pip install wave
%pip install googletrans==4.0.0-rc1
%pip install pyttsx3

In [1]:
from googletrans import Translator
# from transformers import GPT2LMHeadModel, GPT2Tokenizer
import json
import wave
import sys
from vosk import Model, KaldiRecognizer
import pyaudio
import re
import pyttsx3

In [2]:
# Step 0: Records Audio Sample

CHUNK = 1024
FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 44100
RECORD_SECONDS = 3

with wave.open('human_input.wav', 'wb') as wf:
    p = pyaudio.PyAudio()
    wf.setnchannels(CHANNELS)
    wf.setsampwidth(p.get_sample_size(FORMAT))
    wf.setframerate(RATE)

    stream = p.open(format=FORMAT, channels=CHANNELS, rate=RATE, input=True)

    print('Recording...')
    for _ in range(0, RATE // CHUNK * RECORD_SECONDS):
        wf.writeframes(stream.read(CHUNK))
    print('Done')

    stream.close()
    p.terminate()


Recording...
Done


In [3]:
# Play Audio Section

if len(sys.argv) < 2:
    print(f'Plays a wave file. Usage: {sys.argv[0]} filename.wav')
    sys.exit(-1)

with wave.open("human_input.wav", 'rb') as wf:
    # Instantiate PyAudio and initialize PortAudio system resources (1)
    p = pyaudio.PyAudio()

    # Open stream (2)
    stream = p.open(format=p.get_format_from_width(wf.getsampwidth()),
                    channels=wf.getnchannels(),
                    rate=wf.getframerate(),
                    output=True)

    # Play samples from the wave file (3)
    while len(data := wf.readframes(CHUNK)):  # Requires Python 3.8+ for :=
        stream.write(data)

    # Close stream (4)
    stream.close()

    # Release PortAudio system resources (5)
    p.terminate()

In [4]:
# Step 1: Chinese Speech Recognition with Vosk

# Path to the downloaded Vosk model
model_path = "vosk-model-small-cn-0.22"

# Path to the input WAV file
wav_file_path = "human_input.wav"

# Load the Vosk model
model = Model(model_path)

# Open the WAV file
with wave.open(wav_file_path, "rb") as wf:
    # Check if the audio file has the correct parameters
    if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getframerate() not in [8000, 16000, 32000, 44100, 48000]:
        # print("Audio file must be WAV format mono PCM.")
        raise Exception("Audio file must be WAV format mono PCM.")

    # Create a Kaldi recognizer with the model and the sample rate
    recognizer = KaldiRecognizer(model, wf.getframerate())

    # Read the audio data and transcribe it
    while True:
        data = wf.readframes(4000)
        if len(data) == 0:
            break
        if recognizer.AcceptWaveform(data):
            result = recognizer.Result()
            # print(result)
            text = json.loads(result).get('text', '')
            # print(f"Recognized text: {text}")

    # Final result
    final_result = recognizer.FinalResult()
    final_result_test = json.loads(final_result).get('text', '')

    if final_result_test != "":
        chinese_input_text = json.loads(final_result).get('text', '')
        chinese_input_text = re.sub(r"\s+", "", chinese_input_text, flags=re.UNICODE)
    else:
        chinese_input_text = re.sub(r"\s+", "", text, flags=re.UNICODE)

    

    print(f"Final recognized text: {chinese_input_text}")

{
  "text" : "您好"
}
Recognized text: 您好
Final recognized text: 您好


In [5]:
# Step 2: Translate from Chinese to English 


# Initialize the translator
translator = Translator()

# Translate from Chinese to English
translated = translator.translate(chinese_input_text, src='zh-cn', dest='en')
print(translated.text)  



Hello


In [8]:
from transformers import AutoModelForCausalLM, AutoTokenizer

# Load the pre-trained model and tokenizer
model_name = "microsoft/DialoGPT-small"
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Function to generate a response
def generate_response(input_text, max_length=50):
    input_ids = tokenizer.encode(input_text, return_tensors='pt')
    output = model.generate(input_ids, max_length=max_length, num_beams=5, early_stopping=True)
    response = tokenizer.decode(output[0], skip_special_tokens=True)
    return response

# Example usage
input_text = "Hello, how are you?"
output_text = generate_response(input_text)
output_text = output_text.strip(input_text)
print(f"Input: {input_text}")
print(f"Output: {output_text}")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input: Hello, how are you?
Output: Good morning everyone!


In [9]:
# Step 4:  Translate back to Chinese

# Translate from English to Chinese
translated_to_zh = translator.translate(output_text, src='en', dest='zh-cn')
print(translated_to_zh.text)  

大家，早安！


In [10]:
# Step 5: Translate reply back to Chinese 

# Initialize the TTS engine
engine = pyttsx3.init()

# Set properties before adding anything to speak
engine.setProperty('rate', 125)  # Speed percent (can go over 100)
engine.setProperty('volume', 1)  # Volume 0-1

# List available voices and set the voice to a Chinese one if available
voices = engine.getProperty('voices')
for voice in voices:
    # print(f"Voice: {voice.name}, ID: {voice.id}, Languages: {voice.languages}")
    if 'ZH' in voice.languages or 'ZH-CN' in voice.id:
        engine.setProperty('voice', voice.id)
        break

# Text to be spoken
text_to_speak = translated_to_zh.text

# Speak the text
engine.say(text_to_speak)
engine.runAndWait()

In [None]:
# Play Audio Section



with wave.open("chinese_output.mp3", 'rb') as wf:
    # Instantiate PyAudio and initialize PortAudio system resources (1)
    p = pyaudio.PyAudio()

    # Open stream (2)
    stream = p.open(format=p.get_format_from_width(wf.getsampwidth()),
                    channels=wf.getnchannels(),
                    rate=wf.getframerate(),
                    output=True)

    # Play samples from the wave file (3)
    while len(data := wf.readframes(CHUNK)):  # Requires Python 3.8+ for :=
        stream.write(data)

    # Close stream (4)
    stream.close()

    # Release PortAudio system resources (5)
    p.terminate()