In [25]:
%pip install vosk
%pip install pyaudio
%pip install transformers
%pip install wave
%pip install googletrans==4.0.0-rc1
%pip install pyttsx3


Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.

Note: you may need to restart the kernel to use updated packages.
Collecting pyttsx3
  Downloading pyttsx3-2.90-py3-none-any.whl.metadata (3.6 kB)
Collecting comtypes (from pyttsx3)
  Downloading comtypes-1.4.2-py3-none-any.whl.metadata (4.1 kB)
Collecting pypiwin32 (from pyttsx3)
  Downloading pypiwin32-223-py3-none-any.whl.metadata (236 bytes)
Downloading pyttsx3-2.90-py3-none-any.whl (39 kB)
Downloading comtypes-1.4.2-py3-none-any.whl (201 kB)
   ---------------------------------------- 0.0/201.2 kB ? eta -:--:--
   ------------------------------ --------- 153.6/201.2 kB 4.6 MB/s eta 0:00:01
   ---------------------------------------- 201.2/201.2 kB 4.2 MB/s eta 0:00:00
Downloading pypiwin32-223-py3-none-any.whl (1.7 kB)
Installing collected packages: comtypes, pypiwin32, pyttsx3
Successfully installed comtypes-1.4.2 pypiwin32-223 pyttsx3-2.90
Note: y

In [1]:
from googletrans import Translator
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import json
import wave
import sys
from vosk import Model, KaldiRecognizer
import pyaudio
import re
import pyttsx3

In [3]:
# Step 0: Records Audio Sample

CHUNK = 1024
FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 44100
RECORD_SECONDS = 3

with wave.open('human_input.wav', 'wb') as wf:
    p = pyaudio.PyAudio()
    wf.setnchannels(CHANNELS)
    wf.setsampwidth(p.get_sample_size(FORMAT))
    wf.setframerate(RATE)

    stream = p.open(format=FORMAT, channels=CHANNELS, rate=RATE, input=True)

    print('Recording...')
    for _ in range(0, RATE // CHUNK * RECORD_SECONDS):
        wf.writeframes(stream.read(CHUNK))
    print('Done')

    stream.close()
    p.terminate()


Recording...
Done


In [8]:
# Play Audio Section

if len(sys.argv) < 2:
    print(f'Plays a wave file. Usage: {sys.argv[0]} filename.wav')
    sys.exit(-1)

with wave.open("human_input.wav", 'rb') as wf:
    # Instantiate PyAudio and initialize PortAudio system resources (1)
    p = pyaudio.PyAudio()

    # Open stream (2)
    stream = p.open(format=p.get_format_from_width(wf.getsampwidth()),
                    channels=wf.getnchannels(),
                    rate=wf.getframerate(),
                    output=True)

    # Play samples from the wave file (3)
    while len(data := wf.readframes(CHUNK)):  # Requires Python 3.8+ for :=
        stream.write(data)

    # Close stream (4)
    stream.close()

    # Release PortAudio system resources (5)
    p.terminate()

In [9]:
# Step 1: Chinese Speech Recognition with Vosk

# Path to the downloaded Vosk model
model_path = "vosk-model-small-cn-0.22"

# Path to the input WAV file
wav_file_path = "human_input.wav"

# Load the Vosk model
model = Model(model_path)

# Open the WAV file
with wave.open(wav_file_path, "rb") as wf:
    # Check if the audio file has the correct parameters
    if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getframerate() not in [8000, 16000, 32000, 44100, 48000]:
        # print("Audio file must be WAV format mono PCM.")
        raise Exception("Audio file must be WAV format mono PCM.")

    # Create a Kaldi recognizer with the model and the sample rate
    recognizer = KaldiRecognizer(model, wf.getframerate())

    # Read the audio data and transcribe it
    while True:
        data = wf.readframes(4000)
        if len(data) == 0:
            break
        if recognizer.AcceptWaveform(data):
            result = recognizer.Result()
            text = json.loads(result).get('text', '')
            # print(f"Recognized text: {text}")

    # Final result
    final_result = recognizer.FinalResult()
    final_result_test = json.loads(final_result).get('text', '')

    if final_result_test != "":
        chinese_input_text = json.loads(final_result).get('text', '')
        chinese_input_text = re.sub(r"\s+", "", chinese_input_text, flags=re.UNICODE)
    else:
        chinese_input_text = re.sub(r"\s+", "", text, flags=re.UNICODE)

    

    print(f"Final recognized text: {chinese_input_text}")

Final recognized text: 我很小


In [10]:
# Step 2: Translate from Chinese to English 


# Initialize the translator
translator = Translator()

# Translate from Chinese to English
translated = translator.translate(chinese_input_text, src='zh-cn', dest='en')
print(translated.text)  



I am small


In [11]:
# Step 3: Generate reply using GPT-2

# Load pre-trained model and tokenizer
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Input text
input_text = translated.text

# Encode the input text
input_ids = tokenizer.encode(input_text, return_tensors='pt')

# Generate output with additional parameters
output = model.generate(
    input_ids,
    max_length=15,  # Maximum length of the generated sequence
    num_return_sequences=1,  # Number of sequences to return
    temperature=0.8,  # Lower temperature results in more focused output
    top_k=25,  # Limit to top-k tokens
    top_p=0.9,  # Limit to top-p cumulative probability
    repetition_penalty=1.3,  # Penalty for repetition
    do_sample=True  # Use sampling instead of greedy decoding
)

# Decode and print the output
gpt_reply_text = tokenizer.decode(output[0], skip_special_tokens=True)
print(gpt_reply_text)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


I am small, but if you can't stand to have your body exposed


In [12]:
# Step 4:  Translate back to Chinese

# Translate from English to Chinese
translated_to_zh = translator.translate(gpt_reply_text, src='en', dest='zh-cn')
print(translated_to_zh.text)  

我很小，但是如果你不能忍受你的身体


In [10]:

# Step 5: Translate reply back to Chinese (Assuming using a pre-trained model or local translation tool)

# Initialize the text-to-speech engine
engine = pyttsx3.init()

# Set the voice to a Chinese voice (you may need to adjust the voice ID)
# voices = engine.getProperty('voices')
# for voice in voices:
#     if 'chinese' in voice.name.lower():
#         engine.setProperty('voice', voice.id)
#         break

# Get the Chinese text input
chinese_text = input(translated_to_zh.text)

# Convert the text to speech and save it as an audio file
engine.save_to_file(chinese_text, 'chinese_output.mp3')
engine.runAndWait()
print("Audio file 'chinese_output.mp3' has been created.")

In [13]:
engine = pyttsx3.init()
engine.say(translated_to_zh.text)
engine.runAndWait()

In [25]:
# Step 5: Translate reply back to Chinese 

# Initialize the TTS engine
engine = pyttsx3.init()

# Set properties before adding anything to speak
engine.setProperty('rate', 150)  # Speed percent (can go over 100)
engine.setProperty('volume', 1)  # Volume 0-1

# List available voices and set the voice to a Chinese one if available
voices = engine.getProperty('voices')
for voice in voices:
    # print(f"Voice: {voice.name}, ID: {voice.id}, Languages: {voice.languages}")
    if 'ZH' in voice.languages or 'ZH-CN' in voice.id:
        engine.setProperty('voice', voice.id)
        break

# Text to be spoken
text_to_speak = chinese_text

# Speak the text
engine.say(text_to_speak)
engine.runAndWait()

NameError: name 'chinese_text' is not defined

In [6]:
# Play Audio Section



with wave.open("chinese_output.mp3", 'rb') as wf:
    # Instantiate PyAudio and initialize PortAudio system resources (1)
    p = pyaudio.PyAudio()

    # Open stream (2)
    stream = p.open(format=p.get_format_from_width(wf.getsampwidth()),
                    channels=wf.getnchannels(),
                    rate=wf.getframerate(),
                    output=True)

    # Play samples from the wave file (3)
    while len(data := wf.readframes(CHUNK)):  # Requires Python 3.8+ for :=
        stream.write(data)

    # Close stream (4)
    stream.close()

    # Release PortAudio system resources (5)
    p.terminate()