## Record_audio

In [1]:
import pyaudio
import wave

def record_audio(filename="AudioFile\output.wav", duration=5, rate=16000, channels=1, chunk=1024, format=pyaudio.paInt16):
    audio = pyaudio.PyAudio()
    
    stream = audio.open(format=format, channels=channels,
                        rate=rate, input=True,
                        frames_per_buffer=chunk)
    
    print("Recording...")
    frames = []
    
    for _ in range(0, int(rate / chunk * duration)):
        data = stream.read(chunk)
        frames.append(data)
    
    print("Recording finished.")
    
    stream.stop_stream()
    stream.close()
    audio.terminate()
    
    with wave.open(filename, 'wb') as wf:
        wf.setnchannels(channels)  # Chỉ 1 kênh (mono)
        wf.setsampwidth(audio.get_sample_size(format))
        wf.setframerate(rate)  # Tần số lấy mẫu 16kHz
        wf.writeframes(b''.join(frames))

In [2]:
record_audio()

Recording...
Recording finished.


# Using whisper-small

In [3]:
import torch
import torchaudio
from transformers import WhisperProcessor, WhisperForConditionalGeneration

# Load model and processor
model_name = "openai/whisper-small"
processor = WhisperProcessor.from_pretrained(model_name)
whisper = WhisperForConditionalGeneration.from_pretrained(model_name)

# Ensure model runs on GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Running on {device}")
whisper.to(device)

  from .autonotebook import tqdm as notebook_tqdm


Running on cuda


WhisperForConditionalGeneration(
  (model): WhisperModel(
    (encoder): WhisperEncoder(
      (conv1): Conv1d(80, 768, kernel_size=(3,), stride=(1,), padding=(1,))
      (conv2): Conv1d(768, 768, kernel_size=(3,), stride=(2,), padding=(1,))
      (embed_positions): Embedding(1500, 768)
      (layers): ModuleList(
        (0-11): 12 x WhisperEncoderLayer(
          (self_attn): WhisperSdpaAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=False)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
        

## Load and transcribe

In [4]:
import torch
import torchaudio

# Load and preprocess audio
def load_audio(file_path):
    waveform, sample_rate = torchaudio.load(file_path)
    resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
    waveform = resampler(waveform)
    return waveform.squeeze(0), 16000  # Convert to 1D tensor

# Transcribe function
def audio_to_text(audio_path):
    audio, sr = load_audio(audio_path)
    input_features = processor(audio, sampling_rate=sr, return_tensors="pt").input_features.to(device)

    # Generate transcription
    with torch.no_grad():
        predicted_ids = whisper.generate(input_features)
    
    return processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]


In [None]:
audio_to_text("AudioFile\VN.wav")

## Translate using MarianMTModel

In [1]:
from transformers import MarianMTModel, MarianTokenizer

viToEnModel = MarianMTModel.from_pretrained("Helsinki-NLP/opus-mt-vi-en")
viToEntokenizer = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-vi-en")
enToViModel = MarianMTModel.from_pretrained("Helsinki-NLP/opus-mt-en-vi")
enToVitokenizer = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-vi")


def translate_vi_to_en(text):
    inputs = viToEntokenizer(text, return_tensors="pt", padding=True, truncation=True)
    translated = viToEnModel.generate(**inputs)
    return viToEntokenizer.decode(translated[0], skip_special_tokens=True)


def translate_en_to_vi(text):
    inputs = enToVitokenizer(text, return_tensors="pt", padding=True, truncation=True)
    translated = viToEnModel.generate(**inputs)
    return enToVitokenizer.decode(translated[0], skip_special_tokens=True)


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
translate_vi_to_en("Không tìm được mô hình nào trên HuggingFace cho kết quả đọc tiếng Việt tốt cả, cứu tôi :v")

"Couldn't find any models on Huffleing Face for good reading, save me:v."

In [3]:
translate_en_to_vi("This article is about the concept of residence. For the structure, see House. For other uses, see Home (disambiguation). 'Homes' redirects here. For other uses, see Homes (disambiguation).")

"mặt sách Nhân Alex according is Át the concept vậy 1/2., né Home I most?disambiguation). near names Sus' ♫s cái."

# Engine for speech

In [6]:
import pyttsx3

def text_to_speech(text):
    engine = pyttsx3.init()  # Khởi tạo engine
    engine.say(text)  # Đọc văn bản
    engine.runAndWait()  # Chạy lệnh đọc

text_to_speech("C4AI Aya Vision is an open weights research release of multimodal models with advanced capabilities optimized for real-time inference on edge devices.")



## Combine

In [7]:
# Combine all functions
def transcribe_translate_speak(file_path):
    text = audio_to_text(file_path)
    translated_text = translate_vi_to_en(text)
    text_to_speech(translated_text)

In [None]:
transcribe_translate_speak("AudioFile\VN.wav")

## Vi TTS

In [None]:
from TTS.tts.configs.xtts_config import XttsConfig
from TTS.tts.models.xtts import Xtts

config = XttsConfig()
config.load_json("XTTS-v2\config.json")
model = Xtts.init_from_config(config)
model.load_checkpoint(config, checkpoint_dir="XTTS-v2", eval=True)
model.cuda()

outputs = model.synthesize(
    "It took me quite a long time to develop a voice and now that I have it I am not going to be silent.",
    config,
    speaker_wav="XTTS-v2\samples\en_sample.wav",
    gpt_cond_len=3,
    language="en",
)
