In [None]:
!pip install git+https://github.com/huggingface/transformers.git 

In [None]:
pip install git+https://github.com/huggingface/parler-tts.git

In [None]:
!pip install transformers torch accelerate bitsandbytes onnxruntime

In [None]:
!pip install --upgrade protobuf

In [None]:
import torch
from transformers import WhisperProcessor, WhisperForConditionalGeneration, AutoTokenizer

In [None]:
# Load the VAD model
vad_model, vad_utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',
                                  model='silero_vad',
                                  force_reload=True,
                                  onnx=True)
get_speech_timestamps, save_audio, read_audio, VADIterator, collect_chunks = vad_utils

In [None]:
# Load the Whisper model for Speech-to-Text
whisper_processor = WhisperProcessor.from_pretrained("openai/whisper-small")
whisper_model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny", 
                                                            device_map="auto",
                                                            load_in_8bit=True)

In [None]:
# Load the LLaMA model for generating responses
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

model_name = "microsoft/Phi-3.5-mini-instruct"  # Phi-3.5 is not publicly available, so we'll use Phi-2 as an example
    
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    
# Load the model with 4-bit quantization
llm_model = AutoModelForCausalLM.from_pretrained(
        model_name,
        device_map="auto",
        trust_remote_code=True,
        load_in_4bit=True,
        max_memory={0: "15GB"}  # Adjust this based on your Colab's GPU memory
    )

In [None]:
from parler_tts import ParlerTTSForConditionalGeneration
from transformers import AutoTokenizer, set_seed
parler_model = ParlerTTSForConditionalGeneration.from_pretrained("parler-tts/parler-tts-mini-expresso").to("cuda:0")
parler_tokenizer = AutoTokenizer.from_pretrained("parler-tts/parler-tts-mini-expresso")

In [None]:
def vad_and_split(audio_file):
    """Apply VAD and return only the speech segments"""
    wav = read_audio(audio_file, sampling_rate=16000)
    speech_timestamps = get_speech_timestamps(wav, vad_model, sampling_rate=16000)
    speech_chunks = collect_chunks(speech_timestamps, wav)
    return speech_chunks


def transcribe(speech_chunk):
    """Convert a speech chunk to text"""
    inputs = whisper_processor(torch.tensor(speech_chunk,dtype = torch.float16), return_tensors="pt", sampling_rate=16000,).to("cuda:0")
    generated_ids = whisper_model.generate(torch.tensor(inputs.input_features,dtype = torch.float16))
    transcription = whisper_processor.batch_decode(generated_ids, skip_special_tokens=True)
    print(transcription)
    return transcription[0]


from IPython.display import Markdown


def generate_response(text):
    prefix = """<|system|> You provide only one-line answers: concise, precise, and accurate. If the query is long, first summarize, then respond.<|end|>
    <|user|>
    """
    postfix = """
<|assistant|> """
    
    full_prompt = prefix + text + postfix
    
    inputs = tokenizer(full_prompt, return_tensors="pt", max_length=4096, truncation=True).to(llm_model.device)
    
    with torch.no_grad():
        outputs = llm_model.generate(
            inputs.input_ids, 
            max_new_tokens=64,
            do_sample=True,
            top_p=0.9,
            temperature=0.1
        )
    
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    response_cleaned = response[len(full_prompt):].strip()
    
    return response_cleaned


def synthesize_speech(text, output_file="output.wav"):
    import soundfile as sf
    """Convert the generated text response to speech"""
    
    description = "Talia speaks happy tone with emphasis and high quality audio with zero errors."

    input_ids = parler_tokenizer(description, return_tensors="pt").input_ids.to("cuda:0")
    prompt_input_ids = parler_tokenizer(text, return_tensors="pt").input_ids.to("cuda:0")

    generation = parler_model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)
    audio_arr = generation.cpu().numpy().squeeze()
    sf.write("parler_tts_out.wav", audio_arr, parler_model.config.sampling_rate)
    return output_file


def process_audio_pipeline(audio_file):
    """Complete pipeline from audio input to speech output"""
    # Step 1: Apply VAD and get relevant speech chunks
    speech_chunks = vad_and_split(audio_file)
    
    # Process chunk through the Speech-to-Text model
    full_transcription = transcribe(speech_chunks)

    # Step 2: Generate a response using LLaMA model
    response = generate_response(full_transcription.strip())

    # Step 3: Convert the response to speech and save the file
    output_file = synthesize_speech(response)

    return output_file

In [None]:
# Usage:
import time
audio_file_location = "/kaggle/input/lizmotorsinternshiptest/Recording.wav"
start = time.time()
output_audio = process_audio_pipeline(audio_file_location)
print(f"Output speech file saved to: {output_audio}, in time {time.time()-start}")