In [5]:

import torch
import requests
import inflect
from transformers import (
    AutoModelForSpeechSeq2Seq, 
    AutoProcessor, 
    pipeline, 
    SpeechT5Processor, 
    SpeechT5ForTextToSpeech, 
    SpeechT5HifiGan
)
from datasets import load_dataset
import soundfile as sf
import librosa


In [8]:
# Set device and data type for optimized performance
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

# Load models once and reuse them for efficiency
def load_models():
    # Load the speech-to-text model (Whisper)
    model_id = "openai/whisper-tiny.en"
    model = AutoModelForSpeechSeq2Seq.from_pretrained(
        model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
    ).to(device)
    processor = AutoProcessor.from_pretrained(model_id)
    
    # Create a pipeline for automatic speech recognition (ASR)
    pipe = pipeline(
        "automatic-speech-recognition",
        model=model,
        tokenizer=processor.tokenizer,
        feature_extractor=processor.feature_extractor,
        torch_dtype=torch_dtype,  # Use mixed precision for faster processing
        device=0 if torch.cuda.is_available() else -1,  # Use GPU if available
    )
    
    # Load text-to-speech (TTS) models
    processor_tts = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
    model_tts = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to(device)
    vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
    
    return pipe, processor_tts, model_tts, vocoder

# Optimized speech-to-text processing with mixed precision
def process_sample(pipe, audio_path):
    with torch.cuda.amp.autocast(enabled=torch.cuda.is_available()):  # Mixed precision
        result = pipe(audio_path)  # Perform speech recognition
    return result['text']

# API Call to get city temperature (optimized)
def get_city_temperature(city):
    api_key = "e7777b4e947b1311b4d5fbe5a004be21"
    url = f"http://api.openweathermap.org/data/2.5/weather?q={city}&appid={api_key}&units=metric"
    response = requests.get(url)
    
    if response.status_code == 200:
        data = response.json()
        temperature = data['main']['temp']
        humidity = data['main']['humidity']
        
        # Use inflect engine to convert numbers to words
        p = inflect.engine()
        txt_temp = p.number_to_words(temperature)
        humidity_temp = p.number_to_words(humidity)
        
        return f"{txt_temp} celsius and {humidity_temp} percent."
    else:
        return "Sorry, I couldn't retrieve the temperature for that city."

# Text-to-speech function
def generate_speech(text, processor_tts, model_tts, vocoder):
    # Prepare text inputs
    inputs = processor_tts(text=text, return_tensors="pt").to(device)
    
    # Load speaker embeddings only once to avoid reloading
    embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
    speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0).to(device)
    
    # Generate speech
    with torch.no_grad():  # Disable gradients for faster inference
        speech = model_tts.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
    
    # Save speech to file
    sf.write("speech.wav", speech.cpu().numpy(), samplerate=16000)






# Main function to handle speech-to-text, get data, and text-to-speech
def main():

    

    pipe, processor_tts, model_tts, vocoder = load_models()
    
    # Process audio input for speech-to-text
    sample = "recorded_audio.wav"
    jsn_ot = process_sample(pipe, sample)
    print(f"Transcribed Text: {jsn_ot}")
    
    # Extract city name and get temperature data
    jsn_ot_api = "what's the temperature in mumbai"
    words = jsn_ot_api.split()
    city = words[-1]  # Assuming last word is the city
    city_temperature_humid = get_city_temperature(city)
    
    # Prepare the final text for TTS
    final_text = f"{city_temperature_humid}"
    print(f"Final Text for TTS: {final_text}")
    
    # Generate speech from text
    generate_speech(final_text, processor_tts, model_tts, vocoder)
    print("Speech generated and saved as 'speech.wav'")

    # int16_speech_array = generate_speech(final_text, processor_tts, model_tts, vocoder)
    # print(f"Generated signed 16-bit integer array for ESP32.{int16_speech_array}",)



# Execute the main function
# if __name__ == "__main__":
main()


  with torch.cuda.amp.autocast(enabled=torch.cuda.is_available()):  # Mixed precision


Transcribed Text:  The weather in Surat
Final Text for TTS: twenty-nine point nine nine celsius and sixty-six percent.


TypeError: cannot pickle '_hashlib.HMAC' object