In [None]:
!pip install -q streamlit

In [None]:
!npm install localtunnel

In [None]:
!pip install transformers torch torchaudio accelerate bitsandbytes streamlit gtts onnxruntime

In [None]:
!pip install pydub

In [None]:
%%writefile app.py
import torch
import torchaudio
from transformers import WhisperProcessor, WhisperForConditionalGeneration, AutoTokenizer, AutoModelForCausalLM
import numpy as np
from scipy.io import wavfile
import streamlit as st
import os
import tempfile
from pydub import AudioSegment

# VAD Optimization
@st.cache_resource
def load_vad_model():
    model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',
                                  model='silero_vad',
                                  force_reload=True,
                                  onnx=True)
    return model, utils

# Whisper Optimization
@st.cache_resource
def load_whisper_model():
    processor = WhisperProcessor.from_pretrained("openai/whisper-tiny")
    model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny",
                                                            device_map="auto",
                                                            load_in_8bit=True)
    return processor, model

# LLM Optimization (Phi-2 as example)
@st.cache_resource
def load_llm_model():
    model_name = "microsoft/phi-2"
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    model = AutoModelForCausalLM.from_pretrained(model_name,
                                                 device_map="auto",
                                                 trust_remote_code=True,
                                                 load_in_4bit=True,
                                                 max_memory={0: "15GB"})
    return tokenizer, model

# TTS Optimization (using gTTS instead of Parler TTS for simplicity and speed)
from gtts import gTTS
import io

def synthesize_speech(text, output_file="output.mp3"):
    tts = gTTS(text=text, lang='en', slow=False)
    mp3_fp = io.BytesIO()
    tts.write_to_fp(mp3_fp)
    mp3_fp.seek(0)
    with open(output_file, 'wb') as f:
        f.write(mp3_fp.getvalue())
    return output_file

# Load models
vad_model, vad_utils = load_vad_model()
whisper_processor, whisper_model = load_whisper_model()
llm_tokenizer, llm_model = load_llm_model()

def vad_and_split(audio_file, threshold=0.5):
    waveform, sample_rate = torchaudio.load(audio_file)
    if waveform.shape[0] > 1:  # Convert stereo to mono
        waveform = torch.mean(waveform, dim=0, keepdim=True)

    get_speech_timestamps = vad_utils[0]
    speech_timestamps = get_speech_timestamps(waveform[0], vad_model, threshold=threshold)
    return waveform, sample_rate, speech_timestamps

def transcribe(waveform, sample_rate):
    input_features = whisper_processor(waveform.numpy()[0], sampling_rate=sample_rate, return_tensors="pt").input_features
    input_features = input_features.to(whisper_model.device)

    with torch.no_grad():
        predicted_ids = whisper_model.generate(input_features)
    transcription = whisper_processor.batch_decode(predicted_ids, skip_special_tokens=True)
    return transcription[0]

def generate_response(text):
    prefix = """System: You are an expert at one-line answers. You never answer or explain any more than asked by the user.
    Your answers are always crisp, to the point, and extremely accurate.
    If the user query is very long to answer, then summarize it and then answer the user.
    User Query: """
    postfix = "\nAssistant: "

    full_prompt = prefix + text + postfix

    inputs = llm_tokenizer(full_prompt, return_tensors="pt", max_length=4096, truncation=True).to(llm_model.device)

    with torch.no_grad():
        outputs = llm_model.generate(
            inputs.input_ids,
            max_new_tokens=128,
            do_sample=True,
            top_p=0.9,
            temperature=0.7
        )

    response = llm_tokenizer.decode(outputs[0], skip_special_tokens=True)
    response_cleaned = response[len(full_prompt):].strip()

    return response_cleaned

def process_audio_pipeline(audio_file):
    waveform, sample_rate, speech_timestamps = vad_and_split(audio_file)

    # Extract speech segments
    speech_segments = [waveform[:, start:end] for start, end in speech_timestamps]

    if not speech_segments:
        # No speech detected, use the entire audio
        full_speech = waveform
    else:
        full_speech = torch.cat(speech_segments, dim=1)

    transcription = transcribe(full_speech, sample_rate)
    response = generate_response(transcription.strip())
    output_file = synthesize_speech(response)

    return output_file, transcription, response

def is_valid_audio(file):
    try:
        with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as tmp_file:
            tmp_file.write(file.getvalue())
            tmp_file_path = tmp_file.name

        audio = AudioSegment.from_wav(tmp_file_path)
        os.unlink(tmp_file_path)  # Delete the temporary file

        # Check if the audio duration is between 1 second and 5 minutes
        duration_ms = len(audio)
        if 1000 <= duration_ms <= 300000:
            return True
        else:
            return False
    except Exception:
        return False


# Streamlit app
st.title("Optimized Speech-to-Speech Pipeline")

uploaded_file = st.file_uploader("Choose an audio file", type=['wav', 'mp3'])

if uploaded_file is not None:
    if is_valid_audio(uploaded_file):
        st.audio(uploaded_file, format='audio/wav')

        if st.button("Process Audio"):
            with st.spinner("Processing..."):
                # Save the uploaded file temporarily
                with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as tmp_file:
                    tmp_file.write(uploaded_file.getvalue())
                    temp_path = tmp_file.name

                try:
                    output_file, transcription, response = process_audio_pipeline(temp_path)

                    st.success("Processing completed!")

                    st.subheader("Transcription")
                    st.write(transcription)

                    st.subheader("Generated Response")
                    st.write(response)

                    st.subheader("Generated Audio")
                    st.audio(output_file, format='audio/mp3')
                except Exception as e:
                    st.error(f"An error occurred during processing: {str(e)}")
                finally:
                    # Clean up the temporary file
                    os.unlink(temp_path)
    else:
        st.error("The uploaded file is not a valid audio file or its duration is not between 1 second and 5 minutes. Please upload a valid WAV or MP3 file.")
else:
    st.warning("Please upload an audio file.")

st.sidebar.title("About")
st.sidebar.info("This app demonstrates an optimized speech-to-speech pipeline using VAD, Whisper, Phi-2, and gTTS models.")

In [None]:
!ngrok config add-authtoken 2lF8Y1iKK9n3lKlQuvim1KVsDpn_5dahJHKY8qasyJCq3RyBf

In [None]:
import subprocess
import time
from pyngrok import ngrok

# Start Streamlit in the background
streamlit_process = subprocess.Popen(["streamlit", "run", "app.py"])

# Wait for Streamlit to start up
time.sleep(10)

# Set up ngrok
ngrok.set_auth_token("YOUR_NGROK_AUTH_TOKEN")  # Replace with your ngrok auth token
public_url = ngrok.connect(8501).public_url

print(f"Streamlit app is running on: {public_url}")

# Keep the notebook running
!sleep 1h