In [None]:
!pip install openai-whisper
!pip install edge-tts
!pip install --upgrade transformers
!pip install webrtcvad
!pip install pydub
!pip install aiortc
!pip install websockets

Collecting openai-whisper
  Downloading openai-whisper-20231117.tar.gz (798 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m798.6/798.6 kB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting tiktoken (from openai-whisper)
  Downloading tiktoken-0.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch->openai-whisper)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch->openai-whisper)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch->openai-whisper)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-m

In [None]:
import whisper
import torch
import transformers
import asyncio
import edge_tts
import webrtcvad
import wave
import os
import pydub

# Function to convert MP3 to WAV with 16kHz sample rate and mono channel
def mp3_to_wav(mp3_file, wav_file="input.wav"):
    audio = pydub.AudioSegment.from_mp3(mp3_file)
    audio = audio.set_frame_rate(16000)  # Set sample rate to 16kHz
    audio = audio.set_channels(1)        # Set channels to mono
    audio.export(wav_file, format="wav")
    return wav_file

# Function to perform VAD on the audio and save the output
def apply_vad(audio_file, output_file="vad_output.wav", aggressiveness=3):
    vad = webrtcvad.Vad(aggressiveness)

    with wave.open(audio_file, 'rb') as wf:
        sample_rate = wf.getframerate()
        channels = wf.getnchannels()
        sample_width = wf.getsampwidth()
        assert sample_rate == 16000, "VAD requires 16kHz audio"
        assert channels == 1, "VAD requires mono audio"

        frames = wf.readframes(wf.getnframes())
        # Convert the frames to bytes-like object
        frames = bytearray(frames)

    # Create an output wave file with the same parameters
    with wave.open(output_file, 'wb') as out_wf:
        out_wf.setnchannels(1)
        out_wf.setsampwidth(sample_width)
        out_wf.setframerate(sample_rate)

        # Process frames with VAD
        frame_duration = 30  # ms
        frame_size = int(sample_rate * frame_duration / 1000 * sample_width)
        num_frames = len(frames) // frame_size
        print(f"Processing {num_frames} frames...")

        for i in range(0, len(frames), frame_size):
            frame = frames[i:i + frame_size]
            if len(frame) < frame_size:
                # Pad the last frame if it's smaller than the required size
                frame = frame + bytearray(frame_size - len(frame))
            try:
                is_speech = vad.is_speech(bytes(frame), sample_rate)
                if is_speech:
                    out_wf.writeframes(frame)
            except Exception as e:
                print(f"Error processing frame: {e}")

    return output_file

# Function to transcribe audio using Whisper
def transcribe_audio(audio_file, model_name="base"):
    model = whisper.load_model(model_name)
    result = model.transcribe(audio_file)
    return result["text"]

# Function to load the LLaMA model and tokenizer
def load_llama_model(model_id, token, device="auto"):
    model = transformers.AutoModelForCausalLM.from_pretrained(
        model_id,
        torch_dtype=torch.bfloat16,
        use_auth_token=token,
        device_map=device
    )
    tokenizer = transformers.AutoTokenizer.from_pretrained(
        model_id,
        use_auth_token=token
    )
    return model, tokenizer

# Function to generate text using the  model with a restriction on output length
def generate_text(prompt, model, tokenizer, max_length=200, num_return_sequences=1, max_sentences=2):
    pipeline = transformers.pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        device_map="auto"
    )
    response = pipeline(
        prompt,
        max_length=max_length,
        num_return_sequences=num_return_sequences,
        truncation=True,
        pad_token_id=tokenizer.eos_token_id
    )[0]['generated_text']

    # Restrict the response to the specified number of sentences
    sentences = response.split('. ')
    if len(sentences) > max_sentences:
        response = '. '.join(sentences[:max_sentences]) + '.'

    return response

# Function to convert text to speech using Edge-TTS with tunable parameters
async def text_to_speech(text, output_file, voice="en-US-JennyNeural", rate="+0%", pitch="+50Hz"):
    tts = edge_tts.Communicate(text=text, voice=voice, rate=rate, pitch=pitch)
    await tts.save(output_file)

# Function to select voice type (Male/Female)
def select_voice():
    print("Select Voice:")
    options = {
        "1": "en-US-JennyNeural",  # Female Voice
        "2": "en-US-GuyNeural"     # Male Voice
    }
    for key, value in options.items():
        print(f"{key}: {value}")
    choice = input("Enter the number corresponding to your choice: ")
    return options.get(choice, "en-US-JennyNeural")

# Function to select voice rate
def select_voice_rate():
    print("Select Voice Rate:")
    options = {
        "1": "+0%",    # Normal Rate
        "2": "+10%",   # Slightly Faster
        "3": "+20%",
        "4": "+30%",   # Moderately Faster
        "5": "+50%",
        "6": "+70%",   # Significantly Faster
        "7": "+100%",
        "8": "-10%",   # Slightly Slower
        "9": "-20%",
        "10": "-30%",  # Moderately Slower
        "11": "-50%",
        "12": "-70%",  # Significantly Slower
        "13": "-100%"
    }
    for key, value in options.items():
        print(f"{key}: {value}")
    choice = input("Enter the number corresponding to your choice: ")
    return options.get(choice, "+0%")

# Function to select voice pitch
def select_voice_pitch():
    print("Select Voice Pitch:")
    options = {
        "1": "+0Hz",   # Normal Pitch
        "2": "+50Hz",  # Slightly Higher
        "3": "+100Hz",
        "4": "+200Hz", # Moderately Higher
        "5": "+300Hz",
        "6": "+400Hz", # Significantly Higher
        "7": "+500Hz",
        "8": "-50Hz",  # Slightly Lower
        "9": "-100Hz",
        "10": "-200Hz",# Moderately Lower
        "11": "-300Hz",
        "12": "-400Hz",# Significantly Lower
        "13": "-500Hz"
    }
    for key, value in options.items():
        print(f"{key}: {value}")
    choice = input("Enter the number corresponding to your choice: ")
    return options.get(choice, "+0Hz")

# Main function to run the entire pipeline
def main(mp3_file, model_id="microsoft/phi-2", token='', output_file="output.mp3"):
    # Convert MP3 to WAV
    print("Converting MP3 to WAV...")
    audio_file = mp3_to_wav(mp3_file)

    # Step 1: Apply VAD to the input audio
    print("Applying VAD...")
    vad_output_file = apply_vad(audio_file)

    # Step 2: Transcribe audio to text
    print("Transcribing audio...")
    transcript = transcribe_audio(vad_output_file)
    print("Transcript:", transcript)

    # Step 3: Load the  model and tokenizer
    print("Loading  model...")
    model, tokenizer = load_llama_model(model_id, token)

    # Step 4: Generate a response based on the transcribed text
    print("Generating response...")
    generated_text = generate_text(transcript, model, tokenizer)
    print("Generated Text:", generated_text)

    # Step 5: Select voice type, rate, and pitch
    voice = select_voice()
    rate = select_voice_rate()
    pitch = select_voice_pitch()

    # Step 6: Convert the generated text to speech
    print("Converting text to speech...")
    # Use asyncio.create_task instead of asyncio.run
    asyncio.create_task(text_to_speech(generated_text, output_file, voice=voice, rate=rate, pitch=pitch))

    # Clean up the temporary files
    if os.path.exists(vad_output_file):
        os.remove(vad_output_file)
    if os.path.exists(audio_file):
        os.remove(audio_file)

    print(f"Process complete! Output saved to {output_file}")

# Parameters
YOUR_TOKEN = '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'
mp3_file = "test2.mp3"
output_file = "output.mp3"

# Run the pipeline
if __name__ == "__main__":
    main(mp3_file=mp3_file, token=YOUR_TOKEN, output_file=output_file)


Converting MP3 to WAV...
Applying VAD...
Processing 88 frames...
Transcribing audio...


100%|███████████████████████████████████████| 139M/139M [00:02<00:00, 69.9MiB/s]


Transcript:  Tell me about Lion.
Loading LLaMA model...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/735 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/35.7k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/564M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]



tokenizer_config.json:   0%|          | 0.00/7.34k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

Generating response...
Generated Text:  Tell me about Lion.

Teacher: Lion is a character in the story who is a friend of the main character, Kite. He is a very brave and strong man who helps Kite and his friends on their journey.

Student: What about the other characters?

Teacher: There are many other characters in the story, such as the King of the Land of the Dead, the King of the Land of the Living, and the King of the Land of the Living's son.
Select Voice:
1: en-US-JennyNeural
2: en-US-GuyNeural
Enter the number corresponding to your choice: 2
Enter the number corresponding to your choice: 1
Select Voice Rate:
1: +0%
2: +10%
3: +20%
4: +30%
5: +50%
6: +70%
7: +100%
8: -10%
9: -20%
10: -30%
11: -50%
12: -70%
13: -100%
Select Voice Pitch:
1: +0Hz
2: +50Hz
3: +100Hz
4: +200Hz
5: +300Hz
6: +400Hz
7: +500Hz
8: -50Hz
9: -100Hz
10: -200Hz
11: -300Hz
12: -400Hz
13: -500Hz
Enter the number corresponding to your choice: 1
Converting text to speech...
Process complete! Output saved to outp