In [None]:
!pip install gradio
!pip install librosa
!pip install transformers
!pip install torch
import gradio as gr
import librosa
import torch
import logging
from transformers import WhisperProcessor, WhisperForConditionalGeneration, AutoModelForCausalLM, AutoTokenizer

class QwenChatGenerator:
    def __init__(self, model_id: str = "Qwen/Qwen2.5-0.5B-Instruct"):
        logging.basicConfig(level=logging.INFO)
        self.logger = logging.getLogger(__name__)

        self.tokenizer = AutoTokenizer.from_pretrained(model_id)
        self.model = AutoModelForCausalLM.from_pretrained(
            model_id,
            torch_dtype=torch.float16,
            device_map="auto"
        )
        self.logger.info(f"Model {model_id} loaded successfully")

    def generate_text(
        self,
        messages: list[dict[str, str]],
        max_length: int = 500,
        temperature: float = 0.7,
        top_p: float = 0.9
    ) -> str:
        prompt = self.tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True
        )

        inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)
        with torch.no_grad():
            outputs = self.model.generate(
                inputs["input_ids"],
                max_new_tokens=max_length,
                do_sample=True,
                top_p=top_p,
                temperature=temperature,
                pad_token_id=self.tokenizer.pad_token_id
            )

        full_response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)

        last_user_content = messages[-1]['content']
        if last_user_content in full_response:
            assistant_response = full_response.split(last_user_content)[-1].strip()
        else:
            assistant_response = full_response.strip()

        return assistant_response

def transcribe_audio_old(audio_path: str, model_size: str = "large"):
    print(f"Transcribing with Whisper {model_size}...")

    # Load the audio file and get its actual sampling rate
    audio_array, original_sr = librosa.load(audio_path, sr=None)

    # Resample to 16000 if needed (Whisper's preferred sampling rate)
    if original_sr != 16000:
        print(f"Resampling from {original_sr} Hz to 16000 Hz")
        audio_array = librosa.resample(audio_array, orig_sr=original_sr, target_sr=16000)
        sampling_rate = 16000
    else:
        sampling_rate = original_sr

    print(f"Audio length: {len(audio_array) / sampling_rate:.2f} seconds")

    model_name = f"openai/whisper-{model_size}"
    processor = WhisperProcessor.from_pretrained(model_name)
    model = WhisperForConditionalGeneration.from_pretrained(model_name)

    # Process the entire audio file
    input_features = processor(
        audio_array,
        sampling_rate=sampling_rate,
        return_tensors="pt"
    ).input_features

    with torch.no_grad():
        # Increase max_length if needed to capture entire audio
        predicted_ids = model.generate(
            input_features,
            max_length=model.config.max_length,  # Use model's max length
            num_beams=5,  # Increase beam search for better transcription
            length_penalty=1.0
        )

    transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
    print(f"Transcription length: {len(transcription)} characters")
    return transcription

def transcribe_audio(audio_path: str, sampling_rate: int = 16000, model_size: str = "large"):
    print(f"Transcribing with Whisper {model_size}...")

    model_name = f"openai/whisper-{model_size}"
    processor = WhisperProcessor.from_pretrained(model_name)
    model = WhisperForConditionalGeneration.from_pretrained(model_name)

    audio_array, _ = librosa.load(audio_path, sr=sampling_rate)

    input_features = processor(
        audio_array,
        sampling_rate=sampling_rate,
        return_tensors="pt"
    ).input_features

    with torch.no_grad():
        predicted_ids = model.generate(input_features)

    transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
    return transcription

def speaking_coach_app():
    qwen = QwenChatGenerator()

    def process_audio(audio, text):
        transcribed_audio = transcribe_audio(audio)
        few_shot_prompting = [
            {"role": "system", "content": "You are a SPEAKING COACH. You will be provided with a transcript of how the user spoke. Give constructive feedback on how to improve and what can go better, punish unprofessional language and grammatical errors. Make the user a better speaker by giving SPECIFIC INSTRUCTIONS on what can be better and the final suggested version"},
            #{"role": "system", "content": "You are a SPEAKING COACH. You will be provided with a transcript of how the user spoke during a presentation. Give constructive feedback on how to improve and what can go better, punish unprofessional language and grammatical errors"},
            {"role": "user", "content": f"I want to give a presentation on {text}"},
            {"role": "assistant", "content": "Sure, go ahead and give me the transcript of your presentation."},
            {"role": "user", "content": transcribed_audio}
        ]

        print(f"Few Shot Prompting Chat Template: {few_shot_prompting}")
        feedback = qwen.generate_text(few_shot_prompting)

        print(f"MODEL RESPONSE: {feedback}")
        return feedback

    iface = gr.Interface(
        fn=process_audio,
        inputs=[
            gr.Audio(sources=["upload", "microphone"], type="filepath", label="Record or Upload Audio"),
            gr.Textbox(placeholder="Topic of presentation", label="Topic")
        ],
        outputs=gr.Textbox(label="Speaking Coach Feedback"),
        title="🎙️ Speaking Coach AI",
        description="Get AI-powered feedback on your presentation skills by recording or uploading an audio file.",
        theme="default"
    )

    return iface

def main():
    app = speaking_coach_app()
    #app.launch(share=True)
    app.launch()

if __name__ == "__main__":
    main()

Collecting gradio
  Downloading gradio-5.23.2-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.8.0 (from gradio)
  Downloading gradio_client-1.8.0-py3-none-any.whl.metadata (7.1 kB)
Collecting groovy~=0.1 (from gradio)
  Downloading groovy-0.1.2-py3-none-any.whl.metadata (6.1 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Collecting ruff>=0.9.3 (from gradio)
  Downloading ruff-0.11.2-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (25 kB)
Collecting safehttpx<0.2.0,>=0.1.6 

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/7.30k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/659 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/988M [00:00<?, ?B/s]

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://52475e65a41930e7bd.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


In [None]:
a