In [None]:
!pip install -q requests torch bitsandbytes transformers sentencepiece accelerate gradio httpx==0.27.2

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.4/76.4 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m41.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m31.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m22.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
from huggingface_hub import login
from google.colab import userdata
import torch
import gradio as gr
import re
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    AutoModelForSpeechSeq2Seq,
    AutoProcessor,
    pipeline,
    BitsAndBytesConfig
)

In [None]:
hf_token = userdata.get('HF_TOKEN')
login(hf_token, add_to_git_credential=True)

In [None]:
class CustomTextStreamer:
    """Custom text streamer to handle generated text from the model"""
    def __init__(self, tokenizer, skip_prompt=True, skip_special_tokens=True):
        self.tokenizer = tokenizer
        self.skip_prompt = skip_prompt
        self.skip_special_tokens = skip_special_tokens
        self.buffer = ""
        self.started = False

    def put(self, value):
        # Handle both tensor and list inputs
        if isinstance(value, torch.Tensor):
            value = value.tolist()

        # Make sure value is a list of lists (batched tokens)
        if not isinstance(value[0], list):
            value = [value]

        for token_ids in value:
            text = self.tokenizer.decode(token_ids, skip_special_tokens=self.skip_special_tokens)
            if not self.started and '# ' in text:
                self.started = True
                text = text[text.index('# '):]
            self.buffer += text

    def end(self):
        return self.buffer

In [None]:
def transcribe_audio(audio_path):
    """Transcribe audio using Whisper model"""
    torch.cuda.empty_cache()
    AUDIO_MODEL = "openai/whisper-large-v3"
    device = "cuda" if torch.cuda.is_available() else "cpu"

    # Load model and processor
    speech_model = AutoModelForSpeechSeq2Seq.from_pretrained(
        AUDIO_MODEL,
        torch_dtype=torch.float16 if device == "cuda" else torch.float32,
        low_cpu_mem_usage=True,
        use_safetensors=True,
        device_map="auto"
    )

    processor = AutoProcessor.from_pretrained(AUDIO_MODEL)

    # Create ASR pipeline
    pipe = pipeline(
        "automatic-speech-recognition",
        model=speech_model,
        tokenizer=processor.tokenizer,
        feature_extractor=processor.feature_extractor,
        torch_dtype=torch.float16 if device == "cuda" else torch.float32,
        model_kwargs={"use_flash_attention_2": device == "cuda"}
    )

    # Handle different input formats
    actual_path = audio_path['path'] if isinstance(audio_path, dict) else audio_path

    # Transcribe the audio
    result = pipe(actual_path, return_timestamps=True)
    return result['text']


In [None]:
def clean_notes(raw_notes):
    """Clean up generated notes to follow the correct format"""
    # Keep everything starting from "# Topic"
    if '# Topic' in raw_notes:
        cleaned = raw_notes[raw_notes.index('# Topic'):]
    else:
        cleaned = raw_notes

    # Remove any special tokens and formatting
    cleaned = re.sub(r'<\|.*?\|>', '', cleaned)
    cleaned = re.sub(r'(system|user|assistant):', '', cleaned, flags=re.IGNORECASE)
    cleaned = re.sub(r'Today Date:.*?\n', '', cleaned)

    # Normalize markdown headers
    cleaned = cleaned.strip()
    cleaned = re.sub(r'^#\s+', '# ', cleaned, flags=re.MULTILINE)
    cleaned = re.sub(r'^##\s+', '## ', cleaned, flags=re.MULTILINE)

    return cleaned

In [None]:
def generate_notes(transcript):
    """Generate lecture notes from transcript using LLM"""
    LLAMA = "meta-llama/Meta-Llama-3.1-8B-Instruct"
    system_prompt = (
        "Generate ONLY the lecture notes in markdown format with EXACTLY these sections:\n"
        "# Topic\n"
        "## Instructor\n"
        "## Summary\n"
        "## Key Concepts (bullet points)\n"
        "## Key Points (bullet points)\n"
        "## Takeaways (bullet points)\n\n"
        "Do NOT include:\n"
        "- Any introductory text\n"
        "- The transcript\n"
        "- Any explanation about the format\n"
        "- Any text outside the specified sections\n"
        "The first line of your response MUST be '# Topic: [actual topic name]'"
    )

    user_prompt = f"Please generate comprehensive notes from this lecture transcript:\n\n{transcript}"

    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt}
    ]

    # Configure model loading with quantization
    quant_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_quant_type="nf4"
    )

    # Load tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained(LLAMA)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    model = AutoModelForCausalLM.from_pretrained(
        LLAMA,
        device_map='auto',
        quantization_config=quant_config,
        torch_dtype=torch.float16,
    )

    # Prepare input
    inputs = tokenizer.apply_chat_template(messages, return_tensors="pt")
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    input_ids = inputs.to(device)

    # Generate text
    with torch.no_grad():
        try:
            context_manager = torch.backends.cuda.sdp_kernel(enable_flash=True) if torch.cuda.is_available() else contextlib.nullcontext()
            with context_manager:
                output_ids = model.generate(
                    input_ids=input_ids,
                    max_new_tokens=2000,
                    do_sample=True,
                    temperature=0.7,
                    top_p=0.9,
                    eos_token_id=tokenizer.eos_token_id,
                    pad_token_id=tokenizer.pad_token_id,
                )

                # Process the output directly
                generated_text = tokenizer.decode(output_ids[0][input_ids.shape[1]:], skip_special_tokens=True)

                # Find and extract the notes
                if '# Topic' in generated_text:
                    generated_text = generated_text[generated_text.index('# Topic'):]
        except Exception as e:
            print(f"Generation error with flash attention: {e}")
            # Fallback without flash attention
            output_ids = model.generate(
                input_ids=input_ids,
                max_new_tokens=2000,
                do_sample=True,
                temperature=0.7,
                top_p=0.9,
                eos_token_id=tokenizer.eos_token_id,
                pad_token_id=tokenizer.pad_token_id,
            )

            # Process the output directly
            generated_text = tokenizer.decode(output_ids[0][input_ids.shape[1]:], skip_special_tokens=True)

            # Find and extract the notes
            if '# Topic' in generated_text:
                generated_text = generated_text[generated_text.index('# Topic'):]

    # Clean up notes and ensure topic has a name
    notes = clean_notes(generated_text)

    # Check if the topic has no name and try to extract it from the transcript
    if notes.startswith("# Topic\n"):
        # Try to identify a topic from the transcript
        topic_candidates = ["RAG", "Retrieval Augmented Generation", "LLM", "NLP", "Large Language Models"]
        for candidate in topic_candidates:
            if candidate.lower() in transcript.lower():
                notes = notes.replace("# Topic\n", f"# Topic: {candidate}\n")
                break

        # If no candidate found, use a generic topic
        if notes.startswith("# Topic\n"):
            notes = notes.replace("# Topic\n", "# Topic: AI Lecture\n")

    return notes

In [None]:
def process_audio(audio_path, progress=gr.Progress()):
    """Main function to process audio and generate notes"""
    try:
        progress(0.2, desc="Transcribing audio...")
        transcript = transcribe_audio(audio_path)
        if not transcript:
            return "Error: Could not transcribe audio"

        progress(0.6, desc="Generating notes...")
        notes = generate_notes(transcript)
        progress(1.0, desc="Done!")
        return notes
    except Exception as e:
        import traceback
        return f"Error: {str(e)}\n\n{traceback.format_exc()}"

In [None]:
def create_ui():
    with gr.Blocks(theme=gr.themes.Soft()) as demo:
        gr.Markdown("""
        # 📝 AI Lecture Notes Generator
        Upload a lecture audio file to generate clean, structured notes.
        """)

        with gr.Row():
            with gr.Column():
                audio_input = gr.Audio(
                    type="filepath",
                    label="Upload Lecture Audio",
                    elem_classes=["audio-input"]
                )
                btn = gr.Button("Generate Notes", variant="primary")

            with gr.Column():
                # Use Markdown component instead of Textbox for proper rendering
                notes_output = gr.Markdown(
                    label="Generated Notes",
                    elem_classes=["notes-output"]
                )

        btn.click(
            fn=process_audio,
            inputs=audio_input,
            outputs=notes_output,
            api_name="generate_notes"
        )

        demo.css = """
        .audio-input { min-height: 100px; }
        .notes-output {
            font-family: system-ui, -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, Oxygen, Ubuntu, Cantarell, 'Open Sans', 'Helvetica Neue', sans-serif;
            background: #f000000;
            border-radius: 8px;
            padding: 15px;
            margin-top: 10px;
        }
        .notes-output h1 {
            font-size: 1.8em;
            margin-top: 0.5em;
            margin-bottom: 0.5em;
        }
        .notes-output h2 {
            font-size: 1.4em;
            margin-top: 1em;
            margin-bottom: 0.5em;
        }
        .notes-output ul {
            margin-left: 1.5em;
        }
        """
    return demo

In [None]:
if __name__ == "__main__":
    demo = create_ui()
    demo.launch(debug=True)

Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://5c1680fe0f0f9c046b.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


config.json:   0%|          | 0.00/1.27k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.09G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/3.90k [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/340 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/283k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.48M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/34.6k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.07k [00:00<?, ?B/s]

Device set to use cuda:0
Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English.This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`.
Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.


tokenizer_config.json:   0%|          | 0.00/55.4k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/855 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]

  self.gen = func(*args, **kwds)
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Device set to use cuda:0


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

  self.gen = func(*args, **kwds)
ERROR:    Exception in ASGI application
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/uvicorn/protocols/http/h11_impl.py", line 403, in run_asgi
    result = await app(  # type: ignore[func-returns-value]
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/uvicorn/middleware/proxy_headers.py", line 60, in __call__
    return await self.app(scope, receive, send)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/fastapi/applications.py", line 1054, in __call__
    await super().__call__(scope, receive, send)
  File "/usr/local/lib/python3.11/dist-packages/starlette/applications.py", line 112, in __call__
    await self.middleware_stack(scope, receive, send)
  File "/usr/local/lib/python3.11/dist-packages/starlette/middleware/errors.py", line 187, in __call__
    raise exc
  File "/usr/local/lib/python3.11/dist-packages

Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://5c1680fe0f0f9c046b.gradio.live
