In [None]:
!pip install -q --upgrade bitsandbytes accelerate

In [None]:
# imports

import os
import requests
from IPython.display import Markdown, display, update_display
from openai import OpenAI
from google.colab import drive
from huggingface_hub import login
from google.colab import userdata
from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer, BitsAndBytesConfig
import torch
import gradio as gr
from transformers import pipeline
import tempfile, numpy as np, soundfile as sf
from transformers import TextIteratorStreamer
import threading

In [None]:
# Constants
transcription_model = "gpt-4o-mini-transcribe"
LLAMA = "meta-llama/Meta-Llama-3.1-8B-Instruct"
#LLAMA = "meta-llama/Llama-3.2-3B-Instruct"

In [None]:
hf_token = userdata.get('HF_TOKEN')
login(hf_token, add_to_git_credential=True)

# --- Device + dtype config (single source of truth) ---
USE_CUDA       = torch.cuda.is_available()
TORCH_DEVICE   = "cuda" if USE_CUDA else "cpu"   # for tensors / .to()
TORCH_DTYPE    = torch.float16 if USE_CUDA else torch.float32
ASR_DTYPE      = torch.float16 if USE_CUDA else None

In [None]:
USE_CUDA  = torch.cuda.is_available()
# Detect if CUDA (GPU) is available
PIPE_DEVICE = 0 if USE_CUDA else -1           # for HF pipeline(device=…)-1
# Set dtype only if we’re on GPU
PIPE_dtype = torch.float16 if USE_CUDA else None # pipeline(dtype=…); None on CPU
TORCH_DTYPE    = torch.float16 if use_cuda else torch.float32

#Use Open Source for Transcription - Hugging Face Pipelines ( transcription : from audio to text)
pipe = pipeline("automatic-speech-recognition",model="openai/whisper-medium.en", dtype=PIPE_dtype, device=PIPE_DEVICE, return_timestamps=True)

In [None]:
def transcription_fn(audio_file, progress=gr.Progress()):
    result = pipe(audio_file)
    transcription = result["text"]
    return transcription

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
# Quantization

use_cuda = torch.cuda.is_available()
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_quant_type="nf4"
) if use_cuda else None

In [None]:
# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(LLAMA)
tokenizer.pad_token = tokenizer.eos_token

streamer = TextStreamer(tokenizer)
# Initialize Llama model
model = AutoModelForCausalLM.from_pretrained(
    LLAMA,
    device_map="auto" if use_cuda else None,
    quantization_config=quant_config,
    torch_dtype=torch.float16 if use_cuda else torch.float32
)
if not use_cuda:
    model.to("cpu")

In [None]:
# def message_LLAMA(audio_file, progress=gr.Progress()):

#   transcription =transcription_fn(audio_file)
#   progress(0.6, desc="Generating meeting minutes from transcript...")
#   system_message = """
#   You produce minutes of meetings from transcripts, with summary, key discussion points,
#   takeaways and action items with owners, in markdown format without code blocks.
#   """

#   user_prompt = f"""
#   Below is an extract transcript of a Denver council meeting.
#   Please write minutes in markdown without code blocks, including:
#   - a summary with attendees, location and date
#   - discussion points
#   - takeaways
#   - action items with owners

#   Transcription:
#   {transcription}
#   """

#   messages = [
#       {"role": "system", "content": system_message},
#       {"role": "user", "content": user_prompt}
#     ]

#   inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to(device)
#   outputs = model.generate(inputs, max_new_tokens=2000, streamer=streamer)
#   response = tokenizer.decode(outputs[0], skip_special_tokens=True)
#   return response # this is important for Gradio,
#                   # if you want to test the function without gradio, you can delete it

In [None]:
def message_LLAMA(audio_file, progress=gr.Progress()):

    """
  # 🔹 Handle case: (sample_rate, numpy_array)
  if isinstance(audio_file, tuple) and len(audio_file) == 2:
      sr, data = audio_file
      if isinstance(data, np.ndarray):
          tmp_wav = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
          sf.write(tmp_wav.name, data, sr)
          audio_file = tmp_wav.name  # overwrite with path
    """
    progress(0.3, desc="Creating transcript from audio...")
    transcription = transcription_fn(audio_file)

    progress(0.6, desc="Generating meeting minutes from transcript...")
    system_message = (
        "You produce minutes of meetings from transcripts, with summary, key discussion points, "
        "takeaways and action items with owners, in markdown format without code blocks."
    )
    user_prompt = f"""
    Below is an extract transcript of a Denver council meeting.
    Please write minutes in markdown without code blocks, including:
    - a summary with attendees, location and date
    - discussion points
    - takeaways
    - action items with owners

    Transcription:
    {transcription}
    """

    messages = [
        {"role": "system", "content": system_message},
        {"role": "user", "content": user_prompt}
    ]

    inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to(device)

    # No streamer here
    outputs = model.generate(inputs, max_new_tokens=2000)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response # this is important for Gradio,
                  # if you want to test the function without gradio, you can delete it

In [None]:
# get access to the mp3 file.
drive.mount("/content/drive")
audio_filename = "/content/drive/MyDrive/denver_extract.mp3"
audio_file = open(audio_filename, "rb")

In [None]:
# Test the function without Gradio
message_LLAMA(audio_filename)

# Gradio without Streaming

In [None]:
# Create Gradio interface

view = gr.Interface(
    fn=message_LLAMA,
    inputs=gr.Audio(type="filepath", label="Upload MP3 File", format="mp3"),
    #outputs=gr.Markdown(label="Meeting Minutes", min_height=60),
    outputs=gr.Textbox(label="Meeting Minutes (Markdown)", lines=12),
    title="Meeting Minutes Generator",
    description="Upload an MP3 recording of your meeting to get AI-generated meeting minutes. This process may take a few minutes.",
    flagging_mode="never"
)

view.launch()

# Streaming into Gradio :

In [None]:
from transformers import TextIteratorStreamer
import threading

def message_LLAMA_stream(audio_file, progress=gr.Progress()):
    progress(0.3, desc="Creating transcript from audio...")
    transcription = transcription_fn(audio_file)

    progress(0.6, desc="Generating meeting minutes from transcript...")
    system_message = (
        "You produce minutes of meetings from transcripts, with summary, key discussion points, "
        "takeaways and action items with owners, in markdown format without code blocks."
    )
    user_prompt = f"""
    Below is an extract transcript of a Denver council meeting.
    Please write minutes in markdown without code blocks, including:
    - a summary with attendees, location and date
    - discussion points
    - takeaways
    - action items with owners

    Transcription:
    {transcription}
    """

    messages = [
        {"role": "system", "content": system_message},
        {"role": "user", "content": user_prompt}
    ]

    device = "cuda" if torch.cuda.is_available() else "cpu"
    inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to(device)

    # Create a fresh iterator streamer per request
    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

    # Run generation in a thread so we can iterate over streamer
    gen_kwargs = dict(inputs=inputs, max_new_tokens=2000, streamer=streamer)
    thread = threading.Thread(target=model.generate, kwargs=gen_kwargs)
    thread.start()

    partial = ""
    for new_text in streamer:
        partial += new_text
        yield partial  # Gradio updates the textbox live


In [None]:
view = gr.Interface(
    fn=message_LLAMA_stream,  # 👈 use the stream fn
    inputs=gr.Audio(type="filepath", label="Upload MP3 File"),
    outputs=gr.Textbox(label="Meeting Minutes (Markdown)", lines=12),
    title="Meeting Minutes Generator (Streaming)",
    description="Upload an MP3 or record audio. Transcription + live generation.",
    flagging_mode="never"
)
