In [1]:
!pip install -q --upgrade bitsandbytes accelerate

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.4/59.4 MB[0m [31m14.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
!pip install -U -q bitsandbytes

In [3]:
import os
import requests
from openai import OpenAI
from google.colab import userdata
from google.colab import drive
import gc

In [4]:
from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer, BitsAndBytesConfig
import torch
from huggingface_hub import login
from IPython.display import Markdown, display, update_display

In [5]:
import gradio as gr

In [6]:
hf_token = userdata.get('HF_TOKEN_1')
login(hf_token, add_to_git_credential = True)

In [7]:
openai_api_key = userdata.get('HF_OPENAI')
openai = OpenAI(api_key = openai_api_key)

In [8]:
drive.mount("/content/drive")

Mounted at /content/drive


In [9]:
LLAMA = "meta-llama/Llama-3.2-3B-Instruct"
AUDIO_MODEL = 'whisper-1'

In [10]:
PHI = "microsoft/Phi-4-mini-instruct"
GEMMA = "google/gemma-3-270m-it"
QWEN = "Qwen/Qwen3-4B-Instruct-2507"
DEEPSEEK = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"

In [11]:
def transcription(audio_file):
  audio_file = open(audio_file,'rb')
  transcription = openai.audio.transcriptions.create(model =AUDIO_MODEL, file = audio_file, response_format = 'text' )
  return transcription

In [12]:
system_prompt = """
You are an expert assistant that generates clear, concise, and well-structured
Minutes of Meeting (MOM) documents from raw meeting transcripts.

Your output must be in clean Markdown format (without code blocks), and should include:
- **Meeting Summary:** A brief overview of what the meeting was about.
- **Key Discussion Points:** Important topics or decisions discussed.
- **Takeaways:** Major insights, agreements, or learnings.
- **Action Items:** Clear tasks with responsible owners (e.g., "John will prepare the project plan by Monday").

Guidelines:
- Write in professional, easy-to-read language.
- Avoid filler words or redundant phrases.
- Do not include timestamps or transcription noise (like “um”, “yeah”, “okay”).
- Keep the tone formal and factual, but concise.
- Focus on clarity, structure, and readability.
"""

In [13]:
def user_prompt_for(audio_file):
    user_prompt = f"""
Please write well-structured **Minutes of Meeting (MOM)** in Markdown format (without code blocks), including:

- **Summary:** Include attendees, location, and date if mentioned.
- **Key Discussion Points:** List the main topics or debates.
- **Takeaways:** Important insights or decisions.
- **Action Items:** Clearly list tasks with owners and deadlines.

Transcription:
{transcription(audio_file)}
"""
    return user_prompt


In [14]:
def messages_for(audio_file):

  messages = [
    {'role': 'system', 'content': system_prompt},
    {'role': 'user', 'content': user_prompt_for(audio_file)}
    ]

  return messages

In [15]:
quant_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_use_double_quant = True,
    bnb_4bit_compute_dtype= torch.bfloat16,
    bnb_4bit_quant_type = 'nf4'
)

In [16]:
def generate(model_name,audio_file):
  messages = messages_for(audio_file)
  tokenizer = AutoTokenizer.from_pretrained(model_name,trust_remote_code=True)
  tokenizer.pad_token = tokenizer.eos_token
  inputs = tokenizer.apply_chat_template(messages,return_tensors = 'pt',add_generation_prompt = True).to('cuda')
  model = AutoModelForCausalLM.from_pretrained(model_name, device_map = 'auto',quantization_config = quant_config)
  #streamer = TextStreamer(tokenizer)
  outputs = model.generate(inputs, max_new_tokens = 5000)
  result = tokenizer.decode(outputs[0])

  del model, inputs, tokenizer, outputs
  gc.collect()
  torch.cuda.empty_cache()

  return result

In [18]:
def messages_for_gemma(audio_file):
  messages= [{'role':'user','content':user_prompt_for(audio_file)}]
  return messages

In [19]:
def generate_gemma(model_name,audio_file):
  messages = messages_for_gemma(audio_file)
  tokenizer = AutoTokenizer.from_pretrained(model_name,trust_remote_code=True)
  tokenizer.pad_token = tokenizer.eos_token
  inputs = tokenizer.apply_chat_template(messages,return_tensors = 'pt',add_generation_prompt = True).to('cuda')
  model = AutoModelForCausalLM.from_pretrained(model_name, device_map = 'auto',quantization_config = quant_config)
  #streamer = TextStreamer(tokenizer)
  outputs = model.generate(inputs, max_new_tokens = 5000)
  result = tokenizer.decode(outputs[0])

  del model, inputs, tokenizer, outputs
  gc.collect()
  torch.cuda.empty_cache()

  return result

In [25]:
def model_selection(audio_file, model_name):
  if model_name == 'GEMMA':
    return generate_gemma(GEMMA, audio_file)
  elif model_name == 'LLAMA':
    return generate(LLAMA, audio_file)
  elif model_name == 'PHI':
    return generate(PHI, audio_file)
  elif model_name == 'QWEN':
    return generate(QWEN, audio_file)
  elif model_name == 'DEEPSEEK':
    return generate(DEEPSEEK, audio_file)

In [None]:
gr.Interface(
    fn = model_selection,
    inputs = [gr.Textbox(label = 'Enter The Audio Link'),gr.Dropdown(["LLAMA", "PHI", "GEMMA", "QWEN", "DEEPSEEK"],label = 'Choose Your Model')],
    outputs= [gr.Markdown(label = 'MOM')],
    allow_flagging = 'never',
).launch(debug=True)



It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://f9f02d29a498783337.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/54.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/878 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.46G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
