In [1]:
!apt-get update -y >/dev/null
!apt-get install -y ffmpeg >/dev/null

W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)


In [2]:
!apt -qq install ffmpeg -y
!pip install -q faster-whisper pyAudioAnalysis hmmlearn torch torchvision torchaudio transformers sentencepiece

ffmpeg is already the newest version (7:4.4.2-0ubuntu0.22.04.1).
0 upgraded, 0 newly installed, 0 to remove and 38 not upgraded.
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.3/41.3 MB[0m [31m20.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m62.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m166.0/166.0 kB[0m [31m16.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.5/40.5 MB[0m [31m18.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m38.8/38.8 MB[0m [31m19.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.4/17.4 MB[0m [31m122.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.0/46.0 kB[0m [31m4.2 MB/s[0

In [3]:
!pip install -q faster-whisper transformers gradio sentencepiece

In [4]:
import os, json, textwrap
os.makedirs("output", exist_ok=True)
os.makedirs("repo", exist_ok=True)

In [6]:
requirements = """faster-whisper
pyAudioAnalysis
hmmlearn
torch
torchaudio
transformers
sentencepiece
"""
with open("repo/requirements.txt","w") as f:
    f.write(requirements)


readme = textwrap.dedent("""
# Meeting Transcription + Action Item Extractor

**What it does**
- Transcribe meeting audio using open-source Whisper (faster-whisper).
- Summarize transcripts and extract decisions & action items using open-source LLMs (Flan-T5 & DistilBART).
- Optional Gradio UI to upload audio and view results.

**How to run**
1. Open in Google Colab.
2. Run the three cells in order.
3. Use the Gradio UI (appears when you run the final cell) to upload audio (wav/mp3/m4a/ogg).

**Models used**
- ASR: `faster-whisper` (open-source Whisper)
- Summarization: `sshleifer/distilbart-cnn-12-6`
- Action/Decision extraction: `google/flan-t5-small`

All are free/open-source models on Hugging Face.

""")
with open("repo/README.md","w") as f:
    f.write(readme)



print("Setup cell complete — requirements.txt and README.md created in /repo.")
print("Files written: repo/requirements.txt  repo/README.md")

Setup cell complete — requirements.txt and README.md created in /repo.
Files written: repo/requirements.txt  repo/README.md


In [7]:
from google.colab import userdata
from huggingface_hub import login
hf_token = userdata.get('HF_TOKEN')
login(hf_token)

In [8]:

import os
from faster_whisper import WhisperModel
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
import torch
import math
import json
from datetime import datetime

from pydub import AudioSegment

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", DEVICE)


ASR_MODEL_SIZE = "small"   # options: tiny, base, small, medium, large
asr_model = WhisperModel(ASR_MODEL_SIZE, device=DEVICE)


diarizer_model = None

SUM_MODEL = "sshleifer/distilbart-cnn-12-6"
summarizer = pipeline("summarization", model=SUM_MODEL, device=0 if DEVICE=="cuda" else -1)

FLAN_MODEL = "google/flan-t5-base"
flan_tokenizer = AutoTokenizer.from_pretrained(FLAN_MODEL)
flan_model = AutoModelForSeq2SeqLM.from_pretrained(FLAN_MODEL).to(DEVICE)


def chunk_text(text, max_chars=3000):

    chunks = []
    start = 0
    while start < len(text):
        end = start + max_chars
        chunks.append(text[start:end])
        start = end
    return chunks


def transcribe_audio(audio_path, language=None, word_timestamps=False):

    segments_list = []

    segments, info = asr_model.transcribe(audio_path, beam_size=5, language=language)
    for seg in segments:
        segments_list.append({
            "start": float(seg.start),
            "end": float(seg.end),
            "text": seg.text.strip()
        })
    transcript = " ".join([s["text"] for s in segments_list]).strip()
    return {"transcript": transcript, "segments": segments_list, "info": info}

def summarize_transcript(transcript, max_length=180, min_length=40):

    if len(transcript) < 8000:
        s = summarizer(transcript, max_length=max_length, min_length=min_length, do_sample=False)
        return s[0]['summary_text'].strip()

    chunks = chunk_text(transcript, max_chars=7000)
    partials = []
    for c in chunks:
        out = summarizer(c, max_length=max_length, min_length=min_length, do_sample=False)
        partials.append(out[0]['summary_text'].strip())
    combined = " ".join(partials)
    final = summarizer(combined, max_length=max_length, min_length=min_length, do_sample=False)
    return final[0]['summary_text'].strip()


def extract_actions_and_decisions(transcript, max_input_chars=3000):

    print("[LOG] Extracting actions/decisions...")

    txt = transcript[:max_input_chars] if len(transcript) > max_input_chars else transcript

    prompt = (
        "You are an expert meeting summarizer.\n"
        "From the following transcript, extract:\n"
        "1. Key DECISIONS made (short bullet sentences)\n"
        "2. ACTION ITEMS: tasks to be done, with optional owner and due date.\n"
        "Output in **valid JSON** only, structured as:\n"
        "{\n"
        '  "decisions": ["..."],\n'
        '  "action_items": [ {"task": "...", "owner": "", "due_date": ""} ]\n'
        "}\n"
        "Transcript:\n" + txt
    )

    inputs = flan_tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024).to(DEVICE)
    outputs = flan_model.generate(**inputs, max_new_tokens=512, temperature=0.0)
    decoded = flan_tokenizer.decode(outputs[0], skip_special_tokens=True)


    try:
        data = json.loads(decoded)
    except Exception:
        import re
        m = re.search(r'\{.*\}', decoded, re.DOTALL)
        if m:
            try:
                data = json.loads(m.group())
            except Exception:
                data = {"decisions": [], "action_items": [], "raw": decoded}
        else:
            data = {"decisions": [], "action_items": [], "raw": decoded}


    if "decisions" not in data: data["decisions"] = []
    if "action_items" not in data: data["action_items"] = []
    print("[LOG] Extraction complete:", json.dumps(data, indent=2)[:500])
    return data


def process_meeting(audio_path, language=None):

    base_name = os.path.splitext(os.path.basename(audio_path))[0]
    timestamp = datetime.utcnow().strftime("%Y%m%dT%H%M%SZ")
    prefix = f"{base_name}_{timestamp}"

    asr = transcribe_audio(audio_path, language=language)
    transcript = asr["transcript"]

    plain_transcript = " ".join(
      [seg["text"] for seg in asr["segments"]]
    )
    summary = summarize_transcript(plain_transcript)

    extracted = extract_actions_and_decisions(transcript)

    txt_path = f"output/{prefix}_transcript.txt"
    sum_path = f"output/{prefix}_summary.txt"
    json_path = f"output/{prefix}_actions.json"
    with open(txt_path, "w") as f:
        f.write(transcript)
    with open(sum_path, "w") as f:
        f.write(summary)
    with open(json_path, "w") as f:
        json.dump(extracted, f, indent=2)
    return {
        "transcript": transcript,
        "summary": summary,
        "extracted": extracted,
        "files": {"transcript": txt_path, "summary": sum_path, "actions_json": json_path}
    }

print("Core models loaded (ASR + summarizer + flan). Ready.")


  m = re.match('([su]([0-9]{1,2})p?) \(([0-9]{1,2}) bit\)$', token)
  m2 = re.match('([su]([0-9]{1,2})p?)( \(default\))?$', token)
  elif re.match('(flt)p?( \(default\))?$', token):
  elif re.match('(dbl)p?( \(default\))?$', token):


Device: cuda


vocabulary.txt: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

model.bin:   0%|          | 0.00/484M [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

Device set to use cuda:0


tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Core models loaded (ASR + summarizer + flan). Ready.


In [9]:

import gradio as gr
import shutil
import json
import os

def handle_upload(file_obj, language):
    try:
        if file_obj is None:
            return "No file uploaded.", "", "{}", "### No data"

        print("[LOG] File received:", file_obj)
        result = process_meeting(file_obj, language=language if language.strip() else None)

        transcript = result["transcript"]
        summary = result["summary"]
        extracted = result["extracted"]
        extracted_str = json.dumps(extracted, indent=2, ensure_ascii=False)

        md = "### Extracted Decisions & Action Items\n\n"
        decs = extracted.get("decisions", [])
        ais = extracted.get("action_items", [])

        if decs:
            md += "**Decisions:**\n"
            for d in decs:
                md += f"- {d}\n"
        if ais:
            md += "\n**Action Items:**\n"
            for a in ais:
                if isinstance(a, dict):
                    md += f"- {a.get('task','')}"
                    if a.get('owner'): md += f" — owner: {a['owner']}"
                    if a.get('due_date'): md += f" — due: {a['due_date']}"
                    md += "\n"
        return transcript, summary, extracted_str, md

    except Exception as e:
        import traceback
        tb = traceback.format_exc()
        return "", "", f"{{'error': '{str(e)}'}}", f"### Error Occurred\n```\n{tb}\n```"



def package_repo():

    zip_name = "repo_package.zip"
    if os.path.exists(zip_name):
        os.remove(zip_name)
    shutil.make_archive("repo_package", "zip", "repo")
    return zip_name



with gr.Blocks() as demo:
    gr.Markdown("# 🗣️ Meeting Transcription + Action Item Extractor (Colab)")
    with gr.Row():
        audio_in = gr.Audio(type="filepath", label="Upload meeting audio (wav/mp3/m4a/ogg)")
        lang = gr.Textbox(value="", label="Language (optional, e.g. 'en')")
    run_btn = gr.Button("🚀 Transcribe & Summarize")

    with gr.Tabs():
        with gr.Tab("Transcript"):
            out_transcript = gr.Textbox(label="Full transcript", lines=12)
        with gr.Tab("Summary"):
            out_summary = gr.Textbox(label="Concise summary", lines=6)
        with gr.Tab("Decisions & Actions (JSON)"):
            out_json = gr.Textbox(label="Extracted JSON", lines=12)
        with gr.Tab("Decisions & Actions (Pretty)"):
            out_pretty = gr.Markdown()

    run_btn.click(
        fn=handle_upload,
        inputs=[audio_in, lang],
        outputs=[out_transcript, out_summary, out_json, out_pretty]
    )

    gr.Markdown("---")
    gr.Markdown("### 💾 Download your project files")
    download_btn = gr.Button("📦 Download repo (requirements + README)")
    download_output = gr.File(label="Download ZIP")
    download_btn.click(fn=package_repo, inputs=None, outputs=download_output)




demo.launch(share=False)





Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Note: opening Chrome Inspector may crash demo inside Colab notebooks.
* To create a public link, set `share=True` in `launch()`.


<IPython.core.display.Javascript object>

