Sources:
- https://www.youtube.com/watch?v=fc7cAP5zrOY (LLama Summarization)
- https://www.youtube.com/watch?v=MVW746z8y_I (Transcription),  
Credit: Dwarkesh's Patel (https://x.com/dwarkesh_sp/status/1579672641887408129)



# Setup

In [1]:
# Install and import packages
!pip install -q git+https://github.com/openai/whisper.git > /dev/null
!pip install -q git+https://github.com/pyannote/pyannote-audio > /dev/null
!pip install -q pydub

import whisper
import datetime

import sys
import subprocess

import torch
import pyannote.audio
from pyannote.audio.pipelines.speaker_verification import PretrainedSpeakerEmbedding
from pyannote.audio import Audio
from pyannote.core import Segment
from pydub import AudioSegment

import contextlib

from sklearn.cluster import AgglomerativeClustering
import numpy as np

In [2]:
# Upload audio file (mp3)
from google.colab import files
uploaded = files.upload()
path = next(iter(uploaded))

Saving 00_nbp.mp3 to 00_nbp.mp3


In [3]:
# Parameters
num_speakers = 4 #@param {type:"integer"}
language = 'English' #@param ['any', 'English']
model_size = 'medium' #@param ['tiny', 'base', 'small', 'medium', 'large']
transcribe = True #@param {type:"boolean"}
summarize = True #@param {type:"boolean"}

model_name = model_size
if language == 'English' and model_size != 'large':
    model_name += '.en'

# Transcription

In [4]:
# Load transcription model
model = whisper.load_model(model_size)

100%|█████████████████████████████████████| 1.42G/1.42G [00:25<00:00, 59.8MiB/s]
  checkpoint = torch.load(fp, map_location=device)


In [5]:
# Transcribe audio file into segments
result = model.transcribe(path)
segments = result["segments"]



In [7]:
audio = AudioSegment.from_mp3(path)
duration = len(audio) / 1000.0  # duration in seconds

audio = Audio()

embedding_model = PretrainedSpeakerEmbedding("speechbrain/spkrec-ecapa-voxceleb", device='cuda' if torch.cuda.is_available() else 'cpu') # Runtime > Change runtime type > GPU T4

def segment_embedding(segment):
  start = segment["start"]
  # Whisper overshoots the end timestamp in the last segment
  end = min(duration, segment["end"])
  clip = Segment(start, end)
  waveform, sample_rate = audio.crop(path, clip)
  return embedding_model(waveform[None])

  state_dict = torch.load(path, map_location=device)
  stats = torch.load(path, map_location=device)


In [8]:
# Embed the transcription segments
embeddings = np.zeros(shape=(len(segments), 192))
for i, segment in enumerate(segments):
  embeddings[i] = segment_embedding(segment)

embeddings = np.nan_to_num(embeddings)



In [9]:
# Create clusters for the number of speakers to diarize the trancsription
clustering = AgglomerativeClustering(num_speakers).fit(embeddings)
labels = clustering.labels_
for i in range(len(segments)):
  segments[i]["speaker"] = 'SPEAKER ' + str(labels[i] + 1)

In [10]:
# Write protocol file with speaker diarization and timestamps
def time(secs):
  return datetime.timedelta(seconds=round(secs))

f = open("transcript.txt", "w")

for (i, segment) in enumerate(segments):
  if i == 0 or segments[i - 1]["speaker"] != segment["speaker"]:
    f.write("\n" + segment["speaker"] + ' ' + str(time(segment["start"])) + '\n')
  f.write(segment["text"][1:] + ' ')
f.close()

# Summarization

In [12]:
# Only perform summarization if wanted
if not summarize:
    sys.exit()

**Important:**  
Before you can use the llama3 model you have to request access on huggingface, e.g. here for LLama3.2: https://huggingface.co/meta-llama/Llama-3.2-3B. You need to authorize using a huggingface token which you can generate when you are logged in.

In [13]:
!huggingface-cli login # You are asked for HF token here. Can also enter using flag --token "yourtoken"


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) Y
Token is valid (permission: fineGrained).
The token `flosener` has been saved to /root/.cache/huggingface/stored_tokens
[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re-aut

In [14]:
# Install and import packages
!pip install -q transformers einops accelerate langchain bitsandbytes
#!pip install sentencepiece # probably deprecated with Llama3 (not sure)
#!pip install llama-stack # update existing: -U
#!llama model list # show all available models: --show-all

"""
Model IDs:
Llama-3.2-1B-Instruct
Llama-3.2-1B-Instruct-QLORA_INT4_EO8
Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8
Llama-3.2-3B-Instruct
Llama-3.2-3B-Instruct-QLORA_INT4_EO8
Llama-3.2-3B-Instruct-SpinQuant_INT4_EO8
Llama-3.1-8B-Instruct
Llama-3.1-70B-Instruct
Llama-3.1-405B-Instruct
Llama-3.1-405B-Instruct-MP16
Llama-3.1-405B-Instruct-FP8
"""

#!llama model download --source meta --model-id  Llama-3.2-3B-Instruct
!pip install langchain-community
from langchain import HuggingFacePipeline, PromptTemplate, LLMChain
from transformers import AutoTokenizer
import transformers
import torch

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m122.4/122.4 MB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langchain-community
  Downloading langchain_community-0.3.7-py3-none-any.whl.metadata (2.9 kB)
Collecting SQLAlchemy<2.0.36,>=1.4 (from langchain-community)
  Downloading SQLAlchemy-2.0.35-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.6 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting httpx-sse<0.5.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting langchain-core<0.4.0,>=0.3.17 (from langchain-community)
  Downloading langchain_core-0.3.17-py3-none-any.whl.metadata (6.3 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.6.1-py3-none-any.whl.metadata (3.5 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from datacl

Adapting temperature, top_k, top_p can enhance the 'creativity' of the model. The length of the response is determined by max_length (default: 1000) – increasing it also drastically increases inference time.

In [16]:
# Load model, tokenizer, llm pipeline
model = "meta-llama/Llama-3.2-3B-Instruct" # get access here: https://huggingface.co/meta-llama/Llama-3.2-3B
tokenizer = AutoTokenizer.from_pretrained(model)

pipeline = transformers.pipeline(
    "text-generation", #task
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True,
    device_map="auto",
    max_length=1500,
    do_sample=True,
    top_k=10,
    top_p=0.9,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id
)

llm = HuggingFacePipeline(pipeline = pipeline, model_kwargs = {'temperature':0.7})

tokenizer_config.json:   0%|          | 0.00/54.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/878 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.46G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

  llm = HuggingFacePipeline(pipeline = pipeline, model_kwargs = {'temperature':0.7})


In [18]:
# Prepare the prompt for bullet point summarization

template = """
              Write a detailed summary of the main topics discussed by the attendees in the meeting.
              Include at least 10 bullet points covering all significant points.
              ```{text}```
              DETAILED BULLET POINT SUMMARY:
           """

prompt = PromptTemplate(template=template, input_variables=["text"])
llm_chain = LLMChain(prompt=prompt, llm=llm)

# Create summary
protocol = result['text'].replace("\n", '')
summary = llm_chain.run(protocol)
print(summary)

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.



              Write a detailed summary of the main topics discussed by the attendees in the meeting.
              Include at least 10 bullet points covering all significant points.
              DETAILED BULLET POINT SUMMARY:
            • The meeting was recorded, and attendees were informed that they would be recorded during the meeting.
            • There was no need to obtain permission to be recorded, as the meeting was held on university premises.
            • The recording would not raise any privacy concerns, as the meeting was held on university premises.
            • The recording would be used for future meetings to ensure consistency and accuracy.
            • A shared notes document was created for attendees to draft protocols and take notes during the meeting.
            • The meeting would be concluded by the end of the meeting, with the recording automatically stopping.
            • Attendees were asked to review the recording and provide feedback.
            •

In [20]:
# Load diarized protocol
with open('transcript.txt', 'r') as file:
    original = file.read()

# Append the summary to the top of transcript.txt
with open('transcript.txt', 'w') as file:
  file.write(summary + '\n\n' + original)