# Audio Transcription with Speaker Diarization

This notebook combines Whisper for transcription and Pyannote for speaker diarization to create a complete transcript with speaker identification.

In [None]:
# Required imports
import whisper
from pyannote.audio import Pipeline
import torch
from tqdm import tqdm
import os 
from dotenv import load_dotenv

## Configuration
Set up your Hugging Face authentication token

In [None]:
# Load environment variables from .env file
load_dotenv()

# Get Hugging Face token from environment variables
HF_TOKEN = os.getenv('HF_TOKEN')
if not HF_TOKEN:
    raise ValueError("HF_TOKEN not found in environment variables. Please set it up first.")

## Model Initialization
Load both Whisper and Pyannote models with progress tracking

In [None]:
%%time
print("Loading models...")
with tqdm(total=2, desc="Loading models") as pbar:
    whisper_model = whisper.load_model("large-v3")
    pbar.update(1)
    pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization-3.1",
                                       use_auth_token=HF_TOKEN)
    pbar.update(1)
print("Models loaded successfully!")

In [None]:
# run pipeline on correct device
if torch.cuda.is_available():
    device = "cuda"
    device_name = torch.cuda.get_device_name(0)
    print(f"Pipeline running on: {device} - {device_name}")
else:
    device = "cpu"
    print(f"Pipeline running on: {device}")

pipeline = pipeline.to(torch.device(device))

## Process Audio File
Load and process the audio file for both transcription and diarization

In [None]:
%%time
# Define audio file path
AUDIO_FILE = "Audio/audio.wav"

# Load audio file
audio = whisper.load_audio(AUDIO_FILE)

# Process diarization
print("Processing speaker diarization...")
diarization = pipeline(AUDIO_FILE)

# Create list of speaker segments
diarization_list = list(diarization.itertracks(yield_label=True))

# Display speaker segments
print("\nSpeaker segments:")
for turn, _, speaker in diarization_list:
    print(f"start={turn.start:.1f}s stop={turn.end:.1f}s speaker_{speaker}")

## Transcribe Audio
Use Whisper to transcribe the audio content

In [None]:
%%time
# Ensure the Transcription directory exists
os.makedirs("Transcription", exist_ok=True)

# Transcribe the audio file
print("Transcribing audio...")
result = whisper_model.transcribe(AUDIO_FILE)

# Save raw transcription
output_path = os.path.join("Transcription", "transcription.txt")
with open(output_path, "w", encoding='utf-8') as f:
    f.write(result['text'])

print(f"Raw transcription saved to '{output_path}'")

## Combine Transcription with Speaker Information
Merge the transcription with speaker identification

In [None]:
print("Combining transcription with speaker information...")

output_path = os.path.join("Transcription", "transcription_with_speakers.txt")
with open(output_path, 'w', encoding='utf-8') as f:
    for segment in result['segments']:
        segment_start = segment['start']
        segment_end = segment['end']
        segment_text = segment['text']
        
        # Find the speaker with maximum overlap for this segment
        max_overlap = 0
        current_speaker = None
        
        for turn, _, speaker in diarization_list:
            overlap_start = max(segment_start, turn.start)
            overlap_end = min(segment_end, turn.end)
            overlap = max(0, overlap_end - overlap_start)
            
            if overlap > max_overlap:
                max_overlap = overlap
                current_speaker = speaker
        
        # Format and write the line
        start_time = f"{segment_start:.1f}s"
        end_time = f"{segment_end:.1f}s"
        line = f"[{start_time} -> {end_time}] {current_speaker}: {segment_text.strip()}\n"
        print(line.strip())
        f.write(line)

print(f"\nTranscription with speaker identification saved to '{output_path}'")