In [1]:
import whisper
from pyannote.audio import Pipeline
import torch
from tqdm import tqdm


In [3]:
# Load environment variables from .env file
load_dotenv()

# Get Hugging Face token from environment variables
HF_TOKEN = os.getenv('HF_TOKEN')
if not HF_TOKEN:
    raise ValueError("HF_TOKEN not found in environment variables. Please set it up first.")

In [None]:

# First, initialize both models with progress bar
print("Loading models...")
with tqdm(total=2, desc="Loading models") as pbar:
    whisper_model = whisper.load_model("large-v3-turbo")
    pbar.update(1)
    pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization-3.1",
                                      use_auth_token= HF_token)
    pbar.update(1)


In [None]:

# Process the audio file for diarization 
diarization = pipeline("cutout_1.wav")


In [None]:
for turn, _, speaker in diarization.itertracks(yield_label=True):
    print(f"start={turn.start:.1f}s stop={turn.end:.1f}s speaker_{speaker}")

In [19]:
audio = whisper.load_audio("cutout_1.wav")  

In [None]:
# Transcribe the audio file
result = whisper_model.transcribe(cutout_1.wav)



In [None]:
print(result)

In [15]:
with open("transcription.txt", "w") as f:
    f.write(result['text'])

In [None]:
# Print results with speaker diarization
print("\nProcessing final results...")
diarization_list = list(diarization.itertracks(yield_label=True))

In [None]:
# Assuming result is the output from whisper that contains segments
with open('transcription_with_speakers.txt', 'w', encoding='utf-8') as f:
    
     # Iterate through each transcribed segment
    for segment in result['segments']:
        segment_start = segment['start']
        segment_end = segment['end']
        segment_text = segment['text']
        
        # Find the speaker who was talking during this segment
        max_overlap = 0
        current_speaker = None
        
        for turn, _, speaker in diarization_list:
            # Calculate overlap between segment and diarization
            overlap_start = max(segment_start, turn.start)
            overlap_end = min(segment_end, turn.end)
            overlap = max(0, overlap_end - overlap_start)
            
            if overlap > max_overlap:
                max_overlap = overlap
                current_speaker = speaker
        
        # Format and write the line with the correct speaker and text
        start_time = f"{segment_start:.1f}s"
        end_time = f"{segment_end:.1f}s"
        line = f"[{start_time} -> {end_time}] {current_speaker}: {segment_text.strip()}\n"
        print(line.strip())
        f.write(line)