In [1]:
import whisperx
import gc 
import torch
import os

In [2]:
torch.cuda.is_available()

False

# Params

In [88]:
device = "cpu"  # Choose between "cpu" or "cuda"
batch_size = 16  # Reduce if low on GPU memory
compute_type = "int8"  # Use "int8" if low on GPU memory (may reduce accuracy) if not float16
folder = "OBE1"
audio = "Id 13.m4a"

# Run

In [87]:
# Define paths for data and audio
data_path = os.path.abspath(os.path.join("..", "data"))  # Absolute path to the data directory
audio_path = os.path.join(data_path, folder, audio)  # Full path to the audio file

output_dir = os.path.join("outputs", folder)  # Use the same structure as the input for output
os.makedirs(output_dir, exist_ok=True)
audio_id = os.path.splitext(os.path.join(folder, audio))[0].replace(" ", "_")  # Remove spaces and extension

# Hugging Face token for Pyannote (for diarization)
HF_TOKEN = "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"

In [15]:
# 1. Transcribe with original whisper (batched)

model = whisperx.load_model("large-v2",device, compute_type=compute_type, language = "en")
# save model to local path (optional)
#model_dir = "./models/"
#model = whisperx.load_model("large-v2", device, compute_type=compute_type, download_root=model_dir, language = "en")

Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.3.3. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint C:\Users\david\.cache\torch\whisperx-vad-segmentation.bin`


Model was trained with pyannote.audio 0.0.1, yours is 3.1.1. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.0.0. Bad things might happen unless you revert torch to 1.x.


In [49]:
audio = whisperx.load_audio(audio_path)
transcript = model.transcribe(audio, batch_size=batch_size)
print(transcript["segments"]) # before alignment

# delete model if low on GPU resources
gc.collect(); torch.cuda.empty_cache(); del model

[{'text': " But yeah, from my experience, it wasn't just like calmness and serenity. There's also a bit of instability and uncertainty. I thought some of the questions were very interesting, like some of the words you used, like torment, like not knowing what's going to happen, uncertainty, I could relate to some of those things as well. You mean the last question there? The one I just completed, some of the words that were used were interesting, but I could relate to them a lot.", 'start': 0.009, 'end': 23.797}, {'text': " I guess it's like a fear of your own bodily sensations and your thoughts and like a fear of being overwhelmed by them and that it's too much and that it's something you won't be able to handle.", 'start': 25.725, 'end': 45.367}, {'text': " Seeing as this is something I don't practice a lot, you know, so like really tapping into like my bodily sensations and in a non-judgmental way and not being scared of them and really embracing them is like quite unfamiliar to me.

In [50]:
# 2. Align whisper output

model_a, metadata = whisperx.load_align_model(language_code=transcript["language"], device=device)
transcript_aligned = whisperx.align(transcript["segments"], model_a, metadata, audio, device, return_char_alignments=False)
print(transcript_aligned["segments"]) # after alignment

# delete model if low on GPU resources
gc.collect(); torch.cuda.empty_cache(); del model_a

[{'start': 0.269, 'end': 3.59, 'text': " But yeah, from my experience, it wasn't just like calmness and serenity.", 'words': [{'word': 'But', 'start': 0.269, 'end': 0.389, 'score': 0.403}, {'word': 'yeah,', 'start': 0.489, 'end': 0.769, 'score': 0.492}, {'word': 'from', 'start': 0.809, 'end': 1.049, 'score': 0.344}, {'word': 'my', 'start': 1.069, 'end': 1.209, 'score': 0.954}, {'word': 'experience,', 'start': 1.229, 'end': 1.59, 'score': 0.339}, {'word': 'it', 'start': 1.65, 'end': 1.73, 'score': 0.3}, {'word': "wasn't", 'start': 1.77, 'end': 2.03, 'score': 0.326}, {'word': 'just', 'start': 2.13, 'end': 2.39, 'score': 0.559}, {'word': 'like', 'start': 2.41, 'end': 2.55, 'score': 0.279}, {'word': 'calmness', 'start': 2.61, 'end': 3.05, 'score': 0.49}, {'word': 'and', 'start': 3.11, 'end': 3.23, 'score': 0.635}, {'word': 'serenity.', 'start': 3.25, 'end': 3.59, 'score': 0.283}]}, {'start': 4.531, 'end': 7.692, 'text': "There's also a bit of instability and uncertainty.", 'words': [{'word

In [51]:
# 3. Assign speaker labels

diarize_model = whisperx.DiarizationPipeline(use_auth_token=HF_TOKEN, device=device)
# add min/max number of speakers if known
diarize_segments = diarize_model(audio,max_speakers=5) # min_speakers=2, num_speakers=3

In [52]:
result = whisperx.assign_word_speakers(diarize_segments, transcript_aligned)
print(diarize_segments)
print(result["segments"]) # segments are now assigned speaker IDs

#result["segments"]

                               segment label     speaker       start  \
0    [ 00:00:00.008 -->  00:00:23.522]     A  SPEAKER_02    0.008489   
1    [ 00:00:07.920 -->  00:00:08.242]     B  SPEAKER_01    7.920204   
2    [ 00:00:25.679 -->  00:00:33.998]     C  SPEAKER_01   25.679117   
3    [ 00:00:32.928 -->  00:00:33.013]     D  SPEAKER_00   32.928693   
4    [ 00:00:33.013 -->  00:00:33.064]     E  SPEAKER_02   33.013582   
..                                 ...   ...         ...         ...   
105  [ 00:06:59.719 -->  00:07:09.210]    DB  SPEAKER_00  419.719864   
106  [ 00:07:05.322 -->  00:07:05.916]    DC  SPEAKER_02  425.322581   
107  [ 00:07:10.144 -->  00:07:16.001]    DD  SPEAKER_02  430.144312   
108  [ 00:07:16.001 -->  00:07:19.991]    DE  SPEAKER_01  436.001698   
109  [ 00:07:18.599 -->  00:07:18.955]    DF  SPEAKER_02  438.599321   

            end  intersection       union  
0     23.522920   -412.240080  435.894511  
1      8.242784   -427.520216  427.982796  
2  

In [89]:
with open(f"results\{audio_id}_str.txt", "w") as file:
    # Iterate through each segment in result["segments"]
    for segment in result["segments"]:
        # Extract text and speaker information
        text = segment["text"]
        speaker = segment["speaker"]
        
        # Write the text and speaker information to the file
        file.write(f"Speaker: {speaker}\nText: {text}\n\n")

In [91]:
with open(f"results\{audio_id}_merged.txt", "w") as file:
    # Initialize variables to keep track of the current speaker and their dialogue
    current_speaker = None
    current_dialogue = ""
    
    # Iterate through each segment in result["segments"]
    for segment in result["segments"]:
        speaker = segment["speaker"]
        text = segment["text"]
        
        # If the current segment's speaker is the same as the previous one, append the text
        if speaker == current_speaker:
            current_dialogue += " " + text
        else:
            # If the current segment's speaker is different, write the previous speaker's dialogue to the file
            if current_speaker is not None:
                file.write(f"{current_speaker}: {current_dialogue}\n\n")
            
            # Update the current speaker and dialogue
            current_speaker = speaker
            current_dialogue = text
    
    # Write the last speaker's dialogue to the file
    if current_speaker is not None:
        file.write(f"{current_speaker}: {current_dialogue}\n\n")