In [36]:
import whisperx
import gc 

In [37]:
device= "cpu"
audio_file = "./data/OBE1/Id 16.m4a"
batch_size = 16 # reduce if low on GPU mem
#compute_type = "float16" # change to "int8" if low on GPU mem (may reduce accuracy)
compute_type = "int8"

In [38]:
# 1. Transcribe with original whisper (batched)
model = whisperx.load_model("large-v2",device, compute_type=compute_type)

No language specified, language will be first be detected for each audio file (increases inference time).


Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.3.3. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint C:\Users\david\.cache\torch\whisperx-vad-segmentation.bin`


Model was trained with pyannote.audio 0.0.1, yours is 3.1.1. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.0.0. Bad things might happen unless you revert torch to 1.x.


In [39]:
# save model to local path (optional)
# model_dir = "/path/"
# model = whisperx.load_model("large-v2", device, compute_type=compute_type, download_root=model_dir)

In [40]:
audio = whisperx.load_audio(audio_file)
transcript = model.transcribe(audio, batch_size=batch_size)
print(transcript["segments"]) # before alignment

Detected language: en (0.74) in first 30s of audio...
[{'text': ' uh, your body and the virtual visualization of your body. Do you have a feeling that you were separated with the body? Not at 100%, but somewhat, yes. Somewhat, you feel like, OK, your body, the body in front of you is yourself, and then you somehow feel separated. Yeah. How was the feeling? Did you feel, like, any, like, discomfort? Or you feel, like, more pleasant?', 'start': 0.111, 'end': 28.08}, {'text': " It's more, let's say, weirdness. Weirdness. It was not discophonic at all. And it was not pleasant that much. So it was like weird to see that. It's like kind of... Weird means a little bit confused or is it more like excited? More confused.", 'start': 28.08, 'end': 52.671}, {'text': ' Okay, and during the first two parts, could you follow this time better than the last time in the first section? Yes. So you could follow the body scanning part with the eyes closed, how did you feel? Could you do everything he asked

In [41]:
# delete model if low on GPU resources
# import gc; gc.collect(); torch.cuda.empty_cache(); del model

In [43]:
# 2. Align whisper output
model_a, metadata = whisperx.load_align_model(language_code=transcript["language"], device=device)
transcript_aligned = whisperx.align(transcript["segments"], model_a, metadata, audio, device, return_char_alignments=False)

print(transcript_aligned["segments"]) # after alignment

[{'start': 0.691, 'end': 5.073, 'text': ' uh, your body and the virtual visualization of your body.', 'words': [{'word': 'uh,', 'start': 0.691, 'end': 0.871, 'score': 0.684}, {'word': 'your', 'start': 1.031, 'end': 1.211, 'score': 0.896}, {'word': 'body', 'start': 1.291, 'end': 1.832, 'score': 0.945}, {'word': 'and', 'start': 2.132, 'end': 2.292, 'score': 0.855}, {'word': 'the', 'start': 2.392, 'end': 2.532, 'score': 0.846}, {'word': 'virtual', 'start': 2.672, 'end': 3.412, 'score': 0.704}, {'word': 'visualization', 'start': 3.572, 'end': 4.412, 'score': 0.659}, {'word': 'of', 'start': 4.452, 'end': 4.492, 'score': 0.598}, {'word': 'your', 'start': 4.592, 'end': 4.692, 'score': 0.201}, {'word': 'body.', 'start': 4.712, 'end': 5.073, 'score': 0.715}]}, {'start': 5.673, 'end': 8.374, 'text': 'Do you have a feeling that you were separated with the body?', 'words': [{'word': 'Do', 'start': 5.673, 'end': 5.713, 'score': 0.005}, {'word': 'you', 'start': 5.733, 'end': 5.813, 'score': 0.883}, 

In [44]:
# delete model if low on GPU resources
# import gc; gc.collect(); torch.cuda.empty_cache(); del model_a

In [45]:
HF_TOKEN= "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"

In [65]:
# 3. Assign speaker labels
diarize_model = whisperx.DiarizationPipeline(use_auth_token=HF_TOKEN, device=device)

# add min/max number of speakers if known
diarize_segments = diarize_model(audio,max_speakers=5) # min_speakers=2, num_speakers=3

In [66]:
result = whisperx.assign_word_speakers(diarize_segments, transcript_aligned)
print(diarize_segments)
print(result["segments"]) # segments are now assigned speaker IDs

result["segments"]

                              segment label     speaker       start  \
0   [ 00:00:00.008 -->  00:00:04.966]     A  SPEAKER_01    0.008489   
1   [ 00:00:04.558 -->  00:00:05.067]     B  SPEAKER_00    4.558574   
2   [ 00:00:05.118 -->  00:00:08.463]     C  SPEAKER_00    5.118846   
3   [ 00:00:08.684 -->  00:00:13.030]     D  SPEAKER_01    8.684211   
4   [ 00:00:13.234 -->  00:00:13.251]     E  SPEAKER_01   13.234295   
..                                ...   ...         ...         ...   
56  [ 00:02:24.609 -->  00:02:25.135]    BE  SPEAKER_01  144.609508   
57  [ 00:02:25.220 -->  00:02:25.390]    BF  SPEAKER_01  145.220713   
58  [ 00:02:25.967 -->  00:02:26.460]    BG  SPEAKER_01  145.967742   
59  [ 00:02:27.088 -->  00:02:27.156]    BH  SPEAKER_01  147.088285   
60  [ 00:02:32.283 -->  00:02:32.775]    BI  SPEAKER_01  152.283531   

           end  intersection       union  
0     4.966044   -152.225956  157.263511  
1     5.067912   -152.124088  152.713426  
2     8.463497   -

[{'start': 0.691,
  'end': 5.073,
  'text': ' uh, your body and the virtual visualization of your body.',
  'words': [{'word': 'uh,',
    'start': 0.691,
    'end': 0.871,
    'score': 0.684,
    'speaker': 'SPEAKER_01'},
   {'word': 'your',
    'start': 1.031,
    'end': 1.211,
    'score': 0.896,
    'speaker': 'SPEAKER_01'},
   {'word': 'body',
    'start': 1.291,
    'end': 1.832,
    'score': 0.945,
    'speaker': 'SPEAKER_01'},
   {'word': 'and',
    'start': 2.132,
    'end': 2.292,
    'score': 0.855,
    'speaker': 'SPEAKER_01'},
   {'word': 'the',
    'start': 2.392,
    'end': 2.532,
    'score': 0.846,
    'speaker': 'SPEAKER_01'},
   {'word': 'virtual',
    'start': 2.672,
    'end': 3.412,
    'score': 0.704,
    'speaker': 'SPEAKER_01'},
   {'word': 'visualization',
    'start': 3.572,
    'end': 4.412,
    'score': 0.659,
    'speaker': 'SPEAKER_01'},
   {'word': 'of',
    'start': 4.452,
    'end': 4.492,
    'score': 0.598,
    'speaker': 'SPEAKER_01'},
   {'word': 'y

In [67]:
with open("segments_output.txt", "w") as file:
    # Iterate through each segment in result["segments"]
    for segment in result["segments"]:
        # Extract text and speaker information
        text = segment["text"]
        speaker = segment["speaker"]
        
        # Write the text and speaker information to the file
        file.write(f"Speaker: {speaker}\nText: {text}\n\n")

In [68]:
with open("merged_segments_output.txt", "w") as file:
    # Initialize variables to keep track of the current speaker and their dialogue
    current_speaker = None
    current_dialogue = ""
    
    # Iterate through each segment in result["segments"]
    for segment in result["segments"]:
        speaker = segment["speaker"]
        text = segment["text"]
        
        # If the current segment's speaker is the same as the previous one, append the text
        if speaker == current_speaker:
            current_dialogue += " " + text
        else:
            # If the current segment's speaker is different, write the previous speaker's dialogue to the file
            if current_speaker is not None:
                file.write(f"{current_speaker}: {current_dialogue}\n\n")
            
            # Update the current speaker and dialogue
            current_speaker = speaker
            current_dialogue = text
    
    # Write the last speaker's dialogue to the file
    if current_speaker is not None:
        file.write(f"{current_speaker}: {current_dialogue}\n\n")