In [1]:
import dotenv, json
from pathlib import Path
from stad import WhisperPipeline, DiarizationPipeline, assign_speaker_to_transcript

# Load HF_TOKEN from .env file
dotenv.load_dotenv()

PODCAST_AUDIO_PATH = Path.cwd() / ".data" / "audio.wav"

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
with WhisperPipeline.create(
    return_timestamps="word",
    attn_implementation="sdpa",
    batch_size=16,
) as whisper:
    transcript_df = whisper(
        audio_path=PODCAST_AUDIO_PATH,
        language="english",
    )

transcript_df

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.


Unnamed: 0,text,start,end
0,What's,0.04,0.30
1,up,0.30,0.44
2,everybody,0.44,0.70
3,welcome,0.70,1.00
4,to,1.00,1.22
...,...,...,...
25428,We'll,7396.76,7397.04
25429,see,7397.04,7397.16
25430,you,7397.16,7397.30
25431,there.,7397.30,7397.50


In [3]:
with DiarizationPipeline() as diarization:
    speakers_df = diarization(audio_path=PODCAST_AUDIO_PATH)

speakers_df

torchvision is not available - cannot save figures


Unnamed: 0,label,speaker,start,end
0,A,SPEAKER_03,0.132219,46.319094
1,B,SPEAKER_02,24.870969,25.124094
2,C,SPEAKER_03,47.567844,58.705344
3,D,SPEAKER_04,56.376594,57.540969
4,E,SPEAKER_03,59.430969,64.206594
...,...,...,...,...
3208,DSK,SPEAKER_03,7388.597844,7389.272844
3209,DSL,SPEAKER_03,7390.049094,7391.669094
3210,DSM,SPEAKER_00,7391.669094,7391.702844
3211,DSN,SPEAKER_00,7395.145344,7397.541594


In [4]:
assign_speaker_to_transcript(
    speakers_df=speakers_df,
    transcript_df=transcript_df,
    inplace=True,
)
transcript_df

Unnamed: 0,text,start,end,speaker
0,What's,0.04,0.30,SPEAKER_03
1,up,0.30,0.44,SPEAKER_03
2,everybody,0.44,0.70,SPEAKER_03
3,welcome,0.70,1.00,SPEAKER_03
4,to,1.00,1.22,SPEAKER_03
...,...,...,...,...
25428,We'll,7396.76,7397.04,SPEAKER_03
25429,see,7397.04,7397.16,SPEAKER_00
25430,you,7397.16,7397.30,SPEAKER_03
25431,there.,7397.30,7397.50,SPEAKER_00


In [5]:
transcript_df.to_json(
    PODCAST_AUDIO_PATH.with_suffix(".json"),
    orient="records",
    indent=4,
)