the python environment will be audio-ai

In [None]:
from pathlib import Path
from pyannote.audio import Pipeline
import os
import torch

torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True

import torch

import pandas as pd

from load_pipeline import load_pipeline_from_pretrained

from to_wav import convert_to_wav

from transcribe_segments import transcribe_speaker_segments

PATH_TO_CONFIG = "models/pyannote_diarization_config.yaml"
pipeline = load_pipeline_from_pretrained(PATH_TO_CONFIG)

if pipeline is not None:
    pipeline.to(torch.device("cuda"))
else:
    print("Failed to load the pipeline. Please check your Hugging Face token and model access.")

supports a wide range of audio file formats, including MP3, WAV, AAC, FLAC, OGG, and more. You can specify the format when importing audio and export audio to different formats.

In [None]:
audio_path = "./test_audio/"
audio_name = "UTokyo_29Sep"
input_format = 'm4a'

In [None]:
summary_output_dir = Path(f'outputs/txts/{audio_name}')
summary_output_dir.mkdir(parents=True, exist_ok=True)
input_path = audio_path + audio_name + "." + input_format
convert_to_wav(input_path, input_format, output_path=None)

In [None]:
# apply pretrained pipeline
diarization = pipeline(audio_path + audio_name + ".wav")

rows = []
for turn, _, speaker in diarization.itertracks(yield_label=True):
    rows.append({
        "speaker": speaker,
        "duration": turn.end - turn.start,
        "start": turn.start,
        "end": turn.end
    })

speaker_durations = pd.DataFrame(rows, columns=["speaker", "duration", "start", "end"])

# uncomment the following line to when the speaker start to talk and end
# for index, row in speaker_durations.iterrows():
#     print(f"Speaker {row['speaker']} spoke for {row['duration']:.1f} seconds.")

# remove the duration that is less than 3 seconds
speaker_durations = speaker_durations[speaker_durations['duration'] >= 3]

speaker_durations.reset_index(drop=True, inplace=True)
# add a unique id for each segment
speaker_durations['seg_unique_id'] = speaker_durations.index

speaker_durations['speaker'].value_counts().plot(kind='bar', title='Speaker Count')

Use the OpenAI Whisper Model to transcribe the audio to txt:

In [None]:
speaker_durations = transcribe_speaker_segments(
    speaker_durations = speaker_durations,
    audio_wav_path = audio_path + audio_name + ".wav",
    output_dir = f"./outputs/segments/{audio_name}/",
    output_csv_path = f"./outputs/csv/{audio_name}/speaking_durations.csv",
    model_name = "small.en",
)

---

This part is another part you need to manully input:

In [None]:
speakers_dic = {
    "SPEAKER_00": "real_name_1",
    "SPEAKER_01": "real_name_2",
    "SPEAKER_02": "real_name_3",
}

speaker_durations['speaker'] = speaker_durations['speaker'].replace(speakers_dic)

In [None]:
summary_output_dir

In [None]:
# write the speaker_durations to a txt file 
with open(summary_output_dir / f"{audio_name}_transcribed.txt", "w") as f:
    for index, row in speaker_durations.iterrows():
        f.write(f"Segment ID: {row['seg_unique_id']}, Speaker: {row['speaker']}, Start: {row['start']}, End: {row['end']}, Text: {row['text']}\n")

for speaker in speaker_durations['speaker'].unique():
    speaker_segments = speaker_durations[speaker_durations['speaker'] == speaker]
    with open(summary_output_dir / f"{audio_name}_speaker_{speaker}.txt", "w") as f:
        for index, row in speaker_segments.iterrows():
            f.write(f"{row['text']}\n")

Go to [cleantxt.ipynb](./cleantxt.ipynb) to further clean the generated txt.