<a href="https://colab.research.google.com/github/EmreOzdemiroglu/transcription_diarization/blob/main/Transcription_Diarization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

- Change runtime to GPU. Runtime / Change runtime type
- The transcript will be saved in Files, click on the folder icon on the left menu.


In [1]:
!pip install -q git+https://github.com/openai/whisper.git > /dev/null
!pip install -q git+https://github.com/pyannote/pyannote-audio > /dev/null
!pip install torch numpy scikit-learn wave pydub
!apt-get install -y ffmpeg

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
ffmpeg is already the newest version (7:4.4.2-0ubuntu0.22.04.1).
0 upgraded, 0 newly installed, 0 to remove and 16 not upgraded.


In [2]:
from google.colab import files
uploaded = files.upload()
path = next(iter(uploaded))

Saving videoplayback.mp3 to videoplayback (4).mp3


In [3]:
import subprocess
import whisper
import datetime
import torch
import pyannote.audio
from pydub import AudioSegment
from pyannote.audio.pipelines.speaker_verification import PretrainedSpeakerEmbedding
from pyannote.audio import Audio
from pyannote.core import Segment
import wave
import contextlib
from sklearn.cluster import AgglomerativeClustering
import numpy as np

In [4]:
def convert_to_wav(file_path):
    """Convert file to WAV format if not already in that format."""
    if file_path[-3:] != 'wav':
        subprocess.call(['ffmpeg', '-i', file_path, 'audio.wav', '-y'])
        return 'audio.wav'
    return file_path
def match_audio_properties(path, target_sample_rate, target_channels):
    audio = AudioSegment.from_mp3(path)
    audio = audio.set_frame_rate(target_sample_rate)
    audio = audio.set_channels(target_channels)

    audio.export(path, format="mp3")

    print(f"Yeni ses dosyası {path} oluşturuldu.")


# Fonksiyonu çağırarak ses dosyasını düzenle ve yeni dosyayı kaydet
"""def convert_to_mono(file_path):
    sound = AudioSegment.from_file(input_path)
    sound = sound.set_channels(1)
    sound.export(input_path, format="wav")"""

'def convert_to_mono(file_path):\n    sound = AudioSegment.from_file(input_path)\n    sound = sound.set_channels(1)\n    sound.export(input_path, format="wav")'

In [5]:

def load_embedding_model():
    """Load the speaker embedding model."""
    return PretrainedSpeakerEmbedding(
        "speechbrain/spkrec-ecapa-voxceleb",
        device=torch.device("cuda")
    )

def get_file_duration(file_path):
    """Return the duration of the audio file."""
    with contextlib.closing(wave.open(file_path, 'r')) as f:
        frames = f.getnframes()
        rate = f.getframerate()
        duration = frames / float(rate)
    return duration

def segment_embedding(audio_path, segment, duration, embedding_model):
    """Get the embedding for a given audio segment."""
    start = segment["start"]
    end = min(duration, segment["end"])
    clip = Segment(start, end)
    waveform, _ = Audio().crop(audio_path, clip)
    return embedding_model(waveform[None])

def get_embeddings(audio_path, segments, duration, embedding_model):
    """Generate embeddings for each segment."""
    embeddings = np.zeros(shape=(len(segments), 192))
    for i, segment in enumerate(segments):
        embeddings[i] = segment_embedding(audio_path, segment, duration, embedding_model)
    return np.nan_to_num(embeddings)


In [6]:

def write_transcript(segments):
    """Write the transcript to a file."""
    with open("transcript.txt", "w") as f:
        for (i, segment) in enumerate(segments):
            if i == 0 or segments[i - 1]["speaker"] != segment["speaker"]:
                f.write("\n" + segment["speaker"] + ' ' + str(datetime.timedelta(seconds=round(segment["start"]))) + '\n')
            f.write(segment["text"][1:] + ' ')

def transcribe_audio(path: str):
    """
    Transcribe the audio file and return the transcript.

    Args:
    - path (str): Path to the audio file.

    Returns:
    - str: Path to the transcript file.
    """
    # Convert the audio_properties
    match_audio_properties(path,8000,1)

    # Convert the audio file to WAV format if required
    path = convert_to_wav(path)

    # Load the Whisper model and transcribe the audio
    model = whisper.load_model('large')
    result = model.transcribe(path)
    segments = result["segments"]

    # Get the duration of the audio file and load the embedding model
    duration = get_file_duration(path)
    embedding_model = load_embedding_model()
    embeddings = get_embeddings(path, segments, duration, embedding_model)

    # Perform clustering to identify speakers
    clustering = AgglomerativeClustering(2).fit(embeddings)
    labels = clustering.labels_
    for i in range(len(segments)):
        segments[i]["speaker"] = 'SPEAKER ' + str(labels[i] + 1)

    # Write the transcript to a file
    transcript_file = path.rsplit('.', 1)[0] + '_transcription.txt'
    with open(transcript_file, "w") as f:
        for (i, segment) in enumerate(segments):
            if i == 0 or segments[i - 1]["speaker"] != segment["speaker"]:
                f.write("\n" + segment["speaker"] + ' ' + str(datetime.timedelta(seconds=round(segment["start"]))) + '\n')
            f.write(segment["text"][1:] + ' ')

    return transcript_file
transcribe_audio(path)

Yeni ses dosyası videoplayback (4).mp3 oluşturuldu.


'audio_transcription.txt'