In [None]:
!pip install vosk pydub openai-whisper

Collecting openai-whisper
  Downloading openai_whisper-20250625.tar.gz (803 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/803.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m803.2/803.2 kB[0m [31m49.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: openai-whisper
  Building wheel for openai-whisper (pyproject.toml) ... [?25l[?25hdone
  Created wheel for openai-whisper: filename=openai_whisper-20250625-py3-none-any.whl size=803979 sha256=0dd39d096e14899f34e5f2f65a1f6c31813a534bb6e1104ed8cd11c10ec650b5
  Stored in directory: /root/.cache/pip/wheels/61/d2/20/09ec9bef734d126cba375b15898010b6cc28578d8afdde5869
Successfully built openai-whisper
Installing collected packages: openai-whisper
Successfully installe

In [1]:
import json
from vosk import Model, KaldiRecognizer
import wave
import os
from pydub import AudioSegment
from collections import defaultdict
import tempfile


In [None]:

def convert_mp3_to_wav(mp3_path, sample_rate=16000):
    """
    Конвертирует MP3 в WAV формат для Vosk
    """
    print(f"Конвертируем {mp3_path} в WAV...")

    temp_wav = tempfile.NamedTemporaryFile(suffix='.wav', delete=False)
    temp_wav.close()

    try:
        audio = AudioSegment.from_mp3(mp3_path)

        audio = audio.set_channels(1)          # mono
        audio = audio.set_frame_rate(sample_rate)  # 16kHz
        audio = audio.set_sample_width(2)      # 16-bit PCM

        audio.export(temp_wav.name, format="wav")
        print("Конвертация завершена")
        return temp_wav.name

    except Exception as e:
        print(f"Ошибка конвертации: {e}")
        os.unlink(temp_wav.name)
        return None

def transcribe_with_vosk(audio_path, model_path="vosk-model-ru-0.42"):
    """
    Транскрибация с использованием Vosk
    """

    # Проверяем и скачиваем модель если нужно
    if not os.path.exists(model_path):
        print("Модель не найдена. Скачиваем...")
        import urllib.request
        import zipfile

        model_url = "https://alphacephei.com/vosk/models/vosk-model-ru-0.42.zip"
        zip_path = "vosk-model-ru-0.42.zip"

        urllib.request.urlretrieve(model_url, zip_path)
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall(".")
        os.remove(zip_path)
        print("Модель скачана и распакована")

    model = Model(model_path)

    wf = wave.open(audio_path, 'rb')

    if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype() != "NONE":
        print("Предупреждение: аудио не в идеальном формате, но попробуем обработать...")

    # Создаем распознаватель
    rec = KaldiRecognizer(model, wf.getframerate())
    rec.SetWords(True)

    results = []
    print("Идет распознавание...")

    while True:
        data = wf.readframes(4000)
        if len(data) == 0:
            break
        if rec.AcceptWaveform(data):
            result = json.loads(rec.Result())
            results.append(result)

    # Финальный результат
    final_result = json.loads(rec.FinalResult())
    results.append(final_result)

    wf.close()
    return results

def simple_speaker_segmentation(transcription_results, num_speakers=4):
    """
    Простое разделение на спикеров по паузам
    """
    segments = []
    current_speaker = 0
    last_end_time = 0

    for result in transcription_results:
        if 'result' not in result:
            continue

        for word_info in result['result']:
            start = word_info['start']
            end = word_info['end']
            word = word_info['word']

            # Если пауза больше 1.5 секунды - меняем спикера
            if start - last_end_time > 1.5:
                current_speaker = (current_speaker + 1) % num_speakers

            segments.append({
                'start': start,
                'end': end,
                'text': word,
                'speaker': f"SPEAKER_{current_speaker:02d}"
            })
            last_end_time = end

    return segments

def group_segments_by_speaker(segments, time_threshold=2.0):
    """
    Группировка сегментов по спикерам
    """
    grouped = []
    current_group = None

    for segment in segments:
        if current_group is None:
            current_group = {
                'speaker': segment['speaker'],
                'text': segment['text'],
                'start': segment['start'],
                'end': segment['end']
            }
        elif (current_group['speaker'] == segment['speaker'] and
              segment['start'] - current_group['end'] < time_threshold):
            current_group['text'] += " " + segment['text']
            current_group['end'] = segment['end']
        else:
            grouped.append(current_group)
            current_group = {
                'speaker': segment['speaker'],
                'text': segment['text'],
                'start': segment['start'],
                'end': segment['end']
            }

    if current_group:
        grouped.append(current_group)

    return grouped

def format_time(seconds):
    """Форматирование времени"""
    minutes = int(seconds // 60)
    seconds = seconds % 60
    return f"{minutes:02d}:{seconds:06.3f}"

def save_results(segments, output_file="transcription.txt"):
    """Сохранение результатов"""
    with open(output_file, 'w', encoding='utf-8') as f:
        for segment in segments:
            f.write(f"[{segment['speaker']}] ")
            f.write(f"{format_time(segment['start'])}-{format_time(segment['end'])}\n")
            f.write(f"{segment['text']}\n")
            f.write("-" * 50 + "\n\n")

    print(f"Результаты сохранены в {output_file}")

# Основная функция
def transcribe_mp3_with_speakers(mp3_path, num_speakers=4):
    """
    Транскрибация MP3 файла с разделением на спикеров
    """
    print("Начинаем обработку MP3 файла...")

    wav_path = convert_mp3_to_wav(mp3_path)

    if not wav_path:
        print("Ошибка конвертации!")
        return None

    try:
        results = transcribe_with_vosk(wav_path)
        segments = simple_speaker_segmentation(results, num_speakers)

        grouped_segments = group_segments_by_speaker(segments)

        return grouped_segments

    finally:
        if os.path.exists(wav_path):
            os.unlink(wav_path)
            print("Временный файл удален")

if __name__ == "__main__":
    mp3_file = "32140-312ds213-91094-2134.mp3"

    if not os.path.exists(mp3_file):
        print(f"Файл {mp3_file} не найден!")
        print("Убедитесь, что файл существует в той же папке")
    else:
        # Запуск транскрибации
        transcription = transcribe_mp3_with_speakers(mp3_file, num_speakers=4)

        if transcription:
            # Вывод результатов
            print("\n" + "="*60)
            print("РЕЗУЛЬТАТЫ ТРАНСКРИБАЦИИ:")
            print("="*60)

            for segment in transcription:
                print(f"[{segment['speaker']}] {segment['text']}")
                print(f"Время: {format_time(segment['start'])} - {format_time(segment['end'])}\n")

            # Сохранение в файл
            save_results(transcription, "transcription_result.txt")

            print("Обработка завершена успешно!")

Начинаем обработку MP3 файла...
Конвертируем 32140-312ds213-91094-2134.mp3 в WAV...
Конвертация завершена


LOG (VoskAPI:ReadDataFiles():model.cc:213) Decoding params beam=13 max-active=7000 lattice-beam=6
LOG (VoskAPI:ReadDataFiles():model.cc:216) Silence phones 1:2:3:4:5:6:7:8:9:10
LOG (VoskAPI:RemoveOrphanNodes():nnet-nnet.cc:948) Removed 1 orphan nodes.
LOG (VoskAPI:RemoveOrphanComponents():nnet-nnet.cc:847) Removing 2 orphan components.
LOG (VoskAPI:Collapse():nnet-utils.cc:1488) Added 1 components, removed 2
LOG (VoskAPI:ReadDataFiles():model.cc:248) Loading i-vector extractor from vosk-model-ru-0.42/ivector/final.ie
LOG (VoskAPI:ComputeDerivedVars():ivector-extractor.cc:183) Computing derived variables for iVector extractor
LOG (VoskAPI:ComputeDerivedVars():ivector-extractor.cc:204) Done.
LOG (VoskAPI:ReadDataFiles():model.cc:279) Loading HCLG from vosk-model-ru-0.42/graph/HCLG.fst
LOG (VoskAPI:ReadDataFiles():model.cc:297) Loading words from vosk-model-ru-0.42/graph/words.txt
LOG (VoskAPI:ReadDataFiles():model.cc:308) Loading winfo vosk-model-ru-0.42/graph/phones/word_boundary.int
LO