# Кейс: ASR (Automatic speech recognition – автоматическое распознавание речи)

## Скачиваем необходимые библиотеки/фреймворки

In [1]:
!pip install openai-whisper flask ffmpeg-python
!apt-get install ffmpeg
!pip install pyngrok

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
ffmpeg is already the newest version (7:4.4.2-0ubuntu0.22.04.1).
0 upgraded, 0 newly installed, 0 to remove and 49 not upgraded.


In [2]:
!wget -q -O ngrok.zip https://bin.equinox.io/c/4VmDzA7iaHb/ngrok-stable-linux-amd64.zip
!unzip -o ngrok.zip

Archive:  ngrok.zip
  inflating: ngrok                   


In [3]:
!ngrok config add-authtoken 2qAfdhGGhliui7cmzQLZRwCzxRj_69brrS9x4NzVqhGovD72K

Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


In [4]:
!pip install SpeechRecognition



In [5]:
!pip install pyannote.audio



In [6]:
!pip install websockets



## Загружаем модели и создаем сайт на Flask

In [14]:
import os
import whisper
from flask import Flask, request, jsonify, render_template, Response, send_file
from pyannote.audio import Pipeline
import librosa
import soundfile as sf
import threading
import time
import torch

app = Flask(__name__)

# Загрузка моделей
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

whisper_model = whisper.load_model("tiny", device=device)
pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization@2.1", use_auth_token="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").to(device)


def preprocess_audio(input_path, output_path):
    """Нормализация и ресемплинг аудио"""
    audio, sr = librosa.load(input_path, sr=16000)  # Ресемплинг до 16 кГц
    sf.write(output_path, audio, 16000, format='WAV')


def perform_diarization(filepath):
    """Диаризация аудио"""
    print(f"Performing diarization on {filepath}...")
    diarization = pipeline(filepath)
    speaker_segments = []
    for turn, _, speaker in diarization.itertracks(yield_label=True):
        print(f"Speaker {speaker}: {turn.start:.1f}s - {turn.end:.1f}s")
        speaker_segments.append(f"{speaker}: {turn.start:.1f}s - {turn.end:.1f}s")
    return speaker_segments


def generate_srt(transcription):
    """Генерация субтитров в формате SRT"""
    segments = transcription["segments"]
    srt_content = ""
    for idx, segment in enumerate(segments):
        start = segment["start"]
        end = segment["end"]
        text = segment["text"]
        srt_content += f"{idx + 1}\n{format_time(start)} --> {format_time(end)}\n{text}\n\n"
    return srt_content


def format_time(seconds):
    """Форматирование времени для SRT"""
    millis = int((seconds - int(seconds)) * 1000)
    h, m, s = int(seconds // 3600), int((seconds % 3600) // 60), int(seconds % 60)
    return f"{h:02}:{m:02}:{s:02},{millis:03}"


def stream_transcription(filepath):
    """Постепенная транскрипция для отображения субтитров в реальном времени"""
    try:
        # Выполняем транскрипцию сразу
        result = whisper_model.transcribe(filepath, task="transcribe", language="ru", fp16=False)

        if "segments" in result:
            for idx, segment in enumerate(result["segments"]):
                start = segment['start']
                end = segment['end']
                text = segment['text']
                srt_content = f"{idx + 1}\n{format_time(start)} --> {format_time(end)}\n{text}\n\n"
                yield srt_content
                time.sleep(end - start)  # Синхронизация с реальным временем
        else:
            yield f"data: Ошибка: сегменты не найдены в результате.\n\n"
    except Exception as e:
        yield f"data: Ошибка: {str(e)}\n\n"


@app.route('/')
def index():
    return """
    <!DOCTYPE html>
    <html lang="en">
    <head>
        <meta charset="UTF-8">
        <meta name="viewport" content="width=device-width, initial-scale=1.0">
        <title>ASR App</title>
        <style>
            body {
                font-family: Arial, sans-serif;
                margin: 20px;
                background-color: #f4f4f9;
            }
            h1 {
                text-align: center;
            }
            form {
                max-width: 400px;
                margin: 0 auto;
                padding: 20px;
                background: white;
                border: 1px solid #ddd;
                border-radius: 8px;
            }
            label, select, input, button {
                display: block;
                width: 100%;
                margin-bottom: 10px;
            }
            button {
                background-color: #007BFF;
                color: white;
                border: none;
                padding: 10px;
                border-radius: 5px;
                cursor: pointer;
            }
            button:hover {
                background-color: #0056b3;
            }
            #subtitles {
                margin-top: 20px;
                max-width: 600px;
                margin: 20px auto;
                padding: 20px;
                background: #fff;
                border: 1px solid #ddd;
                border-radius: 8px;
                white-space: pre-wrap;
                font-family: monospace;
                overflow-y: auto;
                height: 200px;
            }
        </style>
    </head>
    <body>
        <h1>ASR-приложение</h1>
        <form action="/upload" method="post" enctype="multipart/form-data">
            <label for="method">Выберите метод обработки:</label>
            <select name="method">
                <option value="transcribe">Транскрипция</option>
                <option value="diarization">Диаризация</option>
                <option value="subtitles">Субтитры</option>
            </select>
            <input type="file" name="audio_file" accept="audio/*,video/*">
            <button type="submit">Загрузить</button>
        </form>
        <div id="subtitles"></div>
        <script>
            const eventSource = new EventSource('/stream_subtitles');
            const subtitlesDiv = document.getElementById('subtitles');
            const audioElement = document.getElementById('audio');

            eventSource.onmessage = function(event) {
                subtitlesDiv.textContent += event.data + "\n";
                subtitlesDiv.scrollTop = subtitlesDiv.scrollHeight;
            };

            fetch('/audio')
                .then(response => response.blob())
                .then(blob => {
                    const url = URL.createObjectURL(blob);
                    audioElement.src = url;
                })
                .catch(error => console.error('Ошибка загрузки аудио:', error));

            audioElement.addEventListener('play', () => {
                eventSource.onmessage = function(event) {
                    const [timeRange, text] = event.data.split(' ', 2);
                    const [start, end] = timeRange.split('-').map(parseFloat);

                    const currentTime = audioElement.currentTime;
                    if (currentTime >= start && currentTime <= end) {
                        subtitlesDiv.textContent += text + "\n";
                        subtitlesDiv.scrollTop = subtitlesDiv.scrollHeight;
                    }
                };
            });
        </script>
    </body>
    </html>
    """

@app.route('/upload', methods=['POST'])
def upload():
    if 'audio_file' not in request.files:
        return jsonify({"error": "Файл не найден"}), 400

    file = request.files['audio_file']
    if file.filename == '':
        return jsonify({"error": "Файл не выбран"}), 400

    method = request.form.get("method", "transcribe")
    filepath = os.path.join("uploads", file.filename)
    os.makedirs("uploads", exist_ok=True)
    file.save(filepath)

    preprocess_audio(filepath, filepath)

    if method == "transcribe":
        try:
            result = whisper_model.transcribe(filepath)
            os.remove(filepath)
            return jsonify({"text": result["text"]})
        except Exception as e:
            return jsonify({"error": str(e)}), 500

    elif method == "diarization":
        try:
            segments = perform_diarization(filepath)
            os.remove(filepath)
            return jsonify({"diarization": segments})
        except Exception as e:
            return jsonify({"error": str(e)}), 500

    elif method == "subtitles":
      try:
        return Response(stream_transcription(filepath), mimetype='text/event-stream')
      except Exception as e:
        return jsonify({"error": str(e)}), 500

    else:
        return jsonify({"error": "Неверный метод"}), 400

@app.route('/audio')
def audio():
    """Маршрут для потоковой передачи аудио"""
    filepath = os.path.join("uploads", os.listdir("uploads")[0])  # Берем первый файл в папке uploads
    return send_file(filepath, mimetype="audio/wav")

  checkpoint = torch.load(fp, map_location=device)
INFO:pytorch_lightning.utilities.migration.utils:Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.4.0. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../root/.cache/torch/pyannote/models--pyannote--segmentation/snapshots/c4c8ceafcbb3a7a280c2d357aee9fbc9b0be7f9b/pytorch_model.bin`


Model was trained with pyannote.audio 0.0.1, yours is 3.3.2. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.5.1+cu121. Bad things might happen unless you revert torch to 1.x.


  state_dict = torch.load(path, map_location=device)
  stats = torch.load(path, map_location=device)


## Запускаем сайт

In [None]:
from pyngrok import ngrok

public_url = ngrok.connect(5000)
print(f"Ваше приложение доступно по адресу: {public_url}\n")

# Запуск Flask
app.run(port=5000)

Ваше приложение доступно по адресу: NgrokTunnel: "https://1246-34-124-196-126.ngrok-free.app" -> "http://localhost:5000"
 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
INFO:werkzeug:127.0.0.1 - - [13/Dec/2024 18:41:26] "GET / HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [13/Dec/2024 18:41:26] "[33mGET /favicon.ico HTTP/1.1[0m" 404 -
INFO:werkzeug:127.0.0.1 - - [13/Dec/2024 18:42:49] "POST /upload HTTP/1.1" 200 -
