<a href="https://colab.research.google.com/github/Avvonna/DL_sound_project/blob/main/demo_colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# @title 1\. Setup Repository and Libraries (~ 4 mins)

import os
import subprocess
import sys

REPO_NAME = "DL_sound_project"
GIT_URL = f"https://github.com/Avvonna/{REPO_NAME}.git"

def run_command(command, message):
    """
    –í—Å–ø–æ–º–æ–≥–∞—Ç–µ–ª—å–Ω–∞—è —Ñ—É–Ω–∫—Ü–∏—è –¥–ª—è —Ç–∏—Ö–æ–π —É—Å—Ç–∞–Ω–æ–≤–∫–∏ –ø–∞–∫–µ—Ç–æ–≤ —Å –ø–æ–º–æ—â—å—é subprocess.
    """
    print(f"{message}...")
    try:
        subprocess.run(command, check=True, shell=True, capture_output=True, text=True)
        print(f"{message} - DONE")
    except subprocess.CalledProcessError as e:
        print(f"{message} - FAILED!")
        print("ERROR LOG:\n", e.stderr)
        raise e

# Repo
current_dir = os.path.basename(os.getcwd())
if current_dir == REPO_NAME:
    print(f"Already inside {REPO_NAME}. Skipping clone.")
    run_command("git pull", "Pulling latest changes")
else:
    if not os.path.exists(REPO_NAME):
        run_command(f"git clone {GIT_URL}", f"Cloning {REPO_NAME}")
    os.chdir(REPO_NAME)

# Dependencies
if not os.path.exists(".setup_complete"):
    if os.path.exists("requirements.txt"):
        with open("requirements.txt", "r") as f:
            reqs = f.readlines()

        new_reqs = []
        for req in reqs:
            # –ü—Ä–æ–ø—É—Å–∫–∞–µ–º torch, numpy (–≤ colab –æ–Ω–∏ –±–æ–ª–µ–µ —Å—Ç–∞—Ä—ã–µ)
            if any(lib in req.lower() for lib in ["torch", "numpy"]):
                continue
            new_reqs.append(req)

        with open("requirements_colab.txt", "w") as f:
            f.writelines(new_reqs)

        # –°—Ç–∞–≤–∏–º –æ—Å—Ç–∞–ª—å–Ω—ã–µ –±–∏–±–ª–∏–æ—Ç–µ–∫–∏
        run_command("pip install -q -r requirements_colab.txt", "Installing project dependencies")

        # –°–∏–Ω—Ö—Ä–æ–Ω–∏–∑–∏—Ä—É–µ–º –≤–µ—Ä—Å–∏–∏ torch –∏ torch audio
        run_command("pip install --upgrade torchaudio", "Syncing Torch & TorchAudio")

        with open(".setup_complete", "w") as f:
            f.write("done")
        print("Dependencies installed.")
    else:
        print("requirements.txt not found!")
else:
    print("Dependencies already installed.")

print(f"\nCurrent Directory: {os.getcwd()}")
print("Setup complete!")

In [None]:
# @title 2\. Download Pre-trained Weights

import gdown

FILE_ID = "12k5HUXaGLweeOGXSZqiqrhUG9CMjKujr"
OUTPUT_DIR = "saved"
MODEL_PATH = os.path.join(OUTPUT_DIR, "best_model.pth")

os.makedirs(OUTPUT_DIR, exist_ok=True)

if not os.path.exists(MODEL_PATH):
    print(f"Downloading weights to {MODEL_PATH}...")
    url = f'https://drive.google.com/uc?id={FILE_ID}'
    gdown.download(url, MODEL_PATH, quiet=True)
    print("Weights successfully downloaded")
else:
    print(f"Weights found at {MODEL_PATH}")

In [None]:
# @title 3\. Helper Functions (Visualization & Inference)

import shutil
import soundfile as sf
import IPython.display as ipd
from pathlib import Path

def play_audio(file_path):
    """
    –ß–∏—Ç–∞–µ—Ç –∏ –≤–æ—Å–ø—Ä–æ–∏–∑–≤–æ–¥–∏—Ç –∞—É–¥–∏–æ—Ñ–∞–π–ª –≤ Colab.
    """
    try:
        audio_array, sample_rate = sf.read(file_path)
        if audio_array.ndim > 1:
            audio_array = audio_array.T
        print(f"Audio: {os.path.basename(file_path)} | SR: {sample_rate}")
        ipd.display(ipd.Audio(audio_array, rate=sample_rate))
    except Exception as e:
        print(f"Could not play {file_path}: {e}")

def show_inference_results(audio_dir, prediction_dir, limit=None):
    """
    –ü—Ä–æ—Ö–æ–¥–∏—Ç –ø–æ —Ñ–∞–π–ª–∞–º –≤ audio_dir, –∏—â–µ—Ç —Å–æ–æ—Ç–≤–µ—Ç—Å—Ç–≤—É—é—â–∏–µ .txt –≤ prediction_dir
    –∏ –æ—Ç–æ–±—Ä–∞–∂–∞–µ—Ç —Ä–µ–∑—É–ª—å—Ç–∞—Ç (–ê—É–¥–∏–æ + –¢–µ–∫—Å—Ç).

    Args:
        audio_dir (str): –ü—É—Ç—å –∫ –ø–∞–ø–∫–µ —Å –∞—É–¥–∏–æ.
        prediction_dir (str): –ü—É—Ç—å –∫ –ø–∞–ø–∫–µ —Å –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏—è–º–∏ (.txt).
        limit (int): –ú–∞–∫—Å–∏–º–∞–ª—å–Ω–æ–µ –∫–æ–ª-–≤–æ –ø—Ä–∏–º–µ—Ä–æ–≤ –¥–ª—è –ø–æ–∫–∞–∑–∞ (–ø–æ–ª–µ–∑–Ω–æ –¥–ª—è –±–æ–ª—å—à–∏—Ö –¥–∞—Ç–∞—Å–µ—Ç–æ–≤).
    """
    files = sorted(os.listdir(audio_dir))

    # –§–∏–ª—å—Ç—Ä—É–µ–º —Ç–æ–ª—å–∫–æ –∞—É–¥–∏–æ
    audio_extensions = ('.wav', '.mp3', '.flac')
    files = [f for f in files if f.lower().endswith(audio_extensions)]

    if not files:
        print("No audio files found to display.")
        return

    # –ü—Ä–∏–º–µ–Ω—è–µ–º –ª–∏–º–∏—Ç, –µ—Å–ª–∏ –∑–∞–¥–∞–Ω
    count = len(files)
    if limit and count > limit:
        print(f"Found {count} files. Showing first {limit} examples:\n")
        files = files[:limit]
    else:
        print(f"Showing results for {count} files:\n")

    for filename in files:
        file_id = Path(filename).stem
        audio_path = os.path.join(audio_dir, filename)
        pred_path = os.path.join(prediction_dir, f"{file_id}.txt")

        print("-" * 40)

        # 1. –ò–≥—Ä–∞–µ–º –∞—É–¥–∏–æ
        play_audio(audio_path)

        # 2. –ü–µ—á–∞—Ç–∞–µ–º —Ç–µ–∫—Å—Ç
        if os.path.exists(pred_path):
            with open(pred_path, "r") as f:
                text = f.read().strip()
            print(f"\nText:\n{text.upper()}")
        else:
            print(f"\nPrediction file not found for ID: {file_id}")

def run_prediction(audio_path, model_path="saved/best_model.pth"):
    """
    –ó–∞–ø—É—Å–∫–∞–µ—Ç inference.py —á–µ—Ä–µ–∑ –∫–æ–º–∞–Ω–¥–Ω—É—é —Å—Ç—Ä–æ–∫—É
    """

    base_dir = os.path.abspath(".")
    temp_dir = os.path.join(base_dir, "temp_inference_data")
    audio_sub = os.path.join(temp_dir, "audio")
    save_folder_name = "temp_predictions"

    # –û—á–∏—Å—Ç–∫–∞ –∏ —Å–æ–∑–¥–∞–Ω–∏–µ –ø–∞–ø–æ–∫
    if os.path.exists(temp_dir):
        shutil.rmtree(temp_dir)
    os.makedirs(audio_sub, exist_ok=True)

    # –û—á–∏—Å—Ç–∫–∞ –ø–∞–ø–∫–∏ –≤—ã–≤–æ–¥–∞
    full_save_path = os.path.join(base_dir, "data", "saved", save_folder_name)
    if os.path.exists(full_save_path):
        shutil.rmtree(full_save_path)

    # –ö–æ–ø–∏—Ä–æ–≤–∞–Ω–∏–µ –∞—É–¥–∏–æ
    path_obj = Path(audio_path)
    audio_name = path_obj.name
    shutil.copy(audio_path, os.path.join(audio_sub, audio_name))

    # –ó–∞–ø—É—Å–∫ –∫–æ–º–∞–Ω–¥—ã
    command = [
        "python", "inference.py",
        f"inferencer.save_path={save_folder_name}",
        f"inferencer.from_pretrained={model_path}",
        f"inferencer.data_dir={temp_dir}",
        "inferencer.device=auto",
        "+writer.run_name=demo"
    ]

    print(f"Running inference on {audio_name}...")
    result = subprocess.run(command, capture_output=True, text=True)

    if result.returncode != 0:
        print("Inference Error:")
        print(result.stderr)
        return "ERROR"

    file_id = path_obj.stem
    expected_file = os.path.join(full_save_path, "inference", f"{file_id}.txt")

    if os.path.exists(expected_file):
        with open(expected_file, "r") as f:
            return f.read().strip()
    else:
        print(f"File not found: {expected_file}")
        return "ERROR: File not found"

print("Inference functions ready.")

–ö–æ–¥ –Ω–∏–∂–µ –ø–æ–∑–≤–æ–ª—è–µ—Ç –∑–∞–≥—Ä—É–∂–∞—Ç—å —Å–æ–±—Å—Ç–≤–µ–Ω–Ω—ã–µ –∞—É–¥–∏–æ-—Ñ–∞–π–ª—ã –∏ –ø—Ä–æ–≥–æ–Ω—è—Ç—å –Ω–∞ –Ω–∏—Ö –∏–Ω—Ñ–µ—Ä–µ–Ω—Å.

–ê—É–¥–∏–æ—Ñ–∞–π–ª—ã –¥–ª—è –∏–Ω—Ñ–µ—Ä–µ–Ω—Å–∞ –¥–æ–ª–∂–Ω—ã –±—ã—Ç—å –≤ —Ñ–æ—Ä–º–∞—Ç–∞—Ö `flac`, `mp3` –∏–ª–∏ `wav`

–§–∞–π–ª—ã —Å–æ—Ö—Ä—è–Ω—è—é—Ç—Å—è –≤–æ –≤—Ä–µ–º–µ–Ω–Ω—É—é –¥–∏—Ä–µ–∫—Ç–æ—Ä–∏—é, –¥–∞–ª–µ–µ –∏—Å–ø–æ–ª—å–∑—É–µ—Ç—Å—è —Å—Ç—Ä—É–∫—Ç—É—Ä–∞ `CustomDirDataset`

In [None]:
# @title 4A. Run on uploaded audiofiles

import ipywidgets as widgets
from google.colab import files, output
import shutil
import subprocess
from pathlib import Path

# –ó–∞–≥—Ä—É–∑–∫–∞ —Ñ–∞–π–ª–æ–≤
print("Upload audio files (flac/wav/mp3):")
uploaded = files.upload()
output.clear()

if not uploaded:
    print("No files uploaded.")
else:
    print(f"Uploaded {len(uploaded)} files")

    # –ù–∞—Å—Ç—Ä–æ–π–∫–∞ –ø—É—Ç–µ–π
    base_dir = os.path.abspath(".")
    input_root = os.path.join(base_dir, "temp_data")
    audio_dir = os.path.join(input_root, "audio")

    # –ü–∞–ø–∫–∞ –¥–ª—è —Å–æ—Ö—Ä–∞–Ω–µ–Ω–∏—è —Ä–µ–∑—É–ª—å—Ç–∞—Ç–æ–≤
    save_folder_name = "temp_data_predictions"
    full_save_path = os.path.join(base_dir, "data", "saved", save_folder_name)
    predictions_txt_dir = os.path.join(full_save_path, "inference")

    # –û—á–∏—Å—Ç–∫–∞ –∏ —Å–æ–∑–¥–∞–Ω–∏–µ –¥–∏—Ä–µ–∫—Ç–æ—Ä–∏–π
    if os.path.exists(input_root):
        shutil.rmtree(input_root)
    os.makedirs(audio_dir, exist_ok=True)

    if os.path.exists(full_save_path):
        shutil.rmtree(full_save_path)

    # –ü–µ—Ä–µ–º–µ—â–µ–Ω–∏–µ —Ñ–∞–π–ª–æ–≤ –≤ —Å—Ç—Ä—É–∫—Ç—É—Ä—É –¥–∞—Ç–∞—Å–µ—Ç–∞
    uploaded_filenames = list(uploaded.keys())
    for filename in uploaded_filenames:
        src_path = os.path.join(base_dir, filename)
        dst_path = os.path.join(audio_dir, filename)

        if os.path.exists(src_path):
            shutil.move(src_path, dst_path)

    # Inference
    print(f"Running inference...")

    command = [
        "python", "inference.py",
        f"inferencer.save_path={save_folder_name}",
        f"inferencer.from_pretrained=saved/best_model.pth",
        f"inferencer.data_dir={input_root}"
    ]

    try:
        result = subprocess.run(command, capture_output=True, text=True)

        if result.returncode != 0:
            print("Inference Error:")
            print(result.stderr)
        else:
            print("Inference finished.\n")

            show_inference_results(audio_dir, predictions_txt_dir, limit=None)

    except Exception as e:
        print(f"Critical Error during execution: {e}")

    finally:
        # –£–¥–∞–ª–µ–Ω–∏–µ –∏—Å—Ö–æ–¥–Ω—ã—Ö –∞—É–¥–∏–æ—Ñ–∞–π–ª–æ–≤ –∏–∑ –∫–æ—Ä–Ω—è –ø—Ä–æ–µ–∫—Ç–∞
        # –û–Ω–∏ —É–∂–µ –µ—Å—Ç—å –≤–æ –≤—Ä–µ–º–µ–Ω–Ω–æ–π –¥–∏—Ä–µ–∫—Ç–æ—Ä–∏–∏
        if os.path.exists(filename):
            os.remove(filename)

–ö–æ–¥ –Ω–∏–∂–µ –ø–æ–∑–≤–æ–ª—è–µ—Ç —Å–∫–∞—á–∏–≤–∞—Ç—å –∞—Ä—Ö–∏–≤ –∏–ª–∏ –ø–∞–ø–∫—É —Å GDrive –∏ –ø—Ä–æ–≥–æ–Ω—è—Ç—å –∏–Ω—Ñ–µ—Ä–µ–Ω—Å –Ω–∞ –Ω–µ–π.

–°—Ç—Ä—É–∫—Ç—É—Ä–∞ –ø–∞–ø–∫–∏ (–∏–ª–∏ –ø–æ–¥–ø–∞–ø–∫–∏) –¥–æ–ª–∂–Ω–∞ –∏–º–µ—Ç—å —Å–ª–µ–¥—É—é—â–∏–π —Ñ–æ—Ä–º–∞—Ç:

```python
NameOfTheDirectoryWithUtterances
‚îú‚îÄ‚îÄ audio
‚îÇ   ‚îú‚îÄ‚îÄ UtteranceID1.wav    # may be flac or mp3
‚îÇ   ‚îú‚îÄ‚îÄ UtteranceID2.wav
‚îÇ   .
‚îÇ   .
‚îÇ   .
‚îÇ   ‚îî‚îÄ‚îÄ UtteranceIDn.wav
‚îî‚îÄ‚îÄ transcriptions          # ground truth, may not exist
    ‚îú‚îÄ‚îÄ UtteranceID1.txt
    ‚îú‚îÄ‚îÄ UtteranceID2.txt
    .
    .
    .
    ‚îî‚îÄ‚îÄ UtteranceIDn.txt
```

–ê—É–¥–∏–æ—Ñ–∞–π–ª—ã –¥–ª—è –∏–Ω—Ñ–µ—Ä–µ–Ω—Å–∞ –¥–æ–ª–∂–Ω—ã –±—ã—Ç—å –≤ —Ñ–æ—Ä–º–∞—Ç–∞—Ö `flac`, `mp3` –∏–ª–∏ `wav`

–í—ã—Ç–∞—â–∏–ª –ø–µ—Ä–≤—ã–µ –ø–æ–ø–∞–≤—à–∏–µ—Å—è —Ñ–∞–π–ª—ã –∏–∑ `librispeech\dev-clean\84\121123` –∏ –ø–µ—Ä–µ–≥–Ω–∞–ª –Ω–µ–∫–æ—Ç–æ—Ä—ã–µ –∏–∑ —Ñ–æ—Ä–º–∞—Ç–∞ `flac` –≤ `wav`, `mp3` –∏ `m4a` (–¥–ª—è –ø—Ä–æ–≤–µ—Ä–∫–∏):

–í–æ—Ç —Å—Å—ã–ª–∫–∏ –Ω–∞ –ø–∞–ø–∫–∏ –¥–ª—è –±—ã—Å—Ç—Ä–æ–≥–æ –¥–æ—Å—Ç—É–ø–∞ (–º–æ–≥—É—Ç –±—ã—Ç—å –≤ –±–∞–Ω–µ*):
- https://drive.google.com/drive/folders/1K-fpqyYsVxvleKqhpCQPfzV1_xC3RyEk?usp=drive_link (—Å —Ç—Ä–∞–Ω—Å–∫—Ä–∏–ø—Ü–∏—è–º–∏ - –¥–ª—è —Ä–∞—Å—á–µ—Ç–∞ –º–µ—Ç—Ä–∏–∫)
- https://drive.google.com/drive/folders/1B2ZbI0jbzfjUOlqqWceib4R8pajMqE4u?usp=drive_link (–±–µ–∑ —Ç—Ä–∞–Ω—Å–∫—Ä–∏–ø—Ü–∏–π - –¥–ª—è –∏—Ö –ø–æ–ª—É—á–µ–Ω–∏—è)

–í–æ—Ç —Å—Å—ã–ª–∫–∏ –Ω–∞ –∞—Ä—Ö–∏–≤—ã —Å –∞–Ω–∞–ª–æ–≥–∏—á–Ω—ã–º —Å–æ–¥–µ—Ä–∂–∞–Ω–∏–µ–º:
- https://drive.google.com/file/d/1B5JUcSsPk1l_xe9c99iClJIUSzkXJh8h/view?usp=sharing (—Å —Ç—Ä–∞–Ω—Å–∫—Ä–∏–ø—Ü–∏—è–º–∏)
- https://drive.google.com/file/d/1_9-mLQCYn9MCR3wm1RNxCA0bvt3rP6Ok/view?usp=sharing (–±–µ–∑ –Ω–∏—Ö)

**\(\*\) –ß–∞—Å—Ç—ã–µ —Å–∫–∞—á–∏–≤–∞–Ω–∏—è —Å GDrive –ø—Ä–∏–≤–æ–¥—è—Ç –∫ —Ñ—Ä–∏–∑—É –¥–∏—Å–∫–∞**

In [None]:
# @title 4B. Run on GDrive Dataset

import zipfile
import gdown

# @markdown –í—Å—Ç–∞–≤—å—Ç–µ —Å—Å—ã–ª–∫—É –Ω–∞ Google Drive (–Ω–∞ .zip —Ñ–∞–π–ª –∏–ª–∏ –Ω–∞ –ø–∞–ø–∫—É):
gdrive_url = ""  # @param {type:"string"}

# –ù–∞—Å—Ç—Ä–æ–π–∫–∞ –ø—É—Ç–µ–π
base_dir = os.path.abspath(".")
dataset_root = os.path.join(base_dir, "temp_custom_dataset")
save_folder_name = "custom_drive_predictions"
full_save_path = os.path.join(base_dir, "data", "saved", save_folder_name)

# –û—á–∏—Å—Ç–∫–∞ –ø—Ä–µ–¥—ã–¥—É—â–∏—Ö –∑–∞–ø—É—Å–∫–æ–≤
if os.path.exists(dataset_root):
    shutil.rmtree(dataset_root)
if os.path.exists(full_save_path):
    shutil.rmtree(full_save_path)

os.makedirs(dataset_root, exist_ok=True)

if not gdrive_url:
    print("Please paste a Google Drive link above. (zip or folder)")
else:
    try:
        # –°–∫–∞—á–∏–≤–∞–Ω–∏–µ (Folder –∏–ª–∏ zip)
        if "drive.google.com" in gdrive_url and "/folders/" in gdrive_url:
            print("Detected Google Drive FOLDER.")
            gdown.download_folder(url=gdrive_url, output=dataset_root, quiet=True)
        elif "drive.google.com" in gdrive_url:
            print("Detected Google Drive FILE (assuming .zip).")
            zip_path = os.path.join(dataset_root, "dataset.zip")
            gdown.download(url=gdrive_url, output=zip_path, quiet=True, fuzzy=True)
            with zipfile.ZipFile(zip_path, 'r') as zip_ref:
                zip_ref.extractall(dataset_root)
            os.remove(zip_path)
        else:
            # –ø—Ä—è–º–∞—è —Å—Å—ã–ª–∫–∞
            print("Direct link detected.")
            zip_path = os.path.join(dataset_root, "dataset.zip")
            subprocess.run(
                ["wget", gdrive_url, "-q", "-O", zip_path],
                check=True
            )
            with zipfile.ZipFile(zip_path, 'r') as zip_ref:
                zip_ref.extractall(dataset_root)
            os.remove(zip_path)

        # –ü–æ–∏—Å–∫ –¥–∞–Ω–Ω—ã—Ö
        data_dir = None
        for root, dirs, files in os.walk(dataset_root):
            if "audio" in dirs:
                data_dir = root
                break

        if not data_dir:
            raise FileNotFoundError("Could not find 'audio' folder.")

        print(f"Dataset found at: {data_dir}")

        # Inference
        print("\nRunning Inference...")
        cmd_inf = [
            "python", "inference.py",
            f"inferencer.save_path={save_folder_name}",
            f"inferencer.from_pretrained=saved/best_model.pth",
            f"inferencer.data_dir={data_dir}"
        ]

        result = subprocess.run(cmd_inf, capture_output=True, text=True)

        if result.returncode != 0:
            print("Inference Error:")
            print(result.stderr)
        else:
            print("Inference finished.")

            # Paths
            gt_dir = os.path.join(data_dir, "transcriptions")
            preds_txt_dir = os.path.join(full_save_path, "inference")
            audio_source_dir = os.path.join(data_dir, "audio")

            # Metrics vs Visualization
            if os.path.exists(gt_dir) and os.path.isdir(gt_dir):
                print("\nGround truth found. Calculating WER/CER...")
                if os.path.exists(preds_txt_dir):
                    cmd_metrics = [
                        "python", "calc_metrics.py",
                        "--pred_dir", preds_txt_dir,
                        "--gt_dir", gt_dir
                    ]
                    metrics_res = subprocess.run(cmd_metrics, capture_output=True, text=True)
                    print(metrics_res.stdout)
                    if metrics_res.stderr: print("Metrics Errors:", metrics_res.stderr)
                else:
                    print("Error: Predictions not found.")

            if os.path.exists(preds_txt_dir):
                    show_inference_results(audio_source_dir, preds_txt_dir, limit=5)
            else:
                print("Predictions directory was not created.")

    except Exception as e:
        print(f"\nFAILED: {e}")


–ö–æ–¥ –Ω–∏–∂–µ –ø–æ–∑–≤–æ–ª—è–µ—Ç –∑–∞–ø–∏—Å–∞—Ç—å –∞—É–¥–∏–æ –ø—Ä—è–º–æ –≤–Ω—É—Ç—Ä–∏ Colab –∏ –ø—Ä–æ–≥–Ω–∞—Ç—å –Ω–∞ –Ω–µ–º –∏–Ω—Ñ–µ—Ä–µ–Ω—Å (–Ω–∞–¥–æ –≥–æ–≤–æ—Ä–∏—Ç—å –Ω–∞ –∞–Ω–≥–ª–∏–π—Å–∫–æ–º)

*\* JS –∫–æ–¥ –ø–∏—Å–∞–ª–∞ –Ω–µ–π—Ä–æ—Å–µ—Ç—å, –Ω–æ –º–Ω–µ –ø—Ä–æ—Å—Ç–æ —Ö–æ—Ç–µ–ª–æ—Å—å —ç—Ç–æ —Ä–µ–∞–ª–∏–∑–æ–≤–∞—Ç—å, –º–Ω–µ –∫–∞–∂–µ—Ç—Å—è –ø—Ä–∏–∫–æ–ª—å–Ω–æ :\)*

In [18]:
# @title 4C. Record Audio

from IPython.display import HTML, Audio, display
from google.colab.output import eval_js
from base64 import b64decode
import numpy as np


# JS
AUDIO_HTML = """
<script>
var recordAudio = () => new Promise(async resolve => {
  // –°–æ–∑–¥–∞–µ–º UI
  var div = document.createElement('div');
  var btn = document.createElement('button');

  btn.innerText = 'üî¥ Start Recording';
  btn.style.background = 'red';
  btn.style.color = 'white';
  btn.style.padding = '12px';
  btn.style.border = 'none';
  btn.style.borderRadius = '5px';
  btn.style.cursor = 'pointer';
  btn.style.fontSize = '16px';

  div.appendChild(btn);
  document.body.appendChild(div);

  // –ò–Ω–∏—Ü–∏–∞–ª–∏–∑–∞—Ü–∏—è
  const stream = await navigator.mediaDevices.getUserMedia({audio: true});
  const recorder = new MediaRecorder(stream);
  const chunks = [];

  // –°–æ–±–∏—Ä–∞–µ–º –¥–∞–Ω–Ω—ã–µ
  recorder.ondataavailable = e => chunks.push(e.data);

  // –ö–æ–≥–¥–∞ –∑–∞–ø–∏—Å—å –æ—Å—Ç–∞–Ω–æ–≤–ª–µ–Ω–∞
  recorder.onstop = () => {
    const blob = new Blob(chunks); // –ë—Ä–∞—É–∑–µ—Ä –ø–∏—à–µ—Ç –≤ webm/ogg
    const reader = new FileReader();
    reader.readAsDataURL(blob);
    reader.onloadend = () => {
        div.remove(); // –£–¥–∞–ª—è–µ–º –∫–Ω–æ–ø–∫—É
        resolve(reader.result); // –í–æ–∑–≤—Ä–∞—â–∞–µ–º –¥–∞–Ω–Ω—ã–µ –≤ Python
    }
  };

  // –õ–æ–≥–∏–∫–∞ –∫–Ω–æ–ø–∫–∏
  btn.onclick = () => {
    if (recorder.state === "inactive") {
      recorder.start();
      btn.innerText = "‚¨õ Stop Recording";
      btn.style.background = "black";
    } else {
      recorder.stop();
      stream.getTracks().forEach(track => track.stop()); // –í—ã–∫–ª—é—á–∞–µ–º –º–∏–∫—Ä–æ—Ñ–æ–Ω
      btn.innerText = "Processing...";
      btn.style.background = "gray";
      btn.disabled = true;
    }
  }
});
</script>
"""

def record_audio_robust(filename='recording.wav'):
    # –í–Ω–µ–¥—Ä—è–µ–º JS —Ñ—É–Ω–∫—Ü–∏—é
    display(HTML(AUDIO_HTML))
    # –í—ã–∑—ã–≤–∞–µ–º —Ñ—É–Ω–∫—Ü–∏—é –∏ –∂–¥–µ–º Promise
    data = eval_js("recordAudio({})")

    # –î–µ–∫–æ–¥–∏—Ä—É–µ–º
    binary = b64decode(data.split(',')[1])

    # –°–æ—Ö—Ä–∞–Ω—è–µ–º –≤—Ä–µ–º–µ–Ω–Ω—ã–π —Ñ–∞–π–ª (–æ–±—ã—á–Ω–æ —ç—Ç–æ webm)
    temp_file = filename.replace('.wav', '.webm')
    with open(temp_file, 'wb') as f:
        f.write(binary)

    # –ö–æ–Ω–≤–µ—Ä—Ç–∞—Ü–∏—è –≤ WAV (16kHz, mono) —á–µ—Ä–µ–∑ ffmpeg
    # -y: –ø–µ—Ä–µ–∑–∞–ø–∏—Å–∞—Ç—å, -vn: –±–µ–∑ –≤–∏–¥–µ–æ, -ac 1: –º–æ–Ω–æ
    subprocess.run(['ffmpeg', '-y', '-i', temp_file, '-ar', '16000', '-ac', '1', filename],
                   stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)

    if os.path.exists(temp_file):
        os.remove(temp_file)

    return filename

base_dir = os.path.abspath(".")
input_root = os.path.join(base_dir, "temp_mic_data")
audio_dir = os.path.join(input_root, "audio")
save_folder_name = "mic_predictions"
full_save_path = os.path.join(base_dir, "data", "saved", save_folder_name)
filename = "my_voice.wav"

if os.path.exists(input_root):
    shutil.rmtree(input_root)
if os.path.exists(full_save_path):
    shutil.rmtree(full_save_path)

os.makedirs(audio_dir, exist_ok=True)

# –ó–∞–ø–∏—Å—å –∏ –ò–Ω—Ñ–µ—Ä–µ–Ω—Å
try:
    print("Click the button below to record (Click again to stop)")
    file_path = os.path.join(audio_dir, filename)

    record_audio_robust(file_path)

    if os.path.exists(file_path):
        print(f"Audio saved: {file_path}")

        print("\nRunning inference...")

        command = [
            "python", "inference.py",
            f"inferencer.save_path={save_folder_name}",
            f"inferencer.from_pretrained=saved/best_model.pth",
            f"inferencer.data_dir={input_root}",
            "inferencer.device=auto"
        ]

        result = subprocess.run(command, capture_output=True, text=True)

        if result.returncode != 0:
            print("Inference Error:")
            print(result.stderr)
        else:
            print("Inference finished.\n")

            predictions_txt_dir = os.path.join(full_save_path, "inference")
            if os.path.exists(predictions_txt_dir):
                show_inference_results(audio_dir, predictions_txt_dir)
            else:
                print("Predictions directory not found.")
    else:
        print("Error: File was not saved.")

except Exception as e:
    print(f"Error: {e}")

Click the button below to record (Click again to stop)


Audio saved: /content/DL_sound_project/temp_mic_data/audio/my_voice.wav

Running inference...
Inference finished.

Showing results for 1 files:

----------------------------------------
Audio: my_voice.wav | SR: 16000



Text:
ARM I WAS EEME ITSTUNNED WY THE WAY HE WALKED
