# Подготовка данных

In [2]:
import os
import shutil
import sys
import tarfile
import urllib.request
from pathlib import Path
import numpy as np
import librosa
from tqdm import tqdm
import json

In [3]:
PROJECT_ROOT = Path(".").parent
DATA_DIR = PROJECT_ROOT / "data" / "aishell3"
PROCESSED_DIR = PROJECT_ROOT / "processed"
MODELS_DIR = PROJECT_ROOT / "models"

DATA_DIR.mkdir(parents=True, exist_ok=True)
PROCESSED_DIR.mkdir(exist_ok=True)
MODELS_DIR.mkdir(exist_ok=True)

In [4]:
AISHELL3_URL = "http://www.openslr.org/resources/93/data_aishell3.tgz"

In [5]:
def download_aishell3():
    """Скачивает AISHELL-3, если ещё не скачан"""
    tar_path = DATA_DIR / "data_aishell3.tgz"
    if tar_path.exists():
        print("AISHELL-3 архив уже скачан.")
        return tar_path

    print("Скачивание AISHELL-3...")
    def reporthook(block_num, block_size, total_size):
        if total_size > 0:
            percent = min(100, (block_num * block_size * 100) // total_size)
            print(f"\rПрогресс: {percent}%", end="")
    urllib.request.urlretrieve(AISHELL3_URL, tar_path, reporthook)
    print("\nЗагрузка завершена.")
    return tar_path

In [6]:
def extract_aishell3(tar_path: Path):
    """Распаковывает AISHELL-3"""
    if (DATA_DIR / "train").exists() and (DATA_DIR / "test").exists():
        print("AISHELL-3 уже распакован.")
        return

    print("Распаковка...")
    with tarfile.open(tar_path, "r:gz") as tar:
        tar.extractall(DATA_DIR.parent)
    # AISHELL-3 распаковывается в data_aishell3/train и data_aishell3/test
    # Переместим содержимое в data/aishell3/
    src_dir = DATA_DIR.parent / "data_aishell3"
    if src_dir.exists():
        for item in src_dir.iterdir():
            shutil.move(str(item), str(DATA_DIR))
        shutil.rmtree(src_dir)
    print("Распаковка завершена.")

In [7]:
def load_aishell3_labels_from_content():
    """
    Загружает слоги с тонами из content.txt.
    Формат: <filename> <иероглифы> <пиньинь_с_тонами>
    Пример: SSB00050001.wav	广 guang3 州 zhou1 ...
    """
    content_path = DATA_DIR / "train" / "content.txt"
    if not content_path.exists():
        raise FileNotFoundError(f"Не найден {content_path}. Убедитесь, что AISHELL-3 распакован полностью.")

    labels = {}
    with open(content_path, "r", encoding="utf-8") as f:
        for line in f:
            parts = line.strip().split()
            if len(parts) < 3:
                continue
            wav_filename = parts[0]  # Например, SSB00050001.wav
            # Убираем .wav
            wav_id = wav_filename.replace(".wav", "")
            # Пиньинь с тонами начинаются после иероглифов — обычно с 2-го элемента
            # Но лучше найти индекс, где начинается пиньинь
            pinyin_start_idx = 1  # Первый элемент — имя файла, второй — первый иероглиф
            # Пропускаем иероглифы — они идут до первого слова, содержащего цифру тона
            for i in range(1, len(parts)):
                if parts[i][-1].isdigit() and parts[i][:-1].isalpha():
                    pinyin_start_idx = i
                    break

            syllables = parts[pinyin_start_idx:]
            labels[wav_id] = syllables

    return labels

In [37]:
import random


def extract_f0_for_syllables():
    """
    Главная функция препроцессинга.
    Извлекает F0 для каждого слога, используя грубую сегментацию по времени.
    """
    labels = load_aishell3_labels_from_content()
    wav_base_dir = DATA_DIR / "train" / "wav"

    all_f0 = []
    all_tones = []

    print("Извлечение F0 для слогов...")
    random.seed(42)  # для воспроизводимости
    sampled_items = random.sample(list(labels.items()), k=min(300, len(labels)))
    
    for wav_id, syllables in tqdm(sampled_items):
        # Определяем путь к файлу: train/wav/SSB0005/SSB00050001.wav
        speaker_id = wav_id[:7]  # первые 7 символов: SSB0005
        wav_path = wav_base_dir / speaker_id / f"{wav_id}.wav"
        
        if not wav_path.exists():
            print(f"Не найден файл: {wav_path}")
            continue

        # Загружаем аудио
        y, sr = librosa.load(wav_path, sr=16000)
        duration = len(y) / sr
        n_syllables = len(syllables)

        # Делим аудио на n_syllables равных частей
        for i, syl in enumerate(syllables):
            start = i * duration / n_syllables
            end = (i + 1) * duration / n_syllables

            # Извлекаем F0 с помощью Pyin
            y_seg = y[int(start * sr):int(end * sr)]
            if len(y_seg) == 0:
                continue

            f0, _, _ = librosa.pyin(
                y_seg,
                fmin=librosa.note_to_hz('C2'),   # ~65 Гц
                fmax=librosa.note_to_hz('C7'),   # ~2093 Гц
                sr=sr,
                frame_length=512,
                win_length=512 // 2,
                hop_length=128
            )

            # Оставляем только валидные значения
            f0 = f0[~np.isnan(f0)]
            if len(f0) < 5:  # слишком мало точек
                continue

            # Нормализуем контур до 50 точек
            x_old = np.linspace(0, 1, len(f0))
            x_new = np.linspace(0, 1, 50)
            f0_interp = np.interp(x_new, x_old, f0)

            # Преобразуем тон из 'ni3' → 3
            if syl[-1].isdigit():
                tone = int(syl[-1])
            else:
                tone = 5  # нейтральный

            all_f0.append(f0_interp)
            all_tones.append(tone)

    # Сохраняем
    np.save(PROCESSED_DIR / "f0_contours.npy", np.array(all_f0))
    np.save(PROCESSED_DIR / "tones.npy", np.array(all_tones))
    print(f"Обработано {len(all_f0)} слогов.")


In [38]:
#tar_path = download_aishell3()
#extract_aishell3(tar_path)

DATA_DIR = PROJECT_ROOT / "data"
extract_f0_for_syllables()

Извлечение F0 для слогов...


  extract_f0_for_syllables()
100%|██████████| 300/300 [25:06<00:00,  5.02s/it]

Обработано 5044 слогов.





# Обучение 1D CNN

In [39]:
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from pathlib import Path
from tqdm import tqdm

In [40]:
PROJECT_ROOT = Path(".").parent
PROCESSED_DIR = PROJECT_ROOT / "processed"
MODELS_DIR = PROJECT_ROOT / "models"

In [41]:
class ToneCNN(nn.Module):
    def __init__(self, input_len=50, num_classes=5):
        super().__init__()
        self.conv1 = nn.Conv1d(1, 32, kernel_size=5, padding=2)
        self.relu = nn.ReLU()
        self.pool = nn.MaxPool1d(2)
        self.conv2 = nn.Conv1d(32, 64, kernel_size=3, padding=1)
        self.flatten = nn.Flatten()
        self.fc1 = nn.Linear(64 * (input_len // 4), 128)
        self.dropout = nn.Dropout(0.5)
        self.fc2 = nn.Linear(128, num_classes)

    def forward(self, x):
        x = x.unsqueeze(1)  # (B, L) -> (B, 1, L)
        x = self.pool(self.relu(self.conv1(x)))
        x = self.pool(self.relu(self.conv2(x)))
        x = self.flatten(x)
        x = self.dropout(self.relu(self.fc1(x)))
        x = self.fc2(x)
        return x


In [42]:
# Загрузка данных
X = np.load(PROCESSED_DIR / "f0_contours.npy")
y = np.load(PROCESSED_DIR / "tones.npy") - 1  # тоны 1-5 → индексы 0-4

# Нормализация: Z-score по каждому контуру
X = (X - X.mean(axis=1, keepdims=True)) / (X.std(axis=1, keepdims=True) + 1e-8)

# Разделение
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [43]:
# Веса классов
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weights = torch.tensor(class_weights, dtype=torch.float32)

# Датасеты
train_dataset = TensorDataset(torch.tensor(X_train, dtype=torch.float32),
                                torch.tensor(y_train, dtype=torch.long))
val_dataset = TensorDataset(torch.tensor(X_val, dtype=torch.float32),
                            torch.tensor(y_val, dtype=torch.long))

train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=128)

In [44]:
# Модель
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(torch.__version__)
model = ToneCNN().to(device)
criterion = nn.CrossEntropyLoss(weight=class_weights.to(device))
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

2.8.0+cpu


In [45]:
# Параметры ранней остановки
patience = 20
best_val_acc = 0.0
epochs_no_improve = 0
early_stop = False

# Обучение
for epoch in range(150):
    if early_stop:
        print("Ранняя остановка сработала.")
        break

    model.train()
    train_loss = 0
    for x, y in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
        x, y = x.to(device), y.to(device)
        optimizer.zero_grad()
        out = model(x)
        loss = criterion(out, y)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

    # Валидация
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for x, y in val_loader:
            x, y = x.to(device), y.to(device)
            out = model(x)
            pred = out.argmax(dim=1)
            total += y.size(0)
            correct += (pred == y).sum().item()
    val_acc = correct / total
    avg_train_loss = train_loss / len(train_loader)
    print(f"Train Loss: {avg_train_loss:.4f}, Val Acc: {val_acc:.4f}")

    # Ранняя остановка
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        epochs_no_improve = 0
        # Опционально: сохранять лучшую модель
        torch.save(model.state_dict(), MODELS_DIR / "tone_cnn_best.pth")
    else:
        epochs_no_improve += 1
        if epochs_no_improve >= patience:
            early_stop = True

# Сохранение финальной модели
torch.save(model.state_dict(), MODELS_DIR / "tone_cnn.pth")
print("Модель сохранена")

Epoch 1: 100%|██████████| 32/32 [00:00<00:00, 67.37it/s]


Train Loss: 1.6124, Val Acc: 0.1754


Epoch 2: 100%|██████████| 32/32 [00:00<00:00, 104.58it/s]


Train Loss: 1.6075, Val Acc: 0.2002


Epoch 3: 100%|██████████| 32/32 [00:00<00:00, 106.31it/s]


Train Loss: 1.6048, Val Acc: 0.2418


Epoch 4: 100%|██████████| 32/32 [00:00<00:00, 106.67it/s]


Train Loss: 1.6025, Val Acc: 0.1665


Epoch 5: 100%|██████████| 32/32 [00:00<00:00, 108.47it/s]


Train Loss: 1.5993, Val Acc: 0.1784


Epoch 6: 100%|██████████| 32/32 [00:00<00:00, 105.26it/s]


Train Loss: 1.5956, Val Acc: 0.1744


Epoch 7: 100%|██████████| 32/32 [00:00<00:00, 91.95it/s]


Train Loss: 1.5891, Val Acc: 0.1368


Epoch 8: 100%|██████████| 32/32 [00:00<00:00, 102.24it/s]


Train Loss: 1.5839, Val Acc: 0.1615


Epoch 9: 100%|██████████| 32/32 [00:00<00:00, 106.67it/s]


Train Loss: 1.5756, Val Acc: 0.1288


Epoch 10: 100%|██████████| 32/32 [00:00<00:00, 87.91it/s]


Train Loss: 1.5718, Val Acc: 0.1606


Epoch 11: 100%|██████████| 32/32 [00:00<00:00, 100.63it/s]


Train Loss: 1.5687, Val Acc: 0.1883


Epoch 12: 100%|██████████| 32/32 [00:00<00:00, 105.96it/s]


Train Loss: 1.5574, Val Acc: 0.1506


Epoch 13: 100%|██████████| 32/32 [00:00<00:00, 108.10it/s]


Train Loss: 1.5532, Val Acc: 0.1982


Epoch 14: 100%|██████████| 32/32 [00:00<00:00, 109.96it/s]


Train Loss: 1.5464, Val Acc: 0.1843


Epoch 15: 100%|██████████| 32/32 [00:00<00:00, 106.67it/s]


Train Loss: 1.5429, Val Acc: 0.1526


Epoch 16: 100%|██████████| 32/32 [00:00<00:00, 109.58it/s]


Train Loss: 1.5324, Val Acc: 0.1685


Epoch 17: 100%|██████████| 32/32 [00:00<00:00, 105.96it/s]


Train Loss: 1.5162, Val Acc: 0.1437


Epoch 18: 100%|██████████| 32/32 [00:00<00:00, 106.31it/s]


Train Loss: 1.5065, Val Acc: 0.1724


Epoch 19: 100%|██████████| 32/32 [00:00<00:00, 108.47it/s]


Train Loss: 1.4983, Val Acc: 0.1487


Epoch 20: 100%|██████████| 32/32 [00:00<00:00, 107.38it/s]


Train Loss: 1.4870, Val Acc: 0.1576


Epoch 21: 100%|██████████| 32/32 [00:00<00:00, 89.63it/s]


Train Loss: 1.4839, Val Acc: 0.1625


Epoch 22: 100%|██████████| 32/32 [00:00<00:00, 107.74it/s]


Train Loss: 1.4691, Val Acc: 0.1734


Epoch 23: 100%|██████████| 32/32 [00:00<00:00, 102.56it/s]

Train Loss: 1.4577, Val Acc: 0.1615
Ранняя остановка сработала.
Модель сохранена





#  Тестирование и сравнение

In [46]:
import numpy as np
import torch
from sklearn.metrics import classification_report, confusion_matrix
from pathlib import Path

In [47]:
# Загрузка модели и данных
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = ToneCNN()
model.load_state_dict(torch.load(MODELS_DIR / "tone_cnn.pth", map_location=device))
model.eval()


ToneCNN(
  (conv1): Conv1d(1, 32, kernel_size=(5,), stride=(1,), padding=(2,))
  (relu): ReLU()
  (pool): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv1d(32, 64, kernel_size=(3,), stride=(1,), padding=(1,))
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (fc1): Linear(in_features=768, out_features=128, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
  (fc2): Linear(in_features=128, out_features=5, bias=True)
)

In [48]:
X = np.load(PROCESSED_DIR / "f0_contours.npy")
y_true = np.load(PROCESSED_DIR / "tones.npy") - 1  # 0-4

In [49]:
# Нормализация
X = (X - X.mean(axis=1, keepdims=True)) / (X.std(axis=1, keepdims=True) + 1e-8)
X_tensor = torch.tensor(X, dtype=torch.float32).to(device)

In [50]:
# Предсказание
with torch.no_grad():
    outputs = model(X_tensor)
    y_pred = outputs.argmax(dim=1).cpu().numpy()

In [52]:
target_names = ["Tone 1", "Tone 2", "Tone 3", "Tone 4", "Tone 5"]
print(classification_report(y_true, y_pred, target_names=target_names))

              precision    recall  f1-score   support

      Tone 1       0.29      0.39      0.33       539
      Tone 2       0.21      0.50      0.29       583
      Tone 3       0.19      0.45      0.27       396
      Tone 4       0.24      0.45      0.31       881
      Tone 5       0.75      0.10      0.18      2645

    accuracy                           0.27      5044
   macro avg       0.34      0.38      0.28      5044
weighted avg       0.51      0.27      0.24      5044

