In [1]:
import json, random
from pathlib import Path

ROOT = Path(".")
DATA_DIR = ROOT / ".."
TRAIN_DIR = DATA_DIR / "train_opus"
TEST_DIR  = DATA_DIR / "test_opus"
train_audio_dir = TRAIN_DIR / "audio"
test_audio_dir = TEST_DIR / "audio"
word_bounds_path = TRAIN_DIR / "word_bounds.json"
print("Train audio:", train_audio_dir, "exists:", train_audio_dir.exists())
print("Test audio:", test_audio_dir, "exists:", test_audio_dir.exists())
print("word_bounds.json:", word_bounds_path, "exists:", word_bounds_path.exists())

Train audio: ../train_opus/audio exists: True
Test audio: ../test_opus/audio exists: True
word_bounds.json: ../train_opus/word_bounds.json exists: True


In [2]:
with open(word_bounds_path, "r", encoding="utf-8") as f:
    bounds = json.load(f)

train_files = sorted(train_audio_dir.glob("*.opus"))
test_files  = sorted(test_audio_dir.glob("*.opus"))

train_ids = [p.stem for p in train_files]
test_ids  = [p.stem for p in test_files]

labels = {tid: (1 if tid in bounds else 0) for tid in train_ids}

num_pos = sum(labels.values())
num_neg = len(labels) - num_pos
print("Train size:", len(train_ids), "pos:", num_pos, "neg:", num_neg)
print("Test size:", len(test_ids))


Train size: 90000 pos: 45000 neg: 45000
Test size: 27000


In [3]:
import torch
import torchaudio
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from tqdm import tqdm

SEED = 42
random.seed(SEED)
torch.manual_seed(SEED)

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
SR = 16000
WIN_SEC = 2.5
WIN_SAMPLES = int(SR * WIN_SEC)

N_MELS = 64
N_FFT = 400
HOP = 160

mel_tf = torchaudio.transforms.MelSpectrogram(
    sample_rate=SR, n_fft=N_FFT, hop_length=HOP, n_mels=N_MELS, power=2.0
)
db_tf = torchaudio.transforms.AmplitudeToDB(stype="power")

def load_audio(path: Path, target_sr=SR):
    wav, sr = torchaudio.load(str(path))
    if wav.shape[0] > 1:
        wav = wav.mean(dim=0, keepdim=True)
    if sr != target_sr:
        wav = torchaudio.functional.resample(wav, sr, target_sr)
    return wav.squeeze(0)  # [T]

def pad_or_crop(wav: torch.Tensor, length: int, start: int = None):
    T = wav.numel()
    if T >= length:
        if start is None:
            start = random.randint(0, T - length)
        return wav[start:start+length]
    # pad
    pad_len = length - T
    return torch.nn.functional.pad(wav, (0, pad_len))

def wav_to_logmel(wav_1d: torch.Tensor):
    # wav_1d: [T]
    x = wav_1d.unsqueeze(0)  # [1, T]
    m = mel_tf(x)            # [1, n_mels, frames]
    m = db_tf(m)
    m = (m - m.mean()) / (m.std() + 1e-5)
    return m  # [1, n_mels, frames]

class PhraseDataset(Dataset):
    def __init__(self, audio_dir: Path, ids, bounds_dict, labels_dict,
                 win_samples=WIN_SAMPLES, sr=SR, train_mode=True):
        self.audio_dir = audio_dir
        self.ids = ids
        self.bounds = bounds_dict
        self.labels = labels_dict
        self.win = win_samples
        self.sr = sr
        self.train_mode = train_mode

    def __len__(self):
        return len(self.ids)

    def __getitem__(self, idx):
        tid = self.ids[idx]
        path = self.audio_dir / f"{tid}.opus"
        y = self.labels[tid]
        wav = load_audio(path, self.sr)
        T = wav.numel()

        if self.train_mode:
            if y == 1:
                st, en = self.bounds[tid]
                st_s = int(max(0, st * self.sr))
                en_s = int(min(T, en * self.sr))
                center = (st_s + en_s) // 2

                jitter = int(0.2 * self.win)
                start = center - self.win // 2 + random.randint(-jitter, jitter)
                start = max(0, min(start, max(0, T - self.win)))
                chunk = pad_or_crop(wav, self.win, start=start)
            else:
                chunk = pad_or_crop(wav, self.win, start=None)
        else:
            start = 0 if T <= self.win else (T - self.win)//2
            chunk = pad_or_crop(wav, self.win, start=start)

        feat = wav_to_logmel(chunk)
        return feat, torch.tensor(y, dtype=torch.float32), tid


In [4]:
all_ids = train_ids
y_all = [labels[i] for i in all_ids]

train_ids_split, val_ids_split = train_test_split(
    all_ids, test_size=0.15, random_state=SEED, stratify=y_all
)

train_ds = PhraseDataset(train_audio_dir, train_ids_split, bounds, labels, train_mode=True)
val_ds   = PhraseDataset(train_audio_dir, val_ids_split, bounds, labels, train_mode=False)

BATCH = 64
train_loader = DataLoader(train_ds, batch_size=BATCH, shuffle=True)
val_loader   = DataLoader(val_ds, batch_size=BATCH, shuffle=False)

print("train:", len(train_ds), "val:", len(val_ds))


train: 76500 val: 13500


In [5]:
import torch.nn as nn
import torch.nn.functional as F

class SimpleCNN(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(1, 16, 3, padding=1),
            nn.BatchNorm2d(16),
            nn.ReLU(),
            nn.MaxPool2d(2),

            nn.Conv2d(16, 32, 3, padding=1),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.MaxPool2d(2),

            nn.Conv2d(32, 64, 3, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(),
        )
        self.pool = nn.AdaptiveAvgPool2d((1, 1))
        self.head = nn.Sequential(
            nn.Linear(64, 64),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(64, 1)
        )

    def forward(self, x):
        # x: [B, 1, n_mels, frames]
        x = self.conv(x)
        x = self.pool(x).squeeze(-1).squeeze(-1)
        logits = self.head(x).squeeze(-1)
        return logits

model = SimpleCNN().to(DEVICE)
print(model)


SimpleCNN(
  (conv): Sequential(
    (0): Conv2d(1, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (4): Conv2d(16, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (5): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (6): ReLU()
    (7): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (8): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (9): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (10): ReLU()
  )
  (pool): AdaptiveAvgPool2d(output_size=(1, 1))
  (head): Sequential(
    (0): Linear(in_features=64, out_features=64, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.2, inplace=False)
    (3): Linear(in_features=64, out_features=1, bias=True)
  )
)


In [6]:
print("Parameters:", sum(p.numel() for p in model.parameters() if p.requires_grad))

Parameters: 27745


In [7]:
def compute_score(y_true, y_pred_bin):
    y_true = torch.tensor(y_true).int()
    y_pred = torch.tensor(y_pred_bin).int()

    pos = (y_true == 1)
    neg = (y_true == 0)

    NUM_POS = pos.sum().item()
    NUM_NEG = neg.sum().item()

    FN = ((y_true == 1) & (y_pred == 0)).sum().item()
    FP = ((y_true == 0) & (y_pred == 1)).sum().item()

    FRR = FN / NUM_POS if NUM_POS > 0 else 0.0
    FAR = FP / NUM_NEG if NUM_NEG > 0 else 0.0

    a = 1.0 - FRR
    b = 1.0 - FAR
    hm = 0.0 if (a + b) == 0 else 2 * a * b / (a + b)
    return hm, FRR, FAR, FN, FP, NUM_POS, NUM_NEG


In [8]:
@torch.no_grad()
def eval_model(model, loader):
    model.eval()
    probs = []
    ys = []
    for x, y, _ in loader:
        x = x.to(DEVICE, non_blocking=True)
        logits = model(x)
        p = torch.sigmoid(logits).cpu()
        probs.extend(p.tolist())
        ys.extend(y.tolist())
    return ys, probs

def find_best_threshold(y_true, probs):
    best = (-1, 0.5, None)
    for t in [i/100 for i in range(5, 96)]:
        pred = [1 if p >= t else 0 for p in probs]
        score, FRR, FAR, *_ = compute_score(y_true, pred)
        if score > best[0]:
            best = (score, t, (FRR, FAR))
    return best  # (score, thr, (FRR,FAR))

criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-4)

EPOCHS = 15

best_val = -1
best_state = None
best_thr = 0.5

for epoch in range(1, EPOCHS+1):
    model.train()
    total_loss = 0.0
    for x, y, _ in tqdm(train_loader, desc=f"Epoch {epoch}/{EPOCHS}"):
        x = x.to(DEVICE, non_blocking=True)
        y = y.to(DEVICE, non_blocking=True)

        optimizer.zero_grad()
        logits = model(x)
        loss = criterion(logits, y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * x.size(0)

    avg_loss = total_loss / len(train_loader.dataset)

    y_true, probs = eval_model(model, val_loader)
    val_score, thr, (FRR, FAR) = find_best_threshold(y_true, probs)

    print(f"Epoch {epoch}: train_loss={avg_loss:.4f} | val_score={val_score:.4f} thr={thr:.2f} FRR={FRR:.3f} FAR={FAR:.3f}")

    if val_score > best_val:
        best_val = val_score
        best_state = {k: v.cpu().clone() for k, v in model.state_dict().items()}
        best_thr = thr

print("Best val score:", best_val, "best_thr:", best_thr)
model.load_state_dict(best_state)
model.to(DEVICE)

Epoch 1/15: 100%|██████████| 1196/1196 [28:02<00:00,  1.41s/it]


Epoch 1: train_loss=0.6209 | val_score=0.6864 thr=0.61 FRR=0.310 FAR=0.317


Epoch 2/15: 100%|██████████| 1196/1196 [29:42<00:00,  1.49s/it]


Epoch 2: train_loss=0.5604 | val_score=0.6933 thr=0.22 FRR=0.323 FAR=0.289


Epoch 3/15: 100%|██████████| 1196/1196 [29:05<00:00,  1.46s/it]


Epoch 3: train_loss=0.5268 | val_score=0.7206 thr=0.89 FRR=0.283 FAR=0.276


Epoch 4/15: 100%|██████████| 1196/1196 [28:57<00:00,  1.45s/it]


Epoch 4: train_loss=0.5037 | val_score=0.7262 thr=0.77 FRR=0.267 FAR=0.280


Epoch 5/15: 100%|██████████| 1196/1196 [27:14<00:00,  1.37s/it]


Epoch 5: train_loss=0.4873 | val_score=0.7531 thr=0.23 FRR=0.229 FAR=0.264


Epoch 6/15: 100%|██████████| 1196/1196 [26:14<00:00,  1.32s/it]


Epoch 6: train_loss=0.4757 | val_score=0.7487 thr=0.05 FRR=0.281 FAR=0.219


Epoch 7/15: 100%|██████████| 1196/1196 [26:42<00:00,  1.34s/it]


Epoch 7: train_loss=0.4642 | val_score=0.7573 thr=0.37 FRR=0.221 FAR=0.263


Epoch 8/15: 100%|██████████| 1196/1196 [26:52<00:00,  1.35s/it]


Epoch 8: train_loss=0.4568 | val_score=0.7663 thr=0.90 FRR=0.199 FAR=0.265


Epoch 9/15: 100%|██████████| 1196/1196 [27:06<00:00,  1.36s/it]


Epoch 9: train_loss=0.4497 | val_score=0.7561 thr=0.82 FRR=0.225 FAR=0.262


Epoch 10/15: 100%|██████████| 1196/1196 [27:06<00:00,  1.36s/it]


Epoch 10: train_loss=0.4425 | val_score=0.7755 thr=0.28 FRR=0.201 FAR=0.247


Epoch 11/15: 100%|██████████| 1196/1196 [27:30<00:00,  1.38s/it]


Epoch 11: train_loss=0.4389 | val_score=0.7786 thr=0.35 FRR=0.188 FAR=0.252


Epoch 12/15: 100%|██████████| 1196/1196 [27:45<00:00,  1.39s/it]


Epoch 12: train_loss=0.4336 | val_score=0.7843 thr=0.55 FRR=0.185 FAR=0.244


Epoch 13/15: 100%|██████████| 1196/1196 [27:51<00:00,  1.40s/it]


Epoch 13: train_loss=0.4296 | val_score=0.7843 thr=0.24 FRR=0.201 FAR=0.230


Epoch 14/15: 100%|██████████| 1196/1196 [27:45<00:00,  1.39s/it]


Epoch 14: train_loss=0.4261 | val_score=0.7805 thr=0.83 FRR=0.201 FAR=0.237


Epoch 15/15: 100%|██████████| 1196/1196 [27:47<00:00,  1.39s/it]


Epoch 15: train_loss=0.4205 | val_score=0.7841 thr=0.63 FRR=0.188 FAR=0.242
Best val score: 0.7843347879601656 best_thr: 0.24


SimpleCNN(
  (conv): Sequential(
    (0): Conv2d(1, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (4): Conv2d(16, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (5): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (6): ReLU()
    (7): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (8): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (9): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (10): ReLU()
  )
  (pool): AdaptiveAvgPool2d(output_size=(1, 1))
  (head): Sequential(
    (0): Linear(in_features=64, out_features=64, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.2, inplace=False)
    (3): Linear(in_features=64, out_features=1, bias=True)
  )
)

In [9]:
@torch.no_grad()
def predict_file_prob(model, path: Path, win_samples=WIN_SAMPLES, hop_ratio=0.5, batch_windows=64):
    model.eval()
    wav = load_audio(path, SR)
    T = wav.numel()
    if T <= win_samples:
        feat = wav_to_logmel(pad_or_crop(wav, win_samples, start=0)).unsqueeze(0).to(DEVICE)
        p = torch.sigmoid(model(feat)).item()
        return p

    hop = int(win_samples * hop_ratio)
    starts = list(range(0, max(1, T - win_samples + 1), hop))
    if starts[-1] != T - win_samples:
        starts.append(T - win_samples)

    probs = []
    batch = []
    for s in starts:
        chunk = wav[s:s+win_samples]
        feat = wav_to_logmel(chunk)
        batch.append(feat)
        if len(batch) == batch_windows:
            x = torch.stack(batch, dim=0).to(DEVICE)
            p = torch.sigmoid(model(x)).detach().cpu()
            probs.extend(p.tolist())
            batch = []
    if batch:
        x = torch.stack(batch, dim=0).to(DEVICE)
        p = torch.sigmoid(model(x)).detach().cpu()
        probs.extend(p.tolist())

    return float(max(probs))


test_probs = []
for p in tqdm(test_files, desc="Predict test"):
    test_probs.append(predict_file_prob(model, p))

test_pred = [1 if pr >= best_thr else 0 for pr in test_probs]
sum(test_pred), len(test_pred)


Predict test: 100%|██████████| 27000/27000 [11:07<00:00, 40.43it/s]


(19666, 27000)

In [10]:
import pandas as pd

sub = pd.DataFrame({"id": test_ids, "label": test_pred})
sub_path = "submition.csv"
sub.to_csv(sub_path, index=False)
