In [1]:
!pip install jiwer

Collecting jiwer
  Downloading jiwer-3.1.0-py3-none-any.whl.metadata (2.6 kB)
Collecting rapidfuzz>=3.9.7 (from jiwer)
  Downloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Downloading jiwer-3.1.0-py3-none-any.whl (22 kB)
Downloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m57.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz, jiwer
Successfully installed jiwer-3.1.0 rapidfuzz-3.13.0


# Полная версия

In [2]:
import os
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torchaudio
from torch.utils.data import Dataset, DataLoader
from jiwer import cer

from tqdm import tqdm
import logging


logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s %(levelname)s: %(message)s',
    datefmt='%H:%M:%S'
)

DATA_DIR = '/kaggle/input/asr-numbers-recognition-in-russian' 
df_train = pd.read_csv(os.path.join(DATA_DIR, 'train.csv'))
df_train['transcription'] = df_train['transcription'].astype(str)
df_dev   = pd.read_csv(os.path.join(DATA_DIR, 'dev.csv'))
df_dev ['transcription'] = df_dev ['transcription'].astype(str)

def text_to_indices(text, char2idx):
    return [char2idx[c] for c in text]

def build_vocab(transcripts):
    texts = transcripts.astype(str)
    chars = sorted(set(''.join(texts)))
    char2idx = {c: i+1 for i, c in enumerate(chars)}  # 0 — CTC blank
    idx2char = {i: c for c, i in char2idx.items()}
    return char2idx, idx2char

char2idx, idx2char = build_vocab(df_train['transcription'])
vocab_size = len(char2idx) + 1  # include CTC blank

resampler = torchaudio.transforms.Resample(orig_freq=24000, new_freq=16000)
mel_spec  = torchaudio.transforms.MelSpectrogram(sample_rate=16000, n_mels=128)

class ASRDataset(Dataset):
    def __init__(self, df, data_base, char2idx):
        self.df = df
        self.base = data_base
        self.char2idx = char2idx
    def __len__(self):
        return len(self.df)
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        wav, sr = torchaudio.load(os.path.join(self.base, row['filename']))
        if sr != 16000:
            wav = resampler(wav)
        spec = mel_spec(wav).squeeze(0).transpose(0,1)  # T x n_mels
        target = torch.tensor(text_to_indices(row['transcription'], self.char2idx), dtype=torch.long)
        return spec, target

def collate_fn(batch):
    specs, targets = zip(*batch)
    spec_lens = [s.size(0) for s in specs]
    max_spec  = max(spec_lens)
    padded_specs = torch.zeros(len(specs), max_spec, specs[0].size(1))
    for i, s in enumerate(specs):
        padded_specs[i, :s.size(0)] = s
    tgt_lens = [t.size(0) for t in targets]
    targets_cat = torch.cat(targets)
    return padded_specs.transpose(1,2), torch.tensor(spec_lens), targets_cat, torch.tensor(tgt_lens)

train_ds = ASRDataset(df_train, DATA_DIR, char2idx)
dev_ds   = ASRDataset(df_dev,   DATA_DIR, char2idx)
train_loader = DataLoader(train_ds, batch_size=16, shuffle=True, collate_fn=collate_fn)
dev_loader   = DataLoader(dev_ds,   batch_size=16, shuffle=False, collate_fn=collate_fn)

class ASRModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.cnn = nn.Sequential(
            nn.Conv2d(1, 32, 3, padding=1), nn.ReLU(), nn.MaxPool2d((2,1)),
            nn.Conv2d(32,64,3, padding=1), nn.ReLU(), nn.MaxPool2d((2,1)),
            nn.Conv2d(64,64,3, padding=1), nn.ReLU(), nn.MaxPool2d((2,1)),
        )
        self.rnn = nn.LSTM(
            input_size=64*16,
            hidden_size=192,
            num_layers=2,
            batch_first=True,
            bidirectional=True,
        )
        self.classifier = nn.Linear(192*2, vocab_size)

    def forward(self, x):
        x = x.unsqueeze(1)  # (B,1,n_mels,T)
        x = self.cnn(x)     # (B,64,16,T′)
        B, C, H, W = x.size()
        x = x.permute(0,3,1,2).reshape(B, W, C*H)  # (B,T′,1024)
        x, _ = self.rnn(x)                       # (B,T′,384)
        x = self.classifier(x)                   # (B,T′,vocab)
        return x.log_softmax(2)

model = ASRModel(vocab_size)
num_params = sum(p.numel() for p in model.parameters())
print(f"Total parameters: {num_params/1e6:.2f}M")

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-3)
ctc_loss  = nn.CTCLoss(blank=0, zero_infinity=True)



Total parameters: 2.82M


In [3]:
import random
import copy
import torchaudio
# import torchaudio.sox_effects as sox_fx
import torchaudio.functional as F
from torchaudio.transforms import TimeMasking, FrequencyMasking
from tqdm import tqdm


time_mask = TimeMasking(time_mask_param=50)
freq_mask = FrequencyMasking(freq_mask_param=20)

def speed_perturb(wav, sr):
    # factor = random.choice([0.9, 1.0, 1.1])
    # effects = [['speed', str(factor)], ['rate', str(sr)]]
    # wav, _ = sox_fx.apply_effects_tensor(wav, sr, effects)
    # return wav
    factor = random.choice([0.9, 1.0, 1.1])
    new_sr = int(sr * factor)
    # down/up-sample to simulate speed change
    wav = F.resample(wav, orig_freq=sr, new_freq=new_sr)
    return F.resample(wav, orig_freq=new_sr, new_freq=sr)

# Augmented Dataset
class ASRAugDataset(ASRDataset):
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        wav, sr = torchaudio.load(os.path.join(self.base, row['filename']))
        if sr != 16000:
            wav = resampler(wav)
        # speed perturb
        if random.random() < 0.5:
            wav = speed_perturb(wav, 16000)
        # gaussian noise
        if random.random() < 0.5:
            wav = wav + 0.005 * torch.randn_like(wav)
        # random gain
        if random.random() < 0.5:
            gain = random.uniform(-6, +6)
            wav = torchaudio.functional.gain(wav, gain)
        # mel-spectrogram
        spec = mel_spec(wav).squeeze(0).transpose(0,1)
        # SpecAugment
        spec = time_mask(freq_mask(spec))
        target = torch.tensor(text_to_indices(row['transcription'], self.char2idx), dtype=torch.long)
        return spec, target

# recreate loader
train_aug_ds = ASRAugDataset(df_train, DATA_DIR, char2idx)
train_loader = DataLoader(train_aug_ds, batch_size=16, shuffle=True, collate_fn=collate_fn)

# scheduler and best-tracking
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=2)
best_cer = float('inf')
best_state = None

def greedy_decode(log_probs):
    inds = log_probs.argmax(-1).cpu().numpy()
    seq, prev = [], 0
    for idx in inds:
        if idx and idx != prev:
            seq.append(idx2char[idx])
        prev = idx
    return ''.join(seq)

# run training+eval
num_epochs = 40
for epoch in range(1, num_epochs+1):
    model.train()
    total_loss = 0
    pbar = tqdm(train_loader, desc=f"Train E{epoch}/{num_epochs}", unit='batch')
    for specs, spec_lens, targets, tgt_lens in pbar:
        specs, targets = specs.to(device), targets.to(device)
        spec_lens, tgt_lens = spec_lens.to(device), tgt_lens.to(device)

        optimizer.zero_grad()
        lp = model(specs).permute(1,0,2)
        loss = ctc_loss(lp, targets, spec_lens, tgt_lens)
        loss.backward()
        optimizer.step()

        batch_loss = loss.item()
        total_loss += batch_loss
        lr = optimizer.param_groups[0]['lr']
        pbar.set_postfix(loss=f"{batch_loss:.4f}", lr=f"{lr:.2e}")
    avg_train = total_loss / len(train_loader)
    scheduler.step(avg_train)

    # eval
    model.eval()
    all_refs, all_hyps = [], []
    with torch.no_grad():
        for specs, spec_lens, targets, tgt_lens in tqdm(dev_loader, desc=" Eval", leave=False):
            specs = specs.to(device)
            lp = model(specs)
            flat = targets.tolist()
            lens = tgt_lens.tolist()
            off = 0
            for i, L in enumerate(lens):
                hy = greedy_decode(lp[i])
                ref_idxs = flat[off:off+L]; off += L
                rf = ''.join(idx2char[x] for x in ref_idxs)
                all_hyps.append(hy); all_refs.append(rf)
    cer_score = cer(all_refs, all_hyps)

    print(f"-> Epoch {epoch}: train loss {avg_train:.4f}, dev CER {cer_score:.4f}")
    if cer_score < best_cer:
        best_cer, best_state = cer_score, copy.deepcopy(model.state_dict())
        print(f"   *** New best Dev CER {best_cer:.4f} ***")

# load best model
model.load_state_dict(best_state)
print(f"\nLoaded best model with Dev CER={best_cer:.4f}")

Train E1/40: 100%|██████████| 785/785 [07:12<00:00,  1.82batch/s, loss=2.4713, lr=1.00e-03]
                                                        

-> Epoch 1: train loss 2.8925, dev CER 1.0000
   *** New best Dev CER 1.0000 ***


Train E2/40: 100%|██████████| 785/785 [04:34<00:00,  2.86batch/s, loss=1.2983, lr=1.00e-03]
                                                        

-> Epoch 2: train loss 2.0244, dev CER 0.7803
   *** New best Dev CER 0.7803 ***


Train E3/40: 100%|██████████| 785/785 [04:27<00:00,  2.94batch/s, loss=0.3520, lr=1.00e-03]
                                                        

-> Epoch 3: train loss 0.8445, dev CER 0.3463
   *** New best Dev CER 0.3463 ***


Train E4/40: 100%|██████████| 785/785 [04:26<00:00,  2.94batch/s, loss=0.2249, lr=1.00e-03]
                                                        

-> Epoch 4: train loss 0.4640, dev CER 0.2859
   *** New best Dev CER 0.2859 ***


Train E5/40: 100%|██████████| 785/785 [04:27<00:00,  2.93batch/s, loss=0.1247, lr=1.00e-03]
                                                        

-> Epoch 5: train loss 0.3449, dev CER 0.2686
   *** New best Dev CER 0.2686 ***


Train E6/40: 100%|██████████| 785/785 [04:36<00:00,  2.84batch/s, loss=0.0630, lr=1.00e-03]
                                                        

-> Epoch 6: train loss 0.2737, dev CER 0.2698


Train E7/40: 100%|██████████| 785/785 [04:27<00:00,  2.93batch/s, loss=0.2348, lr=1.00e-03]
                                                        

-> Epoch 7: train loss 0.2082, dev CER 0.2695


Train E8/40: 100%|██████████| 785/785 [04:29<00:00,  2.91batch/s, loss=0.1587, lr=1.00e-03]
                                                        

-> Epoch 8: train loss 0.1617, dev CER 0.2088
   *** New best Dev CER 0.2088 ***


Train E9/40: 100%|██████████| 785/785 [04:28<00:00,  2.93batch/s, loss=0.0715, lr=1.00e-03]
                                                        

-> Epoch 9: train loss 0.1353, dev CER 0.2287


Train E10/40: 100%|██████████| 785/785 [04:28<00:00,  2.92batch/s, loss=0.1110, lr=1.00e-03]
                                                        

-> Epoch 10: train loss 0.1358, dev CER 0.2444


Train E11/40: 100%|██████████| 785/785 [04:29<00:00,  2.91batch/s, loss=0.0647, lr=1.00e-03]
                                                        

-> Epoch 11: train loss 0.1116, dev CER 0.2357


Train E12/40: 100%|██████████| 785/785 [04:28<00:00,  2.92batch/s, loss=0.0790, lr=1.00e-03]
                                                        

-> Epoch 12: train loss 0.1003, dev CER 0.2312


Train E13/40: 100%|██████████| 785/785 [04:29<00:00,  2.91batch/s, loss=0.1682, lr=1.00e-03]
                                                        

-> Epoch 13: train loss 0.1018, dev CER 0.2339


Train E14/40: 100%|██████████| 785/785 [04:26<00:00,  2.95batch/s, loss=0.0660, lr=1.00e-03]
                                                        

-> Epoch 14: train loss 0.0859, dev CER 0.1933
   *** New best Dev CER 0.1933 ***


Train E15/40: 100%|██████████| 785/785 [04:32<00:00,  2.88batch/s, loss=0.0343, lr=1.00e-03]
                                                        

-> Epoch 15: train loss 0.0873, dev CER 0.2223


Train E16/40: 100%|██████████| 785/785 [04:28<00:00,  2.92batch/s, loss=0.0057, lr=1.00e-03]
                                                        

-> Epoch 16: train loss 0.0789, dev CER 0.2117


Train E17/40: 100%|██████████| 785/785 [04:29<00:00,  2.92batch/s, loss=0.0583, lr=1.00e-03]
                                                        

-> Epoch 17: train loss 0.0765, dev CER 0.2226


Train E18/40: 100%|██████████| 785/785 [04:31<00:00,  2.90batch/s, loss=0.0169, lr=1.00e-03]
                                                        

-> Epoch 18: train loss 0.0685, dev CER 0.1996


Train E19/40: 100%|██████████| 785/785 [04:32<00:00,  2.88batch/s, loss=0.1352, lr=1.00e-03]
                                                        

-> Epoch 19: train loss 0.0683, dev CER 0.2189


Train E20/40: 100%|██████████| 785/785 [04:30<00:00,  2.90batch/s, loss=0.0598, lr=1.00e-03]
                                                        

-> Epoch 20: train loss 0.0634, dev CER 0.2010


Train E21/40: 100%|██████████| 785/785 [04:29<00:00,  2.91batch/s, loss=0.0036, lr=1.00e-03]
                                                        

-> Epoch 21: train loss 0.0677, dev CER 0.1941


Train E22/40: 100%|██████████| 785/785 [04:37<00:00,  2.83batch/s, loss=0.0182, lr=1.00e-03]
                                                        

-> Epoch 22: train loss 0.0607, dev CER 0.1837
   *** New best Dev CER 0.1837 ***


Train E23/40: 100%|██████████| 785/785 [04:37<00:00,  2.82batch/s, loss=0.0141, lr=1.00e-03]
                                                        

-> Epoch 23: train loss 0.0558, dev CER 0.2082


Train E24/40: 100%|██████████| 785/785 [04:31<00:00,  2.89batch/s, loss=0.0065, lr=1.00e-03]
                                                        

-> Epoch 24: train loss 0.0550, dev CER 0.2129


Train E25/40: 100%|██████████| 785/785 [04:33<00:00,  2.87batch/s, loss=0.0016, lr=1.00e-03]
                                                        

-> Epoch 25: train loss 0.0577, dev CER 0.2042


Train E26/40: 100%|██████████| 785/785 [04:29<00:00,  2.91batch/s, loss=0.0032, lr=1.00e-03]
                                                        

-> Epoch 26: train loss 0.0648, dev CER 0.1868


Train E27/40: 100%|██████████| 785/785 [04:30<00:00,  2.91batch/s, loss=0.0029, lr=1.00e-03]
                                                        

-> Epoch 27: train loss 0.0478, dev CER 0.1846


Train E28/40: 100%|██████████| 785/785 [04:32<00:00,  2.88batch/s, loss=0.0123, lr=1.00e-03]
                                                        

-> Epoch 28: train loss 0.0589, dev CER 0.2036


Train E29/40: 100%|██████████| 785/785 [04:30<00:00,  2.90batch/s, loss=0.0024, lr=1.00e-03]
                                                        

-> Epoch 29: train loss 0.0483, dev CER 0.2028


Train E30/40: 100%|██████████| 785/785 [04:29<00:00,  2.91batch/s, loss=0.0655, lr=1.00e-03]
                                                        

-> Epoch 30: train loss 0.0456, dev CER 0.1891


Train E31/40: 100%|██████████| 785/785 [04:30<00:00,  2.90batch/s, loss=0.0369, lr=1.00e-03]
                                                        

-> Epoch 31: train loss 0.0499, dev CER 0.1877


Train E32/40: 100%|██████████| 785/785 [04:30<00:00,  2.90batch/s, loss=0.0100, lr=1.00e-03]
                                                        

-> Epoch 32: train loss 0.0486, dev CER 0.1777
   *** New best Dev CER 0.1777 ***


Train E33/40: 100%|██████████| 785/785 [04:32<00:00,  2.88batch/s, loss=0.0076, lr=1.00e-03]
                                                        

-> Epoch 33: train loss 0.0532, dev CER 0.2034


Train E34/40: 100%|██████████| 785/785 [04:32<00:00,  2.89batch/s, loss=0.0933, lr=5.00e-04]
                                                        

-> Epoch 34: train loss 0.0375, dev CER 0.2119


Train E35/40: 100%|██████████| 785/785 [04:30<00:00,  2.90batch/s, loss=0.0011, lr=5.00e-04]
                                                        

-> Epoch 35: train loss 0.0302, dev CER 0.1822


Train E36/40: 100%|██████████| 785/785 [04:30<00:00,  2.90batch/s, loss=0.0171, lr=5.00e-04]
                                                        

-> Epoch 36: train loss 0.0335, dev CER 0.1970


Train E37/40: 100%|██████████| 785/785 [04:33<00:00,  2.87batch/s, loss=0.0359, lr=5.00e-04]
                                                        

-> Epoch 37: train loss 0.0313, dev CER 0.1813


Train E38/40: 100%|██████████| 785/785 [04:32<00:00,  2.89batch/s, loss=0.0470, lr=5.00e-04]
                                                        

-> Epoch 38: train loss 0.0287, dev CER 0.1838


Train E39/40: 100%|██████████| 785/785 [04:32<00:00,  2.89batch/s, loss=0.0063, lr=5.00e-04]
                                                        

-> Epoch 39: train loss 0.0254, dev CER 0.1827


Train E40/40: 100%|██████████| 785/785 [04:31<00:00,  2.89batch/s, loss=0.0043, lr=5.00e-04]
                                                        

-> Epoch 40: train loss 0.0274, dev CER 0.1909

Loaded best model with Dev CER=0.1777




In [4]:
torch.save(model.state_dict(), 'model_best.pth')

In [5]:
torch.save(best_state, 'best_model.pth')

model.load_state_dict(torch.load('best_model.pth', map_location=device))
model.to(device)

  model.load_state_dict(torch.load('best_model.pth', map_location=device))


ASRModel(
  (cnn): Sequential(
    (0): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=(2, 1), stride=(2, 1), padding=0, dilation=1, ceil_mode=False)
    (3): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (4): ReLU()
    (5): MaxPool2d(kernel_size=(2, 1), stride=(2, 1), padding=0, dilation=1, ceil_mode=False)
    (6): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (7): ReLU()
    (8): MaxPool2d(kernel_size=(2, 1), stride=(2, 1), padding=0, dilation=1, ceil_mode=False)
  )
  (rnn): LSTM(1024, 192, num_layers=2, batch_first=True, bidirectional=True)
  (classifier): Linear(in_features=384, out_features=11, bias=True)
)

In [6]:
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)  # мелкий LR
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=2)

num_more_epochs = 5
for epoch in range(1, num_more_epochs+1):
    model.train()
    total_loss = 0
    for specs, spec_lens, targets, tgt_lens in tqdm(train_loader, desc=f"Fine-tune E{epoch}/{num_more_epochs}"):
        specs, targets = specs.to(device), targets.to(device)
        spec_lens, tgt_lens = spec_lens.to(device), tgt_lens.to(device)

        optimizer.zero_grad()
        lp = model(specs).permute(1,0,2)
        loss = ctc_loss(lp, targets, spec_lens, tgt_lens)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    avg = total_loss / len(train_loader)
    scheduler.step(avg)

    model.eval()
    all_refs, all_hyps = [], []
    with torch.no_grad():
        for specs, spec_lens, targets, tgt_lens in dev_loader:
            specs = specs.to(device)
            lp = model(specs)
            off = 0
            flat = targets.tolist()
            for i,L in enumerate(tgt_lens.tolist()):
                hy = greedy_decode(lp[i])
                rf = ''.join(idx2char[x] for x in flat[off:off+L])
                off += L
                all_hyps.append(hy); all_refs.append(rf)
    cer_score = cer(all_refs, all_hyps)
    print(f"Fine-tune Epoch {epoch}: train loss {avg:.4f}, dev CER {cer_score:.4f}")

Fine-tune E1/5: 100%|██████████| 785/785 [04:31<00:00,  2.89it/s]


Fine-tune Epoch 1: train loss 0.0393, dev CER 0.1775


Fine-tune E2/5: 100%|██████████| 785/785 [04:30<00:00,  2.91it/s]


Fine-tune Epoch 2: train loss 0.0394, dev CER 0.1772


Fine-tune E3/5: 100%|██████████| 785/785 [04:30<00:00,  2.90it/s]


Fine-tune Epoch 3: train loss 0.0348, dev CER 0.1765


Fine-tune E4/5: 100%|██████████| 785/785 [04:27<00:00,  2.94it/s]


Fine-tune Epoch 4: train loss 0.0351, dev CER 0.1762


Fine-tune E5/5: 100%|██████████| 785/785 [04:28<00:00,  2.93it/s]


Fine-tune Epoch 5: train loss 0.0324, dev CER 0.1770


In [7]:
import os
import pandas as pd
import torch
import torchaudio
from tqdm import tqdm


BASE     = '/kaggle/input/asr-numbers-recognition-in-russian'
TEST_CSV = os.path.join(BASE, 'test.csv')
OUT_CSV  = 'predictions.csv'

df = pd.read_csv(TEST_CSV) 

model.eval()
preds = []
for fn in tqdm(df['filename'], desc='Inference'):
    wav, sr = torchaudio.load(os.path.join(BASE, fn))
    if sr != 16000:
        wav = resampler(wav)
    spec = mel_spec(wav).squeeze(0).unsqueeze(0).to(device)
    with torch.no_grad():
        logp = model(spec)
    preds.append(greedy_decode(logp[0]))

out = pd.DataFrame({
    'filename': df['filename'],
    'transcription': preds
})

out['transcription'] = out['transcription'].fillna(0)

n_null = out['transcription'].isna().sum()
print(f"Nulls in transcription: {n_null}")

out['transcription'] = pd.to_numeric(out['transcription'], errors='coerce').fillna(0).astype(int)
out['transcription'] = out['transcription'].apply(lambda x: int(x) if x >= 0 else 0)

out.to_csv(OUT_CSV, index=False)
print(f"Saved → {OUT_CSV}")

Inference: 100%|██████████| 2582/2582 [01:47<00:00, 24.07it/s]


Nulls in transcription: 0
Saved → predictions.csv


In [8]:
torch.save(model.state_dict(), 'model_best_2.pth')