# Попытаемся улучшить CER с 16.624

In [None]:
!pip install jiwer

!pip install pyctcdecode
# !pip install https://github.com/kpu/kenlm/archive/master.zip

In [None]:
import os, random, copy
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import torchaudio
import torchaudio.functional as F
from torch.utils.data import Dataset, DataLoader
from torchaudio.transforms import Resample, MelSpectrogram, FrequencyMasking, TimeMasking
from jiwer import cer
from tqdm import tqdm
from torch.optim.lr_scheduler import OneCycleLR
from pyctcdecode import build_ctcdecoder


DATA_DIR   = '/kaggle/input/asr-numbers-recognition-in-russian'
TRAIN_CSV  = os.path.join(DATA_DIR, 'train.csv')
DEV_CSV    = os.path.join(DATA_DIR, 'dev.csv')
CHECKPOINT = '/kaggle/input/16_624model/pytorch/default/1/model_best_2.pth'
SAVE_PATH  = 'model_finetuned.pth'
LM_PATH    = None


df_tr = pd.read_csv(TRAIN_CSV); df_tr.transcription = df_tr.transcription.astype(str)
df_dev = pd.read_csv(DEV_CSV); df_dev.transcription = df_dev.transcription.astype(str)

def build_vocab(texts):
    chars = sorted(set(''.join(texts)))
    c2i = {c:i+1 for i,c in enumerate(chars)}   # 0 — blank
    i2c = {i:c for c,i in c2i.items()}
    return c2i, i2c

def text_to_indices(t, c2i):
    return [c2i.get(c,0) for c in t]

char2idx, idx2char = build_vocab(df_tr.transcription)
vocab_size = len(char2idx) + 1

resampler    = Resample(orig_freq=24000, new_freq=16000)
mel_spec     = MelSpectrogram(sample_rate=16000, n_mels=128)
spec_augment = nn.Sequential(
    FrequencyMasking(freq_mask_param=15),
    TimeMasking(time_mask_param=35),
)

def speed_perturb(wav, sr):
    rate = random.choice([0.9, 1.0, 1.1])
    wav2 = F.resample(wav, orig_freq=sr, new_freq=int(sr*rate))
    return wav2, int(sr*rate)

class ASRDataset(Dataset):
    def __init__(self, df, base, c2i, augment=False):
        self.df, self.base, self.c2i = df, base, c2i
        self.augment = augment

    def __len__(self):
        return len(self.df)

    def __getitem__(self, i):
        row = self.df.iloc[i]
        wav, sr = torchaudio.load(os.path.join(self.base, row.filename))
        if self.augment:
            wav, sr = speed_perturb(wav, sr)
        if sr != 16000:
            wav = resampler(wav)
        spec = mel_spec(wav).squeeze(0).transpose(0,1)
        if self.augment:
            spec = spec_augment(spec)
        tgt = torch.tensor(text_to_indices(row.transcription, self.c2i), dtype=torch.long)
        return spec, tgt

def collate_fn(batch):
    specs, tgts = zip(*batch)
    lengths = [s.size(0) for s in specs]
    maxL = max(lengths)
    B, M = len(specs), specs[0].size(1)
    pad = torch.zeros(B, maxL, M)
    for i, s in enumerate(specs):
        pad[i, :lengths[i]] = s
    tgt_lens = torch.tensor([t.size(0) for t in tgts])
    tgts_cat = torch.cat(tgts)
    return pad.transpose(1,2), torch.tensor(lengths), tgts_cat, tgt_lens

tr_loader = DataLoader(
    ASRDataset(df_tr, DATA_DIR, char2idx, augment=True),
    batch_size=16, shuffle=True, collate_fn=collate_fn)
dv_loader = DataLoader(
    ASRDataset(df_dev, DATA_DIR, char2idx, augment=False),
    batch_size=16, shuffle=False, collate_fn=collate_fn)

class ASRModel(nn.Module):
    def __init__(self, V):
        super().__init__()
        self.cnn = nn.Sequential(
            nn.Conv2d(1,32,3,padding=1), nn.ReLU(), nn.MaxPool2d((2,1)),
            nn.Conv2d(32,64,3,padding=1), nn.ReLU(), nn.MaxPool2d((2,1)),
            nn.Conv2d(64,64,3,padding=1), nn.ReLU(), nn.MaxPool2d((2,1)),
        )
        self.rnn = nn.LSTM(64*16,192, num_layers=2,
                           batch_first=True,bidirectional=True)
        self.classifier = nn.Linear(192*2, V)

    def forward(self, x):
        x = x.unsqueeze(1)  # B×1×T×M
        x = self.cnn(x)
        B,C,H,W = x.size()
        x = x.permute(0,3,1,2).reshape(B,W,C*H)
        x,_ = self.rnn(x)
        return self.classifier(x).log_softmax(2)

labels = [""] + [idx2char[i] for i in range(1, vocab_size)]
decoder = build_ctcdecoder(labels, LM_PATH)

def beam_decode(logp):
    probs = logp.exp().cpu().numpy()
    return decoder.decode_beams(probs, beam_width=50)[0][0]

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = ASRModel(vocab_size).to(device)
model.load_state_dict(torch.load(CHECKPOINT, map_location=device))
model.eval()
refs, hyps = [], []
with torch.no_grad():
    for specs, lens, tgts, t_lens in dv_loader:
        specs = specs.to(device)
        out = model(specs)
        off = 0
        for i, L in enumerate(t_lens.tolist()):
            hyp = beam_decode(out[i])
            ref = "".join(idx2char[x] for x in tgts[off:off+L].tolist())
            hyps.append(hyp); refs.append(ref)
            off += L
print("Baseline CER =", cer(refs, hyps))

# freeze CNN если нужно
for p in model.cnn.parameters():
    p.requires_grad = False

EPOCH = 20

opt = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()),
                 lr=1e-5)
sched = OneCycleLR(opt, max_lr=5e-5,
                   total_steps=EPOCH * len(tr_loader))
ctc = nn.CTCLoss(blank=0, zero_infinity=True)

best_cer, best_state = float('inf'), None
for ep in range(1, EPOCH+1):
    model.train()
    for specs, lens, tgts, t_lens in tqdm(tr_loader, desc=f"Train {ep}"):
        specs, tgts = specs.to(device), tgts.to(device)
        opt.zero_grad()
        logit = model(specs)
        loss = ctc(logit.permute(1,0,2), tgts, lens, t_lens)
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), 5)
        opt.step(); sched.step()

    model.eval()
    refs, hyps = [], []
    with torch.no_grad():
        for specs, lens, tgts, t_lens in dv_loader:
            specs = specs.to(device)
            out = model(specs)
            off = 0
            for i, L in enumerate(t_lens.tolist()):
                hyp = beam_decode(out[i])
                ref = "".join(idx2char[x] for x in tgts[off:off+L].tolist())
                hyps.append(hyp); refs.append(ref)
                off += L
    sc = cer(refs, hyps)
    print(f"Epoch {ep} → CER={sc:.4f}")
    if sc < best_cer:
        best_cer, best_state = sc, copy.deepcopy(model.state_dict())

model.load_state_dict(best_state)
torch.save(model.state_dict(), SAVE_PATH)
print(f"Done. Best CER={best_cer:.4f}. Saved to {SAVE_PATH}")

In [None]:
!pip install transformers

import os, random, copy
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import torchaudio
import torchaudio.functional as F
from torch.utils.data import Dataset, DataLoader
from torchaudio.transforms import Resample, MelSpectrogram, FrequencyMasking, TimeMasking
from jiwer import cer
from tqdm import tqdm
from transformers import get_linear_schedule_with_warmup
from pyctcdecode import build_ctcdecoder

DATA_DIR   = '/kaggle/input/asr-numbers-recognition-in-russian'
TRAIN_CSV  = os.path.join(DATA_DIR, 'train.csv')
DEV_CSV    = os.path.join(DATA_DIR, 'dev.csv')
BEST_CKPT  = 'model_finetuned.pth' #'/kaggle/input/16_624model/pytorch/default/1/model_best_2.pth'  
SAVE_PATH  = 'model_finetuned_30ep_.pth'
LM_PATH    = None

df_tr = pd.read_csv(TRAIN_CSV); df_tr.transcription = df_tr.transcription.astype(str)
df_dev= pd.read_csv(DEV_CSV);  df_dev.transcription= df_dev.transcription.astype(str)

def build_vocab(texts):
    chars = sorted(set(''.join(texts)))
    c2i = {c:i+1 for i,c in enumerate(chars)}  # 0 — blank
    i2c = {i:c for c,i in c2i.items()}
    return c2i, i2c

def text_to_indices(t, c2i):
    return [c2i.get(c,0) for c in t]

char2idx, idx2char = build_vocab(df_tr.transcription)
vocab_size = len(char2idx) + 1

resampler    = Resample(orig_freq=24000, new_freq=16000)
mel_spec     = MelSpectrogram(sample_rate=16000, n_mels=128)
spec_augment = nn.Sequential(
    FrequencyMasking(freq_mask_param=10),
    TimeMasking(time_mask_param=20),
)
def speed_perturb(wav, sr):
    rate = random.choice([0.9, 1.0, 1.1])
    wav2 = F.resample(wav, orig_freq=sr, new_freq=int(sr*rate))
    return wav2, int(sr*rate)

class ASRDataset(Dataset):
    def __init__(self, df, base, c2i, augment=False):
        self.df, self.base, self.c2i = df, base, c2i
        self.augment = augment
    def __len__(self): return len(self.df)
    def __getitem__(self, i):
        row = self.df.iloc[i]
        wav, sr = torchaudio.load(os.path.join(self.base, row.filename))
        if self.augment:
            wav, sr = speed_perturb(wav, sr)
        if sr != 16000:
            wav = resampler(wav)
        spec = mel_spec(wav).squeeze(0).transpose(0,1)
        if self.augment:
            spec = spec_augment(spec)
        tgt = torch.tensor(text_to_indices(row.transcription, self.c2i), dtype=torch.long)
        return spec, tgt

def collate_fn(batch):
    specs, tgts = zip(*batch)
    lengths = [s.size(0) for s in specs]
    maxL = max(lengths); B,M = len(specs), specs[0].size(1)
    pad = torch.zeros(B, maxL, M)
    for i,s in enumerate(specs):
        pad[i,:lengths[i]] = s
    tgt_lens = torch.tensor([t.size(0) for t in tgts])
    tgts_cat = torch.cat(tgts)
    return pad.transpose(1,2), torch.tensor(lengths), tgts_cat, tgt_lens

tr_loader = DataLoader(ASRDataset(df_tr, DATA_DIR, char2idx, augment=True),
                       batch_size=16, shuffle=True, collate_fn=collate_fn)
dv_loader = DataLoader(ASRDataset(df_dev, DATA_DIR, char2idx, augment=False),
                       batch_size=16, shuffle=False, collate_fn=collate_fn)

class ASRModel(nn.Module):
    def __init__(self, V):
        super().__init__()
        self.cnn = nn.Sequential(
            nn.Conv2d(1,32,3,padding=1), nn.ReLU(), nn.MaxPool2d((2,1)),
            nn.Conv2d(32,64,3,padding=1), nn.ReLU(), nn.MaxPool2d((2,1)),
            nn.Conv2d(64,64,3,padding=1), nn.ReLU(), nn.MaxPool2d((2,1)),
        )
        self.rnn = nn.LSTM(64*16,192, num_layers=2,
                           batch_first=True,bidirectional=True,
                           dropout=0.3)
        self.classifier = nn.Linear(192*2, V)
    def forward(self, x):
        x = x.unsqueeze(1)  # B×1×T×M
        x = self.cnn(x)
        B,C,H,W = x.size()
        x = x.permute(0,3,1,2).reshape(B,W,C*H)
        x,_ = self.rnn(x)
        return self.classifier(x).log_softmax(2)

labels  = [""] + [idx2char[i] for i in range(1, vocab_size)]
decoder = build_ctcdecoder(labels, LM_PATH)
def beam_decode(logp):
    probs = logp.exp().cpu().numpy()
    return decoder.decode_beams(probs, beam_width=50)[0][0]

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model  = ASRModel(vocab_size).to(DEVICE)
model.load_state_dict(torch.load(BEST_CKPT, map_location=DEVICE))
for p in model.parameters(): p.requires_grad = True

head_params = list(model.classifier.parameters())

backbone_params = [p for p in model.parameters() if all(p is not hp for hp in head_params)]

opt  = optim.AdamW([
    {'params': backbone_params, 'lr': 1e-6, 'weight_decay':1e-5},
    {'params': head_params,     'lr': 5e-5, 'weight_decay':1e-5},
])
EPOCH = 50
steps = EPOCH * len(tr_loader)
warm = int(0.1 * steps)
sched = get_linear_schedule_with_warmup(opt, num_warmup_steps=warm, num_training_steps=steps)
ctc   = nn.CTCLoss(blank=0, zero_infinity=True)

best_cer, best_state = float('inf'), None
for ep in range(1, EPOCH+1):
    model.train()
    for specs, lens, tgts, t_lens in tqdm(tr_loader, desc=f"Train {ep:02d}/{EPOCH:02d}"):
        specs, tgts = specs.to(DEVICE), tgts.to(DEVICE)
        opt.zero_grad()
        logits = model(specs)
        loss   = ctc(logits.permute(1,0,2), tgts, lens, t_lens)
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), 5)
        opt.step(); sched.step()

    model.eval()
    refs, hyps = [], []
    with torch.no_grad():
        for specs, lens, tgts, t_lens in dv_loader:
            specs = specs.to(DEVICE)
            out   = model(specs)
            off   = 0
            for i, L in enumerate(t_lens.tolist()):
                hy = beam_decode(out[i])
                rf = "".join(idx2char[x] for x in tgts[off:off+L].tolist())
                hyps.append(hy); refs.append(rf)
                off += L
    sc = cer(refs, hyps)
    print(f"Epoch {ep:02d} → CER={sc:.4f}")
    if sc < best_cer:
        best_cer, best_state = sc, copy.deepcopy(model.state_dict())

model.load_state_dict(best_state)
torch.save(model.state_dict(), SAVE_PATH)
print(f"Done. Best CER={best_cer:.4f}. Saved to {SAVE_PATH}")

In [None]:
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"Всего параметров:      {total_params:,}")
print(f"Обучаемых параметров:  {trainable_params:,}")

# let's make a csv for submit

In [2]:
!pip install jiwer

Collecting jiwer
  Downloading jiwer-3.1.0-py3-none-any.whl.metadata (2.6 kB)
Collecting rapidfuzz>=3.9.7 (from jiwer)
  Downloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Downloading jiwer-3.1.0-py3-none-any.whl (22 kB)
Downloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m30.4 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: rapidfuzz, jiwer
Successfully installed jiwer-3.1.0 rapidfuzz-3.13.0


In [3]:
import os
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torchaudio
from torch.utils.data import Dataset, DataLoader
from jiwer import cer

from tqdm import tqdm
import logging


logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s %(levelname)s: %(message)s',
    datefmt='%H:%M:%S'
)

DATA_DIR = '/kaggle/input/asr-numbers-recognition-in-russian'
df_train = pd.read_csv(os.path.join(DATA_DIR, 'train.csv'))
df_train['transcription'] = df_train['transcription'].astype(str)
df_dev   = pd.read_csv(os.path.join(DATA_DIR, 'dev.csv'))
df_dev ['transcription'] = df_dev ['transcription'].astype(str)

def text_to_indices(text, char2idx):
    return [char2idx[c] for c in text]

def build_vocab(transcripts):
    texts = transcripts.astype(str)
    chars = sorted(set(''.join(texts)))
    char2idx = {c: i+1 for i, c in enumerate(chars)}  # 0 — CTC blank
    idx2char = {i: c for c, i in char2idx.items()}
    return char2idx, idx2char

char2idx, idx2char = build_vocab(df_train['transcription'])
vocab_size = len(char2idx) + 1  # include CTC blank

resampler = torchaudio.transforms.Resample(orig_freq=24000, new_freq=16000)
mel_spec  = torchaudio.transforms.MelSpectrogram(sample_rate=16000, n_mels=128)

class ASRDataset(Dataset):
    def __init__(self, df, data_base, char2idx):
        self.df = df
        self.base = data_base
        self.char2idx = char2idx
    def __len__(self):
        return len(self.df)
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        wav, sr = torchaudio.load(os.path.join(self.base, row['filename']))
        if sr != 16000:
            wav = resampler(wav)
        spec = mel_spec(wav).squeeze(0).transpose(0,1)  # T x n_mels
        target = torch.tensor(text_to_indices(row['transcription'], self.char2idx), dtype=torch.long)
        return spec, target

def collate_fn(batch):
    specs, targets = zip(*batch)
    spec_lens = [s.size(0) for s in specs]
    max_spec  = max(spec_lens)
    padded_specs = torch.zeros(len(specs), max_spec, specs[0].size(1))
    for i, s in enumerate(specs):
        padded_specs[i, :s.size(0)] = s
    tgt_lens = [t.size(0) for t in targets]
    targets_cat = torch.cat(targets)
    return padded_specs.transpose(1,2), torch.tensor(spec_lens), targets_cat, torch.tensor(tgt_lens)

train_ds = ASRDataset(df_train, DATA_DIR, char2idx)
dev_ds   = ASRDataset(df_dev,   DATA_DIR, char2idx)
train_loader = DataLoader(train_ds, batch_size=16, shuffle=True, collate_fn=collate_fn)
dev_loader   = DataLoader(dev_ds,   batch_size=16, shuffle=False, collate_fn=collate_fn)

class ASRModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.cnn = nn.Sequential(
            nn.Conv2d(1, 32, 3, padding=1), nn.ReLU(), nn.MaxPool2d((2,1)),
            nn.Conv2d(32,64,3, padding=1), nn.ReLU(), nn.MaxPool2d((2,1)),
            nn.Conv2d(64,64,3, padding=1), nn.ReLU(), nn.MaxPool2d((2,1)),
        )
        self.rnn = nn.LSTM(
            input_size=64*16,
            hidden_size=192,
            num_layers=2,
            batch_first=True,
            bidirectional=True,
        )
        self.classifier = nn.Linear(192*2, vocab_size)

    def forward(self, x):
        x = x.unsqueeze(1)  # (B,1,n_mels,T)
        x = self.cnn(x)     # (B,64,16,T′)
        B, C, H, W = x.size()
        x = x.permute(0,3,1,2).reshape(B, W, C*H)  # (B,T′,1024)
        x, _ = self.rnn(x)                       # (B,T′,384)
        x = self.classifier(x)                   # (B,T′,vocab)
        return x.log_softmax(2)

model = ASRModel(vocab_size)
num_params = sum(p.numel() for p in model.parameters())
print(f"Total parameters: {num_params/1e6:.2f}M")

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-3)
ctc_loss  = nn.CTCLoss(blank=0, zero_infinity=True)



Total parameters: 2.82M


In [4]:
model.load_state_dict(torch.load('/kaggle/input/cer_0.1592/pytorch/default/1/model_finetuned_30ep_.pth', map_location=device))
model.to(device)

  model.load_state_dict(torch.load('/kaggle/input/cer_0.1592/pytorch/default/1/model_finetuned_30ep_.pth', map_location=device))


ASRModel(
  (cnn): Sequential(
    (0): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=(2, 1), stride=(2, 1), padding=0, dilation=1, ceil_mode=False)
    (3): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (4): ReLU()
    (5): MaxPool2d(kernel_size=(2, 1), stride=(2, 1), padding=0, dilation=1, ceil_mode=False)
    (6): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (7): ReLU()
    (8): MaxPool2d(kernel_size=(2, 1), stride=(2, 1), padding=0, dilation=1, ceil_mode=False)
  )
  (rnn): LSTM(1024, 192, num_layers=2, batch_first=True, bidirectional=True)
  (classifier): Linear(in_features=384, out_features=11, bias=True)
)

In [6]:
def greedy_decode(log_probs, blank=0):
    # log_probs: Tensor (T, vocab)
    indices = log_probs.argmax(dim=1).cpu().tolist()
    tokens, prev = [], None
    for idx in indices:
        if idx!=prev and idx!=blank:
            tokens.append(idx)
        prev = idx
    return ''.join(idx2char[i] for i in tokens)

In [7]:
import os
import pandas as pd
import torch
import torchaudio
from tqdm import tqdm


BASE     = '/kaggle/input/asr-numbers-recognition-in-russian'
TEST_CSV = os.path.join(BASE, 'test.csv')
OUT_CSV  = 'predictions.csv'

df = pd.read_csv(TEST_CSV)
assert 'filename' in df.columns

model.eval()
preds = []
for fn in tqdm(df['filename'], desc='Inference'):
    wav, sr = torchaudio.load(os.path.join(BASE, fn))
    if sr != 16000:
        wav = resampler(wav)
    spec = mel_spec(wav).squeeze(0).unsqueeze(0).to(device)
    with torch.no_grad():
        logp = model(spec)
    preds.append(greedy_decode(logp[0]))

assert len(preds) == len(df), f"inference count mismatch: {len(preds)} vs {len(df)}"

out = pd.DataFrame({
    'filename': df['filename'],
    'transcription': preds
})

out['transcription'] = out['transcription'].fillna(0)

n_null = out['transcription'].isna().sum()
print(f"Nulls in transcription: {n_null}")

out['transcription'] = pd.to_numeric(out['transcription'], errors='coerce').fillna(0).astype(int)
out['transcription'] = out['transcription'].apply(lambda x: int(x) if x >= 0 else 0)

out.to_csv(OUT_CSV, index=False)
print(f"Saved → {OUT_CSV}")

Inference: 100%|██████████| 2582/2582 [01:41<00:00, 25.32it/s]

Nulls in transcription: 0
Saved → predictions.csv



