In [23]:
!pip install torch torchaudio librosa jiwer pandas numpy tqdm scikit-learn




In [24]:
import torch
import librosa
import numpy as np
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import torch.nn as nn
from jiwer import wer

import os, json, subprocess, textwrap

# Put your key here for private notebook use, or set it in Colab env variables:
os.environ["MOZ_API_KEY"] = "354e3416db01207d8792e39bafc0078b5e228e64bdda21fecbc8c012e8f1e7a5"

assert "MOZ_API_KEY" in os.environ and os.environ["MOZ_API_KEY"].strip(), "Set MOZ_API_KEY env var first!"
MOZ_API_KEY = os.environ["MOZ_API_KEY"].strip()

DATASET_ID = "cmflnuzw71qkz8x3kil3tgjvk"
OUT_TAR = "common_voice_tr_scripted_23_0.tar.gz"

# 1) Create download session -> get token
cmd1 = f'''
curl -s -X POST "https://datacollective.mozillafoundation.org/api/datasets/{DATASET_ID}/download" \
  -H "Authorization: Bearer {MOZ_API_KEY}" \
  -H "Content-Type: application/json"
'''
resp = subprocess.check_output(cmd1, shell=True).decode("utf-8")
print(resp)

data = json.loads(resp)
download_token = data.get("download_token") or data.get("token") or data.get("downloadToken")
assert download_token, f"Could not find download token in response: {data}"
print("DOWNLOAD TOKEN:", download_token)

# 2) Download the dataset tar.gz
cmd2 = f'''
curl -L -X GET "https://datacollective.mozillafoundation.org/api/datasets/{DATASET_ID}/download/{download_token}" \
  -H "Authorization: Bearer {MOZ_API_KEY}" \
  -o "{OUT_TAR}"
'''
subprocess.check_call(cmd2, shell=True)

print("Downloaded:", OUT_TAR, "size(bytes)=", os.path.getsize(OUT_TAR))

{"downloadToken":"dlt_1a6b998d-8583-408e-b1d6-d9c81002c5ae","downloadUrl":"https://datacollective.mozillafoundation.org/api/datasets/cmflnuzw71qkz8x3kil3tgjvk/download/dlt_1a6b998d-8583-408e-b1d6-d9c81002c5ae","expiresAt":"2025-12-15T07:24:10.660Z","sizeBytes":"2933767497","contentType":"application/gzip","filename":"mcv-scripted-tr-v23.0.tar.gz","checksum":null}
DOWNLOAD TOKEN: dlt_1a6b998d-8583-408e-b1d6-d9c81002c5ae
Downloaded: common_voice_tr_scripted_23_0.tar.gz size(bytes)= 2933767497


In [25]:
import tarfile

EXTRACT_DIR = "/content/cv_tr"
os.makedirs(EXTRACT_DIR, exist_ok=True)

with tarfile.open(OUT_TAR, "r:gz") as tar:
    tar.extractall(EXTRACT_DIR)

print("âœ… Extracted")




  tar.extractall(EXTRACT_DIR)


âœ… Extracted


In [26]:
import glob
import os

tsvs = glob.glob(EXTRACT_DIR + "/**/*.tsv", recursive=True)

def find_one(name):
    for p in tsvs:
        if os.path.basename(p).lower() == f"{name}.tsv":
            return p
    for p in tsvs:
        if name in os.path.basename(p).lower():
            return p
    return None

train_tsv = find_one("train")
dev_tsv   = find_one("dev") or find_one("valid")
test_tsv  = find_one("test")

assert train_tsv and dev_tsv and test_tsv

clips_dirs = [d for d in glob.glob(EXTRACT_DIR + "/**/clips", recursive=True)]
CLIPS_DIR = clips_dirs[0]

print("train:", train_tsv)
print("dev:", dev_tsv)
print("test:", test_tsv)
print("clips:", CLIPS_DIR)


train: /content/cv_tr/cv-corpus-23.0-2025-09-05/tr/train.tsv
dev: /content/cv_tr/cv-corpus-23.0-2025-09-05/tr/dev.tsv
test: /content/cv_tr/cv-corpus-23.0-2025-09-05/tr/test.tsv
clips: /content/cv_tr/cv-corpus-23.0-2025-09-05/tr/clips


In [27]:
import pandas as pd


def prepare_df(tsv_path):
    df = pd.read_csv(tsv_path, sep="\t")
    df = df[["path", "sentence"]].dropna()
    df["audio"] = df["path"].apply(lambda x: os.path.join(CLIPS_DIR, x))
    df = df.drop(columns=["path"])
    return df

train_df = prepare_df(train_tsv).head(2000)
dev_df   = prepare_df(dev_tsv).head(500)
test_df  = prepare_df(test_tsv).head(500)

print(len(train_df), len(dev_df), len(test_df))



2000 500 500


In [28]:
# TÃ¼rkÃ§e + basic chars
CHARS = list(" abcÃ§defgÄŸhÄ±ijklmnoÃ¶prsÅŸtuÃ¼vyz")
char2idx = {c: i+1 for i, c in enumerate(CHARS)}  # 0 reserved for blank
idx2char = {i: c for c, i in char2idx.items()}

BLANK = 0
VOCAB_SIZE = len(char2idx) + 1

def text_to_int(text):
    text = text.lower()
    return [char2idx[c] for c in text if c in char2idx]



In [29]:
class SpeechDataset(Dataset):
    def __init__(self, df):
        self.df = df

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        audio_path = self.df.iloc[idx]["audio"]
        text = self.df.iloc[idx]["sentence"]

        wav, sr = librosa.load(audio_path, sr=16000)

        mel = librosa.feature.melspectrogram(
            y=wav,
            sr=sr,
            n_mels=80
        )
        mel = librosa.power_to_db(mel)
        mel = torch.tensor(mel, dtype=torch.float32)  # [n_mels, T]

        text_encoded = text_to_int(text)

        return mel, text_encoded


In [30]:
SAMPLE_RATE = 16000
N_MELS = 128

def load_audio(path):
    audio, _ = librosa.load(path, sr=SAMPLE_RATE)
    return audio

def extract_mel(audio):
    mel = librosa.feature.melspectrogram(
        y=audio, sr=SAMPLE_RATE, n_mels=N_MELS
    )
    mel = librosa.power_to_db(mel, ref=np.max)
    return torch.tensor(mel, dtype=torch.float)


In [31]:
def encode_text(text):
    text = text.lower()
    return torch.tensor([char2idx[c] for c in text if c in char2idx], dtype=torch.long)


In [32]:
class CNNSpeechDataset(Dataset):
    def __init__(self, df):
        self.df = df.reset_index(drop=True)

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        audio = load_audio(row.audio)
        mel = extract_mel(audio)
        text = encode_text(row.sentence)
        return mel, text


In [38]:
from torch.nn.utils.rnn import pad_sequence
import torch
def collate_fn(batch):
    mels, texts, mel_lens, text_lens = [], [], [], []

    for mel, text in batch:
        mel = mel.transpose(0, 1)   # [T, n_mels]

        mels.append(mel)
        texts.append(torch.tensor(text, dtype=torch.long))

        # ðŸ”¥ CRITICAL FIX
        mel_lens.append(mel.shape[0] // 4)

        text_lens.append(len(text))

    mels = pad_sequence(mels, batch_first=True)
    texts = pad_sequence(texts, batch_first=True)

    return (
        mels,
        texts,
        torch.tensor(mel_lens, dtype=torch.long),
        torch.tensor(text_lens, dtype=torch.long),
    )



In [39]:
train_dataset = SpeechDataset(train_df)
dev_dataset   = SpeechDataset(dev_df)

train_loader = DataLoader(
    train_dataset,
    batch_size=8,
    shuffle=True,
    collate_fn=collate_fn
)

dev_loader = DataLoader(
    dev_dataset,
    batch_size=8,
    shuffle=False,
    collate_fn=collate_fn
)


In [40]:
class CNN_CTC(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()

        self.cnn = nn.Sequential(
            nn.Conv2d(1, 32, 3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d((2,2)),

            nn.Conv2d(32, 64, 3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d((2,2)),
        )

        self.fc = nn.Linear(64 * 20, vocab_size)

    def forward(self, x):
        # x: [B, T, n_mels]
        x = x.unsqueeze(1)          # [B, 1, T, n_mels]
        x = self.cnn(x)             # [B, C, T', F']
        x = x.permute(0, 2, 1, 3)   # [B, T', C, F']
        x = x.flatten(2)            # [B, T', C*F']
        x = self.fc(x)              # [B, T', vocab]
        return x



In [41]:
device = "cuda" if torch.cuda.is_available() else "cpu"

model = CNN_CTC(VOCAB_SIZE).to(device)
criterion = nn.CTCLoss(blank=BLANK, zero_infinity=True)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

EPOCHS = 10

for epoch in range(EPOCHS):
    model.train()
    total_loss = 0

    for mels, texts, mel_lens, text_lens in tqdm(train_loader):
        mels = mels.to(device)
        texts = texts.to(device)

        logits = model(mels)
        logits = logits.permute(1, 0, 2)  # ðŸ”¥ [T, B, vocab]

        loss = criterion(
            logits.log_softmax(2),
            texts,
            mel_lens,
            text_lens
        )

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}/{EPOCHS} | Loss: {total_loss / len(train_loader):.4f}")


100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 250/250 [00:25<00:00,  9.83it/s]


Epoch 1/10 | Loss: 1.8656


100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 250/250 [00:24<00:00, 10.02it/s]


Epoch 2/10 | Loss: 1.4909


100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 250/250 [00:24<00:00, 10.04it/s]


Epoch 3/10 | Loss: 1.4620


100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 250/250 [00:24<00:00, 10.11it/s]


Epoch 4/10 | Loss: 1.4496


100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 250/250 [00:24<00:00, 10.05it/s]


Epoch 5/10 | Loss: 1.4328


100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 250/250 [00:24<00:00, 10.10it/s]


Epoch 6/10 | Loss: 1.4275


100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 250/250 [00:24<00:00, 10.16it/s]


Epoch 7/10 | Loss: 1.4091


100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 250/250 [00:24<00:00, 10.22it/s]


Epoch 8/10 | Loss: 1.4037


100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 250/250 [00:24<00:00, 10.07it/s]


Epoch 9/10 | Loss: 1.3909


100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 250/250 [00:25<00:00,  9.91it/s]

Epoch 10/10 | Loss: 1.3790





In [42]:
def greedy_decode(logits):
    pred = logits.argmax(dim=-1)
    prev = BLANK
    output = []

    for p in pred:
        if p != prev and p != BLANK:
            output.append(idx2char[p.item()])
        prev = p

    return "".join(output)



In [43]:
from jiwer import wer

model.eval()
refs, hyps = [], []

with torch.no_grad():
    for mels, texts, mel_lens, text_lens in dev_loader:
        mels = mels.to(device)

        logits = model(mels)
        logits = logits.permute(1, 0, 2)

        for i in range(logits.shape[1]):
            hyp = greedy_decode(logits[:, i])
            ref = "".join([idx2char[t.item()] for t in texts[i] if t.item() != 0])

            refs.append(ref)
            hyps.append(hyp)

print("WER:", wer(refs, hyps))



WER: 1.022703457911282
