In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/speaker-training-data/Speaker_9/S_9_5.wav
/kaggle/input/speaker-training-data/Speaker_9/S_9_53.wav
/kaggle/input/speaker-training-data/Speaker_9/S_9_18.wav
/kaggle/input/speaker-training-data/Speaker_9/S_9_66.wav
/kaggle/input/speaker-training-data/Speaker_9/S_9_33.wav
/kaggle/input/speaker-training-data/Speaker_9/S_9_23.wav
/kaggle/input/speaker-training-data/Speaker_9/S_9_67.wav
/kaggle/input/speaker-training-data/Speaker_9/S_9_84.wav
/kaggle/input/speaker-training-data/Speaker_9/S_9_100.wav
/kaggle/input/speaker-training-data/Speaker_9/S_9_78.wav
/kaggle/input/speaker-training-data/Speaker_9/S_9_45.wav
/kaggle/input/speaker-training-data/Speaker_9/S_9_63.wav
/kaggle/input/speaker-training-data/Speaker_9/S_9_34.wav
/kaggle/input/speaker-training-data/Speaker_9/S_9_16.wav
/kaggle/input/speaker-training-data/Speaker_9/S_9_87.wav
/kaggle/input/speaker-training-data/Speaker_9/S_9_29.wav
/kaggle/input/speaker-training-data/Speaker_9/S_9_59.wav
/kaggle/input/speaker-training-

In [3]:
import os
import numpy as np
import librosa

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

# Paths
TRAIN_DIR = "/kaggle/input/speaker-training-data"

# Audio parameters
SR = 16000
N_MELS = 40
DURATION = 3.0

# Model / training parameters
EMBED_DIM = 128
BATCH_SIZE = 16
EPOCHS = 35
LR = 1e-3

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

Using device: cuda


In [4]:
# Feature Extraction
def extract_logmel(file_path, sr=SR, n_mels=N_MELS, duration=DURATION):
    y, _ = librosa.load(file_path, sr=sr)

    target_len = int(sr * duration)
    if len(y) > target_len:
        y = y[:target_len]
    else:
        y = np.pad(y, (0, target_len - len(y)))

    mel = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=n_mels)
    logmel = librosa.power_to_db(mel)

    # Normalize
    logmel = (logmel - logmel.mean()) / (logmel.std() + 1e-8)

    return logmel.astype(np.float32)

In [5]:
# Dataset
class SpeakerDataset(Dataset):
    def __init__(self, files, labels):
        self.files = files
        self.labels = labels

    def __len__(self):
        return len(self.files)

    def __getitem__(self, idx):
        mel = extract_logmel(self.files[idx])
        mel = torch.tensor(mel).unsqueeze(0)
        label = self.labels[idx]
        return mel, label

In [6]:
# Build train / val / test splits PER SPEAKER
train_files, train_labels = [], []
val_files, val_labels = [], []
test_files, test_labels = [], []

speakers = sorted(os.listdir(TRAIN_DIR))
spk2id = {spk: idx for idx, spk in enumerate(speakers)}

for spk, idx in spk2id.items():
    spk_dir = os.path.join(TRAIN_DIR, spk)

    files = [
        os.path.join(spk_dir, f)
        for f in os.listdir(spk_dir)
        if f.lower().endswith(".wav")
    ]

    # Per-speaker split
    train_f, test_f = train_test_split(files, test_size=0.2, random_state=42)
    train_f, val_f  = train_test_split(train_f, test_size=0.1, random_state=42)

    # Append
    train_files += train_f
    train_labels += [idx] * len(train_f)

    val_files += val_f
    val_labels += [idx] * len(val_f)

    test_files += test_f
    test_labels += [idx] * len(test_f)

print("Speakers:", len(spk2id))
print("Train:", len(train_files), "Val:", len(val_files), "Test:", len(test_files))

Speakers: 27
Train: 1944 Val: 216 Test: 544


In [7]:
# Create datasets
train_dataset = SpeakerDataset(train_files, train_labels)
val_dataset   = SpeakerDataset(val_files, val_labels)
test_dataset  = SpeakerDataset(test_files, test_labels)  # used later in verification

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, drop_last=True)
val_loader   = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [8]:
# Model
class SpeakerCNN(nn.Module):
    def __init__(self, num_speakers, embedding_dim=EMBED_DIM):
        super().__init__()

        self.conv1 = nn.Conv2d(1, 32, 3, padding=1)
        self.bn1 = nn.BatchNorm2d(32)

        self.conv2 = nn.Conv2d(32, 64, 3, padding=1)
        self.bn2 = nn.BatchNorm2d(64)

        self.pool = nn.MaxPool2d(2)
        self.adapool = nn.AdaptiveAvgPool2d((8, 8))

        self.fc1 = nn.Linear(64 * 8 * 8, 256)
        self.fc2 = nn.Linear(256, embedding_dim)

        # classifier (training only)
        self.classifier = nn.Linear(embedding_dim, num_speakers)

    def forward(self, x):
        x = F.relu(self.bn1(self.conv1(x)))
        x = self.pool(x)

        x = F.relu(self.bn2(self.conv2(x)))
        x = self.pool(x)

        x = self.adapool(x)
        x = x.view(x.size(0), -1)

        x = F.relu(self.fc1(x))

        emb = self.fc2(x)
        emb = F.normalize(emb, p=2, dim=1)

        logits = self.classifier(emb)
        return emb, logits

In [9]:
# Training setup
model = SpeakerCNN(num_speakers=len(spk2id)).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LR)

In [10]:
# ---------------------------------------------------------
# Training loop (with validation)
# ---------------------------------------------------------
for epoch in range(EPOCHS):
    model.train()
    train_loss = 0.0

    for mel, label in train_loader:
        mel = mel.to(device)
        label = label.to(device)

        optimizer.zero_grad()

        _, logits = model(mel)
        loss = criterion(logits, label)

        loss.backward()
        optimizer.step()

        train_loss += loss.item()

    train_loss /= len(train_loader)

    # ---------------- Validation ----------------
    model.eval()
    val_loss = 0.0

    with torch.no_grad():
        for mel, label in val_loader:
            mel = mel.to(device)
            label = label.to(device)

            _, logits = model(mel)
            loss = criterion(logits, label)
            val_loss += loss.item()

    val_loss /= len(val_loader)

    print(f"Epoch [{epoch+1}/{EPOCHS}] - Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}")

Epoch [1/35] - Train Loss: 2.9160 | Val Loss: 2.5057
Epoch [2/35] - Train Loss: 2.1848 | Val Loss: 1.8249
Epoch [3/35] - Train Loss: 1.5360 | Val Loss: 1.2471
Epoch [4/35] - Train Loss: 1.0282 | Val Loss: 0.8241
Epoch [5/35] - Train Loss: 0.6768 | Val Loss: 0.5459
Epoch [6/35] - Train Loss: 0.4560 | Val Loss: 0.3755
Epoch [7/35] - Train Loss: 0.3191 | Val Loss: 0.2704
Epoch [8/35] - Train Loss: 0.2337 | Val Loss: 0.2026
Epoch [9/35] - Train Loss: 0.1779 | Val Loss: 0.1569
Epoch [10/35] - Train Loss: 0.1396 | Val Loss: 0.1253
Epoch [11/35] - Train Loss: 0.1124 | Val Loss: 0.1025
Epoch [12/35] - Train Loss: 0.0924 | Val Loss: 0.0846
Epoch [13/35] - Train Loss: 0.0772 | Val Loss: 0.0716
Epoch [14/35] - Train Loss: 0.0654 | Val Loss: 0.0608
Epoch [15/35] - Train Loss: 0.0561 | Val Loss: 0.0523
Epoch [16/35] - Train Loss: 0.0486 | Val Loss: 0.0456
Epoch [17/35] - Train Loss: 0.0424 | Val Loss: 0.0401
Epoch [18/35] - Train Loss: 0.0373 | Val Loss: 0.0356
Epoch [19/35] - Train Loss: 0.0330 | 

In [11]:
# Save model
MODEL_PATH = "/kaggle/working/speaker_cnn_trained.pth"
torch.save(model.state_dict(), MODEL_PATH)
print("Model saved to:", MODEL_PATH)

Model saved to: /kaggle/working/speaker_cnn_trained.pth
