In [None]:
! pip install sounddevice torchaudio ipywidgets

### 1. I am initializing the listening procedure. The program listens to me with the use of a laptop microphone and detects the class. Detected class and, optionally, confidence level, is displayed on the screen. The Jupyter-notebook program is used in this scenario.

In [None]:
# ▀▀▀ 1. Importy i konfiguracja ▀▀▀
import time, warnings
import numpy as np
import sounddevice as sd
import torch, torchaudio
from IPython.display import clear_output, display

# model utils (u Ciebie w repo)
from src.utils.utils_model import prepare_model

SAMPLE_RATE  = 16_000
DURATION_SEC = 1.0                   # 1-s fragment
FRAME_LEN    = int(DURATION_SEC * SAMPLE_RATE)
N_MELS, N_FFT, HOP = 80, 400, 160
TARGET_FRAMES = 501
CLASSES = ["allowed", "non-allowed"]
device  = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ▀▀▀ 2. Model ▀▀▀
model_cfg = {
    "model_name": "flexible_cnn",
    "model_params": {
        "num_classes": 2,
        "input_height": 80,
        "input_time": TARGET_FRAMES,
        "blocks_cfg": [
            dict(out_ch=16, activation="ReLU", use_bn=False, dropout_p=0.0, skip=False, is_bn_pre_act=False),
            dict(out_ch=32, activation="ReLU", use_bn=False, dropout_p=0.0, skip=False, is_bn_pre_act=False),
            dict(out_ch=32, activation="ReLU", use_bn=False, dropout_p=0.0, skip=False, is_bn_pre_act=False),
        ],
    },
    "checkpoint_path": "best_model.pt",
    "init": None,
    'freeze_backbone': False, 
}
model = prepare_model(model_cfg).to(device).eval()

# ▀▀▀ 3. Log-mel przetwarzanie ▀▀▀
mel_spec = torchaudio.transforms.MelSpectrogram(
    SAMPLE_RATE, n_fft=N_FFT, hop_length=HOP, n_mels=N_MELS, power=2.0
)
EPS = 1e-9
def wav_to_logmel(wave: torch.Tensor) -> torch.Tensor:
    if wave.dim() > 1:
        wave = wave.mean(dim=0, keepdim=True)          # mono
    rms = wave.pow(2).mean().sqrt()
    wave = wave / (rms + EPS)
    mel  = mel_spec(wave)
    logm = torch.log(mel + EPS).squeeze(0)             # [80, T]

    # dopasuj do 501 ramek
    if logm.shape[-1] < TARGET_FRAMES:
        pad = TARGET_FRAMES - logm.shape[-1]
        logm = torch.nn.functional.pad(logm, (0, pad))
    else:
        logm = logm[..., :TARGET_FRAMES]
    return logm                                         # [80, 501]

# ▀▀▀ 4. Pętla nasłuchu ▀▀▀
def listen():
    print("▶️  Listening…  (Ctrl-C w komórce, aby przerwać)")
    while True:
        audio = sd.rec(FRAME_LEN, samplerate=SAMPLE_RATE,
                       channels=1, dtype="float32")
        sd.wait()
        wave = torch.from_numpy(audio.T)                # [1, N]

        spec = wav_to_logmel(wave).unsqueeze(0).unsqueeze(0).to(device)
        with torch.no_grad():
            probs = torch.softmax(model(spec), dim=-1)[0].cpu()
        conf, idx = torch.max(probs, dim=0)
        clear_output(wait=True)
        display(f"🎤  **{CLASSES[idx]}**   |   confidence: {conf.item()*100:.1f}%")
        time.sleep(0.05)

try:
    listen()
except KeyboardInterrupt:
    print("⏹️  Nasłuch zatrzymany.")


In [None]:
import time
import torch
import torchaudio
import sounddevice as sd
import numpy as np
from IPython.display import clear_output, display
from src.utils.utils_model import prepare_model

# ====================== 1. PARAMETRY ============================
SAMPLE_RATE     = 16_000
DURATION        = 1.0          # sekundy
FRAME_LEN       = int(DURATION * SAMPLE_RATE)
DEVICE          = None         # None = domyślny mikrofon
USE_MIC         = False         # False → fallback z szumem
N_MELS          = 80
N_FFT           = 400
HOP_LENGTH      = 160
TARGET_FRAMES   = 501
EPS             = 1e-9
CLASSES         = ["allowed", "non-allowed"]

# ====================== 2. MODEL ================================
use_bn      = True
dropout_p   = 0.0
activation  = "ReLU"
is_bn_pre_act = True
skip        = False
optim_name   = "adamw"
lr          = 1e-3
weight_decay = 1e-1

model_params = {
    'model_name': 'flexible_cnn',
    'model_params': {
        'num_classes': 2,
        'input_height': 80,
        'input_time': 501,
        'blocks_cfg': [
            dict(out_ch=16, use_bn=use_bn, dropout_p=dropout_p, skip=False, activation=activation, is_bn_pre_act=is_bn_pre_act),
            dict(out_ch=32, use_bn=use_bn, dropout_p=dropout_p, skip=False, activation=activation, is_bn_pre_act=is_bn_pre_act),
            dict(out_ch=32, use_bn=use_bn, dropout_p=dropout_p, skip=skip, activation=activation, is_bn_pre_act=is_bn_pre_act),
        ],       
    },
    'checkpoint_path': f"models/best_model_{optim_name}_{lr}_{weight_decay}_{activation}_{dropout_p}_{use_bn}_{is_bn_pre_act}_{skip}.pt",
    'init': None
}

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = prepare_model(model_params).to(device)
model.eval()

# ====================== 3. TRANSFORM ============================
mel_spec = torchaudio.transforms.MelSpectrogram(
    sample_rate=SAMPLE_RATE,
    n_mels=N_MELS,
    n_fft=N_FFT,
    hop_length=HOP_LENGTH,
    power=2.0,
)

def waveform_to_logmel(waveform: torch.Tensor) -> torch.Tensor:
    if waveform.dim() > 1:
        waveform = waveform.mean(dim=0, keepdim=True)

    # PAD przed MelSpectrogram (min. n_fft)
    if waveform.shape[-1] < N_FFT:
        pad = N_FFT - waveform.shape[-1]
        waveform = torch.nn.functional.pad(waveform, (0, pad))

    # RMS normalizacja
    rms = waveform.pow(2).mean().sqrt()
    waveform = waveform / (rms + EPS)

    mel = mel_spec(waveform)
    logmel = torch.log(mel + EPS).squeeze(0)

    # PAD lub przycięcie do TARGET_FRAMES
    frames = logmel.shape[-1]
    if frames < TARGET_FRAMES:
        logmel = torch.nn.functional.pad(logmel, (0, TARGET_FRAMES - frames))
    else:
        logmel = logmel[..., :TARGET_FRAMES]

    return logmel

# ====================== 4. NAGRYWANIE ============================
def record_audio():
    """Zwraca waveform [1, N] długości ≥ FRAME_LEN."""
    if USE_MIC:
        audio = sd.rec(FRAME_LEN, samplerate=SAMPLE_RATE, channels=1,
                       dtype="float32", device=DEVICE)
        sd.wait()
        return torch.from_numpy(audio.T)
    else:
        return 0.01 * torch.randn(1, FRAME_LEN)  # fallback

# ====================== 5. NASŁUCH ==============================
def listen_loop():
    print("▶️  Listening…  (Ctrl-C to stop)")
    while True:
        waveform = record_audio()
        spec = waveform_to_logmel(waveform)                # [80, 501]
        spec = spec.unsqueeze(0).unsqueeze(0).to(device)   # [1, 1, 80, 501]

        with torch.no_grad():
            logits = model(spec)
            probs = torch.softmax(logits, dim=-1)[0]

        conf, idx = torch.max(probs, dim=0)
        label = CLASSES[idx]

        clear_output(wait=True)
        display(f"🎤  **{label}**  |  confidence: {conf.item()*100:.1f}%")
        time.sleep(0.1)

try:
    listen_loop()
except KeyboardInterrupt:
    print("⏹️  Zatrzymano nasłuch.")


### 2. I am running appropriate fragment of the code to add one more person to class 1. The system may take some time to process new data (update the model).

In [7]:
import random

import pandas as pd

In [8]:
def choose_new_speaker(all_speakers, old_speakers):
    """
    Wybiera nowego speakera spośród dostępnych, z wykluczeniem starych.
    """
    available_speakers = [spk for spk in all_speakers if spk not in old_speakers]
    if not available_speakers:
        raise ValueError("Brak dostępnych nowych speakerów.")
    return random.choice(available_speakers)

def get_spectrogram_df(phase):
    df = pd.read_csv(f'data/{phase}_df.csv')
    df['filename'] = df['filename'].str.replace('speech', 'spectrograms_dataset', regex=False)
    df['filename'] = df['filename'].str.replace('.wav', '.pt', regex=False)
    df.to_csv(f'data/{phase}_spectogram_df.csv', index=False)

In [9]:
import pandas as pd
import random

def create_new_speaker_spectogram_data():
    """
    Tworzy dane dla nowego speakera.
    """
    df_train = pd.read_csv("train_index.csv")
    df_test = pd.read_csv("test_index.csv")
    df = pd.concat([df_train, df_test], ignore_index=True)
    df = df[['filename', 'speaker', 'gender']]
    df['id'] = df.filename.apply(lambda x: x.split('-')[-5])
    df_filtered = df[df.groupby("speaker")["id"].transform("nunique") == 2]

    old_speakers = pd.read_csv(f'data/train_spectogram_df.csv').speaker.unique().tolist()
    new_speaker = choose_new_speaker(df_filtered.speaker.unique(), old_speakers)

    new_speaker_data = df_filtered[df_filtered.speaker == new_speaker].copy()
    new_speaker_data['label'] = 1
    new_speaker_data.to_csv('data/new_speaker_df.csv', index=False)
    get_spectrogram_df('new_speaker')

    # train-test split po dwóch rodzajach nagrania dla każdego speakera
    # allowed speakers
    new_speaker_data["grp"] = (
        new_speaker_data.groupby("speaker")["id"]          # grupujemy po speaker
        .transform(lambda x: pd.factorize(x)[0])  # 0 dla 1. id, 1 dla 2. id
    )
    new_speaker_data_train = new_speaker_data[new_speaker_data["grp"] == 0].drop(columns="grp").copy()
    new_speaker_data_test = new_speaker_data[new_speaker_data["grp"] == 1].drop(columns="grp").copy()

    #val-test split for allowed speakers
    new_speaker_data_test_shuffled = (
        new_speaker_data_test
        .groupby("speaker", group_keys=False)
        .apply(lambda g: g.sample(frac=1, random_state=83))
    )
    new_speaker_data_test_shuffled["part"] = (
        new_speaker_data_test_shuffled.groupby("speaker").cumcount() % 2
    )
    new_speaker_data_val = new_speaker_data_test_shuffled[new_speaker_data_test_shuffled["part"] == 0].drop(columns="part").copy()
    new_speaker_data_test = new_speaker_data_test_shuffled[new_speaker_data_test_shuffled["part"] == 1].drop(columns="part").copy()

    new_speaker_data_train.to_csv('data/new_speaker_train_df.csv', index=False)
    new_speaker_data_val.to_csv('data/new_speaker_val_df.csv', index=False)
    new_speaker_data_test.to_csv('data/new_speaker_test_df.csv', index=False)

    get_spectrogram_df('new_speaker_train')
    get_spectrogram_df('new_speaker_val')
    get_spectrogram_df('new_speaker_test')


In [10]:
create_new_speaker_spectogram_data()

  .apply(lambda g: g.sample(frac=1, random_state=83))


In [11]:
def create_new_datasets():
    old_df_train = pd.read_csv("data/train_df.csv")
    old_df_val = pd.read_csv("data/val_df.csv")
    old_df_test = pd.read_csv("data/test_df.csv")
    new_speaker_data_train = pd.read_csv("data/new_speaker_train_df.csv")
    new_speaker_data_val = pd.read_csv("data/new_speaker_val_df.csv")
    new_speaker_data_test = pd.read_csv("data/new_speaker_test_df.csv")


    replay_rows_train = []
    for _, grp in old_df_train[old_df_train['label'] == 1].groupby("speaker"):
        replay_rows_train.append(grp.sample(frac=0.10, random_state=42))
    for _, grp in old_df_train[old_df_train['label'] == 0].groupby("speaker"):
        replay_rows_train.append(grp.sample(frac=0.10, random_state=42))
    replay_train_df = pd.concat(replay_rows_train)

    # replay_rows_val = []
    # for _, grp in old_df_val[old_df_val['label'] == 1].groupby("speaker"):
    #     replay_rows_val.append(grp.sample(frac=0.10, random_state=42))
    # for _, grp in old_df_val[old_df_val['label'] == 0].groupby("speaker"):
    #     replay_rows_val.append(grp.sample(frac=0.10, random_state=42))
    # replay_val_df = pd.concat(replay_rows_val)
    replay_val_df = old_df_val

    # replay_rows_test = []
    # for _, grp in old_df_test[old_df_test['label'] == 1].groupby("speaker"):
    #     replay_rows_test.append(grp.sample(frac=0.10, random_state=42))
    # for _, grp in old_df_test[old_df_test['label'] == 0].groupby("speaker"):
    #     replay_rows_test.append(grp.sample(frac=0.10, random_state=42))
    # replay_test_df = pd.concat(replay_rows_test)
    replay_test_df = old_df_test

    train_df = pd.concat([new_speaker_data_train, replay_train_df], ignore_index=True)
    val_df = pd.concat([new_speaker_data_val, replay_val_df], ignore_index=True)
    test_df = pd.concat([new_speaker_data_test, replay_test_df], ignore_index=True)

    train_df.to_csv('data/1new_speaker_train_df.csv', index=False)
    val_df.to_csv('data/1new_speaker_val_df.csv', index=False)
    test_df.to_csv('data/1new_speaker_test_df.csv', index=False)

In [12]:
create_new_datasets()

In [13]:
get_spectrogram_df('1new_speaker_train')
get_spectrogram_df('1new_speaker_val')
get_spectrogram_df('1new_speaker_test')

## Then run
### sbatch run_main_new_speaker.sh

In [1]:
import numpy as np
import torch
from tqdm import tqdm

from src.utils.utils_model import prepare_model
from src.utils.utils_data import prepare_loaders

device = "cuda" if torch.cuda.is_available() else "cpu"  

In [2]:
model_cfg = {
    "model_name": "flexible_cnn",
    "model_params": {
        "num_classes": 2,
        "input_height": 80,
        "input_time": 501,
        "blocks_cfg": [
            dict(out_ch=16, activation="ReLU", use_bn=True, dropout_p=0.0, skip=False, is_bn_pre_act=True),
            dict(out_ch=32, activation="ReLU", use_bn=True, dropout_p=0.0, skip=False, is_bn_pre_act=True),
            dict(out_ch=32, activation="ReLU", use_bn=True, dropout_p=0.0, skip=False, is_bn_pre_act=True),
        ],
    },
    "checkpoint_path": "/net/people/plgrid/plgkrzepk/newGithub/FCV_project/models/best_model__one_speaker_adamw_0.0001_0.1_ReLU_0.0_True_True_False.pt",
    "init": None,
    'freeze_backbone': False, 
}

In [3]:
model = prepare_model(model_cfg).to(device).eval()

  checkpoint = torch.load(checkpoint_path, map_location=device)


In [6]:

data_params = {
    'dataset_name' : 'voices_spectograms',
    'dataset_params': {
        'custom_root': '/net/pr2/projects/plgrid/plggdnnp/datasets/VOiCES_devkit',
        'df_train_path': 'data/train_spectogram_df.csv',
        'df_val_path': 'data/val_spectogram_df.csv',
        'df_test_path': 'data/test_spectogram_df.csv',
        'use_transform': True, # if True, then use transforms for the base dataset (per side [CIFAR10 at this point])
    },
    'loader_params': {'batch_size': 128, 'pin_memory': True, 'num_workers': 12}
}
loaders = prepare_loaders(data_params)
test_loader = loaders['test']

import torch.nn.functional as F
from sklearn.metrics import f1_score
def evaluate_model(model, test_loader):
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for data in tqdm(test_loader):
            x_true, y_true = data
            inputs, labels = x_true.to(device), y_true.to(device)
            outputs = model(inputs)
            _, preds = torch.max(outputs, 1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    f1 = f1_score(all_labels, all_preds, average='binary')  # lub 'macro' dla wieloklasowej
    return f1

f1 = evaluate_model(model, test_loader)
print(f"Model F1-score on test set: {f1 * 100:.2f}%")

100%|██████████| 9/9 [00:56<00:00,  6.30s/it]

Model F1-score on test set: 76.77%





In [4]:
from src.utils.utils_data import prepare_loaders
data_params = {
    'dataset_name' : 'voices_spectograms',
    'dataset_params': {
        'custom_root': '/net/pr2/projects/plgrid/plggdnnp/datasets/VOiCES_devkit',
        'df_train_path': 'data/new_speaker_train_spectogram_df.csv',
        'df_val_path': 'data/new_speaker_val_spectogram_df.csv',
        'df_test_path': 'data/new_speaker_test_spectogram_df.csv',
        'use_transform': True, # if True, then use transforms for the base dataset (per side [CIFAR10 at this point])
    },
    'loader_params': {'batch_size': 128, 'pin_memory': True, 'num_workers': 12}
}
loaders = prepare_loaders(data_params)
test_loader = loaders['test']

import torch.nn.functional as F
from sklearn.metrics import f1_score
def evaluate_model(model, test_loader):
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for data in tqdm(test_loader):
            x_true, y_true = data
            inputs, labels = x_true.to(device), y_true.to(device)
            outputs = model(inputs)
            _, preds = torch.max(outputs, 1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    f1 = f1_score(all_labels, all_preds, average='binary')  # lub 'macro' dla wieloklasowej
    return f1

f1 = evaluate_model(model, test_loader)
print(f"Model F1-score on test set: {f1 * 100:.2f}%")

100%|██████████| 1/1 [00:06<00:00,  6.33s/it]

Model F1-score on test set: 76.92%



