# Filter-based Adversarial Examples Demo
What you need: 
- Model checkpoint. 
- At least one dataset (please download them from the official sources)  

We provide model checkpoints for each dataset, as well as listening examples here:  
[https://rwth-aachen.sciebo.de/s/zEaaHkzQpTRZA8d](https://rwth-aachen.sciebo.de/s/zEaaHkzQpTRZA8d).  
Run the corresponding cells below for your dataset and model of choice.

In [None]:
%cd ..
DEVICE = "cuda" # Set to "cpu" if no cuda device available

## Load Model

### CNN14

In [None]:
from training.cnn14_adv_train import CNN14Adv
CNN14_CHECKPOINT = "./demos/cnn14_speech.ckpt" # Must correspond to model type and dataset

model = CNN14Adv.load_from_checkpoint(CNN14_CHECKPOINT)

### PaSST

In [None]:
from training.passt_adv_train import PasstAdv
PASST_CHECKPOINT = "./demos/passt_esc50.ckpt" # Must correspond to model type and dataset

model = PasstAdv.load_from_checkpoint(PASST_CHECKPOINT, map_location=DEVICE)
if DEVICE == "cpu": # Bugfix
    model.mel.preemphasis_coef = model.mel.preemphasis_coef.cpu()

## Load Dataset

### ESC-50

In [None]:
from data.esc50 import ESC50DataModule
ESC50DIR = "../ESC-50/"
data_module = ESC50DataModule(dir=ESC50DIR, batch_size=1, num_workers=1)
data_module.setup("test")
dataset = "ESC-50"
sr = 32000

### NSynth

In [None]:
from data.nsynth import NSynthDataModule
NSYNTHDIR = "../nsynth/"
data_module = NSynthDataModule(dir=NSYNTHDIR, batch_size=1, num_workers=1)
data_module.setup("test")
dataset = "NSynth"
sr = 32000

### Speech Commands

In [None]:
from data.speech import SpeechCommandsDataModule
SPEECHDIR = "../SpeechCommands/speech_commands_v0.02/"
data_module = SpeechCommandsDataModule(dir=SPEECHDIR, batch_size=1, num_workers=1)
data_module.setup("test")
dataset = "SpeechCommands"
sr = 16000

## Run (Untargeted) Attack

Note: Targeted attacks will be added at a later date

In [None]:
from attacks.filter_pgd import run_pgd_batched
DATA_IDX = 0 # Index of the test sample to attack
EPS = 0.1

sample, labels = data_module.test_dataset[DATA_IDX] 
x = sample.unsqueeze(0).to(DEVICE)  # Add batch dimension
labels = labels.unsqueeze(0).to(DEVICE)  # Add batch dimension
x = model.mel(x)
# To see what is happening inside, use run_pgd_batched(..., verbose=True)
res_dict = run_pgd_batched(model, x, labels, eps=EPS, alpha=EPS/10, max_iters=10, restarts=10, device=DEVICE)

# Model predictions before and after attack
import numpy as np
pred_before = data_module.class_map[np.argmax(model(x.unsqueeze(1)).detach().cpu().numpy())]
print(f"Prediction before: {pred_before}")
pred_after = data_module.class_map[np.argmax(model(res_dict['perturbed_inputs']).detach().cpu().numpy())]
print(f"Prediction after: {pred_after}")

In [None]:
# To see the filter, run this
res_dict['filters']

## Transform Mel Filters back to STFT-Space

In [None]:
import librosa
import numpy as np
def mel_filter_to_fft(mel_filter, n_fft=2048):
    """Converts the mel filter to STFT-space.
    As the mel spectrogram is not losslessly invertible, 
    we instead convert the filter to be STFT-compatible
    by mapping the mel bins to their nearest FFT bin
    and apply it to the STFT spectrogram.

    Args:
        mel_filter (np.ndarray): one-dimensional vector containing the mel filter
        n_fft (int, optional): See librosa.stft. Defaults to 2048.

    Returns:
        np.ndarray: The converted filter in STFT-space.
    """
    mel_freqs = librosa.mel_frequencies(len(mel_filter))
    fft_freqs = librosa.fft_frequencies(n_fft=n_fft)
    fft_to_mel_map = {}
    fft_filter_custom = []

    for fft_freq in fft_freqs:
        distances = abs(mel_freqs - fft_freq)
        bin_idx = distances.argmin()
        fft_to_mel_map[fft_freq] = bin_idx
        fft_filter_custom.append(mel_filter[bin_idx])
    return np.array(fft_filter_custom)

mel_filter = res_dict['filters'][0].cpu().numpy()
fft_filter = mel_filter_to_fft(mel_filter)
S = librosa.stft(y=sample.cpu().numpy(), n_fft=2048)
# Apply the filter to STFT spectrogram
S_filtered = S * fft_filter[:, np.newaxis]
# Inverse STFT to get waveform
y_filtered = librosa.istft(S_filtered, n_fft=2048)

import soundfile as sf
sf.write(f'demos/{dataset}_{DATA_IDX}_original_{pred_before}.wav', sample.cpu().numpy(), samplerate=sr, subtype='PCM_16')
sf.write(f'demos/{dataset}_{DATA_IDX}_filtered_eps{EPS}_{pred_after}.wav', y_filtered, samplerate=sr, subtype='PCM_16')