In [1]:
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
from datasets import load_dataset
import torch
# libary to read audio in torch
import torchaudio
from torch.nn import CTCLoss
from jiwer import wer
import soundfile as sf

softmax = torch.nn.LogSoftmax(dim=1)
ctcloss = CTCLoss()

Importing model & dataset

In [2]:
# load model and processor
# processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h")
# model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h")
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")

Ignored unknown kwarg option normalize
Ignored unknown kwarg option normalize
Ignored unknown kwarg option normalize
Ignored unknown kwarg option normalize


Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
# #size for wav2vec2-large-960h
# model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h")
# print("Size of wav2vec2-large-960h: ", model.num_parameters())
# print(model)
# #size for wav2vec2-base-960h
# model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
# print("Size of wav2vec2-base-960h: ", model.num_parameters())
# print(model)

In [4]:
# load dummy dataset and read soundfiles
ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation")

# example of tokenizing an audio file
input_values = processor(ds[0]["audio"]["array"], return_tensors="pt", padding="longest", sampling_rate = ds[0]["audio"]["sampling_rate"]).input_values  # Batch size 1

In [5]:
#select an audio file and process it
#question: is it possible to process the audio after adding noise to it and backpropagate the loss?
audio = processor(ds[0]["audio"]["array"], return_tensors="pt", padding="longest", sampling_rate = ds[0]["audio"]["sampling_rate"]).input_values
sampling_rate = ds[0]["audio"]["sampling_rate"]

#sentence we want our model to predict
target_text = "THE CAT IS INSIDE MY BAG AND IT ROLLS ON THE FLOOR"
target = target_text.replace(" ", "|")
#assuming: target is a list which contains one sentence
target = [c for c in target]
# convert to tensor logits using the tokenizer
target_logits = processor.tokenizer.convert_tokens_to_ids(target)
target_logits = torch.tensor(target_logits)

#load everything to device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
audio = audio.to(device)
target_logits = target_logits.to(device)

In [6]:
def ctc_loss(logits, targets):
    #this function only tested for batch_size = 1
    input_lengths = torch.tensor([logits.shape[0]])
    target_lengths = torch.tensor([targets.shape[0]])
    return ctcloss(logits, targets, input_lengths, target_lengths)

In [7]:
def loss_function(audio, noise, target_logits, model, reg_weight, ctc_weight, eps):
    """ 
    Computes the loss of the audio with the current noise added, with a factor to control the size of the noise (via regularization) to allow for backpropagation of the input audio signal.
    audio: original audio after processing (for now)
    noise: noise to be added to the audio
    target_logits: target logits for the sentence we want to generate an attack for
    model: model to be attacked
    reg_weight: weight for the noise regularization term
    ctc_weight: weight for the ctc loss
    eps: maximum perturbation allowed
    """
    audio_perturbed = audio + noise
    #compute dB_x
    dB_x = (20 * torch.log10(audio-audio.min())).max()
    #compute dB_delta
    dB_delta = (20 * torch.log10(noise-noise.min())).max()
    #compute dB_x_delta
    dB_x_delta = dB_delta - dB_x
    #compute logits
    logits = model(audio_perturbed).logits
    logits = softmax(logits[0])
    logits = logits.unsqueeze(1)
    #compute ctc loss
    ctc_loss_value = ctc_loss(logits, target_logits)
    #compute noise regularization
    noise_reg = torch.norm(noise, p=2)
    #compute total loss
    loss = reg_weight * noise_reg + ctc_weight * ctc_loss_value
    if dB_x_delta < eps:
        return loss, ctc_loss_value.item(), noise_reg.item(), dB_x_delta.item()
    else:
        print(loss.item(), ctc_loss_value.item(), noise_reg.item(), dB_x_delta.item())
        return None, None, None, None


In [8]:
noise = torch.zeros_like(audio)
noise.requires_grad = True

lr = 0.05
optimizer = torch.optim.Adam([noise], lr=lr)
reg_weight = 0
ctc_weight = 1
eps = 10
# N_iter = 1000
losses = []
min_loss = 100000
min_noise = None
desired_noise_db = -60

i = 0
hashit = False
while True:
    optimizer.zero_grad()
    loss, ctc_loss_value, noise_reg, dB_x_delta = loss_function(audio, noise, target_logits, model, reg_weight, ctc_weight, eps)
    if loss is None:
        print("Maximum noise level reached, stopping.")
        break
    loss.backward()
    optimizer.step()

    itemized_loss = loss.item()
    losses_to_observe = [round(itemized_loss,2), round(ctc_loss_value,2), round(noise_reg,2), round(dB_x_delta,2)]
    losses.append(losses_to_observe)

    if itemized_loss < min_loss:
        min_loss = itemized_loss
        min_noise = noise.detach().cpu().numpy()

    if i % 10 == 0:
        print()
        print("Iteration: ", i+1)
        print("  Overall loss: ", round(itemized_loss,2))
        print("  CTC loss: ", round(ctc_loss_value,2))
        print("  Noise regularization: ", round(noise_reg,2))
        print("  dB_x_delta: ", round(dB_x_delta,2))
        print(f"  reg_weight: {reg_weight}, ctc_weight: {ctc_weight}, lr: {optimizer.param_groups[0]['lr']}")
        test = audio + noise
        logits = model(test).logits
        predicted_sentence = processor.batch_decode(torch.argmax(logits, dim=-1))[0]
        print("  Current prediction:", predicted_sentence)

        if predicted_sentence == target_text:
            reg_weight *= 1.05

            if not hashit:
                print("===============================")
                print("  !!!! FIRST HIT !!!!")
                print("===============================")
                hashit = True
                print("  Adjusting objective...")
                reg_weight = 0.1
                ctc_weight = 1
                optimizer.param_groups[0]["lr"] = 0.001

            if dB_x_delta < desired_noise_db:
                print("===============================")
                print("  !!!! DESIRED NOISE LEVEL REACHED !!!!")
                print("  Happily exiting...")
                break
        else:
            reg_weight *= 0.95


    i += 1

  return F.conv1d(input, weight, bias, self.stride,



Iteration:  1
  Overall loss:  26.68
  CTC loss:  26.68
  Noise regularization:  0.0
  dB_x_delta:  -inf
  reg_weight: 0, ctc_weight: 1, lr: 0.05
  Current prediction: BECAUSE HE IS SLEEPING INSTEAD OF CONQUERING THE LOVELY RUSE PRINCESS HAS BECOME A FIDDLE WITHOUT A BO OH PORA SHAGGY SITS THERE ACOOING DOVE

Iteration:  11
  Overall loss:  4.54
  CTC loss:  4.54
  Noise regularization:  84.7
  dB_x_delta:  -23.47
  reg_weight: 0.0, ctc_weight: 1, lr: 0.05
  Current prediction: THE GOSRILEEPIN IMSEM ONCLAN THEROVLEROS ANSO LITTLE BOR OR PORFAGETSE OCUNDOF

Iteration:  21
  Overall loss:  2.32
  CTC loss:  2.32
  Noise regularization:  111.04
  dB_x_delta:  -19.45
  reg_weight: 0.0, ctc_weight: 1, lr: 0.05
  Current prediction: THE CORSU OLYRU ANDITBO OF TORFEGE U

Iteration:  31
  Overall loss:  1.66
  CTC loss:  1.66
  Noise regularization:  122.58
  dB_x_delta:  -17.34
  reg_weight: 0.0, ctc_weight: 1, lr: 0.05
  Current prediction: THE COSU Y RUG AND ITBOLL OFTOEG

Iteration:  41
 

KeyboardInterrupt: 

Test the last noise:

In [9]:
# test = audio + torch.tensor(min_noise).to(device)
test = audio + noise
logits = model(test).logits
#print predicted sentence
print(processor.batch_decode(torch.argmax(logits, dim=-1)))

['THE CAT IS INSIDE MY BAG AND IT ROLLS ON THE FLOOR']


In [10]:
torchaudio.save("test.wav", test.detach().cpu(), sampling_rate)

In [None]:
#get processor dictionary
processor_dict = processor.tokenizer.get_vocab()
print(processor_dict)

In [None]:
raise Exception("stop here as below is trash")

### EVERYTHING BELOW IS TRASH DO NOT BOTHER WITH IT

unless you need inspiration or something

In [None]:
res = model(processor(ds[0]["audio"]["array"], return_tensors="pt", padding="longest", sampling_rate = ds[0]["audio"]["sampling_rate"]).input_values.to(device)).logits
res

In [None]:
res = softmax(res)
res = torch.argmax(res, dim=2)
sentence = processor.batch_decode(res)
sentence

In [None]:
res

In [None]:
processor.tokenizer.convert_tokens_to_ids([c for c in "THE CAT IS INSIDE MY BAG AND IT ROLLS ON THE FLOOR"])

Test the best noise:

In [None]:
test = audio + torch.tensor(min_noise).to(device)
# test = audio + noise
logits = model(test).logits
#print predicted sentence
print(processor.batch_decode(torch.argmax(logits, dim=-1)))
torchaudio.save("best_test.wav", test.detach().cpu(), sampling_rate)

In [None]:
target_sentence = ds[0]["text"]
target_sentence = [c for c in target_sentence]
target_logits = processor.tokenizer.convert_tokens_to_ids(target_sentence)
target_logits = torch.tensor(target_logits)

test_target_sentence = ds[5]['text']
test_target_sentence = [c for c in test_target_sentence]
test_target_logits = processor.tokenizer.convert_tokens_to_ids(test_target_sentence)
test_target_logits = torch.tensor(test_target_logits)

In [None]:
softmaxed_logits = softmax(output_logits)

# show predicted sentence
predicted_sentence = processor.decode(torch.argmax(softmaxed_logits, dim=-1))
print(predicted_sentence)
print(ds[0]['text'])

In [None]:
ctc_loss(output_logits.unsqueeze(1), target_logits.unsqueeze(0), torch.tensor([output_logits.shape[0]]), torch.tensor([target_logits.shape[0]]))

In [None]:
torch.tensor([[target_logits.shape[0]]])

In [None]:
output_logits = model(audio).logits
output_logits = softmax(output_logits[0])

# get predicted sentence
predicted_sentence = processor.decode(torch.argmax(output_logits, dim=-1))
print(predicted_sentence)

In [None]:
# test_target_sentence = ds[1]['text']
test_target_sentence = ds[2]['text']
test_target_sentence = [c for c in test_target_sentence]
test_target_logits = processor.tokenizer.convert_tokens_to_ids(test_target_sentence)
test_target_logits = torch.tensor(test_target_logits)

ctc_loss(output_logits, test_target_logits.unsqueeze(0))

In [None]:
test_target_logits.shape

In [None]:
# tokenize
input_values = processor(ds[0]["audio"]["array"], return_tensors="pt", padding="longest", sampling_rate = ds[0]["audio"]['sampling_rate']).input_values  # Batch size 1

# retrieve logits
logits = model(input_values).logits

# take argmax and decode
predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.batch_decode(predicted_ids)

In [None]:
audio = torch.Tensor(ds[0]["audio"]["array"]).unsqueeze(0)
rate = ds[0]["audio"]["sampling_rate"]
torchaudio.save("test.wav", audio, rate)

In [None]:
input_values = processor(ds[0]["audio"]["array"], return_tensors="pt", padding="longest").input_values  # Batch size 1
logits = model(input_values).logits
predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.batch_decode(predicted_ids)
transcription

In [None]:
good_transcription = 'MISTER QUILTER IS THE APOSTLE OF THE MIDDLE CLASSES AND WE ARE GLAD TO WELCOME HIS GOSPEL'

transcription = "THE CAT JUMPED OVER THE FOX WHERE IT HAS SHOWN US THE WORLD"

In [None]:
target = processor.tokenizer.convert_tokens_to_ids([c for c in transcription[0]])
target = torch.Tensor(target).unsqueeze(0).long()
target_shape = target[0].shape[0]
logits_shape = logits[0].shape[0]

In [None]:
ctcloss(log_probs=softmax(logits[0]), targets=target[0], input_lengths=[logits_shape], target_lengths=[target_shape])

In [None]:
predicted_ids

In [None]:
predicted_characters = processor.tokenizer.convert_ids_to_tokens(predicted_ids[0].tolist())
predicted_characters

In [None]:
transcription_list = [c for c in transcription[0]]
transcription_list

In [None]:
processor.tokenizer.convert_tokens_to_ids(transcription_list)

In [None]:
import itertools
# remove consecutive duplicates
result = [k for k, g in itertools.groupby(predicted_ids[0])]
# remove blanks
result = [x for x in result if x != 0]
#count
len(result)

In [None]:
(predicted_ids>0).sum()

In [None]:
# convert transcription to logits
transcription_logits = processor(transcription, return_tensors="pt", padding="longest").input_values

In [None]:
# load files extra2a, extra2b and infer
files = ["extra2a.wav", "extra2b.wav"]
# read audios
audio, rate = torchaudio.load(files[0])
audio2, rate2 = torchaudio.load(files[1])

In [None]:
#infer audio1
input_values = processor(audio[0], return_tensors="pt", padding="longest", sampling_rate=rate).input_values  # Batch size 1

In [None]:
audio.shape

In [None]:
input_values.shape

In [None]:
# retrieve logits
logits = model(input_values).logits

# take argmax and decode
predicted_ids = torch.argmax(logits, dim=-1)

transcription = processor.batch_decode(predicted_ids)
transcription

In [None]:
#infer audio1
file = "extra_0a.wav"
audio, rate = torchaudio.load(file)
audio = audio
input_values = processor(audio[0], return_tensors="pt", padding="longest", sampling_rate=rate).input_values  # Batch size 1
# retrieve logits
logits = model(input_values).logits

# take argmax and decode
predicted_ids = torch.argmax(logits, dim=-1)

transcription = processor.batch_decode(predicted_ids)
transcription

In [None]:
logits[0].shape

Goal: Minimize 

$ dB_x(\delta) + c l(x+\delta, t) $

where 

$dB_x(\delta)$ is the strength of the noise compared to the signal

$c$: tradeoff parameter between being adversarial and being close to the original signal

$l(x+\delta, t)$ : the loss between the (disturbed signal prediction?) and the target sentence to become adversarial t?

we define $l$ as the CTC-loss, so we can say:

$-log Pr(t | x+\delta) $

In [None]:
target = ["THE CAT IS INSIDE MY BAG AND IT ROLLS ON THE FLOOR"]
#assuming: target is a list which contains one sentence
target = [c for c in target[0]]
# convert to tensor logits
target_logits = processor.tokenizer.convert_tokens_to_ids(target)
target_logits = torch.tensor(target_logits)

#compute adversarial example
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
audio = audio.to(device)
audio = processor(audio, return_tensors="pt", padding="longest", sampling_rate=rate).input_values
audio = audio[0].to(device)
target_logits = target_logits.to(device)

In [None]:
# minimize dB_x(delta) - logPr(t|f(x+delta)), such that dB_x(delta) < eps
# delta = argmin dB_x(delta) - logPr(t|f(x+delta))
# dB_x(delta) = 20*log10(||x+delta||_2 / ||delta||_2)
# x = audio
# t = target transcription
# f = model
# eps = max distortion
# delta = perturbation


# define loss function
def loss_function(audio, noise, target_logits, model, eps, ctc_constant):
    audio_perturbed = audio + noise
    # print(input_values.shape)
    # print(audio_perturbed.shape)
    #audio: clean audio
    # calculate dB_x, dB_delta, dB_x(delta) , where delta is perturbed_noise - clean_audio
    dB_x = 20*torch.log10(torch.norm(audio))
    # calculate dB_delta
    # add 1e-10 to avoid log of zero
    dB_delta = 20*torch.log10(torch.norm(noise+1e-10))
    # calculate dB_x(delta)
    dB_x_delta = dB_delta - dB_x

    # calculate logPr(t|f(x+delta))
    logits = model(audio_perturbed).logits
    logits = logits[0] # remove batch dimension
    logits = softmax(logits)
    # print(target_logits)
    # print(target_logits.shape, target_logits.dtype)
    # print(logits)
    # print(logits.shape, logits.dtype)

    # print(logits.shape[0])
    # print(target_logits.shape[0])
    # print(target_logits)
    # print(logits)
    logPr = ctcloss(logits, target_logits, [logits.shape[0]], [target_logits.shape[0]])
    # calculate loss
    # print("dB_x_delta, logPr")
    # print(dB_x_delta, logPr)
    # loss = dB_x_delta - ctc_constant * logPr
    loss = - logPr

    # check if dbloss is smaller than eps
    if dB_x_delta < eps:
        return loss
    else:
        # print(dB_x_delta, eps)
        # return None
        return loss

In [None]:
# define eps
eps = 10
# define number of iterations
n = 5000
# define learning rate
lr = 1e-1
# define perturbed audio: start with clean audio
noise = torch.zeros_like(audio).requires_grad_(True)
# define optimizer
optimizer = torch.optim.Adam([noise], lr=lr)
ctc_constant = 1

# loop over n iterations
for i in range(n):
    # set gradients to zero
    optimizer.zero_grad()
    # calculate loss
    loss = loss_function(audio, noise, target_logits, model, eps, ctc_constant=ctc_constant)
    # break if loss is None
    if loss is None:
        break
    print("final loss")
    print(loss)      
    # calculate gradients
    loss.backward()
    # update perturbation
    optimizer.step()
#    print(audio_pert)
# save adversarial example
audio_pert = (audio+noise).detach().to("cpu")
torchaudio.save("adversarial_one.wav", audio_pert, rate)

In [None]:
loss

In [None]:
audio.dtype

In [None]:
audio_pert

In [None]:
# display audio object
audio_pert

In [None]:
audio

In [None]:
softmaxed = torch.nn.Softmax(dim=1)
probs = softmaxed(model(perturbed_audio).logits)

In [None]:
probs.sum()

In [None]:
target_sentence = "THE WILL BURN YOU TO A CRISP"
#convert to tokens

In [None]:
perturbed_audio = audio + audio_pert
dB_x = 20 * torch.log10(torch.norm(audio) / torch.norm(audio_pert))
# calculate logPr(t|f(x+delta))
logits = model(perturbed_audio).logits
pred = processor.batch_decode(torch.argmax(logits, dim=-1))

#print(target)
print(pred)

In [None]:
type(pred)