In [12]:
# Execute this code block to install dependencies when running on colab
try:
    import torch
except:
    from os.path import exists
    from wheel.pep425tags import get_abbr_impl, get_impl_ver, get_abi_tag
    platform = '{}{}-{}'.format(get_abbr_impl(), get_impl_ver(), get_abi_tag())
    cuda_output = !ldconfig -p|grep cudart.so|sed -e 's/.*\.\([0-9]*\)\.\([0-9]*\)$/cu\1\2/'
    accelerator = cuda_output[0] if exists('/dev/nvidia0') else 'cpu'

    ! pip install -q http://download.pytorch.org/whl/{accelerator}/torch-1.0.0-{platform}-linux_x86_64.whl torchvision

try: 
    import torchbearer
except:
    ! pip install torchbearer

In [38]:
import torch
import torch.nn.functional as F
import torchvision.transforms as transforms
import torchbearer
from torch import nn
from torch import optim
from torch.utils.data import DataLoader
from torchaudio.datasets import LIBRISPEECH
from torchbearer import Trial
import numpy as np
import torchaudio
import torchaudio.functional as Fa
import torchaudio.transforms as T
import scipy as sp
from scipy import signal
import pytorch_lightning as pl

In [39]:
# fix random seed for reproducibility
seed = 7
torch.manual_seed(seed)
torch.backends.cudnn.deterministic = True

In [69]:
# load data
trainset = LIBRISPEECH(".", download=True)
testset = LIBRISPEECH(".", download=True)

In [66]:
n = len(trainset)
nn = 161
print(n , " ", nn)

28539   161


The out to our network is the Short-Term Fourier Transform (STFT) of the last 2 seconds of the
speech signal.

To calculate the STFT:

*   we use a hamming window length of 320 samples
*   hop length of 160 samples
*   FFT size of 320

Resulting in an **out dimension** of 2 × 161 × 204. 

We use a 13 layer convolutional network.

The network outputs a waveform of 0.5 seconds, sampled at 16kHz. 

In [59]:
# trainloader = DataLoader(stftTrainSet, batch_size=32, shuffle=True)
print(len(trainset))
print(trainset[0][0])
# it = iter(trainloader)
# print(next(it)[0])
# print(next(it)[1])

28539
tensor([[-0.0065, -0.0055, -0.0062,  ...,  0.0033,  0.0005, -0.0095]])


In [70]:
stftTrainSet = torch.zeros(n, 161, 201)
# print(stftTrainSet)
i=0
for s in trainset :
  # print(s[2])
  # print(len(s))
  short = s[0][:, -s[1]*2 :] # takes last 2 seconds
  if short.size() != nn:
    short = torch.from_numpy(np.pad(short, [(0, 0), (0, 32000 - [*short.size()][1])], 'constant'))
  stftTrainSet[i]=(torch.stft(short, n_fft=320, hop_length=160, win_length=320, window=torch.hann_window(320), center=True, pad_mode="reflect", return_complex=True ))
  i += 1

print(stftTrainSet[0].shape)
trainloader = DataLoader(stftTrainSet, batch_size=32, shuffle=True)

torch.Size([161, 201])


In [54]:
print(len(trainset))
print(trainset[0][0])
trainloader = DataLoader(stftTrainSet, batch_size=32, shuffle=True)
print(len(trainloader))
it = iter(trainloader)
print(next(it)[0])
print(next(it)[1])

892
tensor([[-5.0163e-01, -4.1474e-01, -4.1267e-01,  ...,  1.5079e-01,
          1.5291e-01, -4.0820e-02],
        [ 2.9828e-01,  1.9604e-01,  2.1379e-01,  ..., -7.3055e-02,
         -9.7550e-02,  1.1011e-01],
        [-9.4276e-02,  9.0133e-03,  5.5531e-03,  ..., -1.5590e-02,
          3.4145e-02, -1.2937e-01],
        ...,
        [-6.7391e-04,  1.7671e-03, -4.4277e-04,  ...,  1.0176e-03,
         -2.2972e-04,  5.9798e-04],
        [ 2.4803e-04, -1.3281e-03, -2.7625e-04,  ..., -1.0205e-03,
         -3.5502e-05, -9.1248e-04],
        [-1.6239e-04,  1.0773e-03,  8.9604e-04,  ...,  1.4257e-03,
          1.3685e-04,  8.5086e-04]])
tensor([[ 1.0264e+00, -1.4137e-01, -1.5015e-01,  ..., -1.6829e-03,
         -5.4164e-03,  3.0341e-02],
        [-1.5221e+00, -8.0741e-01, -6.4040e-01,  ...,  9.8537e-03,
          3.8811e-03, -4.3285e-02],
        [-1.7463e-01,  3.0923e+00,  2.4873e+00,  ..., -2.9448e-02,
          1.8498e-02,  5.3689e-02],
        ...,
        [ 9.5461e-04, -3.0313e-03, -6.1324

In [52]:
# shortTestSet = np.empty([n,nn])
# print(shortTestSet)

# i=0

# for s in testset :

#   short = s[0][:, -s[1]*2 :]
#   if short.size() != nn:
#     short = np.pad(short, [(0, 0), (0, 32000 - [*short.size()][1])], 'constant')
#   shortTestSet[i] = signal.stft(short, nperseg = 320, noverlap = 160, nfft = 320)[0]
#   i += 1

# print(shortTestSet)
# testset = torch.from_numpy(shortTestSet)
# del shortTestSet
# print(testset)
# testloader = DataLoader(testset, batch_size=32, shuffle=True)

The learning rate started at 1.5 * $10^4$ and decreased using an exponential learning rate scheduler, with a learning anneal gamma value of 0.99.

A.2 NETWORK ARCHITECTURE

The architecture is comprised of :

* 8 down-sampling convolutional blocks 
* 4 up-sampling convolutional blocks 
* linear layer

The down-sampling convolutional block is comprised of a reflection padding, followed a 2d convolution layer, followed by a 2d batch norm, followed by a prelu activation function. The first
downsampling block has 64 channels, the second through the seventh have 128, and finally the last
one has 256 channels. 

**The last conv block also has a leaky relu instead of prelu. Here the signal is
reshaped into a one-hot vector.**

The upsampling blocks are comprised of 1-dimensional ConvTranspose 1d, and a leaky relu activation function. The first has 64 channels, the second 32, the third 16, the fourth, 1. Finally, the linear
layer is followed by a tanh activation function.

In [56]:
class CamoCNN(pl.LightningModule):
    def __init__(self):
        super(CamoCNN, self).__init__()
        self.conv1 = nn.Conv2d(1, 64, (5,5), padding=0)
        self.conv2 = nn.Conv2d(64, 128, (5,5), padding=0)
        self.conv3 = nn.Conv2d(128, 128, (5,5), padding=0)
        self.conv4 = nn.Conv2d(128, 128, (5,5), padding=0)
        self.conv5 = nn.Conv2d(128, 128, (5,5), padding=0)
        self.conv6 = nn.Conv2d(128, 128, (5,5), padding=0)
        self.conv7 = nn.Conv2d(128, 128, (5,5), padding=0)
        self.conv8 = nn.Conv2d(128, 256, (5,5), padding=0)

        self.convTrans1 = nn.ConvTranspose1d(256, 64, (5,5), padding=0)
        self.convTrans2 = nn.ConvTranspose1d(64, 32, (5,5), padding=0)
        self.convTrans3 = nn.ConvTranspose1d(32, 16, (5,5), padding=0)
        self.convTrans4 = nn.ConvTranspose1d(16, 1, (5,5), padding=0)
        # ouutput from the convolution layer is 1+ (16+0-5)/1 = 12
        self.fc1 = nn.Linear(12, 1)
            
    def forward(self, input):
       
      # DOWN-SAMPLING BLOCKS

      out = F.pad(input, pad=(2,2,2,2), mode='reflect')
      out = self.conv1(out)
      out = F.batch_norm(out)
      out = F.prelu(out)

      out = F.pad(out, pad=(2,2,2,2), mode='reflect')
      out = self.conv2(out)
      out = F.batch_norm(out)
      out = F.prelu(out)

      out = F.pad(out, pad=(2,2,2,2), mode='reflect')
      out = self.conv3(out)
      out = F.batch_norm(out)
      out = F.prelu(out)

      out = F.pad(out, pad=(2,2,2,2), mode='reflect')
      out = self.conv4(out)
      out = F.batch_norm(out)
      out = F.prelu(out)

      out = F.pad(out, pad=(2,2,2,2), mode='reflect')
      out = self.conv5(out)
      out = F.batch_norm(out)
      out = F.prelu(out)

      out = F.pad(out, pad=(2,2,2,2), mode='reflect')
      out = self.conv6(out)
      out = F.batch_norm(out)
      out = F.prelu(out)

      out = F.pad(out, pad=(2,2,2,2), mode='reflect')
      out = self.conv7(out)
      out = F.batch_norm(out)
      out = F.prelu(out)

      out = F.pad(out, pad=(2,2,2,2), mode='reflect')
      out = self.conv8(out)
      out = F.batch_norm(out) 
      out = F.leaky_relu(out)

      out = out.view(out.shape[0], -1)

      #UP-SAMPLING BLOCKS
      out = self.convTrans1(out)
      out = F.leaky_relu(out)

      out = self.convTrans2(out)
      out = F.leaky_relu(out)

      out = self.convTrans3(out)
      out = F.leaky_relu(out)

      out = self.convTrans4(out)
      out = F.leaky_relu(out)

      #LINEAR LAYER
      out = self.fc1(out)
      out = F.tanh(out)

      return out
    
    def loss_fn(self,out,target):
      return nn.CTCLoss()(out, target, out.size, target.size)
    
    def configure_optimizers(self):
      LR = 1.5*10**(-4)
      gamma = 0.99
      optimizer = torch.optim.lr_scheduler.ExponentialLR(optimizer=torch.optim.SGD(self.parameters(),lr=LR), gamma=gamma)
      return optimizer

    def training_step(self,batch,target):
      
      x, y = batch
      # print(x,target)
      prediction = self(x)
      # print(prediction)
      loss = self.loss_fn(target,prediction)
      self.log('train_loss', loss)
      return loss       

    def validation_step(self,batch,target):
      x, y = batch
      prediction = self(x)
      loss = self.loss_fn(target,prediction)
      prediction = nn.Softmax(-1)(prediction) 
      logits = torch.argmax(prediction,dim=1)
      accu = self.accuracy(logits, target)        
      self.log('valid_loss', loss)
      self.log('train_acc_step', accu)
      return loss, accu

In [57]:
model = CamoCNN()
print(model.training_step(trainloader))

AttributeError: 'int' object has no attribute 'Conv2d'

In [9]:
model = CamoCNN()

trainer = pl.Trainer(gpus=1, max_epochs=100, deterministic=True)

trainer.fit(model, trainloader)
trainer.test(model, testloader)

NameError: name 'Camo' is not defined

In order to train our model, we need to compute the loss between the predicted speech and the ground-truth speech, 

meaning that in training, we need to attack the entire speech signal, not just a small segment. 

We therefore need to schedule our forward and backward passes such that we have computed the attack for the entire
segment before we calculate the gradients.

In [101]:
Epoch = 4
learningRate = 1.5*10**(-4)
gamma = 0.99

In [None]:
# build the model
model = CamoCNN()

# define the loss function and the optimiser
loss_function = nn.CrossEntropyLoss()
optimiser = optim.Adam(model.parameters())

device = "cuda:0" if torch.cuda.is_available() else "cpu"
trial = Trial(model, optimiser, loss_function, metrics=['loss', 'accuracy']).to(device)
trial.with_generators(trainloader, test_generator=testloader)
trial.run(epochs=10)
results = trial.evaluate(data_key=torchbearer.TEST_DATA)
print(results)