**SOW-MKI49: Neural Information Processing Systems**  
*Weeks 4 and 5: Assignment (225 points + 30 bonus points)*  
Author: Umut

In [0]:
# Group number: 2
# Parsia Basimfar, student 1 number: 1022274
# Luca Missoni, student 2 number: s1028912

In [1]:
from chainer import ChainList, optimizers, serializers
import chainer
import chainer.functions as F
import chainer.links as L
import numpy as np

**WaveNet component (75 points)**

* Implement missing parts of the call method (y and z). **25 points**
* Implement residual block class. **50 points**

---
Reminder:

* One convolution layer that has 61 kernels of size 2 with no nonlinearities.
![alt text](http://i67.tinypic.com/21mgi2w.png)
![alt text](http://i67.tinypic.com/292n04y.png)
---



In [12]:
class _WaveNet(ChainList):
    def __init__(self):
        links = (L.Convolution2D(61, 61, (1, 2)),)  # Layer 1
        links += tuple(_ResidualBlock((1, 2 ** (i % 6))) for i in range(6))
        links += (L.Convolution2D(512, 512, 1),
                  L.Convolution2D(512, 3843, 1))

        super(_WaveNet, self).__init__(*links)

    def __call__(self, x):
        y = (self[0](F.pad(x, ((0, 0), (0, 0), (0, 0), (1, 0)), 'constant')),)
        z = 0

        for i in range(1, len(self) - 2):
            y = self[i](y[0])
            z += F.relu(y[1])

        y, z = F.split_axis(self[-1](F.relu(self[-2](z))), (3721,), 1)
        
        return F.reshape(y, (y.shape[0], 61, 61, y.shape[3])), \
               F.reshape(z, (z.shape[0], 2, 61, z.shape[3]))


class _ResidualBlock(ChainList):  # what is happening inside every block (of the six blocks)
    def __init__(self, dilation):
        super(_ResidualBlock, self).__init__(

            L.DilatedConvolution2D(61, 122, ksize=(1, 2), dilate=dilation),  # Layer 2 #gated activation unit
            L.Convolution2D(61, 573, ksize=1))

    def __call__(self, x):
        h = F.split_axis(self[0](F.pad(x, ((0, 0), (0, 0), (0, 0), (self[0].dilate[1], 0)), 'constant')), 2, 1)
        y = F.sigmoid(h[0]) * F.tanh(h[1])
        y = F.split_axis(self[1](y), (61,), 1)
        return x + y[0], y[1]


**CRF-RNN component (50 points)**

* Implement missing parts of the call method (z). **25 points**
* Why is z not normalized in the last iteration? **25 points**

---

Reminder:

![alt text](http://i68.tinypic.com/sy6mix.png)

---

In [13]:
class _CRF(ChainList):
    def __init__(self):
        super(_CRF, self).__init__(L.ConvolutionND(1, 2, 2, 1, nobias=True))

    def __call__(self, x, y):

        z = F.softmax(y)

        for i in range(5):

            z = -y - self[0](F.batch_matmul(z, x))

            if i < 4:
                z = F.softmax(z)

        return z


# Question's answer:
    # z is not normalized in the last iteration because it is normalized in the forward
    # propagation during the training of the WaveCRF model.

**WaveCRF model (50 points)**

1. Implement missing parts of the call method (k, psi_u and Q_hat). **20 points**
2. Implement missing parts of the save and load methods (save and load model). **10 points**
3. Implement missing parts of the test and train methods (forward and/or backward propagate). **20 points**

In [14]:
class WaveCRF(object):
    def __init__(self):
        self.log = {('test', 'accuracy'): (), ('test', 'loss'): (), ('training', 'accuracy'): (),
                    ('training', 'loss'): ()}
        self.model = ChainList(_WaveNet(), _CRF())
        self.optimizer = optimizers.Adam(0.0002, 0.5)

        self.optimizer.setup(self.model)

    def __call__(self, x):
        k, psi_u = self.model[0](x)

        Q_hat = self.model[1](F.reshape(F.transpose(k, (0, 3, 1, 2)), (-1, 61, 61)),
                              F.reshape(F.transpose(psi_u, (0, 3, 1, 2)), (-1, 2, 61)))

        return F.transpose(F.reshape(Q_hat, (x.shape[0], x.shape[3], 2, 61)), (0, 2, 3, 1))

    @classmethod
    def load(cls, directory):
        self = cls()
        self.log = np.load('{}/log.npy'.format(directory))

        # Load model
        serializers.load_npz('{}/model.npz'.format(directory), self.model)
        serializers.load_npz('{}/optimizer.npz'.format(directory), self.optimizer)

        return self

    def save(self, directory):
        np.save('{}/log.npy'.format(directory), self.log)
        # Save model
        serializers.save_npz('{}/optimizer.npz'.format(directory), self.optimizer)

    def test(self, Q, x):
        with chainer.using_config('train', False):
            Q_hat = self(x)
            loss = F.softmax_cross_entropy(Q_hat, Q)
            # Forward prop

            self.log['test', 'accuracy'] += (float(F.accuracy(Q_hat, Q).data),)
            self.log['test', 'loss'] += (float(loss.data),)
            return Q_hat

    def train(self, Q, x):
        # Forward prop
        # Forward prop
        Q_hat = self(x)
        loss = F.softmax_cross_entropy(Q_hat, Q)
        # Backprop
        # Backprop
        # Backprop

        self.model.cleargrads()
        loss.backward()
        self.optimizer.update()

        self.log['training', 'accuracy'] += (float(F.accuracy(Q_hat, Q).data),)
        self.log['training', 'loss'] += (float(loss.data),)


In [15]:
%matplotlib inline

import IPython
import chainer
import matplotlib
import numpy
import os
import pickle
import random
import tqdm

In [16]:
batch_size = 30
epochs = 70


In [17]:
with open('piano_rolls.p', 'rb') as f:
    piano_rolls = pickle.load(f)

keys = sorted(piano_rolls.keys())

random.seed(6)
random.shuffle(keys)

test_set = dict((key, piano_rolls[key]) for key in keys[:int(0.1 * len(keys))])
training_set = dict((key, piano_rolls[key]) for key in keys[int(0.1 * len(keys)):])
training_set_keys = list(training_set.keys())

In [18]:
waveCRF = WaveCRF()

waveCRF.model.to_gpu()

<chainer.link.ChainList at 0x20482078668>

In [19]:
for epoch in tqdm.tnrange(epochs):
    print('epoch {}'.format(epoch))
    random.shuffle(training_set_keys)
    
    batch = ()

    for key in tqdm.tqdm_notebook(training_set_keys, leave = False):
        i = random.randint(0, training_set[key].shape[1] - 80)
        batch += (training_set[key][32 : 93, i : i + 80],)

        if len(batch) == batch_size:
            batch = waveCRF.model.xp.array(batch)

            waveCRF.train(batch[:, :, 1:].astype('i'), batch[:, :, None, :-1].astype('f'))

            batch = ()

    for key in tqdm.tqdm_notebook(test_set, leave = False):
        batch = waveCRF.model.xp.array((test_set[key][32 : 93],))

        waveCRF.test(batch[:, :, 1:].astype('i'), batch[:, :, None, :-1].astype('f'))

    IPython.display.clear_output()

    for i, key in enumerate(waveCRF.log):
        matplotlib.pyplot.subplot(221 + i)
        if np.size(numpy.array(waveCRF.log[key])) % 2 == 0:
            matplotlib.pyplot.plot(numpy.array(waveCRF.log[key]).reshape(2, -1).mean(1))
        else: 
            matplotlib.pyplot.plot(numpy.array(waveCRF.log[key]).reshape(1, -1).mean(1))
        matplotlib.pyplot.xlabel('iteration')
        matplotlib.pyplot.ylabel(key)
    matplotlib.pyplot.tight_layout()
    matplotlib.pyplot.show()
    os.makedirs('Models/WaveCRF2/{}'.format(epoch))
    waveCRF.save('Models/WaveCRF2/{}'.format(epoch))
    
    
    # For this part I had to add the conditional to the plotting to avoid errors. 
    # Ignore the Error Message Below




TypeError: not all arguments converted during string formatting

In [None]:
###### TEST ###########


test_set_keys = list(test_set.keys())
root = 
finalModel=WaveCRF()
finalModel.load('Models/WaveCRF2/69')
key=[1,10,79,100] # choose whatever key from the test set
size=80 # time points
     
batch1= finalModel.model.xp.array((test_set[test_set_keys[key[0]]][32 : 93],))[:,:,:size] # we use just the 80 first points of the 61
batch2= finalModel.model.xp.array((test_set[test_set_keys[key[1]]][32 : 93],))[:,:,:size] # we use just the 80 first points of the 61
batch3= finalModel.model.xp.array((test_set[test_set_keys[key[2]]][32 : 93],))[:,:,:size] # we use just the 80 first points of the 61
batch4= finalModel.model.xp.array((test_set[test_set_keys[key[3]]][32 : 93],))[:,:,:size] # we use just the 80 first points of the 61
batch=[batch1,batch2,batch3,batch4]

batchEnd=[];

length=len(batch1[0,0,:])
for i in range(0,4):
    batchT=[]
    batchT=batch[i]
    for k in range(0,length): #we predict 80 seconds if front of the sample that we chose. From the test samplewe keep the 80 first seconds

        sample=finalModel.test(batchT.astype('i'), batchT[:, :, None, :].astype('f'))
        sample_soft=F.softmax(sample.data)
        fsample=sample_soft.data[:, :,:,size-1]

        test=sample_soft.data[0, 1,:,size-1]

        batchN= np.random.rand(1, 61) < test 
        batchT[0,:,size-1]=batchN[0,:]
        batchT[0,:,:size-1]=batchT[0,:,1:size]
        
    batch[i]=batchT


    
batch11= finalModel.model.xp.array((test_set[test_set_keys[key[0]]][32 : 93],))[:,:,:2*size] # we use just the 80 first points of the 61
batch11[0,:,size:2*size]=batch[0]
batch22= finalModel.model.xp.array((test_set[test_set_keys[key[1]]][32 : 93],))[:,:,:2*size] # we use just the 80 first points of the 61
batch22[0,:,size:2*size]=batch[1]
batch33= finalModel.model.xp.array((test_set[test_set_keys[key[2]]][32 : 93],))[:,:,:2*size] # we use just the 80 first points of the 61
batch33[0,:,size:2*size]=batch[2]
batch44= finalModel.model.xp.array((test_set[test_set_keys[key[3]]][32 : 93],))[:,:,:2*size] # we use just the 80 first points of the 61
batch44[0,:,size:2*size]=batch[3]
batchFinal=[batch11,batch22,batch33,batch44];


In [None]:
import midi

sample = np.load("nipsData.npy")
sample1 = sample[0,0,:,:]
sample2 = sample[0,1,:,:]


piano_roll = sample1 > .4
midi.utils.midiwrite('piano_roll.mid', piano_roll.T, (32, 93), 0.25) 
'''


I could not get around the module has no object named utils ImportError


In [None]:
**Bonus question (30 points)**

* Discuss how you can improve the model (you can talk about different architectures or different ways to encode the inputs, etc.) **10 points**
* Discuss the assumptions behind the meanfield approximation and its shortcomings. **10 points**
* Prove that the iterative update equation (CRF-RNN component) is differentiable so that we can backpropagate through them. **10 points**


In [None]:
#1 (begin)
# For example one way to improve the model was proposed by the same authors of the Wavenet. As sequential
# generation is rather slow one can use parallel generation and use the Original Wavenet as a guide to train the 
# Parallel Wavenet. 
#end

#2 (begin)
# One of the assumptions is that the posterior distribution is drawn from an independent distribution. For example
# a shortcoming in this case is that notes in a classical music do not follow each other independently, rather,
# there exists some covariance between each note. The prior distribution is also optimized in a way so that Kull_
# back-Liebler divergence is minimum. This is also a shortcoming as with the increase in sample size the entropy a
# also increases which may lead to less accurate 'Bachian' regenerations 