In [37]:
import matplotlib.pyplot as plt
import numpy as np

import torch
import torch.nn as nn #neural network module
import torch.optim as optim #optimizer
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms

import time as time
from tqdm import tqdm
import gc

In [38]:
#choose which one is better
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

device = "cuda" if torch.cuda.is_available() else "cpu"

if torch.cuda.is_available():
    device = torch.device("cuda")
    print('GPU available')
else:
    device = torch.device("cpu")
    print('GPU not available')

GPU available


In [39]:
from google.colab import drive
drive.mount('/content/drive')

datasetpath=os.path.realpath('drive/MyDrive/Dataset_CP.pt')
PolDataset = os.path.realpath('drive/MyDrive/PolyphonicDataset.pt')

#Loading monophonic and polyphonic classes
class MonophonicDataset(Dataset):

   def __init__(self, Instruments):

      DS = torch.load(datasetpath)
      self.Data = []
      self.Instruments = Instruments

      for inst in Instruments:

        RandomData = np.random.choice(DS[inst], int(len(DS[inst])*0.7))
        self.Data.extend(RandomData)

      del DS
      gc.collect()


   def __len__(self):
      return len(self.Data)

   def __getitem__(self, idx):

      PreviousBars = self.Data[idx]['Bars'][0].to_dense()
      Bars = self.Data[idx]['Bars'][1].to_dense()

      prog = self.Data[idx]['Program']
      tempo = self.Data[idx]['Tempo'][0]

      Cond1D = torch.tensor([tempo, prog], dtype=torch.int, device=Bars.device)
      return Bars, PreviousBars, Cond1D




class PolyphonicDataset(Dataset):

   def __init__(self, Genre):

         DS = torch.load(PolDataset, weights_only=False)
         self.Data = []
         self.Genre = Genre

         for gen in Genre:
            self.Data.extend(DS[gen])

         del DS
         gc.collect()

   def __len__(self):
      return len(self.Data)

   def __getitem__(self, idx):

      PreviousBars = self.Data[idx]['Bars'][0].to_dense()
      Bars = self.Data[idx]['Bars'][1].to_dense()

      prog = self.Data[idx]['Program'][0]
      tempo = self.Data[idx]['Tempo'][0]


      TEMPO_MIN, TEMPO_MAX = 60, 200
      PROGRAM_MIN, PROGRAM_MAX = 1, 128

      tempo_norm = (tempo - TEMPO_MIN) / (TEMPO_MAX - TEMPO_MIN)
      prog_norm = [(p - PROGRAM_MIN) / (PROGRAM_MAX - PROGRAM_MIN) for p in prog]


      Cond1D = torch.tensor([tempo_norm] + prog_norm, dtype=torch.float, device=Bars.device)
      return Bars, PreviousBars, Cond1D

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### Concatenation

Definition of the concatenation functions that are used in the hidden layers to concatenate the output and the 1_d and 2_d conditions.

1_d conditioning vector of shape $[n,1]$ with an output of shape $[batch,features,a,b]$:
* first we have to duplicate the vector $a\cdot b$ times to get a tensor of shape $[batch,n,a,b]$
* then we can concatenate the two tensors in the depth dimension (i.e dim=1)

2_d conditioning matrix of the same shape of the output $[batch,features,a,b]$ except the depth dim (it must be that because how we build the conditioner CNN):
* first we check that the dimensions are correct
* we concatenate the two tensors in the depth dimension (i.e dim=1)

In [40]:
def conv_cond_concat(x, y):
    """Concatenate conditioning vector on feature map axis."""
    x_shapes = x.shape  #[batch,n_features,a,b]
    y_shapes = y.shape  #[batch,n]
    y2 = y.view(x_shapes[0],y_shapes[1],1,1)                              #[batch,n,1,1]
    y2 = y2.expand(x_shapes[0],y_shapes[1],x_shapes[2],x_shapes[3])      #[batch,n,a,b]

    return torch.cat((x, y2),dim=1)                                     #[batch,n_features+n,a,b]

def conv_prev_concat(x, y):
    """Concatenate conditioning vector on feature map axis."""
    x_shapes = x.shape  #[batch,n_features,a,b]
    y_shapes = y.shape  #[batch,16,a,b]
    if x_shapes[2:] == y_shapes[2:]:
        y2 = y.expand(x_shapes[0],y_shapes[1],x_shapes[2],x_shapes[3])  #[batch,16,a,b]

        return torch.cat((x, y2),dim=1)                                 #[batch,n_features+16,a,b]

    else:
        print(x_shapes[2:])
        print(y_shapes[2:])

### The Generator and the Conditioner

The generator uses `ConvTranspose2d` (upsampling) layers to produce an image from a seed (random noise). Start with two `Dense` layers that take this seed as input and transform it to a tensor of shape $[batch size,]$, then upsample several times until we reach the desired size of a bar of $[instrument,128,16]$. We use  the `ReLU` activation for each layer, except the output layer which can use `Sigmoid` to predict pixel values in the [0, 1] range.

Coupled to the generator there is the conditioner that uses `Conv2d` (sampling) layers to produce the 2_d tensors that serve as informations from the preaviou bar. The conditioner can be viewed as the reverse of the generator because it uses filters with the same shapes of the ones in the generator. In this case we use  the `LeakyReLU` activation for each layer.

In [41]:
class Generator(nn.Module):

    def __init__(self, input_size, cond_1d_size, instrument_size=1, n_hlayers=128, n_2dhlayers=16):
            super().__init__()

            self.input_size = input_size
            self.instrument_size = instrument_size
            self.cond1d_dim = cond_1d_size
            self.nhlayers=n_hlayers
            self.n_2dhlayers=n_2dhlayers

            #generator layers
            #as said in the DCGAN paper always ReLU activation function in the generator excluded the last layer
            #as said in the DCGAN paper always batchnorm iin the generator excluded the last layer
            self.ff1 = nn.Sequential(
                nn.Linear(input_size+cond_1d_size, n_hlayers*5*3),                                                                            #[batch,n_hlayers*5*3]
                nn.BatchNorm1d(n_hlayers*5*3),
                nn.ReLU()
                )

            #reshape to [batch size,128,5,3]
            #+condition [batch,128+cond_1d_size+16,5,3]
            self.cnn1 = nn.Sequential(
                nn.ConvTranspose2d(n_hlayers+cond_1d_size+n_2dhlayers, n_hlayers, kernel_size=(5,5), stride=2, bias=False, padding=0),           #[batch,128,13,9]
                #nn.BatchNorm2d(n_hlayers),
                nn.ReLU()
                )
            #+condition [batch,128+cond_1d_size+16,13,9]
            self.cnn2 = nn.Sequential(
                nn.ConvTranspose2d(n_hlayers+cond_1d_size+n_2dhlayers, n_hlayers, kernel_size=(5,5), stride=(2,1), bias=False, padding=(0,1),output_padding=(1,0)),           #[batch,128,30,11]
                nn.BatchNorm2d(n_hlayers),
                nn.ReLU()
                )
            #+condition [batch,128+cond_1d_size+16,30,11]
            self.cnn3 = nn.Sequential(
                nn.ConvTranspose2d(n_hlayers+cond_1d_size+n_2dhlayers, n_hlayers, kernel_size=(5,5), stride=(2,1), bias=False, padding=0,output_padding=(1,0)),           #[batch,128,64,15]
                #nn.BatchNorm2d(n_hlayers),
                nn.ReLU()
                )
            #+condition [batch,128+cond_1d_size+16,1,2]
            self.cnn4 = nn.Sequential(
                nn.ConvTranspose2d(n_hlayers+cond_1d_size+n_2dhlayers, instrument_size, kernel_size=(6,6), stride=(2,1), bias=False, padding=(2,2)),       #[batch,instrument_size,128,16]
                nn.Sigmoid()
                #Sigmoid funciotn because we want to generate the matrixes of music without velocity, i.e. only (0,1)
                #Thus we use the sigmoid which is a smoother version of the sign function
                )

            #conditioner layers
            #as in Midinet model we use the Leaky activation funciton for the conditioner
            self.h0_prev = nn.Sequential(
                nn.Conv2d(instrument_size, n_2dhlayers, kernel_size=(6,6), stride=(2,1), padding=(2,2)),                  #[batch,16,64,15]
                nn.BatchNorm2d(n_2dhlayers),
                nn.LeakyReLU()          #note: in the original paper leak=0.2, default leak=0.01
                )
            self.h1_prev = nn.Sequential(
                nn.Conv2d(n_2dhlayers, n_2dhlayers, kernel_size=(5,5), stride=(2,1), padding=0),                                  #[batch,16,30,11]
                #nn.BatchNorm2d(n_2dhlayers),
                nn.LeakyReLU()
                )
            self.h2_prev = nn.Sequential(
                nn.Conv2d(n_2dhlayers, n_2dhlayers, kernel_size=(5,5), stride=(2,1), padding=(0,1)),                                  #[batch,16,13,9]
                nn.BatchNorm2d(n_2dhlayers),
                nn.LeakyReLU()
                )
            self.h3_prev = nn.Sequential(
                nn.Conv2d(n_2dhlayers, n_2dhlayers, kernel_size=(5,5), stride=(2,2), padding=0),                                  #[batch,16,5,3]
                #nn.BatchNorm2d(n_2dhlayers),
                nn.LeakyReLU()
                )

    def forward(self, z, prev_bar, cond_1d, batch_size):

            #2d condiiton
            cond0 = self.h0_prev(prev_bar)          #[batch,16,64,15]
            cond1 = self.h1_prev(cond0)             #[batch,16,30,11]
            cond2 = self.h2_prev(cond1)             #[batch,16,13,9]
            cond3 = self.h3_prev(cond2)             #[batch,16,5,3]

            #single cond_1d size =[n,1], batch_cond_1d size = [batch_size,n]

            input = torch.cat((z,cond_1d), dim=1)   #[batch_size, input_size+cond_1d_size]

            h1 = self.ff1(input)                    #[batch,1920]
            h1 = h1.reshape(batch_size, self.nhlayers, 5, 3)  #[batch,128,5,3]
            h1 = conv_cond_concat(h1,cond_1d)       #[batch,128+cond_1d_size,5,3]
            h1 = conv_prev_concat(h1,cond3)         #[batch,128+cond_1d_size+16,5,3]

            h2 = self.cnn1(h1)                      #[batch,128,13,9]
            h2 = conv_cond_concat(h2,cond_1d)       #[batch,128+cond_1d_size,13,9]
            h2 = conv_prev_concat(h2,cond2)         #[batch,128+cond_1d_size+16,13,9]

            h3 = self.cnn2(h2)                      #[batch,128,30,11]
            h3 = conv_cond_concat(h3,cond_1d)       #[batch,128+cond_1d_size,30,11]
            h3 = conv_prev_concat(h3,cond1)         #[batch,128+cond_1d_size+16,30,11]

            h4 = self.cnn3(h3)                      #[batch,128,64,15]
            h4 = conv_cond_concat(h4,cond_1d)       #[batch,128+cond_1d_size,64,15]
            h4 = conv_prev_concat(h4,cond0)         #[batch,128+cond_1d_size+16,64,15]

            out = self.cnn4(h4)                     #[batch,instrument_size,128,16]

            return out

### The Discriminator

The discriminator uses `Conv2d` (sampling) layers to produce a scalar output from a bar input. Start with two `Conv2d` layers that reduce the size of the input, then use two `Dense` layers. We use  the `LeakyReLU` activation for each layer, except the output layer which can use `Sigmoid` to predict true-false probability value in the [0, 1] range. Note that the activation is included in the loss function.

In [42]:
class Discriminator(nn.Module):

    def __init__(self, cond_1d_size, instrument_size=1):
        super().__init__()

        self.instrument_size = instrument_size
        self.cond1d_dim = cond_1d_size

        #as said in the DCGAN paper always batchnorm in the discriminator layers excluded the first layer
        self.cnn1 = nn.Sequential(
            nn.Conv2d(2*instrument_size+cond_1d_size, 32, kernel_size=(5,5), stride=(3,1), padding=0),        #[batch,32,42,12]
            nn.LeakyReLU()
        )
        #+condition [batch,64+cond_1d_size,42,12]
        self.cnn2 = nn.Sequential(
            nn.Conv2d(32+cond_1d_size, 128, kernel_size=(5,5), stride=(2,1), padding=0),                        #[batch,128,19,8]
            #nn.BatchNorm2d(128),
            nn.LeakyReLU()
        )
        #+condition [batch,64+cond_1d_size,19,8]
        self.cnn3 = nn.Sequential(
            nn.Conv2d(128+cond_1d_size, 64, kernel_size=(5,5), stride=(2,3), padding=0),                             #[batch,64,8,2]
            nn.BatchNorm2d(64),
            nn.LeakyReLU()
        )

        #+condition [batch,1024+cond_1d_size]
        self.ffnn1 = nn.Linear(1024+cond_1d_size, 1)      #no sigmoid activation function because it is already in the definition of the cross entropy loss function



    def forward(self, x, prev_bar, cond_1d):
        input = conv_cond_concat(x,cond_1d)         #[batch,instrument_size+cond_1d_size,128,16]
        input = conv_prev_concat(input,prev_bar)    #[batch,2*instrument_size+cond_1d_size,128,16]

        h0 = self.cnn1(input)                       #[batch,32,42,12]
        fm=h0
        h0 = conv_cond_concat(h0, cond_1d)          #[batch,32+cond_1d_size,42,12]

        h1 = self.cnn2(h0)                          #[batch,128,19,8]
        h1 = conv_cond_concat(h1,cond_1d)           #[batch,128+cond_1d_size,19,8]

        h2=self.cnn3(h1)                            #[batch,64,8,2]
        h2 = torch.flatten(h2, 1)                   #[batch,1024]
        h2 = torch.cat((h2,cond_1d),dim=1)          #[batch,1024+cond_1d_size]

        h3 = self.ffnn1(h2)                         #[batch,1]
        h3_sigmoid = torch.sigmoid(h3)


        return h3_sigmoid, h3, fm



### Weights initialization

Is this ok?

In [43]:
def weights_init(m):
    classname = m.__class__.__name__
    if classname.find('Conv') != -1:
        nn.init.xavier_uniform_(m.weight.data)
    if classname.find('Linear') != -1:
        nn.init.xavier_uniform_(m.weight.data)
    elif classname.find('BatchNorm') != -1:
        nn.init.normal_(m.weight.data, 1.0, 0.2)
        nn.init.constant_(m.bias.data, 0)

### Creation of the model

In [44]:
generator = Generator(input_size=256, cond_1d_size=5, instrument_size=4, n_hlayers=256, n_2dhlayers=16)
generator.apply(weights_init)

Generator(
  (ff1): Sequential(
    (0): Linear(in_features=261, out_features=3840, bias=True)
    (1): BatchNorm1d(3840, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
  )
  (cnn1): Sequential(
    (0): ConvTranspose2d(277, 256, kernel_size=(5, 5), stride=(2, 2), bias=False)
    (1): ReLU()
  )
  (cnn2): Sequential(
    (0): ConvTranspose2d(277, 256, kernel_size=(5, 5), stride=(2, 1), padding=(0, 1), output_padding=(1, 0), bias=False)
    (1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
  )
  (cnn3): Sequential(
    (0): ConvTranspose2d(277, 256, kernel_size=(5, 5), stride=(2, 1), output_padding=(1, 0), bias=False)
    (1): ReLU()
  )
  (cnn4): Sequential(
    (0): ConvTranspose2d(277, 4, kernel_size=(6, 6), stride=(2, 1), padding=(2, 2), bias=False)
    (1): Sigmoid()
  )
  (h0_prev): Sequential(
    (0): Conv2d(4, 16, kernel_size=(6, 6), stride=(2, 1), padding=(2, 2))
    (1): BatchNorm2d(16, eps=1

In [45]:
discriminator = Discriminator(cond_1d_size=5, instrument_size=4)
discriminator.apply(weights_init)

Discriminator(
  (cnn1): Sequential(
    (0): Conv2d(13, 32, kernel_size=(5, 5), stride=(3, 1))
    (1): LeakyReLU(negative_slope=0.01)
  )
  (cnn2): Sequential(
    (0): Conv2d(37, 128, kernel_size=(5, 5), stride=(2, 1))
    (1): LeakyReLU(negative_slope=0.01)
  )
  (cnn3): Sequential(
    (0): Conv2d(133, 64, kernel_size=(5, 5), stride=(2, 3))
    (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): LeakyReLU(negative_slope=0.01)
  )
  (ffnn1): Linear(in_features=1029, out_features=1, bias=True)
)

### Dimension testing

Produce a noise vector of size `[10, 100]`, a noise 1d condition vector of size `[10, 15]`, and a noise 2d condition tensor of size `[10, 1, 128,16]`. Note that we need a 1d and a 2d contions for each batch input. Then we use the (as yet **untrained**) generator to create an image of expected output shape $[10,1,128,16]$.

Then use the (yet **untrained**) discriminator to classify the generated images as real or fake. The model will be trained to output the probability that the image is real in the first output component, thus we expect an output vector of size `[10, 1]` with $x_i \in [0,1]$.

In [46]:
'''############################ input (batch_size=10, vector_size=100) ###############################
noise = torch.normal(0, 1, [10, 100])
print(noise.shape)
############################ conditions ###############################
cond_1d =  torch.normal(0,1,[10,5])
prev_bar = torch.normal(0, 1, [10, 1, 128, 16])
############################ generator ###############################
generated_bar = generator(noise, prev_bar, cond_1d, batch_size=10).detach()
print(generated_bar.shape)
############################ discriminator ###############################
decision, __, __= discriminator(generated_bar, prev_bar, cond_1d)
print(decision)'''

'############################ input (batch_size=10, vector_size=100) ###############################\nnoise = torch.normal(0, 1, [10, 100])\nprint(noise.shape)\n############################ conditions ###############################\ncond_1d =  torch.normal(0,1,[10,5])\nprev_bar = torch.normal(0, 1, [10, 1, 128, 16])\n############################ generator ###############################\ngenerated_bar = generator(noise, prev_bar, cond_1d, batch_size=10).detach()\nprint(generated_bar.shape)\n############################ discriminator ###############################\ndecision, __, __= discriminator(generated_bar, prev_bar, cond_1d)\nprint(decision)'

### Discriminator loss

This method quantifies how well the discriminator is able to distinguish real images from fakes. It compares the discriminator's predictions on real images to an array of 1s, and the discriminator's predictions on fake (generated) images to an array of 0s.
The discriminator loss is of the form:

$\frac{1}{m}\sum_{i=1}^{m}-[\log D(\boldsymbol{x}^{(i)}) +\log(1-D(G(\boldsymbol{z}^{(i)})))]$

We inplement one-sided label smoothing to penalize self confidence and imporve the convergence of the training. Thus we substitute the discriminator's predictions on real images to an array of 1s with an array of (1-$\alpha$)s and the loss function becomes:

$\frac{1}{m}\sum_{i=1}^{m}-[(1-\alpha) \log D(\boldsymbol{x}^{(i)}) +\alpha \log (1-D(\boldsymbol{x}^{(i)}))+\log(1-D(G(\boldsymbol{z}^{(i)})))]$

In [47]:
cross_entropy = nn.BCEWithLogitsLoss()
MSE=nn.MSELoss()

In [48]:
def discriminator_loss(real_output, fake_output, device, alpha=0.1):

    #real_targets = torch.ones_like(real_output, device=device)                 #no label smoothing -> True output expected output is 1
    real_targets = torch.full_like(real_output, 1.0 - alpha, device=device)     #one side label smoothing to penalize self confidence
    fake_targets = torch.zeros_like(fake_output, device=device)                 #no label smoothing -> Fake output expected output is 0

    real_loss = cross_entropy(real_output, real_targets)
    fake_loss = cross_entropy(fake_output, fake_targets)

    total_loss = real_loss + fake_loss
    return total_loss

### Generator loss

The generator's loss quantifies how well it was able to trick the discriminator. Intuitively, if the generator is performing well, the discriminator will classify the fake images as real (or 1).
The generator loss is of the form:

$\frac{1}{m}\sum_{i=1}^{m}\log(1-D(G(\boldsymbol{z}^{(i)})))$

However this loss has some convergence issues due to vanishing gradients. So instead we use the following loss which has the same trend but stronger gradient when the discriminator is too good at recognizing fake samples.

$\frac{1}{m}\sum_{i=1}^{m}-\log(D(G(\boldsymbol{z}^{(i)})))$

Which is the Binary crossentropy between $D(G(\boldsymbol{z}^{(i)}))$ and the probability distribution that has $y^{(i)} = 1 \forall i$, i.e. we are forcing the generator to produce samples that will make the discriminator predict that fake samples are real.

Moreover we add a regularizer term so-called feature matching such that the distributions of the real and generated data are enforced to be close.

$\lambda_1 ||E_{x \sim p(x)} [x] - E_{z\sim p(z)} [G(z)] ||^2 + \lambda_2 ||E_{x \sim p(x)} [f(x)] - E_{z \sim p(z)} [f(G(z))] ||^2$


In [49]:
def generator_loss(fake_output, real_bar, fake_bar, real_f, fake_f, device, lambda1=0.008, lambda2=0.005):

    gen_loss = cross_entropy(fake_output, torch.ones_like(fake_output, device=device))

    mean_real = torch.mean(real_bar, dim=0)
    mean_fake = torch.mean(fake_bar, dim=0)
    l2_data = MSE(mean_real, mean_fake)

    mean_real_feat = torch.mean(real_f, dim=0)
    mean_fake_feat = torch.mean(fake_f, dim=0)
    l2_feat = MSE(mean_real_feat, mean_fake_feat)

    return gen_loss+lambda1*l2_data+lambda2*l2_feat

### Optimizers

With DCGAN the training is very diffuclt so we decide to use Adam optimizer as suggested by the paper. Note that with Adam we use both momentum and RMSprop to normalized velocities. Discriminator and generator need two different optimizers (conditioner is included in the generator training).

In [50]:
generator.to(device)
discriminator.to(device)

gen_opt = optim.Adam(generator.parameters(), lr=2e-4)
dis_opt = optim.Adam(discriminator.parameters(), lr=2e-4)

### Training loop

In [51]:
EPOCHS = 20
noise_dim = 256
BATCH_SIZE = 72

Data = PolyphonicDataset(Genre = ['rock'])
dataloader = DataLoader(Data, BATCH_SIZE, shuffle=True, num_workers=0, drop_last=True)

In [52]:
'''def train_step(bars, cond_1d, prev_bar, generator, discriminator, batch_size, noise_dim, device, dis_opt, gen_opt):
    noise = torch.randn([batch_size, noise_dim], device=device)
    prev_bar = prev_bar + torch.randn_like(prev_bar) * 0.05
    prev_bar = torch.clamp(prev_bar, 0, 1)

    # Generate Images
    generated_bars = generator(noise, prev_bar, cond_1d, batch_size)

    # Classify true and fake images (remember to stop gradient using detach when predicting fake images)
    output, D, fm = discriminator(bars, prev_bar, cond_1d)
    fake_output, fake_D, fake_fm = discriminator(generated_bars.detach(), prev_bar, cond_1d)

    # Compute discriminator loss and update discriminator
    disc_loss = discriminator_loss(D, fake_D, device)
    discriminator.zero_grad()
    #dis_opt.zero_grad()
    disc_loss.backward()
    dis_opt.step()

    # Compute predictions where gradients can flow to the generator of fake samples
    output, D, fm = discriminator(bars, prev_bar, cond_1d)
    fake_output, fake_D, fake_fm = discriminator(generated_bars, prev_bar, cond_1d)

    # Compute generator loss and update generator
    gen_loss = generator_loss(fake_D, bars, generated_bars, fm, fake_fm, device)
    generator.zero_grad()
    #gen_opt.zero_grad()
    gen_loss.backward()
    gen_opt.step()

    return gen_loss, disc_loss'''

'def train_step(bars, cond_1d, prev_bar, generator, discriminator, batch_size, noise_dim, device, dis_opt, gen_opt):\n    noise = torch.randn([batch_size, noise_dim], device=device)\n    prev_bar = prev_bar + torch.randn_like(prev_bar) * 0.05\n    prev_bar = torch.clamp(prev_bar, 0, 1)\n\n    # Generate Images\n    generated_bars = generator(noise, prev_bar, cond_1d, batch_size)\n\n    # Classify true and fake images (remember to stop gradient using detach when predicting fake images)\n    output, D, fm = discriminator(bars, prev_bar, cond_1d)\n    fake_output, fake_D, fake_fm = discriminator(generated_bars.detach(), prev_bar, cond_1d)\n\n    # Compute discriminator loss and update discriminator\n    disc_loss = discriminator_loss(D, fake_D, device)\n    discriminator.zero_grad()\n    #dis_opt.zero_grad()\n    disc_loss.backward()\n    dis_opt.step()\n\n    # Compute predictions where gradients can flow to the generator of fake samples\n    output, D, fm = discriminator(bars, prev_bar,

In [53]:
def train_step(bars, cond_1d, prev_bar, generator, discriminator, batch_size,
               noise_dim, device, dis_opt, gen_opt, epoch):
    # --- Ensure all tensors are on the correct device ---
    bars = bars.to(device)
    cond_1d = cond_1d.to(device)
    prev_bar = prev_bar.to(device)

    # --- Discriminator training ---
    noise = torch.randn([batch_size, noise_dim], device=device)

    # Add slight noise to prev_bar (optional)
    prev_bar = prev_bar + torch.randn_like(prev_bar) * 0.05
    prev_bar = torch.clamp(prev_bar, 0, 1)

    # Generate fake samples
    generated_bars = generator(noise, prev_bar, cond_1d, batch_size)

    # Forward pass (real + fake)
    real_output, real_D, _ = discriminator(bars, prev_bar, cond_1d)
    fake_output, fake_D, _ = discriminator(generated_bars.detach(), prev_bar, cond_1d)

    # Discriminator loss
    disc_loss = discriminator_loss(real_D, fake_D, device)
    discriminator.zero_grad()
    disc_loss.backward()
    dis_opt.step()

    # --- Generator training (2 steps) ---
    gen_losses = []
    for _ in range(4):  # Consistent 2:1 update ratio
        noise = torch.randn([batch_size, noise_dim], device=device)
        generated_bars = generator(noise, prev_bar, cond_1d, batch_size)
        _, fake_D, fake_fm = discriminator(generated_bars, prev_bar, cond_1d)

        with torch.no_grad():
            _, real_D, real_fm = discriminator(bars, prev_bar, cond_1d)

        gen_loss = generator_loss(fake_D, bars, generated_bars, real_fm, fake_fm, device)
        generator.zero_grad()
        gen_loss.backward()
        gen_opt.step()
        gen_losses.append(gen_loss.item())

    return sum(gen_losses) / len(gen_losses), disc_loss.item()

supponendo che nel dataloader ogni dato sia una bar + la preavious bar + 1d condition sugli strumenti utilizzati

In [None]:
gloss = []
dloss = []

for epoch in range(EPOCHS):

    start = time.time()
    generator.train()
    discriminator.train()
    gen_losses = []
    disc_losses = []
    print('#################')
    print(f"Epoch: {epoch+1}")

    iterator = tqdm(dataloader)
    for bar_batch, prev_bar_batch, instrument_batch in iterator:
        bar_batch = bar_batch.to(dtype=torch.float32, device=device)
        prev_bar_batch = prev_bar_batch.to(dtype=torch.float32, device=device)
        instrument_batch = instrument_batch.to(dtype=torch.float32, device=device)

        gen_loss, disc_loss = train_step(bar_batch, instrument_batch, prev_bar_batch, generator,
                                         discriminator, BATCH_SIZE, noise_dim, device, dis_opt, gen_opt, epoch)
        gen_losses.append(gen_loss)
        disc_losses.append(disc_loss)

        iterator.set_description('Discriminator loss: {}, Generator loss: {}'.format(disc_loss, gen_loss))

    gloss.append(np.mean(gen_losses))
    dloss.append(np.mean(disc_losses))
    #print ('Time for epoch {} is {} sec'.format(epoch + 1, time.time()-start))
    print(f'dLoss: {dloss[-1]}, gLoss: {gloss[-1]}')

#################
Epoch: 1


Discriminator loss: 1.3876826763153076, Generator loss: 0.7854041904211044: 100%|██████████| 653/653 [02:13<00:00,  4.90it/s]


dLoss: 1.5820877657915147, gLoss: 0.7130663146309064
#################
Epoch: 2


Discriminator loss: 1.3792245388031006, Generator loss: 0.8020652532577515: 100%|██████████| 653/653 [02:13<00:00,  4.90it/s]


dLoss: 1.382564717741144, gLoss: 0.7924090687293919
#################
Epoch: 3


Discriminator loss: 1.3787972927093506, Generator loss: 0.7991109192371368:  51%|█████▏    | 336/653 [01:08<01:05,  4.87it/s]

### Weights and loss analysis

First let's plot the 2 losses over the epochs, if it works correctly the generator loss would have to decrease and the discriminator one would have to increase

In [None]:
plt.plot(gloss, label='Generator loss')
plt.plot(dloss, label='Discriminator loss')
plt.legend()
plt.show()

Then we can analyze the weights distribution

In [None]:
# First hidden layer
h1_w = net.fc1.weight.data.cpu().numpy()
h1_b = net.fc1.bias.data.cpu().numpy()

# Second hidden layer
h2_w = net.fc2.weight.data.cpu().numpy()
h2_b = net.fc2.bias.data.cpu().numpy()

# Output layer
out_w = net.out.weight.data.cpu().numpy()
out_b = net.out.bias.data.cpu().numpy()

# Weights histogram
fig, axs = plt.subplots(3, 1, figsize=(12,8))
axs[0].hist(h1_w.flatten(), 50)
axs[0].set_title('First hidden layer weights')
axs[1].hist(h2_w.flatten(), 50)
axs[1].set_title('Second hidden layer weights')
axs[2].hist(out_w.flatten(), 50)
axs[2].set_title('Output layer weights')
[ax.grid() for ax in axs]
plt.tight_layout()
plt.show()

In [None]:
# lo avevmo fatto nel lab2 forse adesso un po' complicato visto che come input abbiamo un vettore con 100 componenti

def get_activation(layer, input, output):
    global activation
    activation = torch.sigmoid(output)

### Register hook
hook_handle = generator.cnn1.register_forward_hook(get_activation)

### Analyze activations
generator = generator.to(device)
generator.eval()
with torch.no_grad():
    x1 = torch.tensor([0.1]).float().to(device)
    y1 = net(x1)
    z1 = activation
    x2 = torch.tensor([0.9]).float().to(device)
    y2 = net(x2)
    z2 = activation
    x3 = torch.tensor([2.5]).float().to(device)
    y3 = net(x3)
    z3 = activation

### Remove hook
hook_handle.remove()

### Plot activations
fig, axs = plt.subplots(3, 1, figsize=(12,6))
axs[0].stem(z1.cpu().numpy())
axs[0].set_title('Last layer activations for input x=%.2f' % x1)
axs[1].stem(z2.cpu().numpy())
axs[1].set_title('Last layer activations for input x=%.2f' % x2)
axs[2].stem(z3.cpu().numpy())
axs[2].set_title('Last layer activations for input x=%.2f' % x3)
plt.tight_layout()
plt.show()

#### Generator output

In [None]:
generator.eval()
noise = torch.randn([1, noise_dim], device=device)
instrument =
prev_bar =


with torch.no_grad():
   generated_bar = generator(noise, prev_bar, instrument, 1)

generated_bar=generated_bar.cpu().numpy()

Save networks and optimizers states

In [None]:
torch.save(discriminator.state_dict(), 'Traddiscriminator_parameters.torch')
torch.save(generator.state_dict(), 'Tradgenerator_parameters.torch')
print('Saved Model')

torch.save(dis_opt.state_dict(), 'Traddis_opt_state.torch')
torch.save(gen_opt.state_dict(), 'Tradgen_opt_state.torch')
print('Saved Optimizer')

Reload

In [None]:
discriminator = Discriminator(input_size=100, cond_1d_size=15, instrument_size=1)
discriminator.apply(weights_init)
discriminator.to(device)
# Load the state dict previously saved
discriminator_state_dict = torch.load('discriminator_parameters.torch')
# Update the network parameters
discriminator.load_state_dict(discriminator_state_dict)

generator = Generator(input_size=100, cond_1d_size=15, instrument_size=1, n_hlayers=128)
generator.apply(weights_init)
generator.to(device)
# Load the state dict previously saved
generator_state_dict = torch.load('generator_parameters.torch')
# Update the network parameters
generator.load_state_dict(generator_state_dict)


dis_opt = optim.Adam(discriminator.parameters(), lr=2e-4)
# Load the state dict previously saved
dis_opt_state_dict = torch.load('dis_opt_state.torch')
# Update the network parameters
dis_opt.load_state_dict(dis_opt_state_dict)

gen_opt = optim.Adam(generator.parameters(), lr=2e-4)
# Load the state dict previously saved
gen_opt_state_dict = torch.load('gen_opt_state.torch')
# Update the network parameters
gen_opt.load_state_dict(gen_opt_state_dict)