<a href="https://colab.research.google.com/github/Biradeep/Application-of-U-Net-Neural-Network-to-Cavitation-Phenomena/blob/master/WandbUnetTraining.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
!pip install wandb --upgrade

Collecting wandb
  Downloading wandb-0.12.0-py2.py3-none-any.whl (1.6 MB)
[?25l[K     |▏                               | 10 kB 26.7 MB/s eta 0:00:01[K     |▍                               | 20 kB 31.4 MB/s eta 0:00:01[K     |▋                               | 30 kB 22.3 MB/s eta 0:00:01[K     |▉                               | 40 kB 17.7 MB/s eta 0:00:01[K     |█                               | 51 kB 9.1 MB/s eta 0:00:01[K     |█▏                              | 61 kB 9.4 MB/s eta 0:00:01[K     |█▍                              | 71 kB 9.0 MB/s eta 0:00:01[K     |█▋                              | 81 kB 10.1 MB/s eta 0:00:01[K     |█▉                              | 92 kB 10.3 MB/s eta 0:00:01[K     |██                              | 102 kB 8.4 MB/s eta 0:00:01[K     |██▏                             | 112 kB 8.4 MB/s eta 0:00:01[K     |██▍                             | 122 kB 8.4 MB/s eta 0:00:01[K     |██▋                             | 133 kB 8.4 MB/s eta 0:00:01

In [3]:
import wandb

wandb.login()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize


wandb: Paste an API key from your profile and hit enter: ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [4]:
import sys
sys.path.append('/usr/local/lib/python3.7/site-packages/')
sys.path.insert(0,'/content/gdrive/MyDrive/Deep-Flow-Prediction/train')

In [5]:
import os, sys, random
import numpy as np
import torch
import torch.nn as nn
from torch.autograd import Variable
from torch.utils.data import DataLoader
import torch.optim as optim

from DfpNet import TurbNetG, weights_init
import dataset
import utils

In [6]:
config = dict(
    batch_size=1,
    iterations = 10000,
    learning_rate=0.0006,
    expo = 5,
    architecture="CNN")

In [7]:
device          = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [8]:
wandb.init(project="unet-training")
cfg = wandb.config

[34m[1mwandb[0m: Currently logged in as: [33mbiradeep[0m (use `wandb login --relogin` to force relogin)


In [9]:
import torch
import torch.nn as nn
import torch.nn.functional as F

def weights_init(m):
    classname = m.__class__.__name__
    if classname.find('Conv') != -1:
        m.weight.data.normal_(0.0, 0.02)
    elif classname.find('BatchNorm') != -1:
        m.weight.data.normal_(1.0, 0.02)
        m.bias.data.fill_(0)

def blockUNet(in_c, out_c, name, transposed=False, bn=True, relu=True, size=4, pad=1, dropout=0.):
    block = nn.Sequential()
    if relu:
        block.add_module('%s_relu' % name, nn.ReLU(inplace=True))
    else:
        block.add_module('%s_leakyrelu' % name, nn.LeakyReLU(0.2, inplace=True))
    if not transposed:
        block.add_module('%s_conv' % name, nn.Conv2d(in_c, out_c, kernel_size=size, stride=2, padding=pad, bias=True))
    else:
        block.add_module('%s_upsam' % name, nn.Upsample(scale_factor=2, mode='bilinear')) # Note: old default was nearest neighbor
        # reduce kernel size by one for the upsampling (ie decoder part)
        block.add_module('%s_tconv' % name, nn.Conv2d(in_c, out_c, kernel_size=(size-1), stride=1, padding=pad, bias=True))
    if bn:
        block.add_module('%s_bn' % name, nn.BatchNorm2d(out_c))
    if dropout>0.:
        block.add_module('%s_dropout' % name, nn.Dropout2d( dropout, inplace=True))
    return block
    
# generator model
class TurbNetG(nn.Module):
    def __init__(self, channelExponent=6, dropout=0.):
        super(TurbNetG, self).__init__()
        channels = int(2 ** channelExponent + 0.5)

        self.layer1 = nn.Sequential()
        self.layer1.add_module('layer1_conv', nn.Conv2d(3, channels, 4, 2, 1, bias=True))

        self.layer2 = blockUNet(channels  , channels*2, 'layer2', transposed=False, bn=True,  relu=False, dropout=dropout )
        self.layer2b= blockUNet(channels*2, channels*2, 'layer2b',transposed=False, bn=True,  relu=False, dropout=dropout )
        self.layer3 = blockUNet(channels*2, channels*4, 'layer3', transposed=False, bn=True,  relu=False, dropout=dropout )
        # note the following layer also had a kernel size of 2 in the original version (cf https://arxiv.org/abs/1810.08217)
        # it is now changed to size 4 for encoder/decoder symmetry; to reproduce the old/original results, please change it to 2
        self.layer4 = blockUNet(channels*4, channels*8, 'layer4', transposed=False, bn=True,  relu=False, dropout=dropout ,  size=4 ) # note, size 4!
        self.layer5 = blockUNet(channels*8, channels*8, 'layer5', transposed=False, bn=True,  relu=False, dropout=dropout , size=2,pad=0)
        self.layer6 = blockUNet(channels*8, channels*8, 'layer6', transposed=False, bn=False, relu=False, dropout=dropout , size=2,pad=0)
     
        # note, kernel size is internally reduced by one now
        self.dlayer6 = blockUNet(channels*8, channels*8, 'dlayer6', transposed=True, bn=True, relu=True, dropout=dropout , size=2,pad=0)
        self.dlayer5 = blockUNet(channels*16,channels*8, 'dlayer5', transposed=True, bn=True, relu=True, dropout=dropout , size=2,pad=0)
        self.dlayer4 = blockUNet(channels*16,channels*4, 'dlayer4', transposed=True, bn=True, relu=True, dropout=dropout ) 
        self.dlayer3 = blockUNet(channels*8, channels*2, 'dlayer3', transposed=True, bn=True, relu=True, dropout=dropout )
        self.dlayer2b= blockUNet(channels*4, channels*2, 'dlayer2b',transposed=True, bn=True, relu=True, dropout=dropout )
        self.dlayer2 = blockUNet(channels*4, channels  , 'dlayer2', transposed=True, bn=True, relu=True, dropout=dropout )

        self.dlayer1 = nn.Sequential()
        self.dlayer1.add_module('dlayer1_relu', nn.ReLU(inplace=True))
        self.dlayer1.add_module('dlayer1_tconv', nn.ConvTranspose2d(channels*2, 3, 4, 2, 1, bias=True))

    def forward(self, x):
        out1 = self.layer1(x)
        out2 = self.layer2(out1)
        out2b= self.layer2b(out2)
        out3 = self.layer3(out2b)
        out4 = self.layer4(out3)
        out5 = self.layer5(out4)
        out6 = self.layer6(out5)
        dout6 = self.dlayer6(out6)
        dout6_out5 = torch.cat([dout6, out5], 1)
        dout5 = self.dlayer5(dout6_out5)
        dout5_out4 = torch.cat([dout5, out4], 1)
        dout4 = self.dlayer4(dout5_out4)
        dout4_out3 = torch.cat([dout4, out3], 1)
        dout3 = self.dlayer3(dout4_out3)
        dout3_out2b = torch.cat([dout3, out2b], 1)
        dout2b = self.dlayer2b(dout3_out2b)
        dout2b_out2 = torch.cat([dout2b, out2], 1)
        dout2 = self.dlayer2(dout2b_out2)
        dout2_out1 = torch.cat([dout2, out1], 1)
        dout1 = self.dlayer1(dout2_out1)
        return dout1

In [10]:
net = TurbNetG()

net.apply(weights_init)
net.to(device)

TurbNetG(
  (layer1): Sequential(
    (layer1_conv): Conv2d(3, 64, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1))
  )
  (layer2): Sequential(
    (layer2_leakyrelu): LeakyReLU(negative_slope=0.2, inplace=True)
    (layer2_conv): Conv2d(64, 128, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1))
    (layer2_bn): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  )
  (layer2b): Sequential(
    (layer2b_leakyrelu): LeakyReLU(negative_slope=0.2, inplace=True)
    (layer2b_conv): Conv2d(128, 128, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1))
    (layer2b_bn): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  )
  (layer3): Sequential(
    (layer3_leakyrelu): LeakyReLU(negative_slope=0.2, inplace=True)
    (layer3_conv): Conv2d(128, 256, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1))
    (layer3_bn): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  )
  (layer4): Sequential(
    (lay

In [11]:
from dataset import TurbDataset
expo = 5
testLoader = DataLoader(dataset, batch_size=1, shuffle=False)

In [12]:
n_var = 3

In [13]:
import time
from utils import relative_error

In [None]:
######## Settings ########

# number of training iterations
iterations = 10000
# batch size
batch_size = 10
# learning rate, generator
lrG = 0.0006
# decay learning rate?
decayLr = True
# channel exponent to control network size
expo = 5
# data set config
prop=None # by default, use all from "../data/train"
#prop=[1000,0.75,0,0.25] # mix data from multiple directories
# save txt files with per epoch loss?
saveL1 = False


##########################

prefix = ""
if len(sys.argv)>1:
    prefix = sys.argv[1]
    print("Output prefix: {}".format(prefix))

dropout    = 0.      # note, the original runs from https://arxiv.org/abs/1810.08217 used slight dropout, but the effect is minimal; conv layers "shouldn't need" dropout, hence set to 0 here.
doLoad     = ""      # optional, path to pre-trained model

print("LR: {}".format(lrG))
print("LR decay: {}".format(decayLr))
print("Iterations: {}".format(iterations))
print("Dropout: {}".format(dropout))

##########################

seed = random.randint(0, 2**32 - 1)
print("Random seed: {}".format(seed))
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
#torch.backends.cudnn.deterministic=True # warning, slower

# create pytorch data object with dfp dataset
data = TurbDataset(prop, shuffle=1)
trainLoader = DataLoader(data, batch_size=batch_size, shuffle=True, drop_last=True)
print("Training batches: {}".format(len(trainLoader)))
dataValidation = dataset.ValiDataset(data)
valiLoader = DataLoader(dataValidation, batch_size=batch_size, shuffle=False, drop_last=True) 
print("Validation batches: {}".format(len(valiLoader)))

# setup training
epochs = int(iterations/len(trainLoader) + 0.5)
netG = TurbNetG(channelExponent=expo, dropout=dropout)
print(netG) # print full net
model_parameters = filter(lambda p: p.requires_grad, netG.parameters())
params = sum([np.prod(p.size()) for p in model_parameters])
print("Initialized TurbNet with {} trainable params ".format(params))

netG.apply(weights_init)
if len(doLoad)>0:
    netG.load_state_dict(torch.load(doLoad))
    print("Loaded model "+doLoad)
netG.cuda()

criterionL1 = nn.L1Loss()
criterionL1.cuda()

optimizerG = optim.Adam(netG.parameters(), lr=lrG, betas=(0.5, 0.999), weight_decay=0.0)

targets = Variable(torch.FloatTensor(batch_size, 3, 128, 128))
inputs  = Variable(torch.FloatTensor(batch_size, 3, 128, 128))
targets = targets.cuda()
inputs  = inputs.cuda()

##########################

columns = ['epoch','id', 'Pressure','V_x','V_y']
test_table= wandb.Table(columns=columns)
start_time = time.time()
for epoch in range(epochs):
    print("Starting epoch {} / {}".format((epoch+1),epochs))
    

    "Training loop"
    netG.train()
    L1_accum = 0.0

    for i, traindata in enumerate(trainLoader, 0):
        inputs_cpu, targets_cpu = traindata
        targets_cpu, inputs_cpu = targets_cpu.float().cuda(), inputs_cpu.float().cuda()
        inputs.resize_as_(inputs_cpu).copy_(inputs_cpu)
        targets.resize_as_(targets_cpu).copy_(targets_cpu)

        # compute LR decay
        if decayLr:
            currLr = utils.computeLR(epoch, epochs, lrG*0.1, lrG)
            if currLr < lrG:
                for g in optimizerG.param_groups:
                    g['lr'] = currLr

        netG.zero_grad()
        gen_out = netG(inputs)

        lossL1 = criterionL1(gen_out, targets)
        lossL1.backward()

        optimizerG.step()

        lossL1viz = lossL1.item()
        L1_accum += lossL1viz
        

        if i==len(trainLoader)-1:
            logline = "Epoch: {}, batch-idx: {}, L1: {}\n".format(epoch, i, lossL1viz)
            print(logline)

    wandb.log({'Train_loss':L1_accum/len(trainLoader), 'Epoch':epoch})

    # validation
    netG.eval()
    L1val_accum = 0.0
    test_loss_accum = 0

    dataset = TurbDataset(None, mode=TurbDataset.TEST, dataDirTest= r"/content/gdrive/MyDrive/data/test")
    testLoader = DataLoader(dataset, batch_size=1, shuffle=False)

    # data for graph plotting
    L1_accum    /= len(trainLoader)
    L1val_accum /= len(valiLoader)
    if saveL1:
        if epoch==0: 
            utils.resetLog(prefix + "L1.txt"   )
            utils.resetLog(prefix + "L1val.txt")
        utils.log(prefix + "L1.txt"   , "{} ".format(L1_accum), False)
        utils.log(prefix + "L1val.txt", "{} ".format(L1val_accum), False)

    '''Relative error per channel'''
    R_Error         = torch.zeros(size=(1,n_var),device=device)

    counter = 0
    for i, data in enumerate(testLoader, 0):

      counter +=1
      inputs_cpu, targets_cpu = data
      targets_cpu1, inputs_cpu = targets_cpu.float().cuda(), inputs_cpu.float().cuda()
      inputs.resize_as_(inputs_cpu).copy_(inputs_cpu)
      targets.resize_as_(targets_cpu1).copy_(targets_cpu)     
      #inputs.data.resize_as_(inputs_cpu).copy_(inputs_cpu)
      #targets.data.resize_as_(targets_cpu1).copy_(targets_cpu)


      outputs = netG(inputs)
      outputs_cpu = outputs.data.cpu().numpy()[0]
      targets_cpu = targets_cpu1.cpu().numpy()[0] 

      outputs_gpu_test = outputs.data
      targets_gpu_test = targets_cpu1   

      loss             = criterionL1(outputs,targets)
      test_loss_accum += loss.item()
      R_Error         += relative_error(outputs_gpu_test,targets_gpu_test, inputs_cpu[:,2,:,:])

    if counter % 30==0:




      images = []

      for i in range(n_var):
        x        = np.reshape(targets_cpu[i,:,:],(128,128)).T
        y        = np.reshape(outputs_cpu[i,:,:],(128,128)).T
        img_data = wandb.Image(np.concatenate((x,y),axis=1))
        images.append(img_data)

      test_table.add_data(epoch,counter,*images)

    wandb.log({'Channels':test_table})
  

    E_P, E_V_x, E_V_y  = R_Error[0]/len(testLoader)

    wandb.log({'Test_loss':test_loss_accum/len(testLoader),'Epoch':epoch,\
             'E_P':E_P,'E_V_x':E_V_x,'E_V_y':E_V_y})
  

end_time = time.time()
total_time = end_time - start_time
print("Time: ", total_time)

torch.save(netG.state_dict(), prefix + "ModelGNew6.pth" )

Output prefix: -f
LR: 0.0006
LR decay: True
Iterations: 10000
Dropout: 0.0
Random seed: 3190650814
