In [1]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import torch
import torch.nn as nn
import math

In [2]:
def try_gpu():
    """
    If GPU is available, return torch.device as cuda:0; else return torch.device
    as cpu.
    """
    if torch.cuda.is_available():
        device = torch.device('cuda:0')
    else:
        device = torch.device('cpu')
    return device

In [3]:
device=try_gpu()
print(device)

cuda:0


In [None]:
"""
According to Convolutional LSTM Network: A Machine Learning Approach for Precipitation Nowcasting
https://arxiv.org/pdf/1506.04214.pdf

    ConvlutionalLSTM unit which has the following update rule:
        it ​= σ(W_xi * ​xt ​​+ W_hi * ​h(t−1) ​+ W_ci⊙c(t-1)+b_i​)
        ft​ = σ(W_xf * ​xt ​+ W_hf * ​h(t−1) ​+ W_cf⊙c(t-1)+b_f​)
        ct ​= ft​ ⊙ c(t−1) ​+ it ​⊙ ​tanh(W_xc * ​xt ​​+ W_hc * ​h(t−1) ​+ b_c​)  //c candidate
        ot ​= σ(W_xo * ​xt ​+ W_ho ​* h(t−1) ​+ W_co⊙c(t)+b_o​)
        
        ht ​= ot​ ⊙ tanh(ct​)​
        *:convolution operator
        ⊙:Hadamard product
"""

'\nAccording to Convolutional LSTM Network: A Machine Learning Approach for Precipitation Nowcasting\nhttps://arxiv.org/pdf/1506.04214.pdf\n\n    ConvlutionalLSTM unit which has the following update rule:\n        it \u200b= σ(W_xi * \u200bxt \u200b\u200b+ W_hi * \u200bh(t−1) \u200b+ W_ci⊙c(t-1)+b_i\u200b)\n        ft\u200b = σ(W_xf * \u200bxt \u200b+ W_hf * \u200bh(t−1) \u200b+ W_cf⊙c(t-1)+b_f\u200b)\n        ct \u200b= ft\u200b ⊙ c(t−1) \u200b+ it \u200b⊙ \u200btanh(W_xc * \u200bxt \u200b\u200b+ W_hc * \u200bh(t−1) \u200b+ b_c\u200b)  //c candidate\n        ot \u200b= σ(W_xo * \u200bxt \u200b+ W_ho \u200b* h(t−1) \u200b+ W_co⊙c(t)+b_o\u200b)\n        \n        ht \u200b= ot\u200b ⊙ tanh(ct\u200b)\u200b\n        *:convolution operator\n        ⊙:Hadamard product\n'

In [None]:
##Basic CONVLSTM model

#function:init, init hidden,reset_parameter and forward
class convLSTM(nn.Module):
    """
    ConvLSTM based on https://github.com/Hzzone/Precipitation-Nowcasting/blob/master/nowcasting/models/convLSTM.py"""
    def __init__(self,input_channels,hidden_channels,kernel_size=3,out_width=64,out_height=64,batch_first=False,bias=False,return_all_layers=False):
        
        #input channels, #of conv kernerls(hidden channels), convkernel size
        #output width, output height (also hidden width and height)are the same(64) with input according to MetNet paper
        
        
        super().__init__()
        
        self.input_channels = input_channels#defalut 256
        self.hidden_channels = hidden_channels#default 384
        self.kernel_size = kernel_size#default 3
        self.width=out_width#default 64
        self.height=out_height#default 64
        #self.batch_first = batch_first
        #self.bias = bias
        #self.return_all_layers = return_all_layers
        
        
        #W_xi,W_hi,W_xf,W_hf,W_xc,W_hc,W_xo,W_ho
        self.conv = nn.Conv2d(in_channels=input_channels + hidden_channels,##calculate the input xt and hidden ht together
                               out_channels=hidden_channels*4,##in order to cut it into gates according to channels
                               kernel_size=kernel_size,
                               stride=1,
                               padding=1)
        """according to MetNet Paper, kernerlsize=3, in order 
        to keep output rows and cols the same with input, default: stride=1 an padding=1"""
        
        self.reset_params()
        

        

    def init_hidden(self, inputs):
        #input shape (time_steps, bsz, 256, 64, 64)
        """usage of torch new https://stackoverflow.com/questions/49263588/pytorch-beginner-tensor-new-method"""
        c0 = inputs.new(size=(inputs.size(1), self.hidden_channels, self.height, self.width))
        h0 = inputs.new(size=(inputs.size(1), self.hidden_channels, self.height, self.width))
        return h0, c0#dim(bsz,#filter,height,width)


    def reset_params(self):
        """
        Initialize network parameters.
        
        std = 1.0 
        self.Wci.data.uniform_(-std, std)
        self.Wcf.data.uniform_(-std, std)
        self.Wco.data.uniform_(-std, std)
"""
        # if using requires_grad flag, torch.save will not save parameters in deed although it may be updated every epoch.
        # Howerver, if you use declare an optimizer like Adam(model.parameters()),
        # parameters will not be updated forever.
        self.Wci = nn.Parameter(torch.zeros(1, self.hidden_channels, self.height, self.width))
        self.Wcf = nn.Parameter(torch.zeros(1, self.hidden_channels, self.height, self.width))
        self.Wco = nn.Parameter(torch.zeros(1, self.hidden_channels, self.height, self.width))


    def forward(self, inputs):
        """
        Expected input shape [seq_len, bsz, channels, height, width]
        input shape (seq_len, bsz, 256, 64, 64)
        output shape (seq_len, bsz, 384, 64, 64)
        """
        
        #input shape (time_steps, bsz, 256, 64, 64)
        time_steps = len(inputs)
        self.hidden = self.init_hidden(inputs)##initial shapes of cell and hidden
        h, c = self.hidden
        
        print("shape h",h.size())
        print("shape c",c.size())

        if inputs is None:
            x = torch.zeros((h.size(0), self.in_channels, self.height, self.width), dtype=torch.float)
            #(time_steps, bsz, 256, 64, 64)
        
        outputs = []
        for index in range(time_steps):#time_step
            # initial inputs
            x = inputs[index]
            #x,(bsz, 256, 64, 64) h,(bsz, hidden_channels, 64, 64) dim1:channel
            cat_xh = torch.cat([x, h], dim=1)#(bsz, 256+hidden_channels, 64, 64)
            conv_xh = self.conv(cat_xh)
            i, f, tmp_c, o = torch.chunk(conv_xh, 4, dim=1)###cut according to channel
            
           
            # conv lstm equations
            i = torch.sigmoid(i+self.Wci*c)
            f = torch.sigmoid(f+self.Wcf*c)
            c = f*c + i*torch.tanh(tmp_c)
            o = torch.sigmoid(o+self.Wco*c)
            h = o*torch.tanh(c)


            outputs.append(h)
       # print("outputsize",outputs.size())
        outputs = torch.stack(outputs)#(time_steps, bsz, 384, 64, 64)convert list into tensor
        print("outputsize",outputs.size())

        return outputs, (h, c)#h,c last sequence

In [None]:
class ConvLSTMForecaster(nn.Module):
    # def __init__(self,input_channels,hidden_channels,kernel_size,out_width=64,out_height=64,batch_first=False,bias=False,return_all_layers=False):
    def __init__(self, 
            in_channels: int,#default 256
            output_shape: tuple,#output shape (384, 64, 64)
            channels: tuple,#(hidden_channels384,32,)
            last_ts: bool = True,
            kernel_size: int = 3,
            last_relu: bool = True):
        super().__init__()

        self.last_ts = last_ts
        self.forcatser = convLSTM(input_channels=in_channels, hidden_channels=channels[0], kernel_size=kernel_size,
                        out_width=output_shape[1], out_height=output_shape[2])
        #output shape (384, 64, 64)
        self.out_layer1 = nn.Conv2d(channels[0], channels[1], kernel_size=1)#default padding=0, stride=1
        self.out_layer2 = nn.Conv2d(channels[1], output_shape[0], 1)##output shape[0] 384
        self.height_weight = output_shape[1:]#(64,64)
        self.last_relu = last_relu
        self.relu = torch.nn.ReLU()

    def forward(self, inputs):
        #inputs = inputs.permute(1,0,2,3,4) # time_steps first(depending on the dimension of input
        out, _ = self.forcatser(inputs)#the last sequence output
        
        if self.last_ts:
            out = out[-1]
        else:
            out = out.permute(1,0,2,3,4) # bsz_first
            bsz = len(out)
            print("forcaster bsz",bsz)
            out = out.contiguous().view(bsz, -1, *self.height_weight) #view()=reshape(),the precondition of view is contiguous
            # use all time steps

        out = self.out_layer1(out)
        print("out1",out.size())
        out = self.out_layer2(out)
        print("out2",out.size())
        if self.last_relu:
            out = self.relu(out)
        return out

In [None]:
#ConvLSTM Model array size test.

time_steps, batch_size, channels,height,width = 2, 3, 256,64,64
test_dataset= torch.randn((time_steps, batch_size, channels,height,width))

#print("test_dataset",test_dataset)

In [None]:
##initilization of model
_, _, input_channels,batch_height,batch_width=test_dataset.size()
#print("sizes",input_channels)
hidden_channels=200
#(input_channels,hidden_channels,kernel_size,out_width=64,out_height=64,batch_first=False,bias=False,return_all_layers=False)
conv_lstm=convLSTM(input_channels,hidden_channels)

In [None]:
outputset,(h,c)=conv_lstm(test_dataset)


print("output test",outputset[1].size())
print("output test size",outputset.size())
print("output test h",h.size())
print("output test c",c.size())


shape h torch.Size([3, 200, 64, 64])
shape c torch.Size([3, 200, 64, 64])
outputsize torch.Size([2, 3, 200, 64, 64])
output test torch.Size([3, 200, 64, 64])
output test size torch.Size([2, 3, 200, 64, 64])
output test h torch.Size([3, 200, 64, 64])
output test c torch.Size([3, 200, 64, 64])


In [None]:
#optimizer = torch.optim.SGD(conv_lstm.parameters(), lr=0.001, momentum=0.9)
for i,parameter in enumerate(conv_lstm.parameters()):
    print(i)
    print(parameter.size())

0
torch.Size([1, 200, 64, 64])
1
torch.Size([1, 200, 64, 64])
2
torch.Size([1, 200, 64, 64])
3
torch.Size([800, 456, 3, 3])
4
torch.Size([800])


In [None]:
#convforcatser initialization
convforcaster=ConvLSTMForecaster(in_channels=256,output_shape=(384,64,64),channels=(384,32))


In [None]:
output=convforcaster(test_dataset)

shape h torch.Size([3, 384, 64, 64])
shape c torch.Size([3, 384, 64, 64])
outputsize torch.Size([2, 3, 384, 64, 64])
out1 torch.Size([3, 32, 64, 64])
out2 torch.Size([3, 384, 64, 64])


In [None]:
print("output",type(output))
print("output",output.size())

output <class 'torch.Tensor'>
output torch.Size([3, 384, 64, 64])


In [None]:
for i,parameter in enumerate(convforcaster.parameters()):
    print(i)
    print(parameter.size())

0
torch.Size([1, 384, 64, 64])
1
torch.Size([1, 384, 64, 64])
2
torch.Size([1, 384, 64, 64])
3
torch.Size([1536, 640, 3, 3])
4
torch.Size([1536])
5
torch.Size([32, 384, 1, 1])
6
torch.Size([32])
7
torch.Size([384, 32, 1, 1])
8
torch.Size([384])


In [None]:
#####################################################
'''
in_channels = 1 # Black-white images in MNIST digits
hidden_channels = [5, 6]
out_features = 10 

# Training parameters
learning_rate = 0.001
epochs = 3 

# Initialize network
net = Net(in_channels, hidden_channels, out_features)
optimizer = torch.optim.SGD(net.parameters(), lr = learning_rate)
criterion = nn.CrossEntropyLoss()

# Define list to store losses and performances of each iteration
train_losses = []
train_accs = []
test_accs = []

# Try using gpu instead of cpu
device = try_gpu()

for epoch in range(epochs):

    # Network in training mode and to device
    net.train()
    net.to(device)

    # Training loop
    for i, (x_batch, y_batch) in enumerate(train_loader):

        # Set to same device
        x_batch, y_batch = x_batch.to(device), y_batch.to(device)

        # Set the gradients to zero
        optimizer.zero_grad()

        # Perform forward pass
        y_pred = net(x_batch)

        # Compute the loss
        loss = criterion(y_pred, y_batch)
        train_losses.append(loss)
        
        # Backward computation and update
        loss.backward()
        optimizer.step()

    # Compute train and test error
    train_acc = 100*evaluate_accuracy(train_loader, net.to('cpu'))
    test_acc = 100*evaluate_accuracy(test_loader, net.to('cpu'))
    
    # Development of performance
    train_accs.append(train_acc)
    test_accs.append(test_acc)

    # Print performance
    print('Epoch: {:.0f}'.format(epoch+1))
    print('Accuracy of train set: {:.00f}%'.format(train_acc))
    print('Accuracy of test set: {:.00f}%'.format(test_acc))
    print('')
    #######################zero grad##################################

    def train(train_loader, model, optimizer, criterion, device):
    """
    Trains network for one epoch in batches.

    Args:
        train_loader: Data loader for training set.
        model: Neural network model.
        optimizer: Optimizer (e.g. SGD).
        criterion: Loss function (e.g. cross-entropy loss).
    """
  
    avg_loss = 0
    correct = 0
    total = 0

    # Iterate through batches
    for i, data in enumerate(train_loader):#######################i:0-40,40份？？
        # Get the inputs; data is a list of [inputs, labels]
        inputs, labels = data
        #print("input",input.size())#torch.Size([100, 40])
        # Move data to target device
        inputs, labels = inputs.to(device), labels.to(device)
        print("i",i)
        # Zero the parameter gradients
        optimizer.zero_grad()

        # Forward + backward + optimize
        outputs = model(inputs)#####################
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        # Keep track of loss and accuracy
        avg_loss += loss
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    return avg_loss / len(train_loader), 100 * correct / total'''
#################################without zero_grad  Adam#############################################

In [None]:
##################defining optimizer#####################
##latitude-weighted Root-Mean-Squared-Error(RMSE)

In [None]:
###############################train step,test step.################################
##structure according to assignment6 solution
def train(train_loader, model, optimizer, criterion, device):
    """
    Trains network for one epoch in batches.
    Args:
        train_loader: Data loader for training set.
        model: Neural network model(ConvLSTM).
        optimizer: Optimizer (Adam).
        criterion: Loss function (latitude_weighted RMSE).
    """
  
    avg_loss = 0
    correct = 0
    total = 0

    # Iterate through batches
    for i, data in enumerate(train_loader):#######################i:0-40,40份？？
        # Get the inputs; data is a list of [inputs, labels]
        inputs, labels = data
        #print("input",input.size())#torch.Size([100, 40])
        # Move data to target device
        inputs, labels = inputs.to(device), labels.to(device)
        # Zero the parameter gradients
        optimizer.zero_grad()###

        # Forward + backward + optimize
        outputs = model(inputs)#####################
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        # Keep track of loss and accuracy
        avg_loss += loss
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    return avg_loss / len(train_loader), 100 * correct / total