In [2]:
import argparse
import random
import time

import numpy as np
import torch
from torch.utils.data import DataLoader, Subset
import torchvision.transforms as T

import np_transforms as NP_T
from datasets import WebcamTSeq
from utils import show_images, sort_seqs_by_len
import plotter
from collections import OrderedDict
from turtle import forward

import torch
from torch import nn
from torch.nn.utils import rnn
import torch.nn.functional as F

# from torchsummary import summary
from torchinfo import summary

In [3]:
def get_data_loaders(args_path, args_shape, train_transform, args_gamma, args_batch_size, file_name, args_max_len):
    train_data = WebcamTSeq(path=args_path, out_shape=args_shape, transform=train_transform, gamma=args_gamma, max_len=args_max_len, file_name=file_name)
    train_loader = DataLoader(train_data,
                            batch_size=args_batch_size,
                            shuffle=False)  # shuffle the data at the beginning of each epoch

    del train_data

    return train_loader, None

In [4]:
file_list = ['164']
data = WebcamTSeq(path='./data/WebCamT', out_shape=[120, 160], transform=NP_T.ToTensor(), gamma=1e3, max_len=6, file_name='164')

In [5]:
test_loader = DataLoader(data,batch_size=16,shuffle=False)

In [6]:
for i, (X,mask,density,count,_,seq_len) in enumerate(test_loader):
    print(X.shape)
    break

torch.Size([16, 6, 3, 120, 160])


In [7]:
class FCN(nn.Module):
    def __init__(self, image_dim = None):
        super(FCN, self).__init__()
        self.image_dim = image_dim
        # FCN layer
        self.fcn_blocks = nn.ModuleList()
        self.fcn_blocks.append(
            nn.Sequential(OrderedDict([
                ('Conv1_1', nn.Conv2d(3, 64, (3, 3), padding=1)),
                ('ReLU1_1', nn.ReLU()),
                ('Conv1_2', nn.Conv2d(64, 64, (3, 3), padding=1)),
                ('ReLU1_2', nn.ReLU()),
                ('MaxPool1', nn.MaxPool2d((2, 2))),
                ('Conv2_1', nn.Conv2d(64, 128, (3, 3), padding=1)),
                ('ReLU2_1', nn.ReLU()),
                ('Conv2_2', nn.Conv2d(128, 128, (3, 3), padding=1)),
                ('ReLU2_2', nn.ReLU()),
                ('MaxPool2', nn.MaxPool2d((2, 2))),
            ])))
        self.fcn_blocks.append(
            nn.Sequential(OrderedDict([
                ('Conv3_1', nn.Conv2d(128, 256, (3, 3), padding=1)),
                ('ReLU3_1', nn.ReLU()),
                ('Conv3_2', nn.Conv2d(256, 256, (3, 3), padding=1)),
                ('ReLU3_2', nn.ReLU()),
                ('Atrous1', nn.Conv2d(256, 256, (3, 3), dilation=2, padding=2)),
                ('ReLU_A1', nn.ReLU()),
            ])))
        self.fcn_blocks.append(
            nn.Sequential(OrderedDict([
                ('Conv4_1', nn.Conv2d(256, 256, (3, 3), padding=1)),
                ('ReLU4_1', nn.ReLU()),
                ('Conv4_2', nn.Conv2d(256, 256, (3, 3), padding=1)),
                ('ReLU4_2', nn.ReLU()),
                ('Atrous2', nn.Conv2d(256, 512, (3, 3), dilation=2, padding=2)),
                ('ReLU_A2', nn.ReLU()),
            ])))
        self.fcn_blocks.append(
            nn.Sequential(OrderedDict([
                ('Atrous3', nn.Conv2d(512, 512, (3, 3), dilation=2, padding=2)),
                ('ReLU_A3', nn.ReLU()),
                ('Atrous4', nn.Conv2d(512, 512, (3, 3), dilation=2, padding=2)),
                ('ReLU_A4', nn.ReLU()),
            ])))
        self.fcn_blocks.append(
            nn.Sequential(OrderedDict([
                ('Conv5', nn.Conv2d(1408, 512, (1, 1))),  # 1408 = 128 + 256 + 512 + 512 (hyper-atrous combination)
                ('ReLU5', nn.ReLU()),
                ('Deconv1', nn.ConvTranspose2d(512, 256, (3, 3), stride=2, padding=1, output_padding=1)),
                ('ReLU_D1', nn.ReLU()),
                ('Deconv2', nn.ConvTranspose2d(256, 64, (3, 3), stride=2, padding=1, output_padding=1)),
                ('ReLU_D2', nn.ReLU()),
                ('Conv6', nn.Conv2d(64, 1, (1, 1))),
            ])))
    
    def forward(self, X, mask=None):
        
        # X shape = N, L, C, H ,W 
        N, L, C, H, W = X.shape
        X = X.reshape(N*L,C,H,W)
        if mask is not None :
            mask = mask.reshape(N*L,1,H,W)
            X = X * mask
            
        h1 = self.fcn_blocks[0](X)
        h2 = self.fcn_blocks[1](h1)
        h3 = self.fcn_blocks[2](h2)
        h4 = self.fcn_blocks[3](h3)
        h = torch.cat((h1, h2, h3, h4), dim=1) # hyper-atrous combination
        h = self.fcn_blocks[4](h)
        density = h.reshape(N,L,1,H,W)
        if mask is not None :
            h = h * mask
        
        return density, h.sum(dim=(1,2,3)).reshape(N,L) # density & count

In [8]:
model_FCN = FCN([120,160])
summary(model_FCN, (16,6,3,120,160),device='cpu')

Layer (type:depth-idx)                   Output Shape              Param #
FCN                                      --                        --
├─ModuleList: 1-1                        --                        --
│    └─Sequential: 2-1                   [96, 128, 30, 40]         --
│    │    └─Conv2d: 3-1                  [96, 64, 120, 160]        1,792
│    │    └─ReLU: 3-2                    [96, 64, 120, 160]        --
│    │    └─Conv2d: 3-3                  [96, 64, 120, 160]        36,928
│    │    └─ReLU: 3-4                    [96, 64, 120, 160]        --
│    │    └─MaxPool2d: 3-5               [96, 64, 60, 80]          --
│    │    └─Conv2d: 3-6                  [96, 128, 60, 80]         73,856
│    │    └─ReLU: 3-7                    [96, 128, 60, 80]         --
│    │    └─Conv2d: 3-8                  [96, 128, 60, 80]         147,584
│    │    └─ReLU: 3-9                    [96, 128, 60, 80]         --
│    │    └─MaxPool2d: 3-10              [96, 128, 30, 40]         --

In [32]:
for i, (X,mask,density,count,_,seq_len) in enumerate(test_loader):
    print(mask.shape)
    print(density.shape)
    print(count.shape)
    density, count = model_FCN(X,mask)
    print(density.shape)
    print(count.shape)
    break

torch.Size([16, 6, 1, 120, 160])
torch.Size([16, 6, 1, 120, 160])
torch.Size([16, 6])
torch.Size([16, 6, 1, 120, 160])
torch.Size([16, 6])


In [10]:
class BiLSTM_Encoder(nn.Module):
    def __init__(self, image_dim = None):
        super(BiLSTM_Encoder, self).__init__()
        self.image_dim = image_dim
        # Bidirectional LSTM layer
        H,W = self.image_dim
        # lstm, input size = H*W, hidden state size = 100 but bidirectional so double
        self.lstm_block = nn.LSTM(H*W,100,num_layers = 3, bidirectional = True, batch_first=True) 
        self.fc = nn.Linear(200,100) # (enc_hid_dim * 2, dec_hid_dim)
        
    def forward(self, density):
        # X shape = N, L, 1, H, W
        # count shape = N, L
        N,L,C,H,W = density.shape
        # count_FCN = density.sum(dim=(2,3,4)).reshape(N,L)
        
        h = density.reshape(N,L,-1)
        h, (hidden, cell) = self.lstm_block(h)
        # hidden [-2,:,:] = forward last hidden state, hidden[-1,:,:] = backward last hidden state
        
        return h, hidden, cell

In [11]:
model_encoder = BiLSTM_Encoder(image_dim = (120,160))
summary(model_encoder, [(16,6,1,120,160)],device='cpu')

Layer (type:depth-idx)                   Output Shape              Param #
BiLSTM_Encoder                           --                        --
├─LSTM: 1-1                              [16, 6, 200]              15,924,800
├─Linear: 1-2                            --                        20,100
Total params: 15,944,900
Trainable params: 15,944,900
Non-trainable params: 0
Total mult-adds (G): 1.53
Input size (MB): 7.37
Forward/backward pass size (MB): 0.15
Params size (MB): 63.78
Estimated Total Size (MB): 71.31

In [14]:
for i, (X,mask,density,count,_,seq_len) in enumerate(test_loader):
    density, count = model_FCN(X,mask)
    en_h, en_hidden, en_cell = model_encoder(density)
    print(en_h.shape) # 2 * hidden_dim
    print(en_hidden.shape)
    print(en_cell.shape)
    break

torch.Size([16, 6, 200])
torch.Size([6, 16, 100])
torch.Size([6, 16, 100])


In [15]:
class Decoder(nn.Module):
    def __init__(self, image_dim = None):
        super(Decoder, self).__init__()
        self.image_dim = image_dim
        self.seq_len = seq_len
        H,W = self.image_dim
        self.lstm_block = nn.LSTM(H*W,100,num_layers = 3, bidirectional = True, batch_first=True)
        
        
    def forward(self, density, hidden, cell):
        # density shape = N, L, 1, H, W 
        # count shape = N, L
        N,L,C,H,W = density.shape
       
        h = density.reshape(N,L,-1)
        h, (hidden, cell) = self.lstm_block(h, (hidden, cell))
        
        return h, hidden, cell

In [16]:
model_decoder = Decoder(image_dim = (120,160))
summary(model_decoder, [(16,6,1,120,160),(6,16,100),(6,16,100)],device='cpu')

Layer (type:depth-idx)                   Output Shape              Param #
Decoder                                  --                        --
├─LSTM: 1-1                              [16, 6, 200]              15,924,800
Total params: 15,924,800
Trainable params: 15,924,800
Non-trainable params: 0
Total mult-adds (G): 1.53
Input size (MB): 7.45
Forward/backward pass size (MB): 0.15
Params size (MB): 63.70
Estimated Total Size (MB): 71.30

In [17]:
for i, (X,mask,density,count,_,seq_len) in enumerate(test_loader):
    density, count = model_FCN(X,mask)
    print(density[:,:-1].shape)
    print(density[:,-1].unsqueeze(1).shape)
    print(count[:,:-1].shape)
    print(count[:,-1].unsqueeze(1).shape)
    en_h, en_hidden, en_cell = model_encoder(density[:,:-1])
    de_h, de_hidden, de_cell = model_decoder(density[:,-1].unsqueeze(1), en_hidden, en_cell)
    print(de_h.shape) # 2 * hidden_dim
    print(de_hidden.shape)
    print(de_cell.shape)
    break

torch.Size([16, 5, 1, 120, 160])
torch.Size([16, 1, 1, 120, 160])
torch.Size([16, 5])
torch.Size([16, 1])
torch.Size([16, 1, 200])
torch.Size([6, 16, 100])
torch.Size([6, 16, 100])


In [18]:
class Attention(nn.Module):
    def __init__(self):
        super(Attention, self).__init__()
        self.W = nn.Linear(400,200)
        self.tanh = nn.Tanh()
        self.v = nn.Linear(200,1)
    def forward(self, x):
        # x shape = N, 200
        N,D = x.shape
        h = self.W(x)
        h = self.tanh(h)
        a = self.v(h)
        return a

In [19]:
a = Attention()

In [24]:
for i, (X,mask,density,count,_,seq_len) in enumerate(test_loader):
    density, count = model_FCN(X,mask)
    en_h, en_hidden, en_cell = model_encoder(density[:,:-1])
    de_h, de_hidden, de_cell = model_decoder(density[:,-1].unsqueeze(1), en_hidden, en_cell)
    # matrix multiplication of 
    de_h_view = de_h.view(de_h.shape[0], de_h.shape[2], -1)
    score = torch.bmm(en_h, de_h_view)
    print(score.shape)
    att_dis = F.softmax(score, dim=1)
    print(att_dis.shape)
    att_val = torch.sum(en_h * att_dis, dim=1)
    print(att_val.shape)
    con = torch.cat((att_val, de_h.squeeze(1)), dim=1)
    print(con.shape)
    out = a(con)
    print(out.shape)
    pred = count[:,-1].unsqueeze(1) + out
    print(pred.shape)
    print(pred)
    break

torch.Size([16, 5, 1])
torch.Size([16, 5, 1])
torch.Size([16, 200])
torch.Size([16, 400])
torch.Size([16, 1])
torch.Size([16, 1])
tensor([[367774.4688],
        [366877.9375],
        [365970.4062],
        [367946.8125],
        [367126.5938],
        [365834.3125],
        [365419.9062],
        [366542.6875],
        [365299.0938],
        [366884.7188],
        [366753.1562],
        [366379.2500],
        [366732.5000],
        [365885.1250],
        [366124.0312],
        [366938.3750]], grad_fn=<AddBackward0>)


In [33]:
class FCN_BLA(nn.Module):
    def __init__(self, FCN, Encoder, Decoder, image_dim = None, seq_len = 5):
        super(FCN_BLA, self).__init__()
        self.image_dim = image_dim
        self.seq_len = seq_len
        self.FCN = FCN(image_dim = image_dim)
        self.Encoder = Encoder(image_dim = image_dim)
        self.Decoder = Decoder(image_dim = image_dim)

        self.W = nn.Linear(400,200)
        self.tanh = nn.Tanh()
        self.v = nn.Linear(200,1)
    
    def forward(self, X, mask):
        # X shape = N, L, C, H, W
        # mask shape = N, L
        density, count = self.FCN(X,mask)
        en_h, en_hidden, en_cell = self.Encoder(density[:,:-1])
        de_h, de_hidden, de_cell = self.Decoder(density[:,-1].unsqueeze(1), en_hidden, en_cell)
        # Add attention
        de_h_view = de_h.view(de_h.shape[0], de_h.shape[2], -1)
        score = torch.bmm(en_h, de_h_view)
        att_dis = F.softmax(score, dim=1)
        att_val = torch.sum(en_h * att_dis, dim=1)
        con = torch.cat((att_val, de_h.squeeze(1)), dim=1)
        out = self.v(self.tanh(self.W(con)))

        pred = count[:,-1].unsqueeze(1) + out
        return density, pred
        
    

In [34]:
model = FCN_BLA(FCN, BiLSTM_Encoder, Decoder, image_dim = (120,160), seq_len = 5)

In [35]:
summary(model_FCN, [(16,6,3,120,160),(16,6,1,120,160)],device='cpu')

Layer (type:depth-idx)                   Output Shape              Param #
FCN                                      --                        --
├─ModuleList: 1-1                        --                        --
│    └─Sequential: 2-1                   [96, 128, 30, 40]         --
│    │    └─Conv2d: 3-1                  [96, 64, 120, 160]        1,792
│    │    └─ReLU: 3-2                    [96, 64, 120, 160]        --
│    │    └─Conv2d: 3-3                  [96, 64, 120, 160]        36,928
│    │    └─ReLU: 3-4                    [96, 64, 120, 160]        --
│    │    └─MaxPool2d: 3-5               [96, 64, 60, 80]          --
│    │    └─Conv2d: 3-6                  [96, 128, 60, 80]         73,856
│    │    └─ReLU: 3-7                    [96, 128, 60, 80]         --
│    │    └─Conv2d: 3-8                  [96, 128, 60, 80]         147,584
│    │    └─ReLU: 3-9                    [96, 128, 60, 80]         --
│    │    └─MaxPool2d: 3-10              [96, 128, 30, 40]         --

In [37]:
for i, (X,mask,density,count,_,seq_len) in enumerate(test_loader):
    density_pred, count_pred = model(X,mask)
    print(density_pred.shape)
    print(count_pred.shape)
    N = torch.sum(seq_len)
    density_loss = torch.sum((density_pred - density)**2)/(2*N)
    count_loss = torch.sum((count_pred[:,-1].unsqueeze(1) - count)**2)/2
    loss = density_loss + 0.2 * count_loss
    print(density_loss, count_loss, loss)
    break

torch.Size([16, 6, 1, 120, 160])
torch.Size([16, 1])
tensor(39.0529, grad_fn=<DivBackward0>) tensor(2.3514e+10, grad_fn=<DivBackward0>) tensor(4.7028e+09, grad_fn=<AddBackward0>)
