<a href="https://colab.research.google.com/github/Dkepffl/Advanced_Analysis/blob/main/Transformer/Transformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## **Import Packages**

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

import torchvision.transforms as transforms
from torchvision.datasets import CIFAR100
from torch.utils.data import DataLoader

import numpy as np

## **Data loader**

In [None]:
# data loader
path = './datasets/'

# Data transform Setting
transform = transforms.Compose([transforms.ToTensor()]) # Tensor 변환. 다른 전처리X

# Load data
train_data = CIFAR100(root=path,train=True,transform=transform,download=True)
test_data = CIFAR100(root=path,train=False,transform=transform,download=True)

batch_size = 100

train_loader = DataLoader(dataset=train_data,batch_size=batch_size,shuffle=True,num_workers=0)
test_loader = DataLoader(dataset=test_data,batch_size=batch_size,shuffle=False,num_workers=0)

Downloading https://www.cs.toronto.edu/~kriz/cifar-100-python.tar.gz to ./datasets/cifar-100-python.tar.gz


100%|██████████| 169M/169M [00:04<00:00, 39.3MB/s]


Extracting ./datasets/cifar-100-python.tar.gz to ./datasets/
Files already downloaded and verified


In [None]:
input_shape = train_data[0][0].shape # 출력 결과 : [3, 32, 32]
output_shape = len(train_data.classes) # 출력 결과 : 100

## **Transformer 구현**

In [None]:
##########################################
#### there is nothing to do upto here ####
##########################################

### **Positional Encoding**
```python
# argument 정리
pe = PositionalEncoding(device, max_len, d_model)
pos_emb = pe(x)
```

In [None]:
# refer to Section 3.5 in the paper
class PositionalEncoding(nn.Module):
    def __init__(self, device, max_len=512, d_model=16):
        super().__init__()

        self.device = device
        self.max_len = max_len
        self.d_model = d_model

        # max_len만큼 만들고 x의 seq_len만큼 잘라서 사용
        self.pos_enc = torch.zeros(self.max_len, self.d_model,requires_grad=False, device=self.device)
        # or self.pos_enc.to(self.device)

        # Position Encoding Matrix
        pos = torch.arange(0, self.max_len, dtype=torch.float).unsqueeze(-1,) # unsqueeze :->(max_len,1)
        ii = torch.arange(0, self.d_model, step=2, dtype=torch.float) # 2i. (1, self.d_model)

        '''
        논문을 그대로 따르려면
        i = torch.arange(0, self.d_model//2, step=1, dtype=torch.float)

        self.pos_enc[:,0::2] = torch.sin(pos/(10000**((2*i)/self.d_model)))
        self.pos_enc[:,1::2] = torch.cos(pos/(10000**((2*i)/self.d_model)))

        torch.arange() 메서드 안에서 self.d_model//2 계산하는 것이 싫어서 아예 2i를 generate
        -> Q. O(n)을 고려할 때 좋은 방법은?
        '''
        self.pos_enc[:,0::2] = torch.sin(pos/(10000**(ii/self.d_model)))
        self.pos_enc[:,1::2] = torch.cos(pos/(10000**(ii/self.d_model)))

    def forward(self,x):
        """
        x: transformed input embedding where x.shape = [batch_size, seq_len, data_dim]
        """
        seq_len = x.shape[1]
        pos_emb = self.pos_enc[:seq_len,:]

        return pos_emb # input embedding + pos_emb는 어디서 해야 하지?

#### **idea**

In [28]:
# To make Position Encoding Matrix
temp = torch.zeros(4,4,requires_grad=False)

temp2 = torch.arange(4).unsqueeze(-1,) # row == pos
temp3 = torch.arange(0, 4//2, 1) # column == i

temp[:,0::2] = torch.sin(temp2/(10000**((2*temp3)/4)))
temp[:,1::2] = torch.cos(temp2/(10000**((2*temp3)/4)))

temp

tensor([[ 0.0000,  1.0000,  0.0000,  1.0000],
        [ 0.8415,  0.5403,  0.0100,  0.9999],
        [ 0.9093, -0.4161,  0.0200,  0.9998],
        [ 0.1411, -0.9900,  0.0300,  0.9996]])

In [29]:
# Version 2
temp = torch.zeros(4,4,requires_grad=False)

temp2 = torch.arange(4).unsqueeze(-1,) # row == pos
temp3 = torch.arange(0, 4, 2) # column == 2*i

temp[:,0::2] = torch.sin(temp2/(10000**((temp3)/4)))
temp[:,1::2] = torch.cos(temp2/(10000**((temp3)/4)))

temp

tensor([[ 0.0000,  1.0000,  0.0000,  1.0000],
        [ 0.8415,  0.5403,  0.0100,  0.9999],
        [ 0.9093, -0.4161,  0.0200,  0.9998],
        [ 0.1411, -0.9900,  0.0300,  0.9996]])

두 코드 결과 동일

#### **Reference**

- 논문에서 참고하라고 한 논문
  - [Jonas Gehring, Michael Auli, David Grangier, Denis Yarats, and Yann N. Dauphin. Convolu
tional sequence to sequence learning. arXiv preprint arXiv:1705.03122v2, 2017.](https://arxiv.org/pdf/1705.03122)

### **ScaledDotProductAttention**
$$Attention(Q, K, V) = softmax(\frac{QK^{T}}{\sqrt {d_{k}}})V$$
```python
attention = ScaledDotProductAttention()
attention_value = attention(q, k, v, mask=None)
```

In [None]:
# refer to Section 3.2.1 and Fig 2 (left) in the paper
class ScaledDotProductAttention(nn.Module):
    def __init__(self):
        super().__init__()

        self.softmax = nn.Softmax()

    def forward(self,q,k,v,mask=None):
        # compute attention value based on transformed query, key, value where mask is given conditionally
        """
        q, k, v = transformed query, key, value
        q.shape, k.shape, v.shpae = [batch_size, num_head, seq_len, d=d_model/num_head]
        mask = masking matrix, if the index has value False, kill the value; else, leave the value
        """
        k_t =  torch.transpose(k, -1, -2) # [batch_size, num_head, d, seq_len]

        numerator = torch.matmul(q, k_t)
        denominator = torch.sqrt(q.shape[-1]) # d = q.shape[-1]

        attention_value = numerator/denominator

        if mask != None:
          # if the index has value False, kill the value; else, leave the value
          # 논문에서는 -inf
          attention_value = torch.mul(mask, attention_value)

        attention_value = self.softmax(attention_value)
        attention_value = torch.matmul(attention_value,v)

        return attention_value

#### **idea**

In [54]:
temp_mask=torch.tensor([[True, False, True], [False, True, False]], dtype=torch.bool)
temp_x = torch.tensor([[1,2,3],[4,5,6]], dtype = torch.int)

temp_ans = torch.tensor([[1,0, 3],[0, 5, 0]], dtype=torch.int)
temp_ans

tensor([[1, 0, 3],
        [0, 5, 0]], dtype=torch.int32)

In [39]:
temp_x[temp_mask]

tensor([2, 2, 3], dtype=torch.int32)

In [52]:
torch.mul(temp_mask, temp_x) # elementwise

tensor([[1, 0, 3],
        [0, 5, 0]], dtype=torch.int32)

### **Multi Head Attention(수정 필요)**
$$MultiHead(Q, K, V) = Concat(head_1, ... , head_h)W^{O}$$

where $  head_i = Attention(QW^{Q}_{i}, QW^{K}_{i}, QW^{V}_{i})$

```python
multiheadattention = MultiHeadAttention(d_model, num_head)
output = multiheadattention(q, k, v, mask=None)
```

In [None]:
# refer to Section 3.2.2 and Fig 2 (right) in the paper
class MultiHeadAttention(nn.Module):
    def __init__(self,d_model=16,num_head=4):
        super().__init__()
        # fill out the rest
        assert d_model % num_head == 0, "check if d_model is divisible by num_head"

        # dimension
        self.d_model = d_model
        self.num_head = num_head # head 개수 : h
        self.d = d_model//num_head # d_k = d_v = d_model/h

        # module
        self.attention = ScaledDotProductAttention()

        # W_i : learnable
        self.w_q = nn.Linear(self.d_model, self.d_model)
        self.w_k = nn.Linear(self.d_model, self.d_model)
        self.w_v = nn.Linear(self.d_model, self.d_model)

        self.w_o = nn.Linear(self.d*self.num_head, self.d_model)

    def forward(self,q,k,v,mask=None):
        # fill out here
        # compute multi-head attention value
        # here, query, key, value are pre-transformed, so you need to transfrom them in this module
        """
        q, k, v = pre-transformed query, key, value
        q.shape, k.shape, v.shpae = [batch_size, seq_len, d_model]
        mask = masking matrix, if the index has value False, kill the value; else, leave the value
        """
        batch_size = q.shape[0]
        seq_len = q.shape[1]

        QW = self.w_q(q) # [batch_size, seq_len, d_model]
        KW = self.w_k(k)
        VW = self.w_v(v)
        '''
        Note that

        ScaledDotProductAttention의 input shape : [batch_size, num_head, seq_len, d]

        현재 : [batch_size, seq_len, d_model]
        reshape -> [batch_size, seq_len, num_head, d]
        transpose(1,2) -> [batch_size, num_head, seq_len, d]

        Colab에서 view()도 비슷한 기능이라고 추천하는데 안 써봐서 reshape 씀.
        '''
        QW = QW.reshape((batch_size, seq_len, self.num_head, self.d)).transpose(1,2)
        KW = KW.reshape((batch_size, seq_len, self.num_head, self.d)).transpose(1,2)
        VW = VW.reshape((batch_size, seq_len, self.num_head, self.d)).transpose(1,2)

        head = self.attention(QW, KW, VW, mask)
        con_head = head.transpose(1,2).contiguous().view(batch_size, -1, self.d_model) # -> [batch_size, -1 ,d_model]

        output = self.w_o(con_head)

        return output

#### **Idea**

In [61]:
temp=torch.arange(0, 120).reshape((4,5,6))
temp

tensor([[[  0,   1,   2,   3,   4,   5],
         [  6,   7,   8,   9,  10,  11],
         [ 12,  13,  14,  15,  16,  17],
         [ 18,  19,  20,  21,  22,  23],
         [ 24,  25,  26,  27,  28,  29]],

        [[ 30,  31,  32,  33,  34,  35],
         [ 36,  37,  38,  39,  40,  41],
         [ 42,  43,  44,  45,  46,  47],
         [ 48,  49,  50,  51,  52,  53],
         [ 54,  55,  56,  57,  58,  59]],

        [[ 60,  61,  62,  63,  64,  65],
         [ 66,  67,  68,  69,  70,  71],
         [ 72,  73,  74,  75,  76,  77],
         [ 78,  79,  80,  81,  82,  83],
         [ 84,  85,  86,  87,  88,  89]],

        [[ 90,  91,  92,  93,  94,  95],
         [ 96,  97,  98,  99, 100, 101],
         [102, 103, 104, 105, 106, 107],
         [108, 109, 110, 111, 112, 113],
         [114, 115, 116, 117, 118, 119]]])

In [65]:
temp.reshape((4,-1,2,3))

tensor([[[[  0,   1,   2],
          [  3,   4,   5]],

         [[  6,   7,   8],
          [  9,  10,  11]],

         [[ 12,  13,  14],
          [ 15,  16,  17]],

         [[ 18,  19,  20],
          [ 21,  22,  23]],

         [[ 24,  25,  26],
          [ 27,  28,  29]]],


        [[[ 30,  31,  32],
          [ 33,  34,  35]],

         [[ 36,  37,  38],
          [ 39,  40,  41]],

         [[ 42,  43,  44],
          [ 45,  46,  47]],

         [[ 48,  49,  50],
          [ 51,  52,  53]],

         [[ 54,  55,  56],
          [ 57,  58,  59]]],


        [[[ 60,  61,  62],
          [ 63,  64,  65]],

         [[ 66,  67,  68],
          [ 69,  70,  71]],

         [[ 72,  73,  74],
          [ 75,  76,  77]],

         [[ 78,  79,  80],
          [ 81,  82,  83]],

         [[ 84,  85,  86],
          [ 87,  88,  89]]],


        [[[ 90,  91,  92],
          [ 93,  94,  95]],

         [[ 96,  97,  98],
          [ 99, 100, 101]],

         [[102, 103, 104],
          [105,

### **PositionwiseFeedForwardNetwork**

In [None]:
# refer to Section 3.3 in the paper
class PositionwiseFeedForwardNetwork(nn.Module):
    def __init__(self,d_model=16,d_ff=32):
        super().__init__()
        # fill out here
        self.d_model = d_model
        self.d_ff = d_ff
        self.linear1 = nn.Linear(self.d_model, self.d_ff) # W1
        self.linear2 = nn.Linear(self.d_ff, self.d_model) # W2

    def forward(self,x):
        # fill out here
        temp = self.linear1(x)
        temp = torch.ReLU(0, temp) # max(0, xW1 + b1) : ReLU 아닌가?
        output = self.linear2(temp)

        return output

### **Masking**

This
 masking, combined with fact that the output embeddings are offset by one position, ensures that the
 predictions for position i can depend only on the known outputs at positions less than i.

네?
이거 랜덤입니까? True, False로 이루어진?

In [None]:
class Masking(nn.Module):
    def __init__(self,device):
        super().__init__()
        # fill out here

    def forward(self,x):
        # fill out here
        """
        x.shape = [batch_size, seq_len, data_dim]
        """

        return mask

### **Layer Normalization**
- [Pytorch LayerNorm Document](https://pytorch.org/docs/stable/generated/torch.nn.LayerNorm.html)

In [None]:
# do not use torch.nn.LayerNorm
class LayerNormalization(nn.Module):
    def __init__(self, d_model=16, eps=1e-5):
        super().__init__()
        # fill out here
        self.d_model = d_model
        self.eps = eps # epsilon 이겠지?
        self.gamma = nn.Parameter(torch.ones(self.d_model)) # torch document 는 1로 initialize...
        self.beta = nn.Parameter(torch.zeros(self.d_model)) # torch document 는 0으로 initialize...

    def forward(self,x):
        # fill out here
        temp = x-torch.mean(x)
        temp = temp/(torch.sqrt(torch.var(x, unbiased=False) + self.eps))
        normed = temp*self.gamma + self.beta # gamma, beta : learnable parameter... 라고 써있는데요?

        return normed

### **Layerwise Encoder&Decoder**

 **Residual Dropout** We apply dropout [27] to the output of each sub-layer, before it is added to the
 sub-layer input and normalized. In addition, we apply dropout to the sums of the embeddings and the
 positional encodings in both the encoder and decoder stacks. For the base model, we use a rate of
 Pdrop = 01.

In [None]:
class EncoderLayer(nn.Module):
# refer to Section 3.1 and Figure 1 in the paper
# this is a single encoder block consists of the following
# multi-head attention, positionwise feed forward network, residual connections, layer normalizations
    def __init__(self,d_model=16,num_head=4,d_ff=32, drop_prob=0.1):
        super().__init__()
        # fill out here
        self.multiattention = MultiHeadAttention(d_model, num_head)
        self.norm = LayerNormalization(d_model)
        self.feedforward = PositionwiseFeedForwardNetwork(d_model, d_ff)
        self.dropout = nn.Dropout(drop_prob) # 얘 어디에 들어감?

    def forward(self,enc):
        # fill out here
        # 2-sub layers
        # enc->q,k,v?
        temp = self.multiattention(q,k,v)
        temp = self.norm(enc + temp)  # Add & Norm

        # Feed Forward
        output = self.feedforward(temp)
        output = self.norm(temp + output) # Add & Norm

        return output

In [None]:
class DecoderLayer(nn.Module):
# refer to Section 3.1 and Figure 1 in the paper
# this is a single decoder block consists of the following
# masked multi-head attention, multi-head attention, positionwise feed forward network, residual connections, layer normalizations
    def __init__(self,d_model=16,num_head=4,d_ff=32,drop_prob=0.1):
        super().__init__()
        # fill out here
        self.feedforward = PositionwiseFeedForwardNetwork(d_model, d_ff)
        self.dropout = nn.Dropout(drop_prob) # 얘 어디에 들어감?
        # residual connections 팔아먹었어요

        self.multiattention1 = MultiHeadAttention(d_model, num_head) # masked
        self.multiattention2 = MultiHeadAttention(d_model, num_head)

        # 다 똑같이 normalization 해도 됨....?
        self.norm = LayerNormalization(d_model)

    def forward(self,enc_output,dec,dec_mask):
        # fill out here
        # 3-sub layers
        # q, k, v 어디서 남?
        temp = self.multiattention1(q,k,v,dec_mask)
        temp = self.norm(dec + temp) # Add & Norm

        # enc_output -> q, k. temp->v
        temp2 = self.multiattention2(enc_output,temp)
        temp2 = self.norm(temp + temp2) # Add & Norm

        output = self.feedforward(temp2) # Feed Forward
        output = self.norm(temp2 + output) # Add & Norm

        return output

#### **찐 Encoder&Decoder**

In [None]:
class Encoder(nn.Module):
# refer to Section 3.1 and Figure 1 in the paper
# this is a whole encoder, i.e., the left side of Figure 1, consists of the following as well
# input embedding, positional encoding
    """
    in this homework, encoder inputs are not tokens, it is already embeddings in the input dimension
    hence, you don't have to set input embedding layer
    instead, you have to transform the input into the hidden dimension with single linear transformation
    """
    def __init__(self,device,input_dim=3,num_layer=3,max_len=512,d_model=16,num_head=4,d_ff=32,drop_prob=.1):
        super().__init__()
        # fill out here
        self.num_layer = num_layer

        self.positionalencoding = PositionalEncoding(device, max_len, d_model)
        self.encoderlayer = EncoderLayer(d_model, num_head, d_ff, drop_prob)

    def forward(self,x):
        # fill out here
        input_embedding = x # input into the hidden dimension with single linear transformation
        hidden = self.positionalencoding(input_embedding)

        for i in range(self.num_layer):
          hidden = self.encoderlayer(hidden)

        return hidden

In [None]:
# refer to Section 3.1 and Figure 1 in the paper
# this is a whole decoder, i.e., the left side of Figure 1, consists of the following as well
# input embedding, positional encoding, linear classifier
class Decoder(nn.Module):
    """
    in this homework, decoder inputs are not tokens, it is already embeddings in the input dimension
    hence, you don't have to set input embedding layer
    instead, you have to transform the input into the hidden dimension with single linear transformation
    """
    def __init__(self,device,input_dim=3,num_layer=3,max_len=512,d_model=16,num_head=4,d_ff=32,drop_prob=.1):
        super().__init__()
        # fill out here
        self.num_layer = num_layer # N

        self.positionalencoding = PositionalEncoding(self.device, self.max_len,d_model)
        self.decoderlayer = DecoderLayer(self.d_model, self.num_head, self.d_ff, self.drop_prob)

    def forward(self,enc_output,y,y_mask):
        # fill out here
        input_embedding = self.positionalencoding(y)
        output = self.encoderlayer(enc_output,input_embedding,y_mask)

        for i in range(self.num_layer-1):
          output = self.encoderlayer(output,input_embedding, y_mask)

        return output

### **Transformer**

In [None]:
# refer to Section 3.1 and Figure 1 in the paper
# sum up encoder and decoder
class Transformer(nn.Module):

    def __init__(self,device, input_dim=3, num_layer=3, max_len=512, d_model=16, num_head=4, d_ff=32, drop_prob=.1):
        super().__init__()
        self.masking = Masking(device)
        self.encoder = Encoder(device,input_dim,num_layer,max_len,d_model,num_head,d_ff,drop_prob)
        self.decoder = Decoder(device,input_dim,num_layer,max_len,d_model,num_head,d_ff,drop_prob)

    def forward(self,x,y):
        # fill out here
        y_mask = self.masking(y)

        enc_output = self.encoder(x)
        dec_output = self.decoder(enc_output, y, y_mask)
        dec_output = nn.linear(dec_output,100)
        dec_output = softmax(dec_output)

        return dec_output

NameError: name 'nn' is not defined

## **Model Train Setting**

In [None]:
##########################################
#### there is nothing to do from here ####
##########################################

In [None]:
class ScheduledOptimizer:
    def __init__(self,optimizer,d_model=16,warmup_steps=4000):
        self.optimizer = optimizer
        self.d_model = d_model
        self.warmup_steps = warmup_steps
        self.step_num = 0

    def zero_grad(self):
        self.optimizer.zero_grad()

    def update_parameter_and_learning_rate(self):
        self.optimizer.step()
        self.step_num += 1
        self.lr = self.d_model**(-.5) * min(self.step_num**(-.5),self.step_num*self.warmup_steps**(-1.5))
        for param_group in self.optimizer.param_groups:
            param_group['lr'] = self.lr

### **Setting**

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# modify num_layer, d_model, num_head, d_ff while debugging your code
model = Transformer(device=device,input_dim=3,num_layer=3,max_len=512,d_model=16,num_head=4,d_ff=32,drop_prob=.1).to(device)
loss = nn.BCEWithLogitsLoss(reduction="mean")
optimizer = torch.optim.Adam(model.parameters(),betas=(.9,.98),eps=1e-9)
scheduled_optimizer = ScheduledOptimizer(optimizer,d_model=16)

In [None]:
# Epoch 설정
num_epoch = 1 # 돌아가나 확인 중...

## **Train Model**

In [None]:
train_loss_list, test_loss_list = list(), list()

total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print("num_param:", total_params)

### **Train/Evaluate 함수(일단 모델 돌아가면 수정)**

반복문 길어서 보기 힘들면 함수로 따로 빼기

In [None]:
'''
def train():
# train
    model.train()

    # initialize(epoch마다)
    total_loss = 0
    count = 0

    for batch_idx, (image, label) in enumerate(train_loader):
        image = image.reshape(-1,3,1024).transpose(1,2)
        x, y = image[:,:512,:].to(device), image[:,512:,:].to(device)

        y_ = torch.zeros([y.shape[0],1,3],requires_grad=False).to(device)
        y_ = torch.cat([y_,y[:,:-1,:]],dim=1)

        logit = model.forward(x,y_)
        cost = loss(logit, y)

        total_loss += cost.item() * y.shape[0] * y.shape[1] * y.shape[2]

        scheduled_optimizer.zero_grad()
        cost.backward()
        scheduled_optimizer.update_parameter_and_learning_rate()

    ave_loss = total_loss/len(train_data)
    train_loss_list.append(ave_loss)

    if i % 1 == 0:
        print("\nEpoch %d Train: %.3f w/ Learning Rate: %.5f"%(i,ave_loss, scheduled_optimizer.lr))
'''

In [None]:
'''
def evaluate():
  ## test
    model.eval()

    total_loss = 0
    count = 0

    with torch.no_grad():
        for batch_idx, (image, label) in enumerate(test_loader):

            image = image.reshape(-1,3,1024).transpose(1,2)
            x, y = image[:,:512,:].to(device), image[:,512:,:].to(device)

            y_ = torch.zeros([y.shape[0],1,3],requires_grad=False).to(device)
            y_ = torch.cat([y_,y[:,:-1,:]],dim=1)

            logit = model.forward(x,y_)
            cost = loss(logit, y)

            total_loss += cost.item() * y.shape[0] * y.shape[1] * y.shape[2]

    ave_loss = total_loss/len(test_data)
    test_loss_list.append(ave_loss)

    if i % 1 == 0:
        print("Epoch %d Test: %.3f"%(i,ave_loss))
'''

In [None]:
'''
for i in range(num_epoch):
  train()
  evaluate()
'''

### **학습 결과**

In [None]:
for i in range(num_epoch):
    # train
    model.train()

    # initialize(epoch마다)
    total_loss = 0
    count = 0

    for batch_idx, (image, label) in enumerate(train_loader):
        image = image.reshape(-1,3,1024).transpose(1,2)
        x, y = image[:,:512,:].to(device), image[:,512:,:].to(device)

        y_ = torch.zeros([y.shape[0],1,3],requires_grad=False).to(device)
        y_ = torch.cat([y_,y[:,:-1,:]],dim=1)

        logit = model.forward(x,y_)
        cost = loss(logit, y)

        total_loss += cost.item() * y.shape[0] * y.shape[1] * y.shape[2]

        scheduled_optimizer.zero_grad()
        cost.backward()
        scheduled_optimizer.update_parameter_and_learning_rate()

    ave_loss = total_loss/len(train_data)
    train_loss_list.append(ave_loss)

    if i % 1 == 0:
        print("\nEpoch %d Train: %.3f w/ Learning Rate: %.5f"%(i,ave_loss, scheduled_optimizer.lr))

    ## test
    model.eval()

    total_loss = 0
    count = 0

    with torch.no_grad():
        for batch_idx, (image, label) in enumerate(test_loader):

            image = image.reshape(-1,3,1024).transpose(1,2)
            x, y = image[:,:512,:].to(device), image[:,512:,:].to(device)

            y_ = torch.zeros([y.shape[0],1,3],requires_grad=False).to(device)
            y_ = torch.cat([y_,y[:,:-1,:]],dim=1)

            logit = model.forward(x,y_)
            cost = loss(logit, y)

            total_loss += cost.item() * y.shape[0] * y.shape[1] * y.shape[2]

    ave_loss = total_loss/len(test_data)
    test_loss_list.append(ave_loss)

    if i % 1 == 0:
        print("Epoch %d Test: %.3f"%(i,ave_loss))