# SAN - Mindspore

## 导入依赖

In [1]:
import os
import numpy as np
import re
import sys
import random
import unicodedata
import math

from mindspore import Tensor, nn, Model, context, DatasetHelper, ms_function
from mindspore.train.serialization import load_param_into_net, load_checkpoint, save_checkpoint
from mindspore.train.callback import LossMonitor, CheckpointConfig, ModelCheckpoint, TimeMonitor
from mindspore import dataset as ds
from mindspore.mindrecord import FileWriter
from mindspore import Parameter
from mindspore.nn.loss.loss import _Loss
from mindspore.ops import functional as F
from mindspore.ops import operations as P
from mindspore.common import dtype as mstype

import moxing as mox
mox.file.copy_parallel(src_url="s3://nlp---3190101095/nlp/vqa/data/annotations/", dst_url='./data/annotations/')
mox.file.copy_parallel(src_url="s3://nlp---3190101095/nlp/vqa/data/questions/", dst_url='./data/questions/')
mox.file.copy_parallel(src_url="s3://nlp---3190101095/nlp/vqa/VQA/", dst_url='./VQA/')
mox.file.copy_parallel(src_url="s3://nlp---3190101095/nlp/vqa/train_feat.pkl", dst_url='./train_feat.pkl')
mox.file.copy_parallel(src_url="s3://nlp---3190101095/nlp/vqa/val_feat.pkl", dst_url='./val_feat.pkl')

from VQA.preprocess_data import preProcess
from VQA.vqa import *
from VQA.vqaEval import *

context.set_context(mode=context.GRAPH_MODE, device_target='Ascend')

INFO:root:Using MoXing-v2.0.0.rc2.4b57a67b-4b57a67b
INFO:root:Using OBS-Python-SDK-3.20.9.1


In [5]:
from easydict import EasyDict as edict

# CONFIG
cfg = edict({
    'trainFeatFile': './train_feat.pkl',
    'valFeatFile': './val_feat.pkl',
    'trainAnnFile': './data/annotations/train.json',
    'trainQuesFile': './data/questions/train.json',
    'valAnnFile': './data/annotations/val.json',
    'valQuesFile': './data/questions/val.json',
    'resultFile': 'result.json',
    'trainSize': 4096,
    'valSize': 1024,
    'max_seq_length': 12,
    'hidden_size': 1024,
    'batch_size': 32,
    'eval_batch_size': 1,
    'learning_rate': 0.001,
    'momentum': 0.9,
    'num_epochs': 15,
    'vocab_size': 0,
    'checkpointFile': './san.ckpt'
})

In [6]:
EOS = "<eos>"
SOS = "<sos>"
MAX_SEQ_LEN=12

def prepareVocab(data):
    vocab = set(' '.join(data).split(' '))
    id2word = [EOS] + [SOS] + list(vocab)
    word2id = {c:i for i,c in enumerate(id2word)}
    vocab_size = len(id2word)
    print("Finish prepare Vocab. Size: %d" %(vocab_size))
    cfg.vocab_size = vocab_size
    return id2word, word2id

def wordEncode(text, word2id):
    data = [1] + [int(word2id[word]) for word in text.split(' ')] + [0]
    #将短句子扩充到统一的长度
    num = MAX_SEQ_LEN + 1 - len(data)
    if(num >= 0):
        data += [0]*num
    else:
        data = data[:MAX_SEQ_LEN] + [0]
    return data

def wordDecode(arr, id2word):
    out = []
    for x in arr:
        if x == 0:
            break
        out.append(id2word[x])
    return ' '.join(out)

In [7]:
class DataProvision:
    def __init__(self, ques, ans, feat):
        self.feat = feat
        self.ques = ques
        self.ans = ans

    def __getitem__(self, index):
#         print(np.array(self.feat[index], dtype=np.float32).shape)
        return np.array(self.ques[index][1:], dtype=np.int32), \
               np.array(self.ans[index][:-1], dtype=np.int32), \
               np.array(self.feat[index], dtype=np.float32), \
               np.array(self.ans[index][1:], dtype=np.int32)

    def __len__(self):
        return len(self.ques)

train_questions, train_answers, train_image_feature, val_questions, val_question_ids, val_image_feature, val_answers = preProcess(cfg)
id2word, word2id = prepareVocab(train_answers + train_questions + val_questions + val_answers)
train_questions = np.array([wordEncode(ques, word2id) for ques in train_questions])
train_answers = np.array([wordEncode(ans, word2id) for ans in train_answers])
provider = DataProvision(train_questions, train_answers, train_image_feature)
ds_train = ds.GeneratorDataset(source=provider, column_names=['src', 'dst', 'feat', 'label'])
ds_train = ds_train.batch(cfg.batch_size)
print(ds_train.get_dataset_size())

loading VQA annotations and questions into memory...
0:00:01.328212
creating index...
index created!
finished processing 0 in train
finished processing 1000 in train
finished processing 2000 in train
finished processing 3000 in train
finished processing 4000 in train
finished processing train
loading VQA annotations and questions into memory...
0:00:00.735394
creating index...
index created!
finished processing 0 in train
finished processing 1000 in train
finished processing val
finished processing features. Miss Images:  1989
Finish prepare Vocab. Size: 3055
128


In [8]:
def gru_default_state(batch_size, input_size, hidden_size, num_layers=1, bidirectional=False):
    '''Weight init for gru cell'''
    stdv = 1 / math.sqrt(hidden_size)
    weight_i = Parameter(Tensor(
        np.random.uniform(-stdv, stdv, (input_size, 3*hidden_size)).astype(np.float32)), 
                         name='weight_i')
    weight_h = Parameter(Tensor(
        np.random.uniform(-stdv, stdv, (hidden_size, 3*hidden_size)).astype(np.float32)), 
                         name='weight_h')
    bias_i = Parameter(Tensor(
        np.random.uniform(-stdv, stdv, (3*hidden_size)).astype(np.float32)), name='bias_i')
    bias_h = Parameter(Tensor(
        np.random.uniform(-stdv, stdv, (3*hidden_size)).astype(np.float32)), name='bias_h')
    return weight_i, weight_h, bias_i, bias_h

class GRU(nn.Cell):
    def __init__(self, config, is_training=True):
        super(GRU, self).__init__()
        if is_training:
            self.batch_size = config.batch_size
        else:
            self.batch_size = config.eval_batch_size
        self.hidden_size = config.hidden_size
        self.weight_i, self.weight_h, self.bias_i, self.bias_h = \
            gru_default_state(self.batch_size, self.hidden_size, self.hidden_size)
        self.rnn = P.DynamicGRUV2()
        self.cast = P.Cast()

    def construct(self, x, hidden):
        x = self.cast(x, mstype.float16)
        y1, h1, _, _, _, _ = self.rnn(x, self.weight_i, self.weight_h, self.bias_i, self.bias_h, None, hidden)
        return y1, h1
    
class Attention(nn.Cell):
    def __init__(self, config, is_training=True):
        super(Attention, self).__init__()
        self.hidden_size = config.hidden_size
        self.attnq = nn.Dense(self.hidden_size, self.hidden_size)
        self.attni = nn.Dense(self.hidden_size, self.hidden_size)
        self.attnp = nn.Dense(self.hidden_size, 1, activation = "softmax")
        self.add = P.Add()
        self.mul = P.Mul()
        self.tanh = nn.Tanh()
        self.trans = P.Transpose()
        self.perm = (1, 0, 2)

    def construct(self, question, img):
        i_attn = self.attni(img)
        q_attn = self.attnq(question)
        i_attn = self.trans(i_attn, self.perm)
        i_attn = self.add(i_attn, q_attn)
        i_attn = self.trans(i_attn, self.perm)
        ha = self.tanh(i_attn)
        p = self.attnp(ha)
        u = self.mul(p,img).sum(axis=1)
        u = self.add(u, question)
        return u

class SAN(nn.Cell):
    def __init__(self, config, is_training=True):
        super(SAN, self).__init__()
        self.hidden_size = config.hidden_size
        self.attn_1 = Attention(config = config, is_training = is_training)
        self.attn_2 = Attention(config = config, is_training = is_training)
        self.attn_3 = Attention(config = config, is_training = is_training)
        self.trans = P.Transpose()
        self.dense = nn.Dense(2048, config.hidden_size)
        self.tanh = nn.Tanh()

    def construct(self, question, img):
        tmp = self.dense(img)
        tmp = self.tanh(tmp)
        u_1 = self.attn_1(question, tmp)
        u_2 = self.attn_2(u_1, tmp)

        return u_2
        
class Encoder(nn.Cell):
    def __init__(self, config, is_training=True):
        super(Encoder, self).__init__()
        self.vocab_size = config.vocab_size
        self.hidden_size = config.hidden_size
        if is_training:
            self.batch_size = config.batch_size
        else:
            self.batch_size = config.eval_batch_size

        self.trans = P.Transpose()
        self.perm = (1, 0, 2)
        self.embedding = nn.Embedding(self.vocab_size, self.hidden_size)
        self.gru = GRU(config, is_training=is_training).to_float(mstype.float16)
        self.h = Tensor(np.zeros((self.batch_size, self.hidden_size)).astype(np.float16))
        self.cast = P.Cast()

    def construct(self, encoder_input):
        embeddings = self.embedding(encoder_input)
        embeddings = self.trans(embeddings, self.perm)
        output, hidden = self.gru(embeddings, self.h)
        return self.cast(output, mstype.float32), hidden

class Decoder(nn.Cell):
    def __init__(self, config, is_training=True, dropout=0.1):
        super(Decoder, self).__init__()

        self.vocab_size = config.vocab_size
        self.hidden_size = config.hidden_size
        self.max_len = config.max_seq_length

        self.trans = P.Transpose()
        self.perm = (1, 0, 2)
        self.embedding = nn.Embedding(self.vocab_size, self.hidden_size)
        self.dropout = nn.Dropout(1-dropout)
        self.attn = nn.Dense(self.hidden_size, self.max_len)
        self.softmax = nn.Softmax(axis=2)
        self.bmm = P.BatchMatMul()
        self.concat = P.Concat(axis=2)
        self.attn_combine = nn.Dense(self.hidden_size * 2, self.hidden_size)

        self.gru = GRU(config, is_training=is_training).to_float(mstype.float16)
        self.out = nn.Dense(self.hidden_size, self.vocab_size)
        self.logsoftmax = nn.LogSoftmax(axis=2)
        self.cast = P.Cast()

    def construct(self, decoder_input, hidden, encoder_output):
        embeddings = self.embedding(decoder_input)
        embeddings = self.dropout(embeddings)

        embeddings = self.trans(embeddings, self.perm)
        output, hidden = self.gru(embeddings, hidden)
        output = self.cast(output, mstype.float32)
        output = self.out(output)
        output = self.logsoftmax(output)

        return output, hidden

class Seq2Seq(nn.Cell):
    def __init__(self, config, is_train=True):
        super(Seq2Seq, self).__init__()
        self.max_len = config.max_seq_length
        self.is_train = is_train

        self.encoder = Encoder(config, is_train)
        self.decoder = Decoder(config, is_train)
        self.expanddims = P.ExpandDims()
        self.squeeze = P.Squeeze(axis=0)
        self.argmax = P.ArgMaxWithValue(axis=int(2), keep_dims=True)
        self.concat = P.Concat(axis=1)
        self.concat2 = P.Concat(axis=0)
        self.select = P.Select()
        self.san = SAN(config,is_train)

    def construct(self, src, dst, img):
        encoder_output, hidden = self.encoder(src)
        san_out = self.san(encoder_output[-1], img)
        
        decoder_hidden = san_out
        if self.is_train:
            outputs, _ = self.decoder(dst, decoder_hidden, san_out)
        else:
            decoder_input = dst[::,0:1:1]
            decoder_outputs = ()
            for i in range(0, self.max_len):
                decoder_output, decoder_hidden = self.decoder(decoder_input, decoder_hidden, san_out)
                decoder_hidden = self.squeeze(decoder_hidden)
                decoder_output, _ = self.argmax(decoder_output)
                decoder_output = self.squeeze(decoder_output)
                decoder_outputs += (decoder_output,)
                decoder_input = decoder_output
            outputs = self.concat(decoder_outputs)
        return outputs

class NLLLoss(_Loss):
    '''
       NLLLoss function
    '''
    def __init__(self, reduction='mean'):
        super(NLLLoss, self).__init__(reduction)
        self.one_hot = P.OneHot()
        self.reduce_sum = P.ReduceSum()

    def construct(self, logits, label):
        label_one_hot = self.one_hot(label, F.shape(logits)[-1], F.scalar_to_array(1.0), 
                                     F.scalar_to_array(0.0))
        loss = self.reduce_sum(-1.0 * logits * label_one_hot, (1,))
        return self.get_loss(loss)
    
class WithLossCell(nn.Cell):
    def __init__(self, backbone, config):
        super(WithLossCell, self).__init__(auto_prefix=False)
        self._backbone = backbone
        self.batch_size = config.batch_size
        self.onehot = nn.OneHot(depth=config.vocab_size)
        self._loss_fn = NLLLoss()
        self.max_len = config.max_seq_length
        self.squeeze = P.Squeeze()
        self.cast = P.Cast()
        self.argmax = P.ArgMaxWithValue(axis=1, keep_dims=True)
        self.print = P.Print()

    def construct(self, src, dst, feat, label):
        out = self._backbone(src, dst, feat)
        loss_total = 0
        for i in range(self.batch_size):
            loss = self._loss_fn(self.squeeze(out[::,i:i+1:1,::]), 
                                 self.squeeze(label[i:i+1:1, ::]))
            loss_total += loss
        loss = loss_total / self.batch_size
        return loss

In [9]:
network = Seq2Seq(cfg)
network = WithLossCell(network, cfg)
optimizer = nn.Adam(network.trainable_params(), learning_rate=cfg.learning_rate, beta1=0.9, beta2=0.98)
model = Model(network, optimizer=optimizer)



In [10]:
loss_cb = LossMonitor()
time_cb = TimeMonitor(data_size=ds_train.get_dataset_size())
callbacks = [time_cb, loss_cb]

model.train(cfg.num_epochs, ds_train, callbacks=callbacks, dataset_sink_mode=False)
save_checkpoint(model.train_network, cfg.checkpointFile)
mox.file.copy_parallel(src_url=cfg.checkpointFile, dst_url='s3://nlp---3190101095/nlp/san.ckpt')

epoch: 1 step: 1, loss is 38.09493
epoch: 1 step: 2, loss is 22.368027
epoch: 1 step: 3, loss is 7.829624
epoch: 1 step: 4, loss is 7.099731
epoch: 1 step: 5, loss is 7.2276335
epoch: 1 step: 6, loss is 10.766493
epoch: 1 step: 7, loss is 5.778524
epoch: 1 step: 8, loss is 6.500427
epoch: 1 step: 9, loss is 5.923667
epoch: 1 step: 10, loss is 10.6613035
epoch: 1 step: 11, loss is 5.900267
epoch: 1 step: 12, loss is 14.860427
epoch: 1 step: 13, loss is 8.539547
epoch: 1 step: 14, loss is 6.819925
epoch: 1 step: 15, loss is 9.511694
epoch: 1 step: 16, loss is 11.066891
epoch: 1 step: 17, loss is 9.313786
epoch: 1 step: 18, loss is 5.9087367
epoch: 1 step: 19, loss is 14.609064
epoch: 1 step: 20, loss is 13.766233
epoch: 1 step: 21, loss is 19.339743
epoch: 1 step: 22, loss is 9.903465
epoch: 1 step: 23, loss is 6.839871
epoch: 1 step: 24, loss is 9.718821
epoch: 1 step: 25, loss is 8.96584
epoch: 1 step: 26, loss is 10.745544
epoch: 1 step: 27, loss is 10.298734
epoch: 1 step: 28, loss i

In [11]:
evalList = []
mask = [1] + [0 for i in range(MAX_SEQ_LEN - 1)]

provider = DataProvision(val_questions, val_answers, val_image_feature)
question = []
decoder_in = []
feat = []
label = []
max_loss = 0
sum_loss = 0
for id, q in enumerate(val_questions):
    question.append(wordEncode(q, word2id)[1:])
    decoder_in.append(mask)
    feat.append(val_image_feature[id])
    label.append(wordEncode(val_answers[id], word2id)[1:])
    if id % 32 < 31:
        continue
    question = Tensor(np.array(question, dtype=np.int32))
    decoder_in = Tensor(np.array(decoder_in).astype(np.int32))
    feat = Tensor(np.array(feat, dtype=np.float32))
    label = Tensor(np.array(label, dtype=np.int32))
    output = model.train_network(question, decoder_in, feat, label)
    output = output.asnumpy()
    max_loss = max(max_loss, output)
    sum_loss += output
#     evalList.append({"answer": wordDecode(output, id2word), "question_id": val_question_ids[id]})
    question = []
    decoder_in = []
    feat = []
    label = []
    
print("Max Loss: ", max_loss)
print("Avg Loss: ", sum_loss * 32 / len(val_questions))

# json.dump(evalList, open(cfg.resultFile, "w"))

# vqa = VQA(cfg.valAnnFile, cfg.valQuesFile)
# vqaEval = VQAEval(vqa, vqa.loadRes(cfg.resultFile, cfg.valQuesFile))
# vqaEval.evaluate()
# print("Overall Accuracy is: %.02f\n" %(vqaEval.accuracy['overall']))

Max Loss:  65.89157
Avg Loss:  36.70064777135849
