In [1]:
import os
import math
from itertools import chain
import gensim
import numpy as np
import mindspore
from mindspore.mindrecord import FileWriter
import jieba
import pandas as pd



In [2]:
import mindspore.dataset as mds
def create_dataset(base_path, batch_size, num_epochs, is_train):
    columns_list = ["feature", "label"]
    num_consumer = 4
    if is_train:
        path = os.path.join(base_path, "aclImdb_train.mindrecord0")
    else:
        path = os.path.join(base_path, "aclImdb_test.mindrecord0")
    dataset = mds.MindDataset(path, columns_list=["feature", "label"], num_parallel_workers=4)
    dataset = dataset.shuffle(buffer_size=dataset.get_dataset_size())
    dataset = dataset.batch(batch_size=batch_size, drop_remainder=True)
    dataset = dataset.repeat(count=num_epochs)
    return dataset
dataset_train = create_dataset("./mindrecord", batch_size=32, num_epochs=10, is_train=True)
dataset_real_test=create_dataset("./mindrecordtest", batch_size=32, num_epochs=10, is_train=True)

In [3]:
embedding_tabel = np.loadtxt(os.path.join("./embedding/", "weight.txt")).astype(np.float32)

In [4]:
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""LSTM."""
import math

import numpy as np

from mindspore import Tensor, nn, context, Parameter, ParameterTuple
from mindspore.common.initializer import initializer
from mindspore.ops import operations as P

STACK_LSTM_DEVICE = ["CPU"]


# Initialize short-term memory (h) and long-term memory (c) to 0
def lstm_default_state(batch_size, hidden_size, num_layers, bidirectional):
    """init default input."""
    num_directions = 2 if bidirectional else 1
    h = Tensor(np.zeros((num_layers * num_directions, batch_size, hidden_size)).astype(np.float32))
    c = Tensor(np.zeros((num_layers * num_directions, batch_size, hidden_size)).astype(np.float32))
    return h, c


def stack_lstm_default_state(batch_size, hidden_size, num_layers, bidirectional):
    """init default input."""
    num_directions = 2 if bidirectional else 1

    h_list = c_list = []
    for _ in range(num_layers):
        h_list.append(Tensor(np.zeros((num_directions, batch_size, hidden_size)).astype(np.float32)))
        c_list.append(Tensor(np.zeros((num_directions, batch_size, hidden_size)).astype(np.float32)))
    h, c = tuple(h_list), tuple(c_list)
    return h, c


class StackLSTM(nn.Cell):
    """
    Stack multi-layers LSTM together.
    """

    def __init__(self,
                 input_size,
                 hidden_size,
                 num_layers=1,
                 has_bias=True,
                 batch_first=False,
                 dropout=0.0,
                 bidirectional=False):
        super(StackLSTM, self).__init__()
        self.num_layers = num_layers
        self.batch_first = batch_first
        self.transpose = P.Transpose()

        # direction number
        num_directions = 2 if bidirectional else 1

        # input_size list
        input_size_list = [input_size]
        for i in range(num_layers - 1):
            input_size_list.append(hidden_size * num_directions)

        # layers
        layers = []
        for i in range(num_layers):
            layers.append(nn.LSTM(input_size=input_size_list[i],
                                      hidden_size=hidden_size,
                                      has_bias=has_bias,
                                      batch_first=batch_first,
                                      bidirectional=bidirectional,
                                      dropout=dropout))

        # weights
        weights = []
        for i in range(num_layers):
            # weight size
            weight_size = (input_size_list[i] + hidden_size) * num_directions * hidden_size * 4
            if has_bias:
                bias_size = num_directions * hidden_size * 4
                weight_size = weight_size + bias_size

            # numpy weight
            stdv = 1 / math.sqrt(hidden_size)
            w_np = np.random.uniform(-stdv, stdv, (weight_size, 1, 1)).astype(np.float32)

            # lstm weight
            weights.append(Parameter(initializer(Tensor(w_np), w_np.shape), name="weight" + str(i)))

        #
        self.lstms = layers
        self.weight = ParameterTuple(tuple(weights))

    def construct(self, x, hx):
        """construct"""
        if self.batch_first:
            x = self.transpose(x, (1, 0, 2))
        # stack lstm
        h, c = hx
        hn = cn = None
        for i in range(self.num_layers):
            x, hn, cn, _, _ = self.lstms[i](x, h[i], c[i], self.weight[i])
        if self.batch_first:
            x = self.transpose(x, (1, 0, 2))
        return x, (hn, cn)


class SentimentNet(nn.Cell):
    """Sentiment network structure."""

    def __init__(self,
                 vocab_size,
                 embed_size,
                 num_hiddens,
                 num_layers,
                 bidirectional,
                 num_classes,
                 weight,
                 batch_size):
        super(SentimentNet, self).__init__()
        # Mapp words to vectors
        self.embedding = nn.Embedding(vocab_size,
                                      embed_size,
                                      embedding_table=weight)
        self.embedding.embedding_table.requires_grad = False
        self.trans = P.Transpose()
        self.perm = (1, 0, 2)

        if context.get_context("device_target") in STACK_LSTM_DEVICE:
            # stack lstm by user
            self.encoder = StackLSTM(input_size=embed_size,
                                     hidden_size=num_hiddens,
                                     num_layers=num_layers,
                                     has_bias=True,
                                     bidirectional=bidirectional,
                                     dropout=0.0)
            self.h, self.c = stack_lstm_default_state(batch_size, num_hiddens, num_layers, bidirectional)
        else:
            # standard lstm
            self.encoder = nn.LSTM(input_size=embed_size,
                                   hidden_size=num_hiddens,
                                   num_layers=num_layers,
                                   has_bias=True,
                                   bidirectional=bidirectional,
                                   dropout=0.0)
            self.h, self.c = lstm_default_state(batch_size, num_hiddens, num_layers, bidirectional)

        self.concat = P.Concat(1)
        if bidirectional:
            self.decoder = nn.Dense(num_hiddens * 4, num_classes)
        else:
            self.decoder = nn.Dense(num_hiddens * 2, num_classes)

    def construct(self, inputs):
        # input：(64,500,300)
        embeddings = self.embedding(inputs)
        embeddings = self.trans(embeddings, self.perm)
        output, _ = self.encoder(embeddings, (self.h, self.c))
        # states[i] size(64,200)  -> encoding.size(64,400)
        encoding = self.concat((output[0], output[499]))
        outputs = self.decoder(encoding)
        return outputs

In [5]:
from mindspore import Tensor, nn, Model, context, Parameter
from mindspore.common.initializer import initializer
from mindspore.ops import operations as P
from mindspore.nn import Accuracy
from mindspore.train.callback import LossMonitor, CheckpointConfig, ModelCheckpoint, TimeMonitor

In [5]:
# from mindspore.model_zoo.lstm import SentimentNet

network = SentimentNet(vocab_size=embedding_tabel.shape[0],
                embed_size=100,
                num_hiddens=100,
                num_layers=2,
                bidirectional=False,
                num_classes=2,
                weight=Tensor(embedding_tabel),
                batch_size=32)

loss = nn.SoftmaxCrossEntropyWithLogits( sparse=True)
opt = nn.Momentum(network.trainable_params(), 0.1, 0.9)
loss_callback = LossMonitor(per_print_times=60)
model = Model(network, loss, opt, {'acc': Accuracy()})
config_ck = CheckpointConfig(save_checkpoint_steps=390, keep_checkpoint_max=10)
checkpoint_cb = ModelCheckpoint(prefix="lstm", directory="./model", config=config_ck)

from mindspore import context
context.set_context(mode=context.GRAPH_MODE, save_graphs=False, device_target="Ascend")
model.train(1, dataset_train, callbacks=[checkpoint_cb, loss_callback], dataset_sink_mode=False)



epoch: 1 step: 60, loss is 0.7171497941017151
epoch: 1 step: 120, loss is 0.7218549251556396
epoch: 1 step: 180, loss is 0.776342511177063
epoch: 1 step: 240, loss is 0.6117372512817383
epoch: 1 step: 300, loss is 0.9299334287643433
epoch: 1 step: 360, loss is 0.8461623191833496
epoch: 1 step: 420, loss is 0.6753621101379395
epoch: 1 step: 480, loss is 1.687245488166809
epoch: 1 step: 540, loss is 0.6154614686965942
epoch: 1 step: 600, loss is 0.5699261426925659
epoch: 1 step: 660, loss is 0.6977465152740479
epoch: 1 step: 720, loss is 1.0413440465927124
epoch: 1 step: 780, loss is 0.6589983701705933
epoch: 1 step: 840, loss is 0.6349864602088928
epoch: 1 step: 900, loss is 0.7926942110061646
epoch: 1 step: 960, loss is 0.7042611837387085
epoch: 1 step: 1020, loss is 1.0392495393753052
epoch: 1 step: 1080, loss is 0.6065108776092529
epoch: 1 step: 1140, loss is 0.6942701935768127
epoch: 1 step: 1200, loss is 0.6698892116546631
epoch: 1 step: 1260, loss is 0.7141430377960205
epoch: 1 st

In [7]:
network = SentimentNet(vocab_size=embedding_tabel.shape[0],
                embed_size=100,
                num_hiddens=100,
                num_layers=2,
                bidirectional=False,
                num_classes=2,
                weight=Tensor(embedding_tabel),
                batch_size=32)

loss = nn.SoftmaxCrossEntropyWithLogits( sparse=True)
opt = nn.Adam(network.trainable_params(), learning_rate=1e-3, weight_decay=1e-6)
loss_callback = LossMonitor(per_print_times=60)
model = Model(network, loss, opt, {'acc': Accuracy()})
config_ck = CheckpointConfig(save_checkpoint_steps=390, keep_checkpoint_max=10)
checkpoint_cb = ModelCheckpoint(prefix="lstm", directory="./model", config=config_ck)

from mindspore import context
context.set_context(mode=context.GRAPH_MODE, save_graphs=False, device_target="Ascend")
model.train(1, dataset_train, callbacks=[checkpoint_cb, loss_callback], dataset_sink_mode=False)



epoch: 1 step: 60, loss is 0.6436595916748047
epoch: 1 step: 120, loss is 0.6030970811843872
epoch: 1 step: 180, loss is 0.6438596844673157
epoch: 1 step: 240, loss is 0.7073056697845459
epoch: 1 step: 300, loss is 0.5240635871887207
epoch: 1 step: 360, loss is 0.6252241134643555
epoch: 1 step: 420, loss is 0.6011233329772949
epoch: 1 step: 480, loss is 0.6071441173553467
epoch: 1 step: 540, loss is 0.6463255286216736
epoch: 1 step: 600, loss is 0.5728106498718262
epoch: 1 step: 660, loss is 0.5444440841674805
epoch: 1 step: 720, loss is 0.6817660331726074
epoch: 1 step: 780, loss is 0.68641197681427
epoch: 1 step: 840, loss is 0.6923555731773376
epoch: 1 step: 900, loss is 0.7211882472038269
epoch: 1 step: 960, loss is 0.5435845255851746
epoch: 1 step: 1020, loss is 0.5553909540176392
epoch: 1 step: 1080, loss is 0.5065057277679443
epoch: 1 step: 1140, loss is 0.5389652848243713
epoch: 1 step: 1200, loss is 0.6151940822601318
epoch: 1 step: 1260, loss is 0.5626600980758667
epoch: 1 st

In [11]:
batch_size=64
dataset_train = create_dataset("./mindrecord", batch_size=batch_size, num_epochs=10, is_train=True)
network = SentimentNet(vocab_size=embedding_tabel.shape[0],
                embed_size=100,
                num_hiddens=100,
                num_layers=2,
                bidirectional=False,
                num_classes=2,
                weight=Tensor(embedding_tabel),
                batch_size=batch_size)

loss = nn.SoftmaxCrossEntropyWithLogits( sparse=True)
opt = nn.Adam(network.trainable_params(), learning_rate=1e-4, weight_decay=1e-6)
loss_callback = LossMonitor(per_print_times=60)
model = Model(network, loss, opt, {'acc': Accuracy()})
config_ck = CheckpointConfig(save_checkpoint_steps=390, keep_checkpoint_max=10)
checkpoint_cb = ModelCheckpoint(prefix="lstm", directory="./model", config=config_ck)

from mindspore import context
context.set_context(mode=context.GRAPH_MODE, save_graphs=False, device_target="Ascend")
model.train(1, dataset_train, callbacks=[checkpoint_cb, loss_callback], dataset_sink_mode=False)



epoch: 1 step: 60, loss is 0.6783525943756104
epoch: 1 step: 120, loss is 0.6577904224395752
epoch: 1 step: 180, loss is 0.6689574718475342
epoch: 1 step: 240, loss is 0.6629035472869873
epoch: 1 step: 300, loss is 0.6530580520629883
epoch: 1 step: 360, loss is 0.6137760877609253
epoch: 1 step: 420, loss is 0.652942955493927
epoch: 1 step: 480, loss is 0.734551191329956
epoch: 1 step: 540, loss is 0.6338605880737305
epoch: 1 step: 600, loss is 0.6435796618461609
epoch: 1 step: 660, loss is 0.6808139681816101
epoch: 1 step: 720, loss is 0.6622568368911743
epoch: 1 step: 780, loss is 0.6049716472625732
epoch: 1 step: 840, loss is 0.5817291736602783
epoch: 1 step: 900, loss is 0.6636234521865845
epoch: 1 step: 960, loss is 0.6992130279541016
epoch: 1 step: 1020, loss is 0.6433334350585938
epoch: 1 step: 1080, loss is 0.6429691314697266
epoch: 1 step: 1140, loss is 0.6113216876983643
epoch: 1 step: 1200, loss is 0.6428802609443665
epoch: 1 step: 1260, loss is 0.6140112280845642
epoch: 1 st

In [39]:
batch_size=64
dataset_train = create_dataset("./mindrecord", batch_size=batch_size, num_epochs=10, is_train=True)
network = SentimentNet(vocab_size=embedding_tabel.shape[0],
                embed_size=100,
                num_hiddens=100,
                num_layers=2,
                bidirectional=False,
                num_classes=2,
                weight=Tensor(embedding_tabel),
                batch_size=batch_size)

loss = nn.SoftmaxCrossEntropyWithLogits( sparse=True)
opt = nn.Adam(network.trainable_params(), learning_rate=2*1e-4, weight_decay=1e-6)
loss_callback = LossMonitor(per_print_times=60)
model = Model(network, loss, opt, {'acc': Accuracy()})
config_ck = CheckpointConfig(save_checkpoint_steps=390, keep_checkpoint_max=10)
checkpoint_cb = ModelCheckpoint(prefix="lstm", directory="./model", config=config_ck)

from mindspore import context
context.set_context(mode=context.GRAPH_MODE, save_graphs=False, device_target="Ascend")
model.train(3, dataset_train, callbacks=[checkpoint_cb, loss_callback], dataset_sink_mode=False)



epoch: 1 step: 60, loss is 0.663476288318634
epoch: 1 step: 120, loss is 0.6531549692153931
epoch: 1 step: 180, loss is 0.694236159324646
epoch: 1 step: 240, loss is 0.6823680400848389
epoch: 1 step: 300, loss is 0.6060357689857483
epoch: 1 step: 360, loss is 0.6931272149085999
epoch: 1 step: 420, loss is 0.6161444187164307
epoch: 1 step: 480, loss is 0.6625012159347534
epoch: 1 step: 540, loss is 0.6367723941802979
epoch: 1 step: 600, loss is 0.664122462272644
epoch: 1 step: 660, loss is 0.6331920027732849
epoch: 1 step: 720, loss is 0.6884821057319641
epoch: 1 step: 780, loss is 0.6616216897964478
epoch: 1 step: 840, loss is 0.6724331378936768
epoch: 1 step: 900, loss is 0.5720317363739014
epoch: 1 step: 960, loss is 0.6618954539299011
epoch: 1 step: 1020, loss is 0.6544455289840698
epoch: 1 step: 1080, loss is 0.6604650020599365
epoch: 1 step: 1140, loss is 0.6418930292129517
epoch: 1 step: 1200, loss is 0.6735744476318359
epoch: 1 step: 1260, loss is 0.6695458889007568
epoch: 1 ste

In [6]:
batch_size=64
dataset_train = create_dataset("./mindrecord", batch_size=batch_size, num_epochs=10, is_train=True)
network = SentimentNet(vocab_size=embedding_tabel.shape[0],
                embed_size=100,
                num_hiddens=100,
                num_layers=2,
                bidirectional=False,
                num_classes=2,
                weight=Tensor(embedding_tabel),
                batch_size=batch_size)

loss = nn.SoftmaxCrossEntropyWithLogits( sparse=True)
opt = nn.Adam(network.trainable_params(), learning_rate=1e-5, weight_decay=1e-8)
loss_callback = LossMonitor(per_print_times=60)
model = Model(network, loss, opt, {'acc': Accuracy()})
config_ck = CheckpointConfig(save_checkpoint_steps=390, keep_checkpoint_max=10)
checkpoint_cb = ModelCheckpoint(prefix="lstm", directory="./model", config=config_ck)

from mindspore import context
context.set_context(mode=context.GRAPH_MODE, save_graphs=False, device_target="Ascend")
model.train(3, dataset_train, callbacks=[checkpoint_cb, loss_callback], dataset_sink_mode=False)



epoch: 1 step: 60, loss is 0.6922612190246582
epoch: 1 step: 120, loss is 0.6918786764144897
epoch: 1 step: 180, loss is 0.6904754638671875
epoch: 1 step: 240, loss is 0.6885923147201538
epoch: 1 step: 300, loss is 0.6849395632743835
epoch: 1 step: 360, loss is 0.6889755129814148
epoch: 1 step: 420, loss is 0.6838794946670532
epoch: 1 step: 480, loss is 0.6855593919754028
epoch: 1 step: 540, loss is 0.6798254251480103
epoch: 1 step: 600, loss is 0.6766155958175659
epoch: 1 step: 660, loss is 0.6806970834732056
epoch: 1 step: 720, loss is 0.685994029045105
epoch: 1 step: 780, loss is 0.6762064695358276
epoch: 1 step: 840, loss is 0.6789414286613464
epoch: 1 step: 900, loss is 0.6843880414962769
epoch: 1 step: 960, loss is 0.6742408275604248
epoch: 1 step: 1020, loss is 0.6833506226539612
epoch: 1 step: 1080, loss is 0.6628250479698181
epoch: 1 step: 1140, loss is 0.6638712882995605
epoch: 1 step: 1200, loss is 0.6819545030593872
epoch: 1 step: 1260, loss is 0.6634235382080078
epoch: 1 s

模型评估

In [8]:
dataset_test = create_dataset("./mindrecord", batch_size=32, num_epochs=10, is_train=False)
acc = model.eval(dataset_test)
print("accuracy:{}".format(acc))

accuracy:{'acc': 0.7079044117647059}


In [10]:
dataset_test = create_dataset("./mindrecord", batch_size=batch_size, num_epochs=10, is_train=False)
acc = model.eval(dataset_test)
print("accuracy:{}".format(acc))

accuracy:{'acc': 0.6660845588235295}


In [13]:
# dataset_test = create_dataset("./mindrecord", batch_size=batch_size, num_epochs=10, is_train=False)
# acc = model.eval(dataset_test)
# print("accuracy:{:.3f}".format(acc))

测试集

In [11]:
# dir(model)
dataset_real_test=create_dataset("./mindrecordtest", batch_size=batch_size, num_epochs=10, is_train=True)
def test_loop(model, dataset, loss_fn):
    preds_list=[]
    num_batches = dataset.get_dataset_size()
    # model.set_train(False)
    total, test_loss, correct = 0, 0, 0
    for data, label in dataset.create_tuple_iterator():
        pred = model.predict(data).asnumpy()
        preds = np.argmax(pred, axis=1)
        preds_list.extend(preds)
    return preds_list
preds_list=test_loop(model,dataset_real_test,loss)

In [12]:
pd.DataFrame(preds_list).to_csv(r'.comment_result.txt', header=False, index=0)  

In [15]:
# preds_list