In [1]:
import math
import numpy as np
import pandas as pd
import os
import math
import random
import codecs
from pathlib import Path

import gensim
import numpy as np
import mindspore
import mindspore.dataset as ds
import mindspore.nn as nn
from mindspore import Tensor
from mindspore import context
from mindspore import ops
from mindspore.train.model import Model
from mindspore.nn.metrics import Accuracy
from mindspore.train.serialization import load_checkpoint, load_param_into_net
from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor
from mindspore.ops import operations as ops

In [2]:
from gensim.models.keyedvectors import KeyedVectors
gensim_model = KeyedVectors.load_word2vec_format(
    'GoogleNews-vectors-negative300.bin', binary=True)

In [3]:
class MovieReview:
    '''
    影评数据集
    '''
    def __init__(self, root_dir, maxlen, split):
        '''
        input:
            root_dir: 影评数据目录
            maxlen: 设置句子最大长度
            split: 设置数据集中训练/评估的比例
        '''
        self.path = root_dir
        #设置两个感知映射：负样本标记为0，正样本标记为1
        self.feelMap = {
            'neg':0,
            'pos':1
        }
        self.files = []

        self.doConvert = False
        
        #路径确认
        mypath = Path(self.path)
        if not mypath.exists() or not mypath.is_dir():
            print("please check the root_dir!")
            raise ValueError

        # 在数据目录中找到文件
        for root,_,filename in os.walk(self.path):
            for each in filename:
                self.files.append(os.path.join(root,each))
            break

        # 确认是否为两个文件.neg与.pos
        if len(self.files) != 2:
            print("There are {} files in the root_dir".format(len(self.files)))
            raise ValueError

        # 读取数据
        self.word_num = 0
        self.maxlen = 0           
        self.minlen = float("inf") #初始化最短句子长度为正无穷
        self.maxlen = float("-inf")#初始化最长句子长度为负无穷
        #Pos与Neg分别存储预处理后的正负样本与对应标签值0/1
        #存储形式为
        #负样本：[负样本的一个sentence,0]
        #正样本：[正样本的一个sentence,1]
        self.Pos = []
        self.Neg = []
        for filename in self.files:
            #read_data即对两个文件中的sentence进行预处理（将一个句子变成其单词的集合），并对应存入Pos和Neg中
            self.read_data(filename)

        self.text2vec(maxlen=maxlen)
        self.split_dataset(split=split)
    #tokenize操作
    def read_data(self, filePath):
        with open(filePath,'r',encoding='utf-8') as f:
            for sentence in f.readlines():
            #将原句子中的若干符号更换成空字符串'',相当于消除这些符号
            #个人理解是当前这些符号不是语义理解的重心，因此去除后减少待处理信息，也不会过多影响处理结果
                sentence = sentence.replace('\n','')\
                                    .replace('"','')\
                                    .replace('\'','')\
                                    .replace('.','')\
                                    .replace(',','')\
                                    .replace('[','')\
                                    .replace(']','')\
                                    .replace('(','')\
                                    .replace(')','')\
                                    .replace(':','')\
                                    .replace('--','')\
                                    .replace('-',' ')\
                                    .replace('\\','')\
                                    .replace('0','')\
                                    .replace('1','')\
                                    .replace('2','')\
                                    .replace('3','')\
                                    .replace('4','')\
                                    .replace('5','')\
                                    .replace('6','')\
                                    .replace('7','')\
                                    .replace('8','')\
                                    .replace('9','')\
                                    .replace('`','')\
                                    .replace('=','')\
                                    .replace('$','')\
                                    .replace('/','')\
                                    .replace('*','')\
                                    .replace(';','')\
                                    .replace('<b>','')\
                                    .replace('%','')
                #按照空格为分隔符将sentence划分
                sentence = sentence.split(' ')
                #将sentence中的空字符过滤掉
                sentence = list(filter(lambda x: x, sentence))
                #此时sentence已经实现tokenize,如neg第一个sentence如下
                #['simplistic', 'silly', 'and', 'tedious']
                if sentence:
                    self.word_num += len(sentence)
                    self.maxlen = self.maxlen if self.maxlen >= len(sentence) else len(sentence)
                    self.minlen = self.minlen if self.minlen <= len(sentence) else len(sentence)
                    #给对应的sentence上标签，表示情感
                    if 'pos' in filePath:
                        self.Pos.append([sentence,self.feelMap['pos']])
                    else:
                        self.Neg.append([sentence,self.feelMap['neg']])
    #
    def text2vec(self, maxlen):
        '''
        将句子转化为向量

        '''
        # Vocab = {word : index}
        self.Vocab = dict()
        #先构造一个空词表
        # self.Vocab['None']
        for SentenceLabel in self.Pos+self.Neg:
            vector = [0]*maxlen
            #之前已经将sentence和其label组装到self.Pos和self.Neg中了
            #这里SentenceLabel[0]即为sentence
            #遍历当前sentence中的每个word
            for index, word in enumerate(SentenceLabel[0]):
                if index >= maxlen:
                    break
                #当前word未入词表，将其加入词表且更新其对应值为索引
                if word not in self.Vocab.keys():
                    self.Vocab[word] = len(self.Vocab)
                    #词向量的第index的位置更新为word索引表示词表中的词
                    vector[index] = len(self.Vocab) - 1
                else:
                    vector[index] = self.Vocab[word]
            #将句子转变为向量，该句子向量类似于独热编码
            SentenceLabel[0] = vector
        self.doConvert = True
    def split_dataset(self, split):
        '''
        分割为训练集与测试集

        '''
        #这里先把原数据集分为trunk_num份，再从中取出一份作为测试集
        #每份大小为正样本trunk_pos_size，负样本trunk_neg_size
        trunk_pos_size = math.ceil((1-split)*len(self.Pos))
        trunk_neg_size = math.ceil((1-split)*len(self.Neg))
        trunk_num = int(1/(1-split))
        pos_temp=list()
        neg_temp=list()
        for index in range(trunk_num):
            pos_temp.append(self.Pos[index*trunk_pos_size:(index+1)*trunk_pos_size])
            neg_temp.append(self.Neg[index*trunk_neg_size:(index+1)*trunk_neg_size])
        self.test = pos_temp.pop(2)+neg_temp.pop(2)
        self.train = [i for item in pos_temp+neg_temp for i in item]

        random.shuffle(self.train)
        # random.shuffle(self.test)

    def get_dict_len(self):
        '''
        获得数据集中文字组成的词典长度
        '''
        if self.doConvert:
            return len(self.Vocab)
        else:
            print("Haven't finished Text2Vec")
            return -1
    #打包数据集
    def create_train_dataset(self, epoch_size, batch_size):
        dataset = ds.GeneratorDataset(
                                        source=Generator(input_list=self.train), 
                                        column_names=["data","label"], 
                                        shuffle=False
                                        )
        dataset=dataset.batch(batch_size=batch_size,drop_remainder=True)
        #重复读取数据集，几个迭代就重复几次
        dataset=dataset.repeat(epoch_size)
        return dataset

    def create_test_dataset(self, batch_size):
        dataset = ds.GeneratorDataset(
                                        source=Generator(input_list=self.test), 
                                        column_names=["data","label"], 
                                        shuffle=False
                                        )
        dataset=dataset.batch(batch_size=batch_size,drop_remainder=True)
        return dataset
    def get_vocab(self):
        return self.Vocab

In [4]:
class Generator():
    def __init__(self, input_list):
        self.input_list=input_list
    def __getitem__(self,item):
        return (np.array(self.input_list[item][0],dtype=np.int32),
                np.array(self.input_list[item][1],dtype=np.int32))
    def __len__(self):
        return len(self.input_list)

In [5]:
#构造一个影评实例
instance = MovieReview(root_dir='./data/', maxlen=51, split=0.9)
dataset = instance.create_train_dataset(batch_size=64,epoch_size=1)
#获取总批数
batch_num = dataset.get_dataset_size() 

**使用预训练的词向量构造word_embeddings**

In [6]:
#用字典存储词到词向量的映射
word2Vec={}
#将词索引转化为其对应的词
words=[word for word in gensim_model.index_to_key[:]]
#获取词向量并存于列表embbedings中
embeddings=[gensim_model[word] for word in words]
for key,value in zip(words,embeddings):
    word2Vec[key]=value
vocab=instance.get_vocab()
vocab_size=instance.get_dict_len()
ori_embeddings=np.zeros((vocab_size,300))
for word,index in vocab.items():
    word_vec=word2Vec.get(word,np.random.randn(300) * np.sqrt(2/300))
    ori_embeddings[index,:]=word_vec

**我们构建的word_embeddings中索引为0的单词对应的是'the',验证一下其词向量是否与gensim_model['the']相等**

In [7]:
ori_embeddings[0].all()==gensim_model['the'].all()

True

添加<'pad'>符号的词向量

In [8]:
pad=np.zeros_like(ori_embeddings[0])
ori_embeddings=np.r_[ori_embeddings,[pad]]

In [9]:
vocab['<pad>']=len(vocab)

In [10]:
ori_embeddings.shape

(18849, 300)

In [11]:
instance.get_dict_len()

18849

# 模型训练

In [12]:
epoch_size=4
batch_size=64
num_classes=2
weight_decay=3e-5
data_path='./data/'
keep_checkpoint_max=1
checkpoint_path='./ckpt/train_textcnn-4_149.ckpt'
word_len=51
vec_length=300
pre_trained=False

In [13]:
learning_rate = []
warm_up = [1e-3 / math.floor(epoch_size / 5) * (i + 1) for _ in range(batch_num) 
           for i in range(math.floor(epoch_size / 5))]
shrink = [1e-3 / (16 * (i + 1)) for _ in range(batch_num) 
          for i in range(math.floor(epoch_size * 3 / 5))]
normal_run = [1e-3 for _ in range(batch_num) for i in 
              range(epoch_size - math.floor(epoch_size / 5) 
                    - math.floor(epoch_size * 2 / 5))]
learning_rate = learning_rate + warm_up + normal_run + shrink

**调整卷积核需要调整下面这个cell的代码，注意更改featureMap和filter的时候记得更新dense的输入个数**  
接下来实验步骤：
1. 测试若干不同大小的单卷积核，寻找性能效果最佳的
2. 在性能最佳的单卷积核领域使用多卷积核组合，确定多卷积核组合
3. 测试不同大小的featureMap寻找性能最佳的规格，确定featureMap
4. 修改word-embedddings为non-static(mindspore未找出实现方法)

In [16]:
#初始化w参数
def _weight_variable(shape, factor=0.01):
    init_value = np.random.randn(*shape).astype(np.float32) * factor
    return Tensor(init_value)

#构造卷积层
def make_conv_layer(kernel_size):
    #单通道卷积核96个,提取96个特征
    weight_shape = (96, 1, *kernel_size)
    weight = _weight_variable(weight_shape)
    return nn.Conv2d(in_channels=1, out_channels=96, kernel_size=kernel_size, padding=1,
                     pad_mode="pad", weight_init=weight, has_bias=True)


class TextCNN(nn.Cell):
    def __init__(self, vocab_len, word_len, num_classes, vec_length):
        super(TextCNN, self).__init__()
        self.vec_length = vec_length
        self.word_len = word_len
        self.num_classes = num_classes

        self.unsqueeze = ops.ExpandDims()
        #传入预训练词向量构造embedding层,Embedding层又可称为EmbeddingLookup层
        #其作用是使用index id对权重矩阵对应id的向量进行查找，当输入为一个由index id组成的序列时，则查找并返回一个相同长度的矩阵
        self.embedding = nn.Embedding(vocab_len, self.vec_length, embedding_table=Tensor((ori_embeddings),mindspore.float32))

        self.slice = ops.Slice()
        #设置三个卷积核
        self.layer1 = self.make_layer(kernel_height=7)
        self.layer2 = self.make_layer(kernel_height=7)
        self.layer3 = self.make_layer(kernel_height=7)
        self.layer4 = self.make_layer(kernel_height=7)
        
        self.concat = ops.Concat(1)
        #注意修改卷积核个数时要修改全连接层的输入
        self.fc = nn.Dense(96*4, self.num_classes)
        #设置dropout为0.5
        self.drop = nn.Dropout(keep_prob=0.5)
        self.print = ops.Print()
        self.reducemean = ops.ReduceMax(keep_dims=False)
        
    def make_layer(self, kernel_height):
        return nn.SequentialCell(
            [
                #构造卷积层，卷积高度自定，卷积宽度均为词向量长度
                make_conv_layer((kernel_height,self.vec_length)),
                #ReLU为激活函数
                nn.ReLU(),
                #最大池化,将卷积得到的一列向量最大池化成一个元素
                nn.MaxPool2d(kernel_size=(self.word_len-kernel_height+1,1)),
            ]
        )

    def construct(self,x):
        x = self.unsqueeze(x, 1)
        #sentence通过词嵌入层转化为矩阵
        x = self.embedding(x)
        #每个layer为 卷积+激活函数+最大池化 层
        #对sentence矩阵分别使用三个类型的卷积进行操作
        x1 = self.layer1(x)
        x2 = self.layer2(x)
        x3 = self.layer3(x)
        x4 = self.layer4(x)
        
        x1 = self.reducemean(x1, (2, 3))
        x2 = self.reducemean(x2, (2, 3))
        x3 = self.reducemean(x3, (2, 3))
        x4 = self.reducemean(x4, (2, 3))
        
        #拼接池化层的输出作为全连接层的输入
        x = self.concat((x1, x2, x3,x4))
        #x=x4
        #设置drop概率为50%，减少过拟合
        x = self.drop(x)
        x = self.fc(x)
        return x

In [17]:
net = TextCNN(vocab_len=instance.get_dict_len(), word_len=word_len, 
              num_classes=num_classes, vec_length=vec_length)

In [18]:
# Continue training if set pre_trained to be True
if pre_trained:
    param_dict = load_checkpoint(checkpoint_path)
    load_param_into_net(net, param_dict)

In [19]:
opt = nn.Adam(filter(lambda x: x.requires_grad, net.get_parameters()), 
              learning_rate=learning_rate, weight_decay=weight_decay)
loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True)

In [20]:
model = Model(net, loss_fn=loss, optimizer=opt, metrics={'acc': Accuracy()})

In [21]:
config_ck = CheckpointConfig(save_checkpoint_steps=int(epoch_size*batch_num/2), keep_checkpoint_max=keep_checkpoint_max)
time_cb = TimeMonitor(data_size=batch_num)
ckpt_save_dir = "./ckpt"
ckpoint_cb = ModelCheckpoint(prefix="train_textcnn", directory=ckpt_save_dir, config=config_ck)
loss_cb = LossMonitor()

In [22]:
model.train(epoch_size, dataset, callbacks=[time_cb, ckpoint_cb, loss_cb])
print("train success")

epoch: 1 step: 1, loss is 0.6918609738349915
epoch: 1 step: 2, loss is 0.6988835334777832
epoch: 1 step: 3, loss is 0.6894785761833191
epoch: 1 step: 4, loss is 0.6904889345169067
epoch: 1 step: 5, loss is 0.6851993799209595
epoch: 1 step: 6, loss is 0.689205527305603
epoch: 1 step: 7, loss is 0.6756930351257324
epoch: 1 step: 8, loss is 0.7156516313552856
epoch: 1 step: 9, loss is 0.6862642765045166
epoch: 1 step: 10, loss is 0.6857807636260986
epoch: 1 step: 11, loss is 0.6806836128234863
epoch: 1 step: 12, loss is 0.6798306703567505
epoch: 1 step: 13, loss is 0.6879295706748962
epoch: 1 step: 14, loss is 0.6887317895889282
epoch: 1 step: 15, loss is 0.6886580586433411
epoch: 1 step: 16, loss is 0.670509934425354
epoch: 1 step: 17, loss is 0.6601541042327881
epoch: 1 step: 18, loss is 0.6621108055114746
epoch: 1 step: 19, loss is 0.6651834845542908
epoch: 1 step: 20, loss is 0.6475258469581604
epoch: 1 step: 21, loss is 0.6757026314735413
epoch: 1 step: 22, loss is 0.6601787209510803

epoch: 2 step: 28, loss is 0.2852814197540283
epoch: 2 step: 29, loss is 0.20917730033397675
epoch: 2 step: 30, loss is 0.19008232653141022
epoch: 2 step: 31, loss is 0.32483553886413574
epoch: 2 step: 32, loss is 0.26633816957473755
epoch: 2 step: 33, loss is 0.272802472114563
epoch: 2 step: 34, loss is 0.21644848585128784
epoch: 2 step: 35, loss is 0.24836848676204681
epoch: 2 step: 36, loss is 0.33973342180252075
epoch: 2 step: 37, loss is 0.3414191007614136
epoch: 2 step: 38, loss is 0.27293097972869873
epoch: 2 step: 39, loss is 0.34876102209091187
epoch: 2 step: 40, loss is 0.23776692152023315
epoch: 2 step: 41, loss is 0.2562500834465027
epoch: 2 step: 42, loss is 0.2170763611793518
epoch: 2 step: 43, loss is 0.2893419861793518
epoch: 2 step: 44, loss is 0.2904369831085205
epoch: 2 step: 45, loss is 0.22009125351905823
epoch: 2 step: 46, loss is 0.23954187333583832
epoch: 2 step: 47, loss is 0.22644761204719543
epoch: 2 step: 48, loss is 0.230582594871521
epoch: 2 step: 49, loss

epoch: 3 step: 53, loss is 0.02618754468858242
epoch: 3 step: 54, loss is 0.03742952644824982
epoch: 3 step: 55, loss is 0.09847600013017654
epoch: 3 step: 56, loss is 0.024839142337441444
epoch: 3 step: 57, loss is 0.04618028178811073
epoch: 3 step: 58, loss is 0.027759436517953873
epoch: 3 step: 59, loss is 0.039830561727285385
epoch: 3 step: 60, loss is 0.02173733338713646
epoch: 3 step: 61, loss is 0.10588110983371735
epoch: 3 step: 62, loss is 0.039596155285835266
epoch: 3 step: 63, loss is 0.08383962512016296
epoch: 3 step: 64, loss is 0.024854382500052452
epoch: 3 step: 65, loss is 0.02471567690372467
epoch: 3 step: 66, loss is 0.026110824197530746
epoch: 3 step: 67, loss is 0.043494872748851776
epoch: 3 step: 68, loss is 0.014806743711233139
epoch: 3 step: 69, loss is 0.068629689514637
epoch: 3 step: 70, loss is 0.011225176975131035
epoch: 3 step: 71, loss is 0.04176764935255051
epoch: 3 step: 72, loss is 0.021836794912815094
epoch: 3 step: 73, loss is 0.022302303463220596
epoc

epoch: 4 step: 74, loss is 0.004326532129198313
epoch: 4 step: 75, loss is 0.005442000459879637
epoch: 4 step: 76, loss is 0.006924111861735582
epoch: 4 step: 77, loss is 0.003923754207789898
epoch: 4 step: 78, loss is 0.00524840597063303
epoch: 4 step: 79, loss is 0.005871220491826534
epoch: 4 step: 80, loss is 0.007262077648192644
epoch: 4 step: 81, loss is 0.00391845079138875
epoch: 4 step: 82, loss is 0.005579745396971703
epoch: 4 step: 83, loss is 0.00516178272664547
epoch: 4 step: 84, loss is 0.009186436422169209
epoch: 4 step: 85, loss is 0.0052321976982057095
epoch: 4 step: 86, loss is 0.007362195756286383
epoch: 4 step: 87, loss is 0.0053515927866101265
epoch: 4 step: 88, loss is 0.004112340975552797
epoch: 4 step: 89, loss is 0.006318016909062862
epoch: 4 step: 90, loss is 0.006404126062989235
epoch: 4 step: 91, loss is 0.009488876909017563
epoch: 4 step: 92, loss is 0.005417576991021633
epoch: 4 step: 93, loss is 0.0051222206093370914
epoch: 4 step: 94, loss is 0.00543492147

# 测试评估

In [23]:
checkpoint_path = './ckpt/train_textcnn_42-4_149.ckpt'
dataset = instance.create_test_dataset(batch_size=batch_size)
opt = nn.Adam(filter(lambda x: x.requires_grad, net.get_parameters()), 
              learning_rate=0.001, weight_decay=weight_decay)
loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True)
net = TextCNN(vocab_len=instance.get_dict_len(),word_len=word_len,
                  num_classes=num_classes,vec_length=vec_length)

if checkpoint_path is not None:
    param_dict = load_checkpoint(checkpoint_path)
    print("load checkpoint from [{}].".format(checkpoint_path))
else:
    param_dict = load_checkpoint(checkpoint_path)
    print("load checkpoint from [{}].".format(checkpoint_path))

load_param_into_net(net, param_dict)
net.set_train(False)
model = Model(net, loss_fn=loss, metrics={'acc': Accuracy()})

acc = model.eval(dataset)
print("accuracy: ", acc)

load checkpoint from [./ckpt/train_textcnn_42-4_149.ckpt].
accuracy:  {'acc': 0.8056640625}


# 在线测试

In [26]:
def preprocess(sentence):
    sentence = sentence.lower().strip()
    sentence = sentence.replace('\n','')\
                                    .replace('"','')\
                                    .replace('\'','')\
                                    .replace('.','')\
                                    .replace(',','')\
                                    .replace('[','')\
                                    .replace(']','')\
                                    .replace('(','')\
                                    .replace(')','')\
                                    .replace(':','')\
                                    .replace('--','')\
                                    .replace('-',' ')\
                                    .replace('\\','')\
                                    .replace('0','')\
                                    .replace('1','')\
                                    .replace('2','')\
                                    .replace('3','')\
                                    .replace('4','')\
                                    .replace('5','')\
                                    .replace('6','')\
                                    .replace('7','')\
                                    .replace('8','')\
                                    .replace('9','')\
                                    .replace('`','')\
                                    .replace('=','')\
                                    .replace('$','')\
                                    .replace('/','')\
                                    .replace('*','')\
                                    .replace(';','')\
                                    .replace('<b>','')\
                                    .replace('%','')\
                                    .replace("  "," ")
    sentence = sentence.split(' ')
    maxlen = word_len
    vector = [0]*maxlen
    for index, word in enumerate(sentence):
        if index >= maxlen:
            break
        if word not in instance.Vocab.keys():
            print(word,"单词未出现在字典中")
        else:
            vector[index] = instance.Vocab[word]
    sentence = vector

    return sentence

def inference(review_en):
    review_en = preprocess(review_en)
    input_en = Tensor(np.array([review_en]).astype(np.int32))
    output = net(input_en)
    if np.argmax(np.array(output[0])) == 1:
        print("Positive comments")
    else:
        print("Negative comments")

In [32]:
review_en = "the movie make my heart relief"
inference(review_en)

Positive comments


In [33]:
review_en = "just boring and make me sleepy"
inference(review_en)

Negative comments


In [34]:
review_en = "interesting and funny movie"
inference(review_en)

Positive comments
