In [1]:
import torch
import torchvision
import torch.nn as nn
import json
import jieba
import time
import numpy as np
from numpy import math
from gensim.models import word2vec as word
import torchtext.vocab as Vocab
import torch.utils.data as Data
import matplotlib.pyplot as plt
import seaborn as sns

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
data_dir = 'E:\Jason\Documents\深度学习\实验五\dataset'

## 1 词向量训练

In [2]:
# 读取数据
with open(data_dir+'/virus_train.txt','r', encoding='utf-8') as load_file:
    load_file = json.load(load_file)
    train_valid_split = 6606
    load_train = load_file[:train_valid_split]
    load_valid = load_file[train_valid_split:]
    print('train: %d, valid: %d' % (len(load_train), len(load_valid)))
with open(data_dir+'/virus_eval_labeled.txt', 'r', encoding='utf-8') as load_file:
    load_test = json.load(load_file)
    print('test: %d' % (len(load_test)))
load_file.close()

train: 6606, valid: 2000
test: 2000


In [3]:
# 分词(运行一次即可，保存至本地)
# stopwords = [line.strip() for line in open('stopword.txt', 'r', encoding='utf-8').readlines()]
# for sentence in load_train:
#     word_cut = jieba.cut(sentence['content'])
#     for words in word_cut:
#         if words not in stopwords:
#             if words != '\t':
#                 with open('word_cut.txt', 'a', errors='ignore', encoding='utf-8') as save_file:
#                     save_file.write(words+" ")
# save_file.close()

# for sentence in load_train:
#     word_cut = jieba.cut(sentence['content'])
#     with open('word_cut_nonestop.txt', 'a', errors='ignore', encoding='utf-8') as save_file:
#         save_file.write(" ".join(word_cut))
# save_file.close()

In [4]:
# 训练词向量
sentences = word.LineSentence('word_cut.txt')
model = word.Word2Vec(sentences, hs=1, min_count=1, window=3, size=100)
print(len(model.wv.vocab))
model.wv.vocab

17766


 <gensim.models.keyedvectors.Vocab at 0x1fe7be5fac8>,
 '獾': <gensim.models.keyedvectors.Vocab at 0x1fe7be5fb00>,
 '野生动物': <gensim.models.keyedvectors.Vocab at 0x1fe7be5fb38>,
 '别去': <gensim.models.keyedvectors.Vocab at 0x1fe7be5fb70>,
 '碰': <gensim.models.keyedvectors.Vocab at 0x1fe7be5fba8>,
 '近期': <gensim.models.keyedvectors.Vocab at 0x1fe7be5fc18>,
 '就医': <gensim.models.keyedvectors.Vocab at 0x1fe7be5fc88>,
 'N95': <gensim.models.keyedvectors.Vocab at 0x1fe7be5fcc0>,
 '起到': <gensim.models.keyedvectors.Vocab at 0x1fe7be5fcf8>,
 '阻止': <gensim.models.keyedvectors.Vocab at 0x1fe7be5fd30>,
 '飞沫': <gensim.models.keyedvectors.Vocab at 0x1fe7be5fda0>,
 '作用': <gensim.models.keyedvectors.Vocab at 0x1fe7be5fdd8>,
 '看好': <gensim.models.keyedvectors.Vocab at 0x1fe7be5fe10>,
 '妈': <gensim.models.keyedvectors.Vocab at 0x1fe7be5fe48>,
 'RDUnNFD': <gensim.models.keyedvectors.Vocab at 0x1fe7be5fe80>,
 '西安': <gensim.models.keyedvectors.Vocab at 0x1fe7be5ff28>,
 '肖战': <gensim.models.keyedvectors.Vocab 

## 2 数据预处理

In [5]:
# li = []
# suma = 0
# n = 0
# for f in load_train:
#     li.append(len(f['content']))
#     suma += len(f['content'])
#     n += 1
# la = sorted(li)
# print('max: %d, mean: %.4f' % (la[-1], suma/n))

# plt.plot(range(n), li)

In [13]:
# 分离特征和标签
def split_x_y(load_data):
    X, y = [], []
    for data in load_data:
        X.append(data['content'])
        y.append(data['label'])
    return X, y

train_x, train_y = split_x_y(load_train)
valid_x, valid_y = split_x_y(load_valid)
test_x, test_y = split_x_y(load_test)

In [24]:
# 分词，并去掉停用词
def words_after_jieba(X):
    result = []
    stopwords = [line.strip() for line in open('stopword.txt', 'r', encoding='utf-8').readlines()]
    for sentence in X:
        sentence = jieba.cut(sentence)
        words = []
        for word in sentence:
            if word not in stopwords:
                if word != '\t':
                    words.append(word)
        result.append(words)
    return result

train_X = words_after_jieba(train_x)
valid_X = words_after_jieba(valid_x)
test_X = words_after_jieba(test_x)


['天使']


In [45]:
# 词向量映射
def form_embedding(corpus, model):
    # 读取词向量
    w2v = dict(zip(model.wv.index2word, model.wv.vectors))

    # 创建词语词典，从而知道文本中有多少词语
    w2vindex = dict() # 词语为key，索引为value的字典
    index = 1
    embedding_dim = 100
    for sentence in corpus:
        for word in sentence:
            if word not in w2vindex:
                w2vindex[word] = index
                index += 1

    # 建立词语到词向量的映射
    # embeddings = np.random.randn(len(w2index) + 1, self.embedding_dim)
    embeddings = np.zeros(shape=(len(w2vindex) + 1, embedding_dim), dtype=float)
    embeddings[0] = 0   # 未映射到的词语，全部赋值为0

    n_not_in_w2v = 0
    for word, index in w2vindex.items():
        if word in model.wv.vocab:
            embeddings[index] = model.wv[word]
        else:
            n_not_in_w2v += 1

    del model, w2v

    # 语料从中文词映射为索引
    x = [[w2vindex[word] for word in sentence] for sentence in corpus]

    return embeddings, x

train_embd, train_embd_x = form_embedding(train_X, model)
valid_embd, valid_embd_x = form_embedding(valid_X, model)
test_embd, test_embd_x = form_embedding(test_X, model)

## 3 构建LSTM网络

In [None]:
# 定义模型
class TextLSTM(nn.Module):
    def __init__():
        super().__init__()
    
    def forward():
