In [7]:
import collections
import os
import random
import time
from tqdm import tqdm
import torch
from torch import nn
import torchtext.vocab as Vocab
import torch.utils.data as Data
import torch.nn.functional as F
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

In [8]:
# !pip install -i https://pypi.tuna.tsinghua.edu.cn/simple torchtext

# 读取数据

In [17]:
def read_data(file):
    data = []
    with open(file, 'r', encoding='utf-8') as f:
        lines = f.readlines()
        # print(len(lines))  # 16000
        # print(type(lines))  # <class 'list'>
        # print(lines[0])  # 0	酸菜鱼不错
        for line in lines:
            mid_list = line.strip().split('\t')
            data.append(mid_list)
    return data

train_data = read_data('./data/Comments/train_shuffle.txt')
train_data[:10]

[['0', '酸菜鱼不错'],
 ['0', '轻食素食都是友善的饮食方式'],
 ['0', '完爆中午吃的农家乐'],
 ['1', '烤鱼很入味'],
 ['0', '有种入口即化的感觉'],
 ['0', '菜品一如既往的好'],
 ['0', '味道非常好'],
 ['0', '团购很优惠'],
 ['0', '咖喱牛腩不错'],
 ['0', '部分菜偏酸辣口']]

In [73]:
import jieba

def get_tokenized_comments(raw_data):
    '''
    @params:
        data: 数据的列表，列表中的每个元素为 [0/1标签, 文本字符串] 二元组
    @return: 切分词后的文本的列表，列表中的每个元素为切分后的词序列 -> [[word1, word2, ...], 0/1标签]
    '''
    data = []
    # data_label = []
    for one in raw_data:
        # data_label.append(one[0])
        data.append(list(jieba.cut(one[1], cut_all=False, HMM=False)))
    return data

def get_vocab_comments(data):
    '''
    @params:
        data: 同上
    @return: 数据集上的词典，Vocab 的实例（freqs, stoi, itos）
    '''
    mid_list = []
    for st in get_tokenized_comments(data):
        mid_list.extend(st)
    counter = collections.Counter(mid_list)
    return Vocab.Vocab(counter, min_freq=1)

vocab = get_vocab_comments(train_data)
print('# words in vocab:', len(vocab))

# words in vocab: 7817


In [75]:
mid = get_tokenized_comments(train_data)

In [76]:
max([len(one) for one in mid])

15

In [77]:
vocab.stoi['酸菜鱼']  # words to index

288

In [78]:
vocab.itos[288]

'酸菜鱼'

In [79]:
vocab.itos[0]

'<unk>'

In [80]:
vocab.itos[1]

'<pad>'

In [81]:
vocab.itos[2]

'的'

### 词典和词语的索引创建好后，就可以将数据集的文本从字符串的形式转换为单词下标序列的形式，以待之后的使用。

In [86]:
def preprocess_comments(data, vocab):
    '''
    @params:
        data: 同上，原始的读入数据
        vocab: 训练集上生成的词典
    @return:
        features: 单词下标序列，形状为 (n, max_l) 的整数张量
        labels: 情感标签，形状为 (n,) 的0/1整数张量
    '''
    max_len = 15  # 将每条评论通过截断或者补0，使得长度变成15

    def pad(x):
        return x[:max_len] if len(x) > max_len else x + [0] * (max_len - len(x))

    tokenized_data = get_tokenized_comments(data)  # [['酸菜鱼', '不错'], ...]
    features = torch.tensor([pad([vocab.stoi[word] for word in words]) for words in tokenized_data])
    labels = torch.tensor([int(one[0]) for one in data])
    return features, labels

In [87]:
features, labels = preprocess_comments(train_data, vocab)

In [88]:
features.shape

torch.Size([16000, 15])

In [90]:
features[:10]

tensor([[ 288,    4,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0],
        [ 742,  272, 1544,    9,   10, 4685,    2, 7677, 2010,    0,    0,    0,
            0,    0,    0],
        [ 157,  505, 1124,   15,    2, 1912,    0,    0,    0,    0,    0,    0,
            0,    0,    0],
        [ 482,    3,   71,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0],
        [ 200,  254,  354,  332,    2,   26,    0,    0,    0,    0,    0,    0,
            0,    0,    0],
        [  49,   12,    2,    6,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0],
        [   7,   14,    6,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0],
        [  67,    3, 1028,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0],
        [  97,  298,    4,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0],
        [1382,   36

### 创建数据迭代器

利用 [`torch.utils.data.TensorDataset`](https://pytorch.org/docs/stable/data.html?highlight=tensor%20dataset#torch.utils.data.TensorDataset)，可以创建 PyTorch 格式的数据集，从而创建数据迭代器。

In [91]:
train_set = Data.TensorDataset(*preprocess_comments(train_data, vocab))
# test_set = Data.TensorDataset(*preprocess_comments(test_data, vocab))  # 测试集标签未知

# 上面的代码等价于下面的注释代码
# train_features, train_labels = preprocess_imdb(train_data, vocab)
# test_features, test_labels = preprocess_imdb(test_data, vocab)
# train_set = Data.TensorDataset(train_features, train_labels)
# test_set = Data.TensorDataset(test_features, test_labels)

# len(train_set) = features.shape[0] or labels.shape[0]
# train_set[index] = (features[index], labels[index])

batch_size = 64
train_iter = Data.DataLoader(train_set, batch_size, shuffle=True)
# test_iter = Data.DataLoader(test_set, batch_size)  # 测试集标签未知

for X, y in train_iter:
    print('X', X.shape, 'y', y.shape)
    break
print('#batches:', len(train_iter))

X torch.Size([64, 15]) y torch.Size([64])
#batches: 250
