In [28]:
import collections
import random
import tarfile
import torch
import os
from torch import nn
import torchtext.vocab as Vocab
import torch.utils.data as Data

import d2l_pytorch as d2l

In [7]:
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

DATA_ROOT = '/Users/szki/Code/Datasets/'

In [8]:
fname = os.path.join(DATA_ROOT, 'aclImdb_v1.tar.gz')

In [10]:
if not os.path.exists(os.path.join(DATA_ROOT, 'aclImdb')):
    print('从压缩包解压...')
    with tarfile.open(fname, 'r') as f:
        f.extractall(DATA_ROOT)

In [19]:
from tqdm import tqdm

def read_imdb(folder='train', data_root=DATA_ROOT+'aclImdb'):
    data = []
    for label in ['pos', 'neg']:
        folder_name = os.path.join(data_root, folder, label)
        for file in tqdm(os.listdir(folder_name)):
            with open(os.path.join(folder_name, file), 'rb') as f:
                review = f.read().decode('utf-8').replace('\n', ' ').replace('\r', ' ').lower()
                data.append([review, 1 if label == 'pos' else 0])
    random.shuffle(data)
    return data

In [21]:
train_data, test_data = read_imdb('train'), read_imdb('test')

100%|██████████| 12500/12500 [00:00<00:00, 23174.24it/s]
100%|██████████| 12500/12500 [00:00<00:00, 23859.43it/s]
100%|██████████| 12500/12500 [00:02<00:00, 4652.49it/s]
100%|██████████| 12500/12500 [00:00<00:00, 22666.96it/s]


In [43]:
print(len(train_data), len(test_data))

25000 25000


AttributeError: 'list' object has no attribute 'shape'

In [25]:
def get_tokenized_imdb(data):
    """
    :param data: list of [string, label]
    :return:
    """
    def tokenizer(text):
        return [tok.lower() for tok in text.split(' ')]
    return [tokenizer(review) for review, _ in data]

In [31]:
def get_vocab_imdb(data):
    tokenized_data = get_tokenized_imdb(data)
    counter = collections.Counter([tk for st in tokenized_data for tk in st])
    return Vocab.Vocab(counter, min_freq=5)

In [35]:
vocab = get_vocab_imdb(train_data)

In [40]:
print('# words in vocab:', len(vocab))

# words in vocab: 46152


In [42]:
def preprocess_imdb(data, vocab):
    max_l = 500

    def pad(x):
        return x[:max_l] if int(len(x)) > max_l else x + [0] * (max_l - int(len(x)))

    tokenized_data = get_tokenized_imdb(data)
    features = torch.tensor([pad([vocab.stoi[word] for word in words]) for words in tokenized_data])
    labels = torch.tensor([score for _, score in data])
    return features, labels

In [None]:
batch_size = 64
train_set = Data.TensorDataset()