In [1]:
'''Sentiment Analysis on Movie Reviews'''
import math
import torch
from itertools import chain
import pandas as pd
from torch.nn.utils.rnn import pack_padded_sequence
from torch.utils.data import Dataset, DataLoader
import time
import matplotlib.pyplot as plt

In [4]:
class SAData(Dataset):
    def __init__(self, train):
        # 构建数据样本
        self.train = train
        self.data = pd.read_csv('./data/train.tsv', sep='\t')

        if self.train:
            # 随机选取80%作为训练集，不可按索引顺序取，数据会不全面
            self.data = self.data.sample(frac=0.8, replace=False, random_state=1, axis=0)
            # self.data = self.data[:int(self.data.shape[0] * 0.8)]
            self.data = self.data.reset_index(drop=True)  # 重新生成索引
            ### 正式训练要训练所有数据 ###
            # self.data = self.data
            self.len = self.data.shape[0]
        else:
            # 20%作为验证集
            self.data = self.data.sample(frac=0.2, replace=False, random_state=1, axis=0)
            # self.data = self.data[int(self.data.shape[0] * 0.8):]
            self.data = self.data.reset_index(drop=True)  # 重新生成索引
            self.len = self.data.shape[0]
        self.x_data, self.y_data = self.data['Phrase'], self.data['Sentiment']

    def __getitem__(self, index):
        # 根据数据索引获取样本
        return self.x_data[index], self.y_data[index]

    def __len__(self):
        # 返回数据长度
        return self.len


In [5]:
# 训练集验证集数据对象
train_set = SAData(train=True)
validation_set = SAData(train=False)

In [6]:
# Hyper Parameters
N_CHARS = 128  # ASCII码个数
HIDDEN_SIZE = 128
N_LAYER = 2
BATCH_SIZE = 128
N_EPOCHS = 100
USE_GPU = True
N_CLASS = len(set(train_set.y_data))

In [8]:
# 训练集验证集数据加载对象
train_loader = DataLoader(
    dataset=train_set,
    batch_size=BATCH_SIZE,
    shuffle=True,
    # num_workers=2
)
validation_loader = DataLoader(
    dataset=validation_set,
    batch_size=BATCH_SIZE,
    shuffle=False,  # 测试集不打乱有利于观察结果
    # num_workers=2
)