In [2]:
%load_ext autoreload
%autoreload 2
import torch
from tree import RTree
from RAE import RAE
import os as os
import numpy as np
from logger import Logger
from sklearn.model_selection import train_test_split

### 读取数据並進行拆分，打包npz

In [30]:
datapath = r'../datasets/aclImdb'
save_dir = r'../datasets/aclImdb'

def get_data(datapath):
    pos_files = os.listdir(datapath + '/pos')
    neg_files = os.listdir(datapath + '/neg')
    print(len(pos_files))
    print(len(neg_files))

    pos_all = []
    neg_all = []
    for pf, nf in zip(pos_files, neg_files):
        with open(datapath + '/pos' + '/' + pf, encoding='utf-8') as f:
            s = f.read()
            pos_all.append(s)
        with open(datapath + '/neg' + '/' + nf, encoding='utf-8') as f:
            s = f.read()
            neg_all.append(s)

    X_orig= np.array(pos_all + neg_all)
    Y_orig = np.array([1 for _ in range(len(pos_all))] + [0 for _ in range(len(neg_all))])
    print("X_orig:", X_orig.shape)
    print("Y_orig:", Y_orig.shape)

    return X_orig, Y_orig

def generate_train_data():
    X_orig, Y_orig = get_data(datapath+r'/train')
    X_test, Y__test = get_data(datapath+r'/test')
    X = np.concatenate([X_orig, X_test])
    Y = np.concatenate([Y_orig, Y__test])
    np.random.seed = 1
    random_indexs = np.random.permutation(len(X))
    X = X[random_indexs]
    Y = Y[random_indexs]
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1)
    print("X_train:", X_train.shape)
    print("y_train:", y_train.shape)
    print("X_test:", X_test.shape)
    print("y_test:", y_test.shape)
    print("x_val:", X_val.shape)
    print("y_val:", y_val.shape)
    np.savez(save_dir + '/imdb_train', x=X_train, y=y_train)
    np.savez(save_dir + '/imdb_test', x=X_test, y=y_test)
    np.savez(save_dir + '/imdb_val', x=X_val, y=y_val)

generate_train_data()

12500
12500
X_orig: (25000,)
Y_orig: (25000,)
12500
12500
X_orig: (25000,)
Y_orig: (25000,)
X_train: (36000,)
y_train: (36000,)
X_test: (10000,)
y_test: (10000,)
x_val: (4000,)
y_val: (4000,)


### 数据的截断和填充 

In [16]:
from torchtext.data import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torchtext import transforms as T
from torch.utils.data import TensorDataset
    
def build_dataset(reviews, labels, vocab, max_len=512):
    text_transform = T.Sequential(
        T.VocabTransform(vocab=vocab),
        T.Truncate(max_seq_len=max_len),
        T.ToTensor(padding_value=vocab['<pad>']),
        T.PadTransform(max_length=max_len, pad_value=vocab['<pad>']),
    )
    dataset = TensorDataset(text_transform(reviews), torch.tensor(labels))
    return dataset

def load_imdb():
    trainset=np.load('../datasets/aclImdb/imdb_train.npz')
    valset=np.load('../datasets/aclImdb/imdb_train.npz')
    testset=np.load('../datasets/aclImdb/imdb_train.npz')
    torkenizer=get_tokenizer('basic_english')
    reviews_train, labels_train = [torkenizer(s) for s in trainset['x'].tolist()],trainset['y'].tolist()
    reviews_val,labels_val = [torkenizer(s) for s in valset['x'].tolist()],valset['y']
    reviews_test, labels_test = [torkenizer(s) for s in testset['x'].tolist()],testset['y']
    
    vocab = build_vocab_from_iterator(reviews_train, min_freq=3, specials=['<pad>', '<unk>', '<cls>', '<sep>'])
    vocab.set_default_index(vocab['<unk>'])
    train_data = build_dataset(reviews_train, labels_train, vocab)
    val_data = build_dataset(reviews_val,labels_val,vocab)
    test_data = build_dataset(reviews_test, labels_test, vocab)
    return train_data, val_data,test_data, vocab

train_data, val_data,test_data, vocab=load_imdb()


### 训练

In [None]:
from torch.utils.data import DataLoader
if torch.cuda.is_available() :
    device=torch.device('cuda')
else:
    device=torch.device('cpu')
train_dl=DataLoader(dataset=train_data,batch_size=200)
val_dl=DataLoader(dataset=val_data,batch_size=200)
model=RAE(device=device,
          vocab_size=vocab.__len__(),
          K=2).to(device)
params=model.parameters()
lossFunc2=torch.nn.CrossEntropyLoss()
optimizer=torch.optim.Adam(params=params,lr=1e-4)
val_loss=-1
for epoch in range(1,200):
    for step ,(x,y) in enumerate(train_dl):
        x=x.to(device)
        y=y.to(device)
        pred,loss1=model(x)
        loss2=lossFunc2(pred,y)
        loss=loss1+loss2
        loss=torch.abs(loss)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        print(str(100*step/len(train_dl)),'%','  train_loss:',loss.item(),end='\r')
    if epoch %50 == 0 :
        with torch.no_grad():
            for _,(vx,vy) in enumerate(val_data):
                vx=vx.to(device)
                vy=vy.to(device)
                pred,vloss1=model(vx)
                vloss2=lossFunc2(pred,y)
                vloss=vloss1+vloss2
                vloss=torch.abs(vloss)
    print('EPOCH ',str(epoch),',  train_loss:',loss.item(),',  val_loss:',vloss.item())
    
    

In [20]:
a=torch.tensor([1,2,3,4,5])
b=torch.tensor([0,2,0,4,5]).cuda()
b.tolist()

[0, 2, 0, 4, 5]