In [1]:
import torchtext

In [2]:
import os
from collections import Counter
from torch import nn
from torchtext import vocab
from torch import optim
from torch.utils import data
import time
import tarfile

In [3]:
import torch
os.environ["CUDA_VISIBLE_DEVICES"]="0"
device=torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
os.listdir("./data/aclImdb")

['imdb.vocab', 'imdbEr.txt', 'README', 'test', 'train']

In [5]:
ROOT_DIR="./data"

In [6]:
file_path=os.path.join(ROOT_DIR,"aclImdb")
file_name=os.path.join(ROOT_DIR,"aclImdb_v1.tar")
if not os.path.exists(file_path):
    with tarfile.open(file_name,"r") as f:
        f.extractall(ROOT_DIR)

In [7]:
def read_data(mode="train",path="./data/aclImdb"):
    raw_data=[]
    for label in ["pos","neg"]:
        tag=1 if label=="pos" else 0
        file_path=os.path.join(path,mode,label)
        for file in os.listdir(file_path):
            full_name=os.path.join(file_path,file)
            with open(full_name,"rb") as f:
                content=f.read().decode("utf-8").replace("\n"," ").lower().split()
                raw_data.append((content,tag))
    return raw_data

In [8]:
train_data=read_data("train")

In [9]:
test_data=read_data("test")

In [10]:
def get_vocab(dataset):
    counter=Counter([tk for st,_ in dataset for tk in st])
    all_vocab=vocab.Vocab(counter=counter,min_freq=5)
    return all_vocab

In [11]:
all_vocab=get_vocab(train_data)

In [12]:
import math
import numpy as np

# torchtext的vocab中自动添加unk,pad

In [13]:
content_lens=[len(d) for d,_ in train_data]

In [14]:
np.mean(content_lens)

233.78720000000001

In [15]:
MAX_LEN=300

In [16]:
def preprocess(data,vocab):
    def pad(x):
        return x[:MAX_LEN] if len(x)>MAX_LEN else x+[vocab.stoi["<pad>"]]*(MAX_LEN-len(x))
    features,labels=[],[]
    for f,l in data:
        features.append(pad([vocab.stoi[w] for w in f]))
        labels.append(l)
    return torch.tensor(features,dtype=torch.long),torch.tensor(labels,dtype=torch.int)

In [17]:
train_dataset=data.TensorDataset(*preprocess(train_data,all_vocab))
test_dataset=data.TensorDataset(*preprocess(test_data,all_vocab))
BATCH_SIZE=64
train_loader=data.DataLoader(train_dataset,batch_size=BATCH_SIZE,shuffle=True)
test_loader=data.DataLoader(test_dataset,batch_size=BATCH_SIZE,shuffle=True)

In [18]:
for s,t in train_loader:
    print(s.size(),t.size())
    break

torch.Size([64, 300]) torch.Size([64])


In [19]:
class RnnModel(nn.Module):
    def __init__(self,vocab_num,input_size,hidden_size,num_layer,bidirection=False):
        super(RnnModel,self).__init__()
        self.embedding=nn.Embedding(vocab_num,input_size)
        self.encoder=nn.LSTM(input_size=input_size,hidden_size=hidden_size,bias=True,num_layers=num_layer,bidirectional=bidirection)
        self.linear=nn.Linear(hidden_size*(4 if bidirection else 2),2)
    def forward(self,x):
        x=self.embedding(x.long()).transpose(1,0)#seq_len,batch_size,embeding_size
        outputs,_=self.encoder(x)#outputs:seq_len,batch_size,2*hidden_size
        hidden=torch.cat((outputs[0],outputs[-1]),dim=-1)#result:batch_size,4*hidden_size
        result=self.linear(hidden)
        return result

In [20]:
EMBEDDING_SIZE=100
HIDDEN_SIZE=200
NUM_LAYER=2
BIDIRECT=True
model=RnnModel(len(all_vocab.itos),input_size=EMBEDDING_SIZE,
              hidden_size=HIDDEN_SIZE,num_layer=NUM_LAYER,bidirection=BIDIRECT)

In [21]:
def get_word2vec(vocabulary):
    glove=vocab.GloVe(name="6B",dim=EMBEDDING_SIZE,cache=os.path.join(ROOT_DIR,"glove"))
    embedding_matrix=torch.zeros((len(vocabulary.itos),EMBEDDING_SIZE))
    pretrain_vector=glove.vectors
    oov=0
    for i,w in enumerate(vocabulary.stoi):
        try:
            index=glove.stoi[w]
            embedding_matrix[i,:]=pretrain_vector[index,:]
        except:
            oov+=1
    print("oov:",oov)
    return embedding_matrix        

In [22]:
model.embedding.weight.data.copy_(get_word2vec(all_vocab))
model.embedding.weight.requires_grad_(False)

oov: 365977


Parameter containing:
tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [ 0.2512,  0.6499, -0.2465,  ...,  0.0659, -0.9114,  0.4129],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.1752,  0.1468, -0.0800,  ...,  0.1581, -0.6230, -0.2806]])

In [23]:
lr,num_epochs=0.01,5
loss=nn.CrossEntropyLoss()
optimizer=optim.Adam(list(filter(lambda x:x.requires_grad,model.parameters())),lr=lr)

In [29]:
def train(net,train_loader,test_loader,lr,num_epoch,device):
    net=net.to(device)
    loss=nn.CrossEntropyLoss()
    optimizer=optim.Adam(list(filter(lambda x:x.requires_grad,net.parameters())),lr=lr)
    for epoch in range(num_epoch):
        loss_sum,n=0.0,0
        for X,Y in train_loader:
            Y_hat=net(X.to(device))
            l=loss(Y_hat,Y.long().to(device))
            optimizer.zero_grad()
            l.backward()
            optimizer.step()
            loss_sum+=l.cpu().item()
            n+=X.size(0)
            print(n)
        print("Epoch:%d,TrainLoss:%.2f"%(epoch+1,l_sum/n))
        loss_sum,n=0.0,0
        for X,Y in test_loader:
            with torch.no_grad():
                Y_hat=net(X.to(device))
                l=loss(Y_hat,Y.long().to(device))
                loss_sum+=l.cpu().item()
                n+=X.size(0)
        print("TestLoss:%.2f"%(l_sum/n))             

In [25]:
def predict(net,sentence,vocabulary):
    sentence_number=torch.tensor([[vocabulary.stoi[w] for k in sentence.trim().split()]])
    result=net(sentence_number).argmax(dim=-1).item()
    print("%s,predict_result:%s"%(sentence,"positive" if result==1 else "negative"))

In [None]:
train(model,train_loader,test_loader,lr,num_epochs,device)

64
128
192
256
320
384
448
512
576
640
704
768
832
896
960
1024
1088
1152
1216
1280
1344
1408
1472
1536
1600
1664
1728
1792
1856
1920
1984
2048
2112
2176
2240
2304
2368
2432
2496
2560
2624
2688
2752
2816
2880
2944
3008
3072
3136
3200
3264
3328
3392
3456
3520
3584
3648
3712
3776
3840
3904
3968
4032
4096
4160
4224
4288
4352
4416
4480
4544
4608
4672
4736
4800
4864
4928
4992
5056
5120
5184
5248
5312
5376
5440
5504
5568
5632
5696
5760
5824
5888
5952
6016
6080
6144
6208
6272
6336
6400
6464
6528
6592
6656
6720
6784
6848
6912
6976
7040
7104
7168
7232
7296
7360
7424
7488
7552
7616
7680
7744
7808
7872
7936
8000
8064
8128
8192
8256
8320
8384
8448
8512
8576
8640
8704
8768
8832
8896
8960
9024
9088
9152
9216
9280
9344
9408
9472
9536
9600
9664
9728
9792
9856
9920
9984
10048
10112
10176
10240
10304
10368
10432
10496
10560
10624
10688
10752
10816
10880
10944
11008
11072
11136
11200
11264
11328
11392
11456
11520
11584
11648
11712
11776
11840
11904
11968
12032
12096
12160
12224
12288
12352
12416
12480
12

In [None]:
class GlobalMaxPool(nn.Module):
    def __init__(self):
        super(GlobalMaxPool,self).__init__()
    def forward(self,x):
        return nn.functional.max_pool1d(x,kernel_size=x.size(-1))

In [None]:
class CNNModel(nn.Module):
    def __init__(self,vocab_size,embeding_size,kernel_sizes,channel_sizes,dropout=0.5):
        super(CNNModel,self).__init__()
        self.embeding=nn.Embedding(vocab_size,embeding_size)
        self.convs=nn.ModuleList()
        for kernel,channel in zip(kernel_sizes,channel_sizes):
            self.convs.append(nn.Conv1d(in_channels=embeding_size,out_channels=channel,kernel_size=kernel))
        self.dropout=nn.Dropout(dropout)
        self.maxpool=GlobalMaxPool()
        self.linear=nn.Linear(sum(channel_sizes),2)
    def forward(self,x):
        x=self.embeding(x).premute(0,2,1)
        conv_result=[]
        for conv in self.convs:
            conv_result.append(maxpool(nn.functional.relu(conv(x))).squeeze(-1))
        result=torch.cat(conv_result,dim=1)
        return self.linear(self.dropout(result))