# Text Classification by RNN

In [1]:
# library
import gensim
import pandas as pd
# import nltk
# import string
# import re
import numpy as np
import random

import torch
import torch.nn as nn
import torchvision
# import matplotlib.pyplot as plt
# %matplotlib inline
import torch.utils.data as Data

In [2]:
word2vec = gensim.models.KeyedVectors.load_word2vec_format("GoogleNews-vectors-negative300.bin.gz", binary=True)

## Data Loading

In [3]:
data=pd.read_table('train.tsv',sep='\t')
# texts=data['Phrase'].tolist()
data_y=data["Sentiment"]
data_y=np.array(data_y)
N=len(data_y)

# words_ls=pd.read_table('words_ls.txt',header=None)[0]
# words_ls=[eval(words) for words in words_ls]

stopwords = ('-',',','.',';',':',"'",'?')  # 停词
texts=data['Phrase'].tolist()
words_ls = []
i=1
for text in texts:
    remove = str.maketrans('','',string.punctuation) 
    text = text.translate(remove)
    if i%100 == 0:
        print("当前已完成%.2f" % (i*100/156060)+'%',end='\r')
    words = [word for word in text.split() if (word not in stopwords)]
    words_ls.append(words)
    i+=1

word_maxlen=0
for words in words_ls:
    word_maxlen=max(word_maxlen,len(words))

## RNN Architecture

In [4]:
# Hyper Parameters
EPOCH = 2 
BATCH_SIZE = 200
TIME_STEP = word_maxlen      # rnn 时间步数 / 图片高度

VEC_LEN = 300
INPUT_SIZE = VEC_LEN     # rnn 每步输入值 / 图片每行像素
LR = 0.01

In [36]:
class RNN(nn.Module):
    def __init__(self):
        super(RNN, self).__init__()

        self.rnn = nn.LSTM(
            input_size=INPUT_SIZE,      # 图片每行的数据像素点
            hidden_size=64,     # rnn hidden unit
            num_layers=1,       # 有几层 RNN layers
            batch_first=True,   # input & output 会是以 batch size 为第一维度的特征集 e.g. (batch, time_step, input_size)
        )

        self.out = nn.Linear(64, 5)    # 输出层

    def forward(self, x):
        # x shape (batch, time_step, input_size)
        # r_out shape (batch, time_step, output_size)
        # h_n shape (n_layers, batch, hidden_size)   LSTM 有两个 hidden states, h_n 是分线, h_c 是主线
        # h_c shape (n_layers, batch, hidden_size)
        r_out, (h_n, h_c) = self.rnn(x, None)   # None 表示 hidden state 会用全0的 state

        # 选取最后一个时间点的 r_out 输出
        # 这里 r_out[:, -1, :] 的值也是 h_n 的值
        out = self.out(r_out[:, -1, :])
        return out

rnn = RNN()
print(rnn)

RNN(
  (rnn): LSTM(300, 64, batch_first=True)
  (out): Linear(in_features=64, out_features=5, bias=True)
)


In [46]:
np.zeros([1,2,3])

array([[[0., 0., 0.],
        [0., 0., 0.]]])

## WV to Tensor

In [65]:
def wv_to_tensor_ls(inds,v_length=VEC_LEN,WV=word2vec):
    # output is a list of tensors
    l=len(inds)
    out=[]
    for i in range(l):
        words=words_ls[inds[i]]
        n=len(words)
        
        if n>0: 
            wv=np.zeros([1,n,v_length])
            try:
                wv[0,:,:]=WV[words]
            except KeyError:
                for h in range(n):
                    try:
                        wv[0,h,:]=WV[words[h]].reshape(1,v_length)
                    except KeyError:
                        wv[0,h,:]=np.random.randn(1,v_length)/10
                        # 到此 wordvec的type还是np.array, need to convert to torch.tensor
        else: #(n==0) 随机赋值
            wv=np.random.randn(1,1,VEC_LEN)/100
        wv=torch.from_numpy(wv).to(torch.float32)
        out.append(wv)
        
    return out

## Train & Test

In [103]:
def train_rnn(EPOCH = 2 ,BATCH_SIZE = 100,LR = 0.01,wv=word2vec):
    
    rnn = RNN()
    optimizer = torch.optim.Adam(rnn.parameters(), lr=LR)   # optimize all parameters
    loss_func = nn.CrossEntropyLoss()   # the target label is not one-hotted

    train_inds=random.sample(range(N),np.int(np.floor(N*0.8)))
    test_inds=list(set(range(N))-set(train_inds))

    mat=np.concatenate((np.arange(N).reshape(N,1),data_y.reshape(N,1)),axis=1)
    train_loader = Data.DataLoader(dataset=mat[train_inds,:], batch_size=BATCH_SIZE, shuffle=True)
    test_x=wv_to_tensor_ls(inds=mat[test_inds,0],WV=wv) # it's a list
    test_y=torch.from_numpy(mat[test_inds,1])

    for epoch in range(EPOCH):
        for step, batch_data in enumerate(train_loader):   # gives batch data, normalize x when iterate train_loader
            b_x_ind = batch_data[:,0]  # batch x
            b_y = batch_data[:,1]   # batch y
            b_x = wv_to_tensor_ls(inds=b_x_ind,WV=wv) # it's a list, when training, iteration is needed.
            
            output=torch.FloatTensor()
            for x in b_x:
                output = torch.cat([output, rnn(x)], dim=0) 
                
            loss = loss_func(output, b_y)   # cross entropy loss
            optimizer.zero_grad()           # clear gradients for this training step
            loss.backward()                 # backpropagation, compute gradients
            optimizer.step()                # apply gradients

            if step % 200 == 199:
                test_output = torch.FloatTensor()
                for x in test_x:
                    test_output = torch.cat([test_output, rnn(x)], dim=0)
                pred_y = torch.max(test_output, 1)[1].data.squeeze()
                accuracy = (pred_y == test_y).sum().item() / float(test_y.size(0))
                print('Epoch: ',epoch+1, '| Step: ', step+1, '| train loss: %.4f' % loss.data, '| test accuracy: %.2f' % accuracy)
    return (rnn,accuracy)

In [98]:
x1=np.random.randn(1,10,300)
x1=torch.from_numpy(x1).to(torch.float32)
output=rnn(x1)
output = torch.cat([output, rnn(x1)], dim=0)

In [99]:
output
torch.max(output, 1)[1].data.squeeze()
torch.max(output, 1)[1]

tensor([[-0.5919,  0.6655,  0.2829, -0.0249, -0.2737],
        [-0.5919,  0.6655,  0.2829, -0.0249, -0.2737]], grad_fn=<CatBackward>)

tensor([1, 1])

tensor([1, 1])

In [104]:
rnn, acc=train_rnn()

Epoch:  1 | Step:  200 | train loss: 0.9738 | test accuracy: 0.62
Epoch:  1 | Step:  400 | train loss: 1.0327 | test accuracy: 0.63
Epoch:  1 | Step:  600 | train loss: 0.9101 | test accuracy: 0.63
Epoch:  1 | Step:  800 | train loss: 0.9056 | test accuracy: 0.64
Epoch:  1 | Step:  1000 | train loss: 0.8129 | test accuracy: 0.64
Epoch:  1 | Step:  1200 | train loss: 0.8013 | test accuracy: 0.64
Epoch:  2 | Step:  200 | train loss: 0.8466 | test accuracy: 0.64
Epoch:  2 | Step:  400 | train loss: 0.7765 | test accuracy: 0.65
Epoch:  2 | Step:  600 | train loss: 0.8576 | test accuracy: 0.65
Epoch:  2 | Step:  800 | train loss: 0.8357 | test accuracy: 0.65
Epoch:  2 | Step:  1000 | train loss: 0.8024 | test accuracy: 0.65
Epoch:  2 | Step:  1200 | train loss: 0.7829 | test accuracy: 0.66


In [106]:
round(acc,4)

0.6556

In [107]:
torch.save(rnn, 'rnn-g300.pkl')  # save entire net

  "type " + obj.__name__ + ". It won't be checked "
