## torchtext

In [1]:
import pandas as pd
import jieba
import os
from torchtext.data import *
from torchtext.vocab import *

In [2]:
stopwords = open("/Users/finup/Project/doc_cat/data/stop/stopword.txt", 'r').readlines()
stopwords = [i.replace('\n', '') for i in stopwords]
stopwords.append('LOTOzf')

### 将文本拼成一个文件

In [65]:
def concat_files(path):
    walk = list(os.walk(path))
    result = []
    cat_dict = {'文学出版':0, '校园':1, '女性':2, '体育':3}
    for path, _, files in walk[1:]:
        cat = cat_dict[path.split('/')[-1]]
        for f in files:
            try:
                with open(os.path.join(path, f), 'r', encoding='gbk') as of:
                    doc = of.read()
                    doc = (str(doc)).replace("\r\n", "").replace(" ", "").strip()
                    result.append([int(f.split('.')[0]), doc, cat])
            except UnicodeDecodeError:
                print(path, f)
    return result

train = concat_files('data/data')
test = concat_files('data/test1')

data/data/体育 300.txt


In [66]:
train_df = pd.DataFrame(train, columns=['file_idx', 'content', 'label'])
test_df = pd.DataFrame(test, columns=['file_idx', 'content', 'label'])

In [67]:
train_df.head()

Unnamed: 0,file_idx,content,label
0,289,转发微博贺超：今晚阅读李洱作品《白色的乌鸦》新星出版社20点品味书香文艺之声1066原文转发...,0
1,504,光明荐书《夜莺与玫瑰》是王尔德所著的童话作品经典选集，共收录了他的《夜莺与玫瑰》《幸福王子》...,0
2,262,《悟道》一书的编辑大人发来消息.1.卓越上《悟道》一书的在线阅读，已上线，请关注。2.当当看...,0
3,276,《人生的责任（一）》?对凡夫而言，死亡是一场灾难，因为它切断了生命和光明。但对凡夫而言，死亡...,0
4,510,《气候变化研究进展》计划于2012年出版“气候变化与第三极环境”专栏，主要内容为气候变化背景...,0


In [119]:
test_df

Unnamed: 0,file_idx,content,label
0,771,2011好书推荐《怪诞行为学》$LOTOzf$,0
1,770,陈思进撰写的《金融让谁富有》一书去年由中信出版社出版，据说该书揭开了华尔街金融机器掠夺全球财...,0
2,772,请《结网》的读者支持一下《结网》，非常感谢！$LOTOzf$刘江:2011年度原创技术图书评...,0
3,767,悬疑小说《凡·高密码》连载──凡?高的举世名画《向日葵》里埋藏着怎样的秘密？一个疯子的传奇人...,0
4,773,《F1速报》2011年度总集编需要您的参与，请各位读者和车迷们积极投票，并说出理由，我们会选...,0
5,788,2011好书推荐《南方传媒研究》《品牌策划营销与管理》《科学的广告》《品牌背后的故事》$LO...,0
6,777,在读《设计中的设计|全本》★★★★★“学习下呢”http:url.cn/0eF8rs$LOT...,0
7,776,在读《青春》★★★★“其实就是博客内容，但是我还是支持。哎，想看看台湾的完整未删减版呢”ht...,0
8,789,《苏世睿语》是一部好的人生哲理书。全书506页，50万字，共分8个部分：1.人生篇、2.生活...,0
9,774,《倒错的死角》的确是本不错的作品，自己读起来也是希望一口气看完的（非推理小说的确没有这样的待...,0


### 定义Field

重要的参数：
1. `sequential`：是否是可序列化数据（类似于字符串数据），默认值是 `True`；
2. `user_vocab`：是否使用 `Vocab` 对象，如果取 `False`，则该字段必须是数值类型；默认值是`True`；
3. `tokenize`：是一个 `function` 类型的对象（如 `string.cut` 、`jieba.cut` 等），用于对字符串进行分词；
4. `batch_first`：如果该属性的值取 `True`，则该字段返回的 `Tensor` 对象的第一维度是 `batch` 的大小；默认值是`False`；
5. `fix_length`：该字段是否是定长，如果取 `None` 则按同 `batch` 该字段的最大长度进行pad；
6. `stop_words`：停用词，在`tokenize`步会忽视;

重要函数：

`build_vocab`：为该`Field`创建`Vocab`；

In [69]:
CONTENT = Field(sequential=True, tokenize=jieba.cut, batch_first=True, fix_length=50, stop_words=stopwords)
LABEL = Field(sequential=False, use_vocab=False)

### Dataset

重要参数：
1. `examples`：`Example`对象列表;
2. `fields`：格式是`List(tuple(str, Field))`，其中 `str` 是 `Field` 对象的描述；

重要函数：
1. `split()`：此方法用于划分数据集，将数据集划分为train、test、valid数据集；
2. `split_ratio`：此参数为 `float` 或 `list` 类型，当参数为`float`类型时（参数取值要在[0, 1]），表示数据集中多少比例的数据划分到`train`（训练）数据集里，剩余的划分到`valid`（验证）数据集里；如果该参数是`list`类型（如[0.7, 0.2, 0.1]），表示`train`，`test`、`valid`数据集的比例；该参数默认值是 0.7

In [70]:
# get_dataset构造并返回Dataset所需的examples和fields
def get_dataset(csv_data, text_field, label_field, test=False):
    # id数据对训练在训练过程中没用，使用None指定其对应的field
    fields = [("id", None), # we won't be needing the id, so we pass in None as the field
                 ("content", text_field), ("label", label_field)]       
    examples = []

    if test:
        # 如果为测试集，则不加载label
        for text in tqdm(csv_data['content']):
            examples.append(Example.fromlist([None, text, None], fields))
    else:
        for text, label in tqdm(zip(csv_data['content'], csv_data['label'])):
            examples.append(Example.fromlist([None, text, label], fields))
    return examples, fields

In [99]:
# 得到构建Dataset所需的examples和fields
train_examples, train_fields = get_dataset(train_df, CONTENT, LABEL)
test_examples, test_fields = get_dataset(test_df, CONTENT, LABEL)

3305it [00:02, 1584.64it/s]
200it [00:00, 1766.32it/s]


In [100]:
# 构建Dataset数据集
train_dataset = Dataset(train_examples, train_fields)
# train_dataset, valid_dataset = train_dataset.split(0.7)
test_dataset = Dataset(test_examples, test_fields)

### 构建迭代器

根据`batch_size` 生成 `Dataset` 的 `Iteratior`。常用的有 `Iterator` 和 `BucketIterator` 。其中 `BucketIterator` 是 `Iterator` 的子类，与 `Iterator` 相比，`BucketIterator` 会把相同或相近长度的数据（按 `sort_key`）属性进行排序，这样可以最小化 `pad`。

重要参数：
1. `dataset`：需要生成`Iterator`的数据集；
2. `batch_size`：每个 `batch`的大小；
3. `sort_key`：用来为每个 `Example` 进行排序的字段，默认是`None`；
4. `shuffle`：每次 `epoch` 是否进行 `shuffle`；

重要函数：
1. `splits()`：为数据集生成`Iterator`；
2. `datasets`：`Tuple` 类型的 `Dataset`，`Tuple`的第一个元素应该是`train`数据集的`Dataset`；
3. `batch_sizes`：`Tuple`类型，和`datasets`中的`Dataset`一一对应，表示各个数据集生成`batch`的大小；

In [101]:
# 同时对训练集和验证集进行迭代器的构建
train_iter, val_iter = BucketIterator.splits(
        (train_dataset, test_dataset), # 构建数据集所需的数据集
        batch_sizes=(8, 8),
        device=-1, # 如果使用gpu，此处将-1更换为GPU的编号
        sort_key=lambda x: len(x.content), # the BucketIterator needs to be told what function it should use to group the data.
        sort_within_batch=False,
        repeat=False # we pass repeat=False because we want to wrap this Iterator layer.
)

The `device` argument should be set by using `torch.device` or passing a string as an argument. This behavior will be deprecated soon and currently defaults to cpu.
The `device` argument should be set by using `torch.device` or passing a string as an argument. This behavior will be deprecated soon and currently defaults to cpu.


### 构建词表

In [102]:
CONTENT.build_vocab(train_dataset)

### 查看构建的样本

In [117]:
e = list(val_iter)[20]

In [128]:
test_dataset.examples[5].content

['2011', '好书', '南方', '传媒', '营销', '广告', '背后', '故事']

In [105]:
e.content.shape

torch.Size([8, 50])

## BILSTM+ATTENTION

In [106]:
import joblib
from torch import nn
import torch
from torch.autograd import Variable
import torch.nn.functional as F
import torch.optim as optim
from tensorboardX import SummaryWriter
import time

In [107]:
writer = SummaryWriter('log')

### 配置

In [141]:
class TRNNConfig(object):
    """RNN配置参数"""

    # 模型参数
    char_embedding_size = 100      # 词向量维度
    num_classes = 4        # 类别数
    vocab_size = 10000       # 词汇表达小

    rnn_layers= 1           # 隐藏层层数
    hidden_dims = 64        # 隐藏层神经元
    rnn = 'lstm'             # lstm 或 gru

    keep_dropout = 0.8 # dropout保留比例
    learning_rate = 1e-3    # 学习率

    batch_size = 8         # 每批训练大小
    num_epochs = 5          # 总迭代轮次

    l2_reg_lambda = 1       # l2正则
    
    print_per_batch = 10    # 每多少轮输出一次结果
    save_per_batch = 10      # 每多少轮存入tensorboard

### 网络结构定义

In [142]:
class TextBILSTM(nn.Module):
    
    def __init__(self,
                 config:TRNNConfig,
                 char_size=200):
        super(TextBILSTM, self).__init__()
        self.num_classes = config.num_classes
        self.learning_rate = config.learning_rate
        self.keep_dropout = config.keep_dropout
        self.char_embedding_size = config.char_embedding_size
        self.l2_reg_lambda = config.l2_reg_lambda
        self.hidden_dims = config.hidden_dims
        self.char_size = char_size
        self.rnn_layers = config.rnn_layers
        self.build_model()


    def build_model(self):
        # 初始化字向量
        self.char_embeddings = nn.Embedding(self.char_size, self.char_embedding_size)
        # 字向量参与更新
        self.char_embeddings.weight.requires_grad = True
        # attention layer
        self.attention_layer = nn.Sequential(
            nn.Linear(self.hidden_dims, self.hidden_dims),
            nn.ReLU(inplace=True)
        )
        # self.attention_weights = self.attention_weights.view(self.hidden_dims, 1)

        # 双层lstm
        self.lstm_net = nn.LSTM(self.char_embedding_size, self.hidden_dims,
                                num_layers=self.rnn_layers, dropout=self.keep_dropout,
                                bidirectional=True)
        # FC层
        # self.fc_out = nn.Linear(self.hidden_dims, self.num_classes)
        self.fc_out = nn.Sequential(
#             nn.Dropout(self.keep_dropout),
#             nn.Linear(self.hidden_dims, self.hidden_dims),
#             nn.ReLU(inplace=True),
#             nn.Dropout(self.keep_dropout),
            nn.Linear(self.hidden_dims, self.num_classes),
        )

    def attention_net_with_w(self, lstm_out, lstm_hidden):
        '''
        :param lstm_out:    [batch_size, len_seq, n_hidden * 2]
        :param lstm_hidden: [batch_size, num_layers * num_directions, n_hidden] [8, 
        :return: [batch_size, n_hidden]
        '''
        # chunk的方法做的是对张量进行分块，返回一个张量列表。但如果指定轴的元素个数被chunks除不尽，最后一块的元素个数会少。
        lstm_tmp_out = torch.chunk(lstm_out, 2, -1)
        
        # h [batch_size, time_step, hidden_dims] [8, 200, 32]
        # 将双向LSTM的激活值相加
        h = lstm_tmp_out[0] + lstm_tmp_out[1]
        
        # 将最后一层隐藏层两个方向权重加起来
        # [batch_size, num_layers * num_directions, n_hidden]
        lstm_hidden = torch.sum(lstm_hidden, dim=1)
        
        # [batch_size, 1, n_hidden] [8, 1, 32] 在下标1的位置增加1维
        lstm_hidden = lstm_hidden.unsqueeze(1)
        
        '''
        self.attention_layer 层会根据lstm层的输出产生一个用于计算attention的weights，
        而我希望self.attention_layer产生用于计算attention的weights的过程也进行参数学习，
        所以定义了个anttention_layer。
        以下几行为自定义attention层，将最后一层隐藏层作为attention权重，输出softmax_w
        '''
        # atten_w [batch_size, 1, hidden_dims] [8, 1, 32] attention层
        atten_w = self.attention_layer(lstm_hidden)
        # m [batch_size, time_step, hidden_dims] [8, 200, 32] Tanh激活
        m = nn.Tanh()(h)
        # atten_context [batch_size, 1, time_step] [8, 1, 200] 每个词对应一个weight
        atten_context = torch.bmm(atten_w, m.transpose(1, 2))
        # softmax_w [batch_size, 1, time_step] [8, 1, 200] softmax归一化
        softmax_w = F.softmax(atten_context, dim=-1)
        
        # context [batch_size, 1, hidden_dims] [8, 1, 32] 所有词加权到lstm的output上
        context = torch.bmm(softmax_w, h)
        # 减一维
        result = context.squeeze(1)
        return result
    
    
    def attention_net(self, lstm_out, lstm_hidden):
        
        # chunk的方法做的是对张量进行分块，返回一个张量列表。但如果指定轴的元素个数被chunks除不尽，最后一块的元素个数会少。
        lstm_tmp_out = torch.chunk(lstm_out, 2, -1)
        
        # h [batch_size, time_step, hidden_dims] 
        # 将双向LSTM的激活值相加
        h = lstm_tmp_out[0] + lstm_tmp_out[1]
        
        # 将最后一层隐藏层两个方向权重加起来
        # [batch_size, num_layers * num_directions, n_hidden]
        lstm_hidden = torch.sum(lstm_hidden, dim=1)
        
        # [batch_size, 1, n_hidden]  在下标1的位置增加1维
        lstm_hidden = lstm_hidden.unsqueeze(1)
        
        # atten_score [batch_size, 1, time_step]  attention score层
        atten_score = torch.bmm(lstm_hidden, h.transpose(1, 2))
        
        # atten_weight [batch_size, 1, time_step] softmax归一化（attention distribution层）
        atten_weight = F.softmax(atten_score, dim=-1)
        
        # [batch_size, 1, n_hidden] 加权输出值
        context = torch.bmm(atten_weight, h)
        result = context.squeeze(1)
        return result
    

    def forward(self, char_id):
        '''
        :param char_id: torch.from_numpy(np.array(input[0])).long()
        词id的矩阵，本例中大小为 [batch_size=8, len_seq=200]
        '''
        
        # sen_char_input: [batch_size, len_seq, embedding_dim], torch.Size([8, 200, 100])
        # char_embeddings 后每个词都嵌入到一个100维的矩阵中，输出变为3维张量
        sen_char_input = self.char_embeddings(char_id)
        
        # input : [len_seq, batch_size, embedding_dim] 
        # 将张量1维2维位置调换，permute传入每个维度期望的index
        sen_input = sen_char_input.permute(1, 0, 2)
        
        # output : [batch_size, len_seq, n_hidden * 2] 双向lstm的输出
        output, (step_hidden_state, step_cell_state) = self.lstm_net(sen_input)
        output = output.permute(1, 0, 2)
        
        # step_hidden_state : [batch_size, num_layers * num_directions, n_hidden] 每个时间步的隐藏状态值
        step_hidden_state = step_hidden_state.permute(1, 0, 2)
        
        # 带权重的attention层输出 [batch_size, n_hidden]
        atten_out = self.attention_net(output, step_hidden_state)
        return F.softmax(self.fc_out(atten_out))


### 训练测试

In [143]:

def train_func(sub_train_):
    running_loss = 0
    running_corrects = 0
    for epoch, batch in enumerate(sub_train_):
        optimizer.zero_grad()
        # text = batch.text.permute(1, 0)
        predicted = model(batch.content)

        loss = loss_funtion(predicted, batch.label)
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * batch.batch_size
        running_corrects += torch.sum(predicted.argmax(1) == batch.label).item()
        writer.add_scalar('Train/Loss', running_loss,epoch)
        writer.add_scalar('Train/Acc',running_corrects,epoch)

    train_size = sub_train_.batch_size * sub_train_.iterations
    epoch_loss = running_loss / train_size
    epoch_acc = running_corrects / train_size
    return epoch_loss, epoch_acc

def test_func(sub_test_):
    running_loss = 0
    running_corrects = 0
    for epoch, batch in enumerate(sub_test_):
        with torch.no_grad():
            # text = batch.text.permute(1, 0)
            predicted = model(batch.content)
            loss = loss_funtion(predicted, batch.label)

            running_loss += loss.item() * batch.batch_size
            running_corrects += torch.sum(predicted.argmax(1) == batch.label).item()
            writer.add_scalar('Test/Loss', running_loss, epoch)
            writer.add_scalar('Test/Acc',running_corrects,epoch)

    train_size = sub_test_.batch_size * sub_test_.iterations
    epoch_loss = running_loss / train_size
    epoch_acc = running_corrects / train_size
    return epoch_loss, epoch_acc

### 走起

In [144]:
model = TextBILSTM(TRNNConfig, char_size=len(CONTENT.vocab))
model.train()
optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=TRNNConfig().learning_rate)
loss_funtion = F.cross_entropy

for epoch in range(TRNNConfig().num_epochs):
    start_time = time.time()
    train_loss, train_acc = train_func(train_iter)
    valid_loss, valid_acc = test_func(val_iter)

    secs = int(time.time() - start_time)
    mins = secs / 60
    secs = secs % 60

    print('Epoch: %d' %(epoch + 1), " | time in %d minutes, %d seconds" %(mins, secs))
    print(f'\tLoss: {train_loss:.4f}(train)\t|\tAcc: {train_acc * 100:.1f}%(train)')
    print(f'\tLoss: {valid_loss:.4f}(valid)\t|\tAcc: {valid_acc * 100:.1f}%(valid)')

  "num_layers={}".format(dropout, num_layers))


Epoch: 1  | time in 0 minutes, 47 seconds
	Loss: 1.2363(train)	|	Acc: 49.8%(train)
	Loss: 1.0654(valid)	|	Acc: 70.5%(valid)
Epoch: 2  | time in 0 minutes, 49 seconds
	Loss: 0.9987(train)	|	Acc: 74.4%(train)
	Loss: 0.9925(valid)	|	Acc: 74.0%(valid)
Epoch: 3  | time in 0 minutes, 48 seconds
	Loss: 0.8978(train)	|	Acc: 84.4%(train)
	Loss: 0.9435(valid)	|	Acc: 80.0%(valid)
Epoch: 4  | time in 0 minutes, 48 seconds
	Loss: 0.8553(train)	|	Acc: 88.3%(train)
	Loss: 0.9059(valid)	|	Acc: 84.5%(valid)
Epoch: 5  | time in 0 minutes, 47 seconds
	Loss: 0.8406(train)	|	Acc: 89.9%(train)
	Loss: 0.9185(valid)	|	Acc: 83.0%(valid)


In [129]:
running_loss = 0
running_corrects = 0
for epoch, batch in enumerate(val_iter):
    with torch.no_grad():
        # text = batch.text.permute(1, 0)
        predicted = model(batch.content)
        loss = loss_funtion(predicted, batch.label)

        running_loss += loss.item() * batch.batch_size
        running_corrects += torch.sum(predicted.argmax(1) == batch.label).item()
        writer.add_scalar('Test/Loss', running_loss, epoch)
        writer.add_scalar('Test/Acc',running_corrects,epoch)

train_size = train_iter.batch_size * train_iter.iterations
epoch_loss = running_loss / train_size
epoch_acc = running_corrects / train_size



In [132]:
running_corrects

150