In [1]:
#TextCNN


# coding: UTF-8
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np


class TextCNN_Config(object):

    """配置参数"""
    def __init__(self, embedding):
        self.model_name = 'TextCNN'
        self.train_path ='data/train.txt'                                # 训练集
        self.dev_path = 'data/dev.txt'                                    # 验证集
        self.test_path = 'data/test.txt'                                  # 测试集
        self.class_list = [x.strip() for x in open(
             'data/class.txt', encoding='utf-8').readlines()]           # 类别名单
        self.vocab_path = 'data/vocab.pkl'                                # 词表
        self.save_path = 'data/saved_dict/' + self.model_name + '.ckpt'        # 模型训练结果
        self.log_path = 'data/log/' + self.model_name
        self.embedding_pretrained = torch.tensor(
            np.load('data/' + embedding)["embeddings"].astype('float32'))\
            if embedding != 'random' else None                                       # 预训练词向量
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')   # 设备

        self.dropout = 0.5                                              # 随机失活
        self.require_improvement = 1000                                 # 若超过1000batch效果还没提升，则提前结束训练
        self.num_classes = len(self.class_list)                         # 类别数
        self.n_vocab = 0                                                # 词表大小，在运行时赋值
        self.num_epochs = 20                                            # epoch数
        self.batch_size = 128                                           # mini-batch大小
        self.pad_size = 300                                              # 每句话处理成的长度(短填长切)
        self.learning_rate = 1e-3                                       # 学习率
        self.embed = self.embedding_pretrained.size(1)\
            if self.embedding_pretrained is not None else 300           # 字向量维度
        self.filter_sizes = (2, 3, 4)                                   # 卷积核尺寸
        self.num_filters = 256                                          # 卷积核数量(channels数)

'''Convolutional Neural Networks for Sentence Classification'''


class TextCNN_Model(nn.Module):
    def __init__(self, config):
        super(TextCNN_Model, self).__init__()
        if config.embedding_pretrained is not None:
            self.embedding = nn.Embedding.from_pretrained(config.embedding_pretrained, freeze=False)
        else:
            self.embedding = nn.Embedding(config.n_vocab, config.embed, padding_idx=config.n_vocab - 1)
        self.convs = nn.ModuleList(
            [nn.Conv2d(1, config.num_filters, (k, config.embed)) for k in config.filter_sizes])
        self.dropout = nn.Dropout(config.dropout)
        self.fc = nn.Linear(config.num_filters * len(config.filter_sizes), config.num_classes)

    def conv_and_pool(self, x, conv):
        x = F.relu(conv(x)).squeeze(3)
        x = F.max_pool1d(x, x.size(2)).squeeze(2)
        return x

    def forward(self, x):
        out = self.embedding(x[0])
        out = out.unsqueeze(1)
        out = torch.cat([self.conv_and_pool(out, conv) for conv in self.convs], 1)
        out = self.dropout(out)
        out = self.fc(out)
        return out

In [2]:
#TextRCNN

# coding: UTF-8
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np


class TextRCNN_Config(object):

    """配置参数"""
    def __init__(self, embedding):
        self.model_name = 'TextRCNN'
        self.train_path = "data/train.txt"                                # 训练集
        self.dev_path ='data/dev.txt'                                    # 验证集
        self.test_path ='data/test.txt'                                  # 测试集
        self.class_list = [x.strip() for x in open(
             'data/class.txt', encoding='utf-8').readlines()]              # 类别名单
        self.vocab_path = 'data/vocab.pkl'                                # 词表
        self.save_path = 'data/saved_dict/' + self.model_name + '.ckpt'        # 模型训练结果
        self.log_path =  'data/log/' + self.model_name
        self.embedding_pretrained = torch.tensor(
            np.load('data/' + embedding)["embeddings"].astype('float32'))\
            if embedding != 'random' else None                                       # 预训练词向量
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')   # 设备

        self.dropout = 1.0                                              # 随机失活
        self.require_improvement = 1000                                 # 若超过1000batch效果还没提升，则提前结束训练
        self.num_classes = len(self.class_list)                         # 类别数
        self.n_vocab = 0                                                # 词表大小，在运行时赋值
        self.num_epochs = 10                                            # epoch数
        self.batch_size = 128                                           # mini-batch大小
        self.pad_size = 300                                              # 每句话处理成的长度(短填长切)
        self.learning_rate = 1e-3                                       # 学习率
        self.embed = self.embedding_pretrained.size(1)\
            if self.embedding_pretrained is not None else 300           # 字向量维度, 若使用了预训练词向量，则维度统一
        self.hidden_size = 256                                          # lstm隐藏层
        self.num_layers = 1                                             # lstm层数


'''Recurrent Convolutional Neural Networks for Text Classification'''


class TextRCNN_Model(nn.Module):
    def __init__(self, config):
        super(TextRCNN_Model, self).__init__()
        if config.embedding_pretrained is not None:
            self.embedding = nn.Embedding.from_pretrained(config.embedding_pretrained, freeze=False)
        else:
            self.embedding = nn.Embedding(config.n_vocab, config.embed, padding_idx=config.n_vocab - 1)
        self.lstm = nn.LSTM(config.embed, config.hidden_size, config.num_layers,
                            bidirectional=True, batch_first=True, dropout=config.dropout)
        self.maxpool = nn.MaxPool1d(config.pad_size)
        self.fc = nn.Linear(config.hidden_size * 2 + config.embed, config.num_classes)

    def forward(self, x):
        x, _ = x
        embed = self.embedding(x)  # [batch_size, seq_len, embeding]=[64, 32, 64]
        out, _ = self.lstm(embed)
        out = torch.cat((embed, out), 2)
        out = F.relu(out)
        out = out.permute(0, 2, 1)
        out = self.maxpool(out).squeeze()
        out = self.fc(out)
        return out

In [3]:
#引入配置文件
embedding = 'embedding_data.npz'
TCNNconfig = TextCNN_Config(embedding)
TRCNNconfig = TextRCNN_Config(embedding)
np.random.seed(1)
torch.manual_seed(1)
torch.cuda.manual_seed_all(1)
torch.backends.cudnn.deterministic = True  # 保证每次结果一样

In [4]:
#引入TextCNN模型
CNN_model=TextCNN_Model(TCNNconfig)
CNN_model.load_state_dict(torch.load('data/saved_dict/TextCNN.ckpt')) 

<All keys matched successfully>

In [5]:
#引入TextRCNN模型
RCNN_model=TextRCNN_Model(TRCNNconfig)
RCNN_model.load_state_dict(torch.load('data/saved_dict/TextRCNN.ckpt')) 

  "num_layers={}".format(dropout, num_layers))


<All keys matched successfully>

In [6]:
#引入测试集
inputs=[]
labels=[]
with open("data/test.txt", "rb") as f:
    for line in f.readlines(): 
        lin = line.decode().strip()
        List=lin.split("\t")
        inputs.append(List[0])
        labels.append(List[1])

In [7]:
#数据处理

MAX_VOCAB_SIZE = 10000  # 词表长度限制
UNK, PAD = '<UNK>', '<PAD>'  # 未知字，padding符号


def build_dataset(config,words):
    tokenizer = lambda x: x.split(' ')  # 以空格隔开，word-level
    vocab = pkl.load(open(config.vocab_path, 'rb'))
    words=words+'\t'+'0'
    contents = []
    lin = words.strip()
    content, label = lin.split('\t')
    words_line = []
    token = tokenizer(content)
    seq_len = len(token)
    if config.pad_size:
        if len(token) < config.pad_size:
            token.extend([PAD] * (config.pad_size - len(token)))
        else:
            token = token[:config.pad_size]
            seq_len = config.pad_size
    # word to id
    for word in token:
        words_line.append(vocab.get(word, vocab.get(UNK)))
    contents.append((words_line, int(label), seq_len))
    return contents

class DatasetIterater(object):
    def __init__(self, batches, batch_size, device):
        self.batch_size = batch_size
        self.batches = batches
        self.n_batches = len(batches) // batch_size
        self.residue = False  # 记录batch数量是否为整数
        if len(batches) % self.n_batches != 0:
            self.residue = True
        self.index = 0
        self.device = device

    def _to_tensor(self, datas):
        x = torch.LongTensor([_[0] for _ in datas]).to(self.device)
        y = torch.LongTensor([_[1] for _ in datas]).to(self.device)
        # pad前的长度(超过pad_size的设为pad_size)
        seq_len = torch.LongTensor([_[2] for _ in datas]).to(self.device)
        return (x, seq_len), y

    def __next__(self):
        if self.residue and self.index == self.n_batches:
            batches = self.batches[self.index * self.batch_size: len(self.batches)]
            self.index += 1
            batches = self._to_tensor(batches)
            return batches

        elif self.index >= self.n_batches:
            self.index = 0
            raise StopIteration
        else:
            batches = self.batches[self.index * self.batch_size: (self.index + 1) * self.batch_size]
            self.index += 1
            batches = self._to_tensor(batches)
            return batches

    def __iter__(self):
        return self

    def __len__(self):
        if self.residue:
            return self.n_batches + 1
        else:
            return self.n_batches

def build_iterator(dataset, config):
    iter = DatasetIterater(dataset, 1, config.device)
    return iter

In [8]:
Type=['alt.atheism',
                'comp.graphics',
                'comp.os.ms-windows.misc',
                'comp.sys.ibm.pc.hardware',
                'comp.sys.mac.hardware',
                'comp.windows.x',
                'misc.forsale',
                'rec.autos',
                'rec.motorcycles',
                'rec.sport.baseball',
                'rec.sport.hockey',
                'sci.crypt',
                'sci.electronics',
                'sci.med',
                'sci.space',
                'soc.religion.christian',
                'talk.politics.guns',
                'talk.politics.mideast',
                'talk.politics.misc',
                'talk.religion.misc']

In [9]:
import pickle as pkl

def TCNNpredict( model, words,config):
    data=build_dataset(config,words)
    iter=build_iterator(data,config)
    model.eval()
    with torch.no_grad():
        for texts, labels in iter:
            outputs = model(texts)
            List=outputs.tolist()
            List1=List[0]
            Result=List1.index(max(List1))
            return Result

def TRCNNpredict( model, words,config):
    data=build_dataset(config,words)
    iter=build_iterator(data,config)
    model.eval()
    with torch.no_grad():
        for texts, labels in iter:
            outputs = model(texts)
            List=outputs.tolist()
            Result=List.index(max(List))
            return Result

In [10]:
#对于测试集，我们来对每一个样本进行推理预测并计算其准确率，这里展示前十个
def  predict(inputs,labels):
    count=10
    countTcnn=0
    countTrcnn=0
    length=len(inputs)
    for i in range(length):
        res1=TCNNpredict(CNN_model,inputs[i],TCNNconfig)
        res2=TRCNNpredict(RCNN_model,inputs[i],TRCNNconfig)
        label=labels[i]
        if(i<=count):
            print("第%d个样本中，TextCNN预测结果为%d,TextRCNN预测结果为%d,标准答案为%d"%(i,res1,res2,int(label)))
        if(label!=res1):countTcnn+=1
        if(label!=res2):countTrcnn+=1

In [11]:
predict(inputs,labels)

第0个样本中，TextCNN预测结果为16,TextRCNN预测结果为16,标准答案为16
第1个样本中，TextCNN预测结果为17,TextRCNN预测结果为17,标准答案为17
第2个样本中，TextCNN预测结果为3,TextRCNN预测结果为3,标准答案为3
第3个样本中，TextCNN预测结果为16,TextRCNN预测结果为16,标准答案为16
第4个样本中，TextCNN预测结果为15,TextRCNN预测结果为15,标准答案为15
第5个样本中，TextCNN预测结果为10,TextRCNN预测结果为10,标准答案为10
第6个样本中，TextCNN预测结果为5,TextRCNN预测结果为5,标准答案为5
第7个样本中，TextCNN预测结果为18,TextRCNN预测结果为18,标准答案为18
第8个样本中，TextCNN预测结果为3,TextRCNN预测结果为3,标准答案为3
第9个样本中，TextCNN预测结果为15,TextRCNN预测结果为15,标准答案为15
第10个样本中，TextCNN预测结果为14,TextRCNN预测结果为18,标准答案为14
0.000
0.000
