In [1]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.utils.data as Data
import torchvision
import matplotlib.pyplot as plt
import numpy as np;
import os
import jieba
import gensim.models.word2vec as w2v
from sklearn.model_selection import train_test_split



In [2]:
# 用以识别并去除文本中的非法字符
def is_chinese(uchar):
    """判断一个unicode是否是汉字"""
    if uchar >= u'\u4e00' and uchar <= u'\u9fa5':
        return True
    else:
        return False

def is_number(uchar):
    """判断一个unicode是否是数字"""
    if uchar >= u'\u0030' and uchar <= u'\u0039':
        return True
    else:
        return False

def is_alphabet(uchar):
    """判断一个unicode是否是英文字母"""
    if (uchar >= u'\u0041' and uchar <= u'\u005a') or (uchar >= u'\u0061' and uchar <= u'\u007a'):
        return True
    else:
        return False

def is_legal(uchar):
    """判断是否非汉字，数字和英文字符"""
    if not (is_chinese(uchar) or is_number(uchar) or is_alphabet(uchar)):
        return False
    else:
        return True

def extract_chinese(line): # 对TXT中的每一行，提取汉字数字与英文与数字
    res = ""
    for word in line:
        if is_legal(word):
            res += word
    return res

def words2line(words): # 对每个words，拼成一个字符串
    line = ""
    for word in words:
        line += " " + word
    return line

In [3]:
#数据预处理函数，在dir文件夹下每个子文件是一类内容

def datahelper(dir):
#返回为文本，文本对应标签，标签及索引，索引及标签
    
    labels_index={}
    index_lables={}
    fs = os.listdir(dir)

    i = 0;
    for f in fs:
        labels_index[f] = i;
        index_lables[i] = f
        i = i + 1;
    print(labels_index)
    
    texts = []   # 每句话（jieba拆词后）的列表
    labels = []  # list of label ids
    
    for la in labels_index.keys():
        print(la + " " + str(labels_index[la]))
        la_dir = dir + "/" + la;  # 读入某一类别，例如“体育”的文件夹
        fs = os.listdir(la_dir) # 获取全部该类文件夹下的文件名
        for f in fs:
            file = open(la_dir + "/" + f, encoding='utf-8') # 打开该txt文件
            lines = file.readlines();
            text = ''
            num_recs=0
            for line in lines:
                if len(line) > 5: # 只提取大于5个字的语句
                    line = extract_chinese(line) # 提取中文有效信息，汉字 字母 数字
                    words = jieba.lcut(line, cut_all=False, HMM=True) # 对每句话拆分成词语的list
                    text = words
                    texts.append(text)
                    labels.append(labels_index[la])
                    num_recs = num_recs + 1
    return texts,labels,labels_index,index_lables

# train_dir = 'D:/THUCNewsTiny'
# train_dir = 'D:/THUCNews'
train_dir = 'D:/THUCNewsSmall'

texts,labels,labels_index,index_lables = datahelper(train_dir)

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\jxjsj\AppData\Local\Temp\jieba.cache


{'体育': 0, '娱乐': 1, '家居': 2, '彩票': 3, '房产': 4, '教育': 5, '时尚': 6, '时政': 7, '星座': 8, '游戏': 9, '社会': 10, '科技': 11, '股票': 12, '财经': 13}
体育 0


Loading model cost 0.648 seconds.
Prefix dict has been built succesfully.


娱乐 1
家居 2
彩票 3
房产 4
教育 5
时尚 6
时政 7
星座 8
游戏 9
社会 10
科技 11
股票 12
财经 13


In [4]:
# 词表构造

word_all = [] # word_all 是词表未去重的形式
word_all.append('')

for text in texts: 
    for word in text:  # 拿出每句话中的每个词，作为词表组成元素
        word_all.append(word)

word_vocb=set(word_all) # 所有词语去重，构成词表
vocb_size=len(word_vocb) # 词表总长度

#设置词表大小
nb_words=40000 # 默认值，若词表长度大于40000，会更新为更大值
max_len=64; # 一句话最多64个词
word_dim=40; # 64x40 就是训练CNN时，第一个卷积层输入矩阵的大小，对应一句话
n_class=len(index_lables)

args={}
if nb_words<vocb_size:
    nb_words=vocb_size;

    #textCNN调用的参数
args['vocb_size']=nb_words
args['max_len']=max_len
args['n_class']=n_class
args['dim']=word_dim

texts_with_id=np.zeros([len(texts),max_len])

#词表与索引的map
word_to_idx={word:i for i,word in enumerate(word_vocb)}
idx_to_word={word_to_idx[word]:word for word in word_to_idx}

In [5]:
# 构造 TextCNN 特有的嵌入矩阵，用以将一句话（以词编号形式输入，向量）转化为矩阵（每个词变化转化为词向量）

#每个单词的对应的词向量  load word 2 vetc，加载词向量，【！！事先预训练！！】
embeddings_index = w2v.Word2Vec.load('D:/THUNewsAllWord/THUCNewsw2v.pkl')

embedding_matrix = np.zeros((nb_words, word_dim)) # 嵌入矩阵，行号是词编号，行内容是词向量

for word, i in word_to_idx.items():
#     if i >= nb_words:
#         continue
    if word in embeddings_index:
        embedding_vector = embeddings_index[word]
#         if int(sum(embedding_vector)) != 0:
#             words not found in embedding index will be all-zeros.
        embedding_matrix[i] += embedding_vector

args['embedding_matrix']=torch.Tensor(embedding_matrix)

  # This is added back by InteractiveShellApp.init_path()
  if sys.path[0] == '':


In [6]:
#生成训练数据，需要将训练数据的Word转换为word的索引，将texts中每句话转化为这句话中的每个词对应的词编号

for i in range(0,len(texts)):
    if len(texts[i])<max_len:
        for j in range(0,len(texts[i])):
            texts_with_id[i][j]=word_to_idx[texts[i][j]]
        for j in range(len(texts[i]),max_len):
            texts_with_id[i][j] = word_to_idx['']
    else:
        for j in range(0,max_len):
            texts_with_id[i][j]=word_to_idx[texts[i][j]]
            
# 生成的texts_with_id 每行是原文本的每句话，列宽限定在max_len，每行变为原每个词对应的词编号0~XXXX

In [8]:
# textCNN模型构造 - BN处理
class textCNN(nn.Module):
    def __init__(self,args):
        super(textCNN, self).__init__()
        vocb_size = args['vocb_size']
        dim = args['dim']
        n_class = args['n_class']
        max_len = args['max_len']
        embedding_matrix=args['embedding_matrix']
        #需要将事先训练好的词向量载入
        self.embeding = nn.Embedding(vocb_size, dim,_weight=embedding_matrix)
        self.conv1 = nn.Sequential(
                      nn.Conv2d(in_channels=1, out_channels=16, kernel_size=5,stride=1, padding=2),
                      nn.BatchNorm2d(num_features=16, eps=1e-05, momentum=0.1, affine=True), # BN 处理
                      nn.ReLU(),
                      nn.MaxPool2d(kernel_size=2) # (16,64,64)
                     )
        self.conv2 = nn.Sequential(
                      nn.Conv2d(in_channels=16, out_channels=32, kernel_size=5, stride=1, padding=2),
                      nn.BatchNorm2d(num_features=32, eps=1e-05, momentum=0.1, affine=True), # BN 处理
                      nn.ReLU(),
                      nn.MaxPool2d(2)
                     )
        self.conv3 = nn.Sequential(
                      nn.Conv2d(in_channels=32, out_channels=64, kernel_size=5, stride=1, padding=2),
                      nn.BatchNorm2d(num_features=64, eps=1e-05, momentum=0.1, affine=True), # BN 处理
                      nn.ReLU(),
                      nn.MaxPool2d(2)
        )
        self.conv4 = nn.Sequential(  # (16,64,64)
            nn.Conv2d(in_channels=64, out_channels=128, kernel_size=5, stride=1, padding=2),
            nn.BatchNorm2d(num_features=128, eps=1e-05, momentum=0.1, affine=True), # BN 处理
            nn.ReLU(),
            nn.MaxPool2d(2)
        )
        self.out = nn.Linear(1024, n_class)

    def forward(self, x):
        x = self.embeding(x)
        x=x.view(x.size(0),1,max_len,word_dim)
        #print(x.size())
        x = self.conv1(x)
        x = self.conv2(x)
        x = self.conv3(x)
        x = self.conv4(x)
        x = x.view(x.size(0), -1) # 将（batch，outchanel,w,h）展平为（batch，outchanel*w*h）
        #print(x.size())
        output = self.out(x)
        return output

In [None]:
# textCNN模型构造 - 无BN处理
class textCNN(nn.Module):
    def __init__(self,args):
        super(textCNN, self).__init__()
        vocb_size = args['vocb_size']
        dim = args['dim']
        n_class = args['n_class']
        max_len = args['max_len']
        embedding_matrix=args['embedding_matrix']
        #需要将事先训练好的词向量载入
        self.embeding = nn.Embedding(vocb_size, dim,_weight=embedding_matrix)
        self.conv1 = nn.Sequential(
                     nn.Conv2d(in_channels=1, out_channels=16, kernel_size=5,
                               stride=1, padding=2),

                     nn.ReLU(),
                     nn.MaxPool2d(kernel_size=2) # (16,64,64)
                     )
        self.conv2 = nn.Sequential(
                     nn.Conv2d(in_channels=16, out_channels=32, kernel_size=5, stride=1, padding=2),
                     nn.ReLU(),
                     nn.MaxPool2d(2)
                     )
        self.conv3 = nn.Sequential(
            nn.Conv2d(in_channels=32, out_channels=64, kernel_size=5, stride=1, padding=2),
            nn.ReLU(),
            nn.MaxPool2d(2)
        )
        self.conv4 = nn.Sequential(  # (16,64,64)
            nn.Conv2d(in_channels=64, out_channels=128, kernel_size=5, stride=1, padding=2),
            nn.ReLU(),
            nn.MaxPool2d(2)
        )
        self.out = nn.Linear(1024, n_class)

    def forward(self, x):
        x = self.embeding(x)
        x=x.view(x.size(0),1,max_len,word_dim)
        #print(x.size())
        x = self.conv1(x)
        x = self.conv2(x)
        x = self.conv3(x)
        x = self.conv4(x)
        x = x.view(x.size(0), -1) # 将（batch，outchanel,w,h）展平为（batch，outchanel*w*h）
        #print(x.size())
        output = self.out(x)
        return output

In [9]:
# 读取空模型
# cnn=textCNN(args)

# 加载原有模型
# 有BN层
cnn = textCNN(args)
cnn.load_state_dict(torch.load('C:/Users/jxjsj/Desktop/JupyterHome/DLmodel/textCNN_news_BN.pkl'))
# 无BN层
# cnn = textCNN(args)
# cnn.load_state_dict(torch.load('C:/Users/jxjsj/Desktop/JupyterHome/DLmodel/textCNN_news.pkl'))

In [11]:
#构建textCNN模型超参数与数据封装入加载器 - LSC

LR = 0.001
optimizer = torch.optim.Adam(cnn.parameters(), lr=LR)

#损失函数
loss_function = nn.CrossEntropyLoss()

#训练批次大小
texts_len=len(texts_with_id)
print(texts_len)

#划分训练数据和测试数据
x_train, x_test, y_train, y_test = train_test_split(texts_with_id, labels, test_size=0.2, random_state=42)

x_train = torch.LongTensor(x_train)
y_train = torch.LongTensor(y_train)
x_test = torch.LongTensor(x_test)
y_test = torch.LongTensor(y_test)

train_dataset = torch.utils.data.TensorDataset(x_train, y_train)
test_dataset = torch.utils.data.TensorDataset(x_test, y_test)

train_data_loader = torch.utils.data.DataLoader(train_dataset, batch_size=2000,shuffle=True)
test_data_loader = torch.utils.data.DataLoader(test_dataset, batch_size=800,shuffle=True)

1351707


In [15]:
# 训练是一句话一句话训练的！！ - LSC

use_gpu = True

if use_gpu:
    cnn = cnn.cuda()

for epoch in range(2):
    print('epoch {}'.format(epoch + 1))
    # training-----------------------------
    cnn.train()
    train_acc = 0.
#     L_train_pred = []
#     L_train_real = []
    for step, (batch_x, batch_y) in enumerate(train_data_loader):
        batch_x, batch_y = Variable(batch_x), Variable(batch_y)
        
        if use_gpu:
            batch_x = batch_x.cuda()
            batch_y = batch_y.cuda()
            
        out = cnn(batch_x)
        loss = loss_function(out, batch_y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        pred = torch.max(out, 1)[1]
        num_correct = (pred == batch_y).sum()
        train_acc += num_correct.data
        
#         L_train_pred += pred.cpu().numpy().tolist()
#         L_train_real += batch_y.cpu().numpy().tolist()
        
        print('Step:',step+1,'Finished!')
    print('Train Acc: {:.6f}'.format(train_acc.cpu().numpy() / (len(train_dataset))))
#     print(classification_report(L_train_real,L_train_pred))

    # evaluation--------------------------------
    cnn.eval()
    with torch.no_grad():
        eval_acc = 0.
#         L_val_pred = []
#         L_val_real = []
        for batch_x, batch_y in test_data_loader:
            batch_x, batch_y = Variable(batch_x), Variable(batch_y)

            if use_gpu:
                batch_x = batch_x.cuda()
                batch_y = batch_y.cuda()

            out = cnn(batch_x)
            loss = loss_function(out, batch_y)
            
            pred = torch.max(out, 1)[1]
            num_correct = (pred == batch_y).sum()
            eval_acc += num_correct
            
#             L_val_pred += pred.cpu().numpy().tolist()
#             L_val_real += batch_y.cpu().numpy().tolist()
            
        print('Test Acc: {:.6f}'.format(eval_acc.cpu().numpy() / (len(test_dataset))))
#         print(classification_report(L_val_real,L_val_pred))

epoch 1
Step: 1 Finished!
Step: 2 Finished!
Step: 3 Finished!
Step: 4 Finished!
Step: 5 Finished!
Step: 6 Finished!
Step: 7 Finished!
Step: 8 Finished!
Step: 9 Finished!
Step: 10 Finished!
Step: 11 Finished!
Step: 12 Finished!
Step: 13 Finished!
Step: 14 Finished!
Step: 15 Finished!
Step: 16 Finished!
Step: 17 Finished!
Step: 18 Finished!
Step: 19 Finished!
Step: 20 Finished!
Step: 21 Finished!
Step: 22 Finished!
Step: 23 Finished!
Step: 24 Finished!
Step: 25 Finished!
Step: 26 Finished!
Step: 27 Finished!
Step: 28 Finished!
Step: 29 Finished!
Step: 30 Finished!
Step: 31 Finished!
Step: 32 Finished!
Step: 33 Finished!
Step: 34 Finished!
Step: 35 Finished!
Step: 36 Finished!
Step: 37 Finished!
Step: 38 Finished!
Step: 39 Finished!
Step: 40 Finished!
Step: 41 Finished!
Step: 42 Finished!
Step: 43 Finished!
Step: 44 Finished!
Step: 45 Finished!
Step: 46 Finished!
Step: 47 Finished!
Step: 48 Finished!
Step: 49 Finished!
Step: 50 Finished!
Step: 51 Finished!
Step: 52 Finished!
Step: 53 Fini

Step: 416 Finished!
Step: 417 Finished!
Step: 418 Finished!
Step: 419 Finished!
Step: 420 Finished!
Step: 421 Finished!
Step: 422 Finished!
Step: 423 Finished!
Step: 424 Finished!
Step: 425 Finished!
Step: 426 Finished!
Step: 427 Finished!
Step: 428 Finished!
Step: 429 Finished!
Step: 430 Finished!
Step: 431 Finished!
Step: 432 Finished!
Step: 433 Finished!
Step: 434 Finished!
Step: 435 Finished!
Step: 436 Finished!
Step: 437 Finished!
Step: 438 Finished!
Step: 439 Finished!
Step: 440 Finished!
Step: 441 Finished!
Step: 442 Finished!
Step: 443 Finished!
Step: 444 Finished!
Step: 445 Finished!
Step: 446 Finished!
Step: 447 Finished!
Step: 448 Finished!
Step: 449 Finished!
Step: 450 Finished!
Step: 451 Finished!
Step: 452 Finished!
Step: 453 Finished!
Step: 454 Finished!
Step: 455 Finished!
Step: 456 Finished!
Step: 457 Finished!
Step: 458 Finished!
Step: 459 Finished!
Step: 460 Finished!
Step: 461 Finished!
Step: 462 Finished!
Step: 463 Finished!
Step: 464 Finished!
Step: 465 Finished!


Step: 288 Finished!
Step: 289 Finished!
Step: 290 Finished!
Step: 291 Finished!
Step: 292 Finished!
Step: 293 Finished!
Step: 294 Finished!
Step: 295 Finished!
Step: 296 Finished!
Step: 297 Finished!
Step: 298 Finished!
Step: 299 Finished!
Step: 300 Finished!
Step: 301 Finished!
Step: 302 Finished!
Step: 303 Finished!
Step: 304 Finished!
Step: 305 Finished!
Step: 306 Finished!
Step: 307 Finished!
Step: 308 Finished!
Step: 309 Finished!
Step: 310 Finished!
Step: 311 Finished!
Step: 312 Finished!
Step: 313 Finished!
Step: 314 Finished!
Step: 315 Finished!
Step: 316 Finished!
Step: 317 Finished!
Step: 318 Finished!
Step: 319 Finished!
Step: 320 Finished!
Step: 321 Finished!
Step: 322 Finished!
Step: 323 Finished!
Step: 324 Finished!
Step: 325 Finished!
Step: 326 Finished!
Step: 327 Finished!
Step: 328 Finished!
Step: 329 Finished!
Step: 330 Finished!
Step: 331 Finished!
Step: 332 Finished!
Step: 333 Finished!
Step: 334 Finished!
Step: 335 Finished!
Step: 336 Finished!
Step: 337 Finished!


In [14]:
# 有BN层
torch.save(cnn.state_dict(),'C:/Users/jxjsj/Desktop/JupyterHome/DLmodel/textCNN_news_BN.pkl')

# 无BN层
# torch.save(cnn.state_dict(),'C:/Users/jxjsj/Desktop/JupyterHome/DLmodel/textCNN_news.pkl')

In [None]:
# 词向量的预训练 - 分批保存训练语料
cut_bin_len = int(len(texts)/100) #分成100份
txt_root = 'D:/THUNewsAllWord/'
n = 0
while n<100:
    f = open(txt_root+'THUNewsAllWord'+str(n)+'.txt','w+',encoding='utf-8')
    for sentences_txt in texts[n*cut_bin_len:(n+1)*cut_bin_len]: # 迭代方式要改变
        for word_txt in  sentences_txt:
            f.write(word_txt + ' ')
        f.write('\r')
    f.close()
    n += 1

In [None]:
# 词向量的预训练 - - 增量训练模式
import gensim.models.word2vec as w2v
from gensim.models.keyedvectors import KeyedVectors as KV

big_batch_num = int(len(texts)/cut_bin_len)

# 第一次训练！！！不要反复跑！！！
sentences = w2v.LineSentence(txt_root+'THUNewsAllWord'+'0'+'.txt')
model = w2v.Word2Vec(sentences, sg=1, size=40,  window=5,  min_count=5,  negative=3, sample=0.001, hs=1, workers=4)
model.save('D:/THUNewsAllWord/THUCNewsw2v.pkl') 

# 增量训练！！！
for big_batch in range(1,big_batch_num):
    sentences = w2v.LineSentence(txt_root+'THUNewsAllWord'+str(big_batch)+'.txt')
    model = KV.load('D:/THUNewsAllWord/THUCNewsw2v.pkl')
    model.build_vocab(sentences, update=True)
    model.train(sentences, total_examples=model.corpus_count, epochs=model.iter)
    model.save('D:/THUNewsAllWord/THUCNewsw2v.pkl') 