In [1]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.utils.data as Data
import torchvision
import matplotlib.pyplot as plt
import numpy as np;
import os
import jieba
import gensim.models.word2vec as w2v
from sklearn.model_selection import train_test_split



In [2]:
# 用以识别并去除文本中的非法字符

def is_number(uchar):
    """判断一个unicode是否是数字"""
    if uchar >= u'\u0030' and uchar <= u'\u0039':
        return True
    else:
        return False

def is_alphabet(uchar):
    """判断一个unicode是否是英文字母"""
    if (uchar >= u'\u0041' and uchar <= u'\u005a') or (uchar >= u'\u0061' and uchar <= u'\u007a'):
        return True
    else:
        return False
    
def is_space(uchar):
    """判断一个unicode是否是英文空格"""
    if (uchar == u'\u0020'):
        return True
    else:
        return False
    
def is_legal(uchar):
    """判断是否非汉字，数字和英文字符"""
    if not (is_alphabet(uchar) or is_space(uchar)):
        return False
    else:
        return True

def extract_english(line): # 对每一行，提取英文，非法字符 则转变为空格
    res = ""
    for word in line:
        if is_legal(word):
            res += word
        else:
            res += " "
    return res

def words2line(words): # 对每个words，拼成一个字符串
    line = ""
    for word in words:
        line += " " + word
    return line

In [3]:
import pandas as pd
test_data_raw = pd.read_csv('C:/Users/jxjsj/Desktop/JupyterHome/Data/drugsComTest_raw.tsv',sep='\t')
train_data_raw = pd.read_csv('C:/Users/jxjsj/Desktop/JupyterHome/Data/drugsComTrain_raw.tsv',sep='\t')

In [4]:
X_train_lst = list(train_data_raw['review'])
y_train_lst = list(train_data_raw['rating'].apply(lambda x: int(x)))

In [5]:
X_test_lst = list(test_data_raw['review'])
y_test_lst = list(test_data_raw['rating'].apply(lambda x: int(x)))

In [6]:
#数据预处理函数

# X 训练语料list
# y 训练目标list

def datahelper_train(X ,y):
#返回为文本，文本对应标签，标签及索引，索引及标签，以及词表
    
    X_u = X
    y_u = y
    X_len = len(X_u)
        
    labels_index={}
    index_lables={}
    
    train_word_all = [] # word_all 是词表未去重的形式
    train_word_all.append('')

    i = 0;
    for f in set(y_u):
        labels_index[f] = i;
        index_lables[i] = f
        i = i + 1;
    print(labels_index)
    
    texts = []   # 每句话的列表
    labels = []  # list of label ids
    
    for cnt in range(X_len):
        text = ''
        line = X_u[cnt]
        if len(line) > 5: # 只提取大于5个字的语句
            line = extract_english(line) # 提取有效信息，字母 数字
            words = line.split()
#             words = jieba.lcut(line, cut_all=False, HMM=True) # 对每句话拆分成词语的list
            
            for word in words: # 顺便创建词表，太慢则去除后续再创建
                train_word_all.append(word)
            
            text = words
            texts.append(text)
            labels.append(labels_index[y_u[cnt]])
    return texts,labels,labels_index,index_lables,train_word_all

train_texts,train_labels,labels_index,index_lables,train_word_all = datahelper_train(X_train_lst, y_train_lst)

{1: 0, 2: 1, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6, 8: 7, 9: 8, 10: 9}


In [8]:
# X 验证语料list
# y 验证目标list

def datahelper_test(X ,y):
#返回为文本，文本对应标签，标签及索引，索引及标签
    
    X_u = X
    y_u = y
    X_len = len(X_u)
    
    test_word_all = [] # word_all 是词表未去重的形式
    test_word_all.append('')
    
    texts = []   # 每句话的列表
    labels = []  # list of label ids
    
    for cnt in range(X_len):
        text = ''
        line = X_u[cnt]
        if len(line) > 5: # 只提取大于5个字的语句
            line = extract_english(line) # 提取有效信息，字母 数字
            words = line.split()
#             words = jieba.lcut(line, cut_all=False, HMM=True) # 对每句话拆分成词语的list
            
            for word in words:
                test_word_all.append(word)
            
            text = words
            texts.append(text)
            labels.append(labels_index[y_u[cnt]])
    return texts,labels,test_word_all

test_texts,test_labels,test_word_all = datahelper_test(X_test_lst, y_test_lst)

In [9]:
# 词表构造

# word_all = [] # word_all 是词表未去重的形式
# word_all.append('')

# for text in texts: 
#     for word in text:  # 拿出每句话中的每个词，作为词表组成元素
#         word_all.append(word)

word_all = train_word_all + test_word_all

word_vocb=set(word_all) # 所有词语去重，构成词表
vocb_size=len(word_vocb) # 词表总长度

#设置词表大小
nb_words=40000 # 默认值，若词表长度大于40000，会更新为更大值
max_len=64; # 一句话最多64个词
word_dim=20; # 64x20 就是训练CNN时，第一个卷积层输入矩阵的大小，对应一句话
n_class=len(index_lables)

args={}
if nb_words<vocb_size:
    nb_words=vocb_size;

    #textCNN调用的参数
args['vocb_size']=nb_words
args['max_len']=max_len
args['n_class']=n_class
args['dim']=word_dim

#词表与索引的map
word_to_idx={word:i for i,word in enumerate(word_vocb)}
idx_to_word={word_to_idx[word]:word for word in word_to_idx}

In [10]:
# 构造 TextCNN 特有的嵌入矩阵，用以将一句话（以词编号形式输入，向量）转化为矩阵（每个词变化转化为词向量）

#每个单词的对应的词向量  load word 2 vetc，加载词向量，【！！事先预训练！ 必须运用测试数据 毕竟仅仅是X！！】
embeddings_index = w2v.Word2Vec.load('D:/DrugScore/DrugScore.pkl')

embedding_matrix = np.zeros((nb_words, word_dim)) # 嵌入矩阵，行号是词编号，行内容是词向量

for word, i in word_to_idx.items():
#     if i >= nb_words:
#         continue
    if word in embeddings_index:
        embedding_vector = embeddings_index[word]
#         if int(sum(embedding_vector)) != 0:
#             words not found in embedding index will be all-zeros.
        embedding_matrix[i] += embedding_vector

args['embedding_matrix']=torch.Tensor(embedding_matrix)

  # This is added back by InteractiveShellApp.init_path()
  if sys.path[0] == '':


In [11]:
#生成训练与验证数据，需要将训练数据的Word转换为word的索引，将texts中每句话转化为这句话中的每个词对应的词编号

def texts_with_id_fun(texts, texts_with_id):
    texts_with_id_temp = texts_with_id
    for i in range(0,len(texts)):
        if len(texts[i])<max_len:
            for j in range(0,len(texts[i])):
                texts_with_id[i][j]=word_to_idx[texts[i][j]]
            for j in range(len(texts[i]),max_len):
                texts_with_id[i][j] = word_to_idx['']
        else:
            for j in range(0,max_len):
                texts_with_id[i][j]=word_to_idx[texts[i][j]]
    return texts_with_id_temp

train_texts_with_id=np.zeros([len(train_texts),max_len])
test_texts_with_id=np.zeros([len(test_texts),max_len])

train_texts_with_id = texts_with_id_fun(train_texts, train_texts_with_id)
test_texts_with_id = texts_with_id_fun(test_texts, test_texts_with_id)
  
# 生成的texts_with_id 每行是原文本的每句话，列宽限定在max_len，每行变为原每个词对应的词编号0~XXXX

In [12]:
# textCNN模型构造 - BN处理
class textCNN(nn.Module):
    def __init__(self,args):
        super(textCNN, self).__init__()
        vocb_size = args['vocb_size']
        dim = args['dim']
        n_class = args['n_class']
        max_len = args['max_len']
        embedding_matrix=args['embedding_matrix']
        #需要将事先训练好的词向量载入
        self.embeding = nn.Embedding(vocb_size, dim,_weight=embedding_matrix)
        self.conv1 = nn.Sequential(
                      nn.Conv2d(in_channels=1, out_channels=16, kernel_size=5,stride=1, padding=2),
                      nn.BatchNorm2d(num_features=16, eps=1e-05, momentum=0.1, affine=True), # BN 处理
                      nn.ReLU(),
                      nn.MaxPool2d(kernel_size=2) # (16,64,64)
                     )
        self.conv2 = nn.Sequential(
                      nn.Conv2d(in_channels=16, out_channels=32, kernel_size=5, stride=1, padding=2),
                      nn.BatchNorm2d(num_features=32, eps=1e-05, momentum=0.1, affine=True), # BN 处理
                      nn.ReLU(),
                      nn.MaxPool2d(2)
                     )
        self.conv3 = nn.Sequential(  # (16,64,64)
                      nn.Conv2d(in_channels=32, out_channels=32, kernel_size=5, stride=1, padding=2),
                      nn.BatchNorm2d(num_features=32, eps=1e-05, momentum=0.1, affine=True), # BN 处理
                      nn.ReLU(),
                      nn.MaxPool2d(2)
        )
        self.out = nn.Linear(256*2, n_class)

    def forward(self, x):
        x = self.embeding(x)
        x=x.view(x.size(0),1,max_len,word_dim)
        #print(x.size())
        x = self.conv1(x)
        x = self.conv2(x)
        x = self.conv3(x)
        x = x.view(x.size(0), -1) # 将（batch，outchanel,w,h）展平为（batch，outchanel*w*h）
        output = self.out(x)
        return output

In [13]:
# 读取空模型
# cnn=textCNN(args)

# 加载原有模型
# 有BN层
cnn = textCNN(args)
# cnn.load_state_dict(torch.load('C:/Users/jxjsj/Desktop/JupyterHome/DLmodel/textCNN_drug_BN.pkl'))

In [16]:
#构建textCNN模型超参数与数据封装入加载器 - LSC

LR = 0.0001

optimizer = torch.optim.Adam(cnn.parameters(), lr=LR)

#损失函数
loss_function = nn.CrossEntropyLoss()

x_train = torch.LongTensor(train_texts_with_id)
y_train = torch.LongTensor(train_labels)
x_test = torch.LongTensor(test_texts_with_id)
y_test = torch.LongTensor(test_labels)

print(len(x_train))
print(len(x_test))

train_dataset = torch.utils.data.TensorDataset(x_train, y_train)
test_dataset = torch.utils.data.TensorDataset(x_test, y_test)

train_data_loader = torch.utils.data.DataLoader(train_dataset, batch_size=5000,shuffle=True)
test_data_loader = torch.utils.data.DataLoader(test_dataset, batch_size=1000,shuffle=True)

161282
53761


In [17]:
# 训练 - LSC

use_gpu = True

if use_gpu:
    cnn = cnn.cuda()
else:
    cnn = cnn.cpu()

for epoch in range(10):
    print('epoch {}'.format(epoch + 1))
    # training-----------------------------
    cnn.train()
    train_acc = 0.

    for step, (batch_x, batch_y) in enumerate(train_data_loader):
        batch_x, batch_y = Variable(batch_x), Variable(batch_y)
                
        if use_gpu:
            batch_x = batch_x.cuda()
            batch_y = batch_y.cuda()
            
        out = cnn(batch_x)
        loss = loss_function(out, batch_y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        pred = torch.max(out, 1)[1]
        num_correct = (pred == batch_y).sum()
        train_acc += num_correct.data
        
        print('Step:',step+1,'Finished! Loss:',loss.detach().cpu().numpy())
    print('Train Acc: {:.6f}'.format(train_acc.cpu().numpy() / (len(train_dataset))))
#     print(classification_report(L_train_real,L_train_pred))

    # evaluation--------------------------------
    cnn.eval()
    with torch.no_grad():
        eval_acc = 0.

        for batch_x, batch_y in test_data_loader:
            batch_x, batch_y = Variable(batch_x), Variable(batch_y)

            if use_gpu:
                batch_x = batch_x.cuda()
                batch_y = batch_y.cuda()

            out = cnn(batch_x)
            loss = loss_function(out, batch_y)
            
            pred = torch.max(out, 1)[1]
            num_correct = (pred == batch_y).sum()
            eval_acc += num_correct
            
        print('Test Acc: {:.6f}'.format(eval_acc.cpu().numpy() / (len(test_dataset))))
#         print(classification_report(L_val_real,L_val_pred))

epoch 1
Step: 1 Finished! Loss: 0.4308753
Step: 2 Finished! Loss: 0.44606963
Step: 3 Finished! Loss: 0.4376505
Step: 4 Finished! Loss: 0.44891822
Step: 5 Finished! Loss: 0.4494121
Step: 6 Finished! Loss: 0.45698905
Step: 7 Finished! Loss: 0.45936725
Step: 8 Finished! Loss: 0.4401672
Step: 9 Finished! Loss: 0.44057217
Step: 10 Finished! Loss: 0.45605245
Step: 11 Finished! Loss: 0.45631644
Step: 12 Finished! Loss: 0.430511
Step: 13 Finished! Loss: 0.45226815
Step: 14 Finished! Loss: 0.43248707
Step: 15 Finished! Loss: 0.44581845
Step: 16 Finished! Loss: 0.44681367
Step: 17 Finished! Loss: 0.4356121
Step: 18 Finished! Loss: 0.4650413
Step: 19 Finished! Loss: 0.45955214
Step: 20 Finished! Loss: 0.43782505
Step: 21 Finished! Loss: 0.44351485
Step: 22 Finished! Loss: 0.4535625
Step: 23 Finished! Loss: 0.44704986
Step: 24 Finished! Loss: 0.4420253
Step: 25 Finished! Loss: 0.45730424
Step: 26 Finished! Loss: 0.42830226
Step: 27 Finished! Loss: 0.44211322
Step: 28 Finished! Loss: 0.43571562
Ste

Step: 27 Finished! Loss: 0.42163867
Step: 28 Finished! Loss: 0.42204052
Step: 29 Finished! Loss: 0.42284024
Step: 30 Finished! Loss: 0.4220058
Step: 31 Finished! Loss: 0.4377088
Step: 32 Finished! Loss: 0.42684844
Step: 33 Finished! Loss: 0.4030921
Train Acc: 0.891848
Test Acc: 0.604955
epoch 8
Step: 1 Finished! Loss: 0.42405254
Step: 2 Finished! Loss: 0.40624937
Step: 3 Finished! Loss: 0.43202084
Step: 4 Finished! Loss: 0.41717294
Step: 5 Finished! Loss: 0.42573702
Step: 6 Finished! Loss: 0.41761395
Step: 7 Finished! Loss: 0.43089512
Step: 8 Finished! Loss: 0.4151145
Step: 9 Finished! Loss: 0.40926874
Step: 10 Finished! Loss: 0.4154218
Step: 11 Finished! Loss: 0.39429516
Step: 12 Finished! Loss: 0.4165339
Step: 13 Finished! Loss: 0.43041444
Step: 14 Finished! Loss: 0.42326686
Step: 15 Finished! Loss: 0.4003168
Step: 16 Finished! Loss: 0.4330081
Step: 17 Finished! Loss: 0.43223047
Step: 18 Finished! Loss: 0.39946538
Step: 19 Finished! Loss: 0.43057647
Step: 20 Finished! Loss: 0.4127247

KeyboardInterrupt: 

In [18]:
# 有BN层
torch.save(cnn.state_dict(),'C:/Users/jxjsj/Desktop/JupyterHome/DLmodel/textCNN_drug_BN.pkl')

In [94]:
# 词向量的预训练 - 分批保存训练语料
# 训练集
train_cut_bin_len = int(len(train_texts)/10) #分成10份
train_txt_root = 'D:/DrugScore/'
n = 0
while n<10:
    f = open(train_txt_root+'DrugScore'+str(n)+'.txt','w+',encoding='utf-8')
    for sentences_txt in train_texts[n*train_cut_bin_len:(n+1)*train_cut_bin_len]: # 迭代方式要改变
        for word_txt in  sentences_txt:
            f.write(word_txt + ' ')
        f.write('\r')
    f.close()
    n += 1

# 验证集
test_cut_bin_len = int(len(test_texts)/5) #分成5份
test_txt_root = 'D:/DrugScoreV/'
n = 0
while n<5:
    f = open(test_txt_root+'DrugScoreV'+str(n)+'.txt','w+',encoding='utf-8')
    for sentences_txt in test_texts[n*test_cut_bin_len:(n+1)*test_cut_bin_len]: # 迭代方式要改变
        for word_txt in  sentences_txt:
            f.write(word_txt + ' ')
        f.write('\r')
    f.close()
    n += 1

In [21]:
# 词向量的预训练 - - 增量训练模式
import gensim.models.word2vec as w2v
from gensim.models.keyedvectors import KeyedVectors as KV

train_big_batch_num = int(len(train_texts)/train_cut_bin_len)

# 第一次训练！！！不要反复跑！！！
sentences = w2v.LineSentence(train_txt_root+'DrugScore'+'0'+'.txt')
# model = w2v.Word2Vec(sentences, sg=1, size=word_dim,  window=8,  min_count=5,  negative=3, sample=0.001, hs=1, workers=4)
# model.save('D:/DrugScore/DrugScore.pkl') 

# 增量训练！！！
for w2v_epoch_trainset in range(10):
    for big_batch in range(1,train_big_batch_num):
        sentences = w2v.LineSentence(train_txt_root+'DrugScore'+str(big_batch)+'.txt')
        model.build_vocab(sentences, update=True)
        model.train(sentences, total_examples=model.corpus_count, epochs=model.epochs) # model.iter
model.save('D:/DrugScore/DrugScore.pkl') 

In [22]:
# 词向量的预训练 - - 增量训练模式 - - 验证集加入

test_big_batch_num = int(len(test_texts)/test_cut_bin_len)
model = KV.load('D:/DrugScore/DrugScore.pkl')

# 增量训练！！！
for w2v_epoch_testset in range(10):
    for big_batch in range(1,test_big_batch_num):
        sentences = w2v.LineSentence(test_txt_root+'DrugScoreV'+str(big_batch)+'.txt')  
        model.build_vocab(sentences, update=True)
        model.train(sentences, total_examples=model.corpus_count, epochs=model.epochs)
model.save('D:/DrugScore/DrugScore.pkl') 

In [30]:
model.wv.most_similar('better')

  if np.issubdtype(vec.dtype, np.int):


[('much', 0.8834090232849121),
 ('more', 0.8462958335876465),
 ('feel', 0.8294618129730225),
 ('Feeling', 0.812267005443573),
 ('calmer', 0.7905117869377136),
 ('energized', 0.7852638363838196),
 ('less', 0.7836241126060486),
 ('energetic', 0.7783820629119873),
 ('than', 0.7749491930007935),
 ('Feel', 0.7744764089584351)]

In [71]:
# 使用与训练验证集无关的预料进行词向量预训练 - 效果极差！！

model = KV.load('D:/DrugScore/DrugScore.pkl')

# 增量训练！！！
sentences = w2v.Text8Corpus('D:/text8/text8')  
model.build_vocab(sentences, update=True)
model.train(sentences, total_examples=model.corpus_count, epochs=model.epochs)

model.save('D:/DrugScore/DrugScore.pkl') 