In [1]:
%config ZMQInteractiveShell.ast_node_interactivity='all'
import pandas as pd
from config import *
import random
import re

## 数据预处理

In [2]:
# 样本乱序
def shuffles(inputs, outputs):
    contents = []
    f1 = open(inputs, 'r', encoding='utf-8') 
    for line in f1.readlines():
        contents.append(line)
    
    random.shuffle(contents)
    
    f2 = open(outputs, 'w', encoding='utf-8')
    for content in contents:
        f2.write(content)
    
    f1.close()
    f2.close()


shuffles(inputs,outputs)

In [3]:
data = pd.read_csv(outputs, header=None, sep=',',encoding='utf-8')
data.shape
data.head(10)
data.info()

(119988, 2)

Unnamed: 0,0,1
0,1,"死于1,5,6,7,10,转发，求免死，哈哈//@Stella盟:睡眠不足死、朝八晚无死，饮..."
1,0,才说完人家旁边福特车胎补两个，我补一个。结果，原来，我的也是两个！哭！一边一个！一个钉子，一...
2,1,#小编传送门#这个冬天，拥有一本温情的教养书，挺好的。[可爱]
3,0,[泪][衰][衰]
4,0,[泪][泪]一定会好起来的！
5,0,一家好的公司都有严格的薪酬体系，这个很正常，并非人情味，这是你本该得到的。 //@马蝎子:来...
6,0,和我一样无法接受的请举手……[衰]
7,0,朦朦胧胧[抓狂]
8,0,北广！[泪] //@自由的馨淇:我当年高考唯一的志愿就是北京广播学院，也就是现在的中国传媒大...
9,1,打开电视，各台主要剧种：抗日战争，宫廷争斗，婆媳关系，正室小三，外加寒暑假的还珠格格和西游记...


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119988 entries, 0 to 119987
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   0       119988 non-null  object
 1   1       119988 non-null  object
dtypes: object(2)
memory usage: 1.8+ MB


In [4]:
def clean_text(text):
    text = re.sub("\//@[a-zA-Z\W+]+", "",text) # re.sub(pattern, repl, string, count=0, flags=0)
    text = re.sub("\@[a-zA-Z\w+]+", "",text)
    text = re.sub("[\-\#+\//@.\s+\.\!\/_,$%^*(+\"\']+|[+——！，。？、""【】~@#￥%……&*（）]+","",text)
    text = re.sub('[A-Za-z0-9\!\%\[\]\,\。\:\::\?\“\”\”“\～+\:?\;;\>>]','',text)
    text = re.sub('\：：?','',text)
    text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)
    text = re.sub(r'#{1,}', '', text)
    return text

def get_train_data():
    texts = []
    labels = []
    with open(outputs,'r',encoding = 'utf-8') as f:
        for line in f.readlines():
            items = line.split(',')
            texts.append(items[1])

            text = items[1]
            text = clean_text(text)
            if len(text) > 1 and text!='\n':
                if items[0] == '1':
                    labels.append(1)
                else:
                    labels.append(0)  # labels 这里一定要存放数字，不能是字符串'1'或'0'
    return texts,labels

texts,labels = get_train_data()
texts[:5]
labels[:5]

['"死于1',
 '才说完人家旁边福特车胎补两个，我补一个。结果，原来，我的也是两个！哭！一边一个！一个钉子，一个刀片！[泪][泪][泪][晕][晕][汗][汗][怒]\n',
 '#小编传送门#这个冬天，拥有一本温情的教养书，挺好的。[可爱]\n',
 '[泪][衰][衰]\n',
 '[泪][泪]一定会好起来的！\n']

[1, 0, 1, 0, 0]

## 加载预训练词向量

In [5]:
from gensim.models import KeyedVectors

w2v_model = KeyedVectors.load_word2vec_format('./embeddings/sgns.zhihu.bigram',binary=False, unicode_errors="ignore")

## 构建训练数据

In [6]:
import jieba
import time

def get_train_tokens(texts, w2v_model):
    train_tokens = []
    for text in texts:
        text = clean_text(text)
        if len(text) > 1 and text!='\n' and text!='\t':
            words_list = [i for i in jieba.cut(text)]
            for i, word in enumerate(words_list):
                try:
                    words_list[i] = w2v_model.vocab[word].index
                except KeyError:
                    words_list[i] = 0
            train_tokens.append(words_list)
    return train_tokens

start = time.time()
train_tokens = get_train_tokens(texts, w2v_model) # [[],[],...]
print("耗时:%0.2f mins"%((time.time() - start)/60))

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\zhoubin\AppData\Local\Temp\jieba.cache
Loading model cost 1.040 seconds.
Prefix dict has been built successfully.


耗时:0.58 mins


In [7]:
train_tokens[:2]

[[267, 190],
 [126,
  3701,
  390,
  865,
  18549,
  44158,
  2840,
  0,
  0,
  7,
  299,
  781,
  6,
  1,
  18,
  4,
  0,
  768,
  506,
  7,
  7,
  17583,
  7,
  25743,
  0,
  0,
  5709,
  5327,
  5327,
  3222]]

In [8]:
import numpy as np

def max_token(train_tokens):
    num_tokens = [len(tokens) for tokens in train_tokens ]
    num_tokens = np.array(num_tokens)
    max_tokens = np.mean(num_tokens) + 2 * np.std(num_tokens) # 标准差
    max_tokens = int(max_tokens)
    return max_tokens

max_tokens = max_token(train_tokens)
max_tokens

56

In [9]:
import time

def vocab_size(train_tokens):
    start = time.time()
    vocab_tokens = []
    i = 0
    for tokens in train_tokens:
        for token in tokens:
            i+=1
            if token not in vocab_tokens:
                vocab_tokens.append(token)
            else:
                pass
    if i%1e6 == 0:
        print("处理前%d的词花费的时间是:%0.2f"%(i,(time.time() - start)/60),'mins')
    print("词汇表大小是：",len(vocab_tokens))
    return len(vocab_tokens)

num_words = vocab_size(train_tokens) # 这个计算比较花费时间

词汇表大小是： 76370


In [10]:
def get_embedding_matrix(embedding_dim, num_words, w2v_model):
    embedding_matrix = np.zeros((num_words, embedding_dim))
    for i in range(num_words):
        embedding_matrix[i,:] = w2v_model[w2v_model.index2word[i]]
    embedding_matrix = embedding_matrix.astype('float32')
    return embedding_matrix


embedding_matrix = get_embedding_matrix(embedding_dim,num_words,w2v_model)
embedding_matrix

array([[-0.801784, -0.16534 ,  0.030508, ...,  0.106525,  0.553436,
         0.43665 ],
       [-0.651747,  0.53597 ,  0.340271, ...,  0.805399,  0.104593,
         0.193694],
       [-0.412321,  0.228261,  0.207114, ...,  0.808777,  0.056751,
         0.452374],
       ...,
       [ 0.127824,  0.697852, -0.43761 , ..., -0.107803, -0.033679,
        -0.517616],
       [-0.100228, -0.134558, -0.352162, ...,  0.210711,  0.088362,
        -0.357846],
       [ 0.015363, -0.057104,  0.571133, ..., -0.518441,  0.157153,
         0.328522]], dtype=float32)

In [11]:
from tensorflow.python.keras.preprocessing.sequence import pad_sequences


def add_padding(train_tokens, labels, max_tokens, num_words):
    train_pad = pad_sequences(train_tokens, maxlen = max_tokens, padding='pre', truncating='pre')
    train_pad[train_pad >= num_words] = 0
    labels = np.array(labels)
    return train_pad,labels


train_pad, labels = add_padding(train_tokens, labels, max_tokens, num_words)
train_pad.shape
labels.shape

(119522, 56)

(119522,)

## 模型构建

In [12]:
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import Dense, GRU, Embedding, LSTM, Bidirectional
from tensorflow.python.keras.optimizers import Adam
from tensorflow.python.keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard, ReduceLROnPlateau
from tensorflow.python.keras.models import save_model

def model(epochs,batch_size):
    model = Sequential()
    model.add(Embedding(num_words,embedding_dim,
                        weights = [embedding_matrix],
                        input_length = max_tokens,
                        trainable = False))
    model.add(Bidirectional(LSTM(units=64, return_sequences=True)))
    model.add(LSTM(units=16, return_sequences=False))
    model.add(Dense(1, activation='sigmoid'))
    
    optimizer = Adam(lr=1e-3)
    model.compile(loss='binary_crossentropy',optimizer=optimizer,metrics=['accuracy'])
    model.summary()
    

    checkpoint = ModelCheckpoint(filepath = path_checkpoint, monitor='val_loss',
                                 verbose=1, save_weights_only=True,
                                 save_best_only=True)
    try:
        model.load_weights(path_checkpoint)
    except Exception as e:
        print(e)
        
    earlystopping = EarlyStopping(monitor='val_loss', patience=5, verbose=1)
    # 当指标停止提升时，降低学习速率。
    lr_reduction = ReduceLROnPlateau(monitor='val_loss',factor=0.1, min_lr=1e-8, patience=0, verbose=1)
    callbacks = [earlystopping, checkpoint,lr_reduction]
    
    model.fit(X_train, y_train,validation_split=0.2, epochs=epochs,batch_size=batch_size, callbacks=callbacks)
    
    # 模型保存
    save_model(model,model_path)
    # 模型评估
    result = model.evaluate(X_test, y_test)
    print(result)
    print(' Accuracy is :{0:.2%}'.format(result[1]))
    return result[1]*100

## 模型训练

In [13]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(train_pad, # array
                                                    labels, # array
                                                    test_size=0.2,
                                                    random_state=660)


accuracy = model(epochs, batch_size)

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 56, 300)           22911000  
_________________________________________________________________
bidirectional (Bidirectional (No

## 模型预测

In [16]:
from tensorflow.python.keras.models import load_model

def prediction(text, w2v_model, vocab_size, maxlen):
    text = clean_text(text)
    if len(text)>1 and text!='\n' and text!='\t':
        words_list = [i for i in jieba.cut(text)]
        for i, word in enumerate(words_list):
            try:
                words_list[i] = w2v_model.vocab[word].index
                if words_list[i]>= vocab_size:
                    words_list[i] = 0
            except KeyError:
                words_list[i] = 0
            
    test_pad = pad_sequences([words_list], maxlen=maxlen, padding='pre', truncating='pre')
    
    # loading model
    model = load_model(model_path)
    result = model.predict(x=test_pad)
    coefs = result[0][0]
    return coefs*100  # 返回判断的阈值

In [17]:
text_list= [
    '我今天为什么要穿白长裙？[泪]还有几站该下车了！雨依然哗哗的！[抓狂] //@败家de小妞子:裙子已经湿了[泪]',
    '糟糕透顶了，刚买的新手机就丢厕所里面了,想骂人呀，谁也别招惹我。',
    '今天是个好日子，天气特别美好，我的心情也很好',
    '新发的工资,还意外的领到了红包',
    '朋友出车祸了',
    '今天去出差，住豪华酒店',
    '今天第一次陪朋友去逛街',
    '这不科学……应该是一堆狗~//@蜘蛛3号: //@我的朋友是个呆B:QAQ//@进击的巨人官网:QAQ//@我的同事是个婊子: QAQ//@',
    '我刚问了老公这个问题，宝宝出生后他会说什么，他不加思索地来了一句：“八喜，欢迎来到地球！”[汗][晕]'
]

results = []
for text in text_list:
    res = prediction(text, w2v_model, num_words, max_tokens)
    results.append(res)

    
cte = ['正面情绪','负面情绪']
for i in range(len(text_list)):
    
    if results[i] > 50:
        cte_j = 0
    else:
        cte_j = 1
    print("第%d条微博是:%s"%(i+1,text_list[i]) )
    print("模型判断这是一条: ##%s##,预测阈值是:%0.2f"%(cte[cte_j],results[i]) )
    print('\n')

第1条微博是:我今天为什么要穿白长裙？[泪]还有几站该下车了！雨依然哗哗的！[抓狂] //@败家de小妞子:裙子已经湿了[泪]
模型判断这是一条: ##负面情绪##,预测阈值是:0.14


第2条微博是:糟糕透顶了，刚买的新手机就丢厕所里面了,想骂人呀，谁也别招惹我。
模型判断这是一条: ##负面情绪##,预测阈值是:1.45


第3条微博是:今天是个好日子，天气特别美好，我的心情也很好
模型判断这是一条: ##正面情绪##,预测阈值是:50.36


第4条微博是:新发的工资,还意外的领到了红包
模型判断这是一条: ##负面情绪##,预测阈值是:45.43


第5条微博是:朋友出车祸了
模型判断这是一条: ##负面情绪##,预测阈值是:1.79


第6条微博是:今天去出差，住豪华酒店
模型判断这是一条: ##负面情绪##,预测阈值是:3.55


第7条微博是:今天第一次陪朋友去逛街
模型判断这是一条: ##正面情绪##,预测阈值是:80.56


第8条微博是:这不科学……应该是一堆狗~//@蜘蛛3号: //@我的朋友是个呆B:QAQ//@进击的巨人官网:QAQ//@我的同事是个婊子: QAQ//@
模型判断这是一条: ##负面情绪##,预测阈值是:3.37


第9条微博是:我刚问了老公这个问题，宝宝出生后他会说什么，他不加思索地来了一句：“八喜，欢迎来到地球！”[汗][晕]
模型判断这是一条: ##负面情绪##,预测阈值是:0.17


