In [1]:
import os
import pandas as pd
import numpy as np

class market():
    def word_cut(self,documents):
        stopwords = self.stopwords
        import jieba
        texts = []
        for line in documents:
            words = ' '.join(jieba.cut(line)).split(' ') # 用空格去连接，连接后马上又拆分
            text = []
            for word in words:
                if (word not in stopwords) & (word != '')& (word != '\u3000')& (word != '\n')&(word != '\u200b'):
                    text.append(word)
            texts.append(text)
        self.docLength = len(documents)
        return(texts)
    def get_docLength(self):
        return(self.docLength)
    def frequency(self,texts,freq):
        from collections import defaultdict
        frequency = defaultdict(int) # value为int
        for text in texts:
            for word in text:
                frequency[word] += 1
        texts = [[word for word in text if frequency[word] > freq] for text in texts]
        return(texts)
    def regroup(self,texts):
        new_texts = []
        for i,sentence in enumerate(texts):
            new_texts.append(" ".join(sentence))
        return(new_texts)
    def add_stopwords(self,path):
        stopwords = set()
        with open(path,'r',encoding = 'cp936') as file:
            for line in file:
                stopwords.add(line.strip())
        self.stopwords = stopwords
        print("Load %s stopwords" %len(stopwords))
    def dictionary(self,docs):
        token_index ={}
        for sample in docs:
            for word in sample:
                if word not in token_index:
                    token_index[word] = len(token_index) + 1
        return(token_index)
    def count(self,docs):
        token_length ={}
        for sample in docs:
            for word in sample:
                if word not in token_length:
                    token_length[word] = 1
                else:
                    token_length[word] += 1
        return(token_length)
    def recoding(self,docs,token_index):
        for i,sample in enumerate(docs):
            for j,word in enumerate(sample):
                if word not in token_index:
                    sample[j] = -1
                else:
                    sample[j] = token_index[word]
            docs[i] = sample
        return(docs)
    def delete(self,docs):
        for index in range(len(docs)):
            for i in range(len(docs[index])-1,-1,-1):
                if docs[index][i] == -1:
                    docs[index].pop(i)
        return docs
    def random_pick(self,df,n):
        import random
        import numpy as np
        rand = np.arange(0,(len(df)-1),1)
        random.shuffle(rand)
        rand = list(rand[:n])
        df = df.loc[rand,]
        return(df)
    def read_vectors(self,path, topn):  # read top n word vectors, i.e. top is 10000
        lines_num, dim = 0, 0
        vectors = {}
        iw = []
        wi = {}
        with open(path, encoding='utf-8', errors='ignore') as f:
            first_line = True
            for line in f:
                if first_line:
                    first_line = False
                    dim = int(line.rstrip().split()[1]) # 删除向量末尾的空格，然后以空格拆分获得向量
                    continue
                lines_num += 1
                tokens = line.rstrip().split(' ')
                vectors[tokens[0]] = np.asarray([float(x) for x in tokens[1:]])# 当数据源是ndarray时，asarray不会占用新的内存；当数据源不是ndarray,asarray与array一样
                iw.append(tokens[0]) # iw储存了所有的tokons[0]，意思是index_word
                if topn != 0 and lines_num >= topn:
                    break
        for i, w in enumerate(iw):
            wi[w] = i # wi是iw的反转，意思是word_index,用w来储存字符，用一个integer去给字符编码
        self.dim = dim
        self.max_words = topn
        self.word_index = wi
        self.index_word = iw
        self.vectors = vectors
        print("Load %s word vectors." % len(vectors))
    def embedding_matrix(self):
        embedding_matrix = np.zeros((self.max_words,self.dim))
        for word,i in self.word_index.items():
            if i < self.max_words:
                embedding_vector = self.vectors.get(word)
                if embedding_vector is not None:
                    embedding_matrix[i] = embedding_vector
        return embedding_matrix
    def navie_knn(self,dataSet, query, k):  
        # 计算出某一样本与所有样本的距离，选择最大(应该修改为最小？)的k个样本作为用于knn
        numSamples = dataSet.shape[0] # return row(sample) number of dataset

        ## step 1: calculate Euclidean distance  
        diff = np.tile(query, (numSamples, 1)) - dataSet #tile: 把query这个向量纵向复制，使得结果与dataset具有同样的行数
        squaredDiff = diff ** 2  
        squaredDist = np.sum(squaredDiff, axis = 1) # sum is performed by row  

        ## step 2: sort the distance  
        sortedDistIndices = np.argsort(squaredDist)   # numpy.argsort 返回的是数组值从小到大的索引值（注意是索引值，不是绝对值）
        if k > len(sortedDistIndices):  
            k = len(sortedDistIndices)  

        return sortedDistIndices[0:k]
    # build a big graph (normalized weight matrix)  
    def buildGraph(self,MatX, kernel_type, rbf_sigma = None, knn_num_neighbors = None):  
        num_samples = MatX.shape[0]  # return row(sample) number of MatX
        affinity_matrix = np.zeros((num_samples, num_samples), np.float32)  
        if kernel_type == 'rbf':  
            if rbf_sigma == None:  
                raise ValueError('You should input a sigma of rbf kernel!')  
            for i in range(num_samples):  
                row_sum = 0.0  
                for j in range(num_samples):  
                    diff = MatX[i, :] - MatX[j, :]  
                    affinity_matrix[i][j] = np.exp(sum(diff**2) / (-2.0 * rbf_sigma**2))  
                    row_sum += affinity_matrix[i][j]  
                affinity_matrix[i][:] /= row_sum  
        elif kernel_type == 'knn':  
            if knn_num_neighbors == None:  
                raise ValueError('You should input a k of knn kernel!')  
            for i in range(num_samples):  
                k_neighbors = self.navie_knn(MatX, MatX[i, :], knn_num_neighbors)  
                affinity_matrix[i][k_neighbors] = 1.0 / knn_num_neighbors  # 将节点i与附近的k个节点连接起来，每个边的权重是1/knn_num_neighbors
        else:  
            raise NameError('Not support kernel type! You can use knn or rbf!')  

        return affinity_matrix  
    # label propagation  
    def labelPropagation(self,Mat_Label, Mat_Unlabel, labels, kernel_type = 'rbf', rbf_sigma = 0.20, \
                        knn_num_neighbors = 10, max_iter = 500, tol = 1e-3):  
        # initialize  
        num_label_samples = Mat_Label.shape[0]  #已经标记的sample number
        num_unlabel_samples = Mat_Unlabel.shape[0]  #未标记的sample number
        num_samples = num_label_samples + num_unlabel_samples
        labels_list = np.unique(labels)  #有哪些label
        num_classes = len(labels_list)  #label的种类数

        MatX = np.vstack((Mat_Label, Mat_Unlabel))
        clamp_data_label = np.zeros((num_label_samples, num_classes), np.float32)  
        for i in range(num_label_samples):  
            clamp_data_label[i][labels[i]] = 1.0   #标记出每一个labelled sample的具体label是什么

        label_function = np.zeros((num_samples, num_classes), np.float32)  
        label_function[0 : num_label_samples] = clamp_data_label  
        label_function[num_label_samples : num_samples] = -1  

        # graph construction  
        affinity_matrix = self.buildGraph(MatX, kernel_type, rbf_sigma, knn_num_neighbors)  

        # start to propagation  
        iter = 0; pre_label_function = np.zeros((num_samples, num_classes), np.float32)  
        changed = np.abs(pre_label_function - label_function).sum()  
        while iter < max_iter and changed > tol:  
            if iter % 1 == 0:  
                print ("---> Iteration %d/%d, changed: %f" % (iter, max_iter, changed))
            pre_label_function = label_function  
            iter += 1  

            # propagation  
            label_function = np.dot(affinity_matrix, label_function)  

            # clamp  
            label_function[0 : num_label_samples] = clamp_data_label  

            # check converge  
            changed = np.abs(pre_label_function - label_function).sum()  

        # get terminate label of unlabeled data  
        unlabel_data_labels = np.zeros(num_unlabel_samples)  
        for i in range(num_unlabel_samples):  
            unlabel_data_labels[i] = np.argmax(label_function[i+num_label_samples]) #取出参数中元素最大值所对应的索引 

        return unlabel_data_labels 

In [2]:
process = market()
process.add_stopwords("D:/Users/PYTHON/Precision-Marketing/stopwords.txt")
process.read_vectors("D:/NLP/sgns.target.word-word.dynwin5.thr10.neg5.dim300.txt",20000)

Load 2316 stopwords
Load 20000 word vectors.


In [3]:
embedding_matrix = process.embedding_matrix()
embedding_matrix.shape

(20000, 300)

In [4]:
os.chdir("D:/Users/PYTHON/Precision-Marketing")
df = pd.DataFrame()
for i in range(10):
    print("已经读取第{}个sheet".format(i))
    df_temp = pd.read_excel("关键词标签.xlsx",sheet_name = i)
    df = df.append(df_temp)
# PATIENT = PATIENT[["博文","是否担忧（1=担忧，0=不担忧）","标签1（担忧对象）","标签2（担忧什么）","标签3（什么保险）","标签4（症状）"]]
# PATIENT = PATIENT.loc[pd.notna(PATIENT["是否担忧（1=担忧，0=不担忧）"]),]
# PATIENT = PATIENT.fillna("无") # 可以尝试将是否担忧作为一个feature！
# PATIENT.head(2)
df

已经读取第0个sheet
已经读取第1个sheet
已经读取第2个sheet
已经读取第3个sheet
已经读取第4个sheet
已经读取第5个sheet
已经读取第6个sheet
已经读取第7个sheet
已经读取第8个sheet
已经读取第9个sheet


Unnamed: 0,用户名,博文,关键词,"是否担忧（1=担忧，-1=完全不担忧，0=中性，有些担忧但不用买保险,2=疑似抑郁症）",标签1（担忧对象）,标签2（担忧什么）,标签3（什么保险）,标签4（症状）,转发数,评论数,点赞数,发文时间,来自,页面网址,博文链接
0,1杯冰牛奶,\n 一直在各个公开场合严肃说了要理智告诉粉丝应该怎样...,父母怎么办,-1.0,,,,,,,,\n 今天06:16\n ...,iPhone 11,https://s.weibo.com/weibo/%25E7%2588%25B6%25E6...,https://weibo.com/6606022745/IwQhN1EAC?refer_f...
1,干杯老铁,\n 网友求助: “因为男友我感染了hpv，尿道炎，宫...,父母怎么办,0.0,,,,,,,,\n 今天06:02\n ...,即刻笔记,https://s.weibo.com/weibo/%25E7%2588%25B6%25E6...,https://weibo.com/3394404550/IwQbUFIBq?refer_f...
2,分手挽回失恋复合前男友前任导师,\n #分手了怎么挽回前任##失恋后怎么挽回前任# 父...,父母怎么办,-1.0,,,,,,,,\n 今天03:39\n ...,微博 weibo.com,https://s.weibo.com/weibo/%25E7%2588%25B6%25E6...,https://weibo.com/3984809099/IwPfZfAGM?refer_f...
3,全球奇葩事,\n 网友投稿：小孩子一岁多，打了媳妇几下，媳妇就教育...,父母怎么办,0.0,,,,,1,3,2,\n 今天02:11\n ...,微博 weibo.com,https://s.weibo.com/weibo/%25E7%2588%25B6%25E6...,https://weibo.com/5629436483/IwOGmd1HG?refer_f...
4,肆意放纵野蛮生长,\n 抑郁症我有慢性疾病，怎么治都治不好，可是我才1...,父母怎么办,1.0,父母,经济,死亡保险,情绪问题,,8,,\n 今天01:18\n ...,抑郁症超话,https://s.weibo.com/weibo/%25E7%2588%25B6%25E6...,https://weibo.com/6087910635/IwOkWv6iP?refer_f...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,永远玩不到王者的6公子,\n 第123件关于我们的回忆：接上条继续...有次带...,不舒服会不会,-1.0,,,,,,,,\n 2019年12月28日 22:49\n ...,iPhone客户端,https://s.weibo.com/weibo/%25E4%25B8%258D%25E8...,https://weibo.com/6397021518/ImUA07X1Z?refer_f...
196,泥销白骨,\n 有点get到这个教官了 虽然早上觉得呆板无趣但其...,不舒服会不会,-1.0,,,,,,,,\n 2019年12月28日 22:25\n ...,,https://s.weibo.com/weibo/%25E4%25B8%258D%25E8...,https://weibo.com/1837879215/ImUqrEiBB?refer_f...
197,很喜欢舔酸奶盖,\n 没有体验过，会不会不舒服啊 ...,不舒服会不会,-1.0,,,,,,,,2019年12月28日 19:34,Android,https://s.weibo.com/weibo/%25E4%25B8%258D%25E8...,https://weibo.com/2825646441/ImTiNCq49?refer_f...
198,魔鬼管理学,职场中，学习能力要比学历重要的多。\n\n每一条都是曾经打在自己身上的鞭子，条条带血。\n\...,不舒服会不会,-1.0,,,,,111,6,74,\n 2019年12月28日 21:40\n ...,荣耀 9X,https://s.weibo.com/weibo/%25E4%25B8%258D%25E8...,https://weibo.com/2406560131/ImU7Ue32T?refer_f...


In [5]:
df = df.loc[pd.notna(df["是否担忧（1=担忧，-1=完全不担忧，0=中性，有些担忧但不用买保险,2=疑似抑郁症）"]),["博文","是否担忧（1=担忧，-1=完全不担忧，0=中性，有些担忧但不用买保险,2=疑似抑郁症）"]]
# df = df.fillna("无") # 可以尝试将是否担忧作为一个feature！
df.head(2)

Unnamed: 0,博文,"是否担忧（1=担忧，-1=完全不担忧，0=中性，有些担忧但不用买保险,2=疑似抑郁症）"
0,\n 一直在各个公开场合严肃说了要理智告诉粉丝应该怎样...,-1.0
1,\n 网友求助: “因为男友我感染了hpv，尿道炎，宫...,0.0


In [6]:
len(df)

1574

In [7]:
f = lambda v: "担忧" if v == 1 else ("中性" if v == 0 else ("不担忧" if v == -1 else "疑似抑郁症"))

df["是否担忧（1=担忧，-1=完全不担忧，0=中性，有些担忧但不用买保险,2=疑似抑郁症）"] = df["是否担忧（1=担忧，-1=完全不担忧，0=中性，有些担忧但不用买保险,2=疑似抑郁症）"].astype("int") # float转化为int
df["是否担忧（1=担忧，-1=完全不担忧，0=中性，有些担忧但不用买保险,2=疑似抑郁症）"] = df["是否担忧（1=担忧，-1=完全不担忧，0=中性，有些担忧但不用买保险,2=疑似抑郁症）"].apply(f)
df.head(2)

Unnamed: 0,博文,"是否担忧（1=担忧，-1=完全不担忧，0=中性，有些担忧但不用买保险,2=疑似抑郁症）"
0,\n 一直在各个公开场合严肃说了要理智告诉粉丝应该怎样...,不担忧
1,\n 网友求助: “因为男友我感染了hpv，尿道炎，宫...,中性


In [8]:
# df = pd.concat([PATIENT]) # concat之后row index会重复
# df = pd.concat([PATIENT])
df.reset_index(drop = True,inplace = True)
print(len(df))
df = df.dropna()
print(len(df))
df.head(2)

1574
1574


Unnamed: 0,博文,"是否担忧（1=担忧，-1=完全不担忧，0=中性，有些担忧但不用买保险,2=疑似抑郁症）"
0,\n 一直在各个公开场合严肃说了要理智告诉粉丝应该怎样...,不担忧
1,\n 网友求助: “因为男友我感染了hpv，尿道炎，宫...,中性


In [9]:
df["是否担忧（1=担忧，-1=完全不担忧，0=中性，有些担忧但不用买保险,2=疑似抑郁症）"].value_counts()

不担忧      942
担忧       327
中性       296
疑似抑郁症      9
Name: 是否担忧（1=担忧，-1=完全不担忧，0=中性，有些担忧但不用买保险,2=疑似抑郁症）, dtype: int64

In [10]:
df_worry = df[df["是否担忧（1=担忧，-1=完全不担忧，0=中性，有些担忧但不用买保险,2=疑似抑郁症）"] == "担忧"]
df_worry.reset_index(drop = True,inplace = True)
df_non_worry = df[df["是否担忧（1=担忧，-1=完全不担忧，0=中性，有些担忧但不用买保险,2=疑似抑郁症）"] == "不担忧"]
df_non_worry.reset_index(drop = True,inplace = True)
df_notso_worry = df[df["是否担忧（1=担忧，-1=完全不担忧，0=中性，有些担忧但不用买保险,2=疑似抑郁症）"] == "中性"]
df_notso_worry.reset_index(drop = True,inplace = True)
df_non_worry = process.random_pick(df_non_worry,min(len(df_worry),len(df_non_worry)))
print(len(df_worry))
print(len(df_non_worry))

327
327


In [11]:
df_worry = df_worry.dropna()
df_non_worry = df_non_worry.dropna()
print(len(df_worry))
print(len(df_non_worry))

327
327


In [29]:
# df_use = pd.concat([df_worry,df_non_worry,df_notso_worry])
df_use = pd.concat([df_worry,df_non_worry])
df_use.reset_index(drop = True,inplace = True)
df_use = df_use.reindex(np.random.permutation(df_use.index))
df_use.head(2)

Unnamed: 0,博文,"是否担忧（1=担忧，-1=完全不担忧，0=中性，有些担忧但不用买保险,2=疑似抑郁症）"
420,\n 微博最近总推送一些讲婆媳关系的，看了之后发现婆媳...,不担忧
448,\n #天气与心情# 2月14曰，这天大家应该都记得，...,不担忧


In [30]:
# labels = df_use[["是否担忧（1=担忧，-1=完全不担忧，0=中性，有些担忧但不用买保险,2=疑似抑郁症）","标签1（担忧对象）",
#                   "标签2（担忧什么）","标签3（什么保险）","标签4（症状）"]]
labels = df_use[["是否担忧（1=担忧，-1=完全不担忧，0=中性，有些担忧但不用买保险,2=疑似抑郁症）"]]
labels = np.array(labels)
labels = list(labels)
labels[:5]

[array(['不担忧'], dtype=object),
 array(['不担忧'], dtype=object),
 array(['不担忧'], dtype=object),
 array(['不担忧'], dtype=object),
 array(['担忧'], dtype=object)]

In [31]:
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
mlb.fit(labels)
labels = mlb.transform(labels)
labels

array([[1, 0],
       [1, 0],
       [1, 0],
       ...,
       [0, 1],
       [1, 0],
       [0, 1]])

In [32]:
print(mlb.classes_.shape)
mlb.classes_

(2,)


array(['不担忧', '担忧'], dtype=object)

In [33]:
x_train = process.word_cut(df_use["博文"])
x_train = process.frequency(x_train,5)

In [34]:
# 给字符编码。如果x_train中的词语不在传入的实参token_index中，那么就编码为-1
x_train = process.recoding(x_train,process.word_index)
# x_train[:5]

In [35]:
x_train = process.delete(x_train)
# x_train[:2]

In [36]:
import keras
import tensorflow
from keras import preprocessing

max_len = 50
x_train = preprocessing.sequence.pad_sequences(x_train,maxlen = max_len)

x_train.shape

(654, 50)

In [37]:
df_use.head(2)

Unnamed: 0,博文,"是否担忧（1=担忧，-1=完全不担忧，0=中性，有些担忧但不用买保险,2=疑似抑郁症）"
420,\n 微博最近总推送一些讲婆媳关系的，看了之后发现婆媳...,不担忧
448,\n #天气与心情# 2月14曰，这天大家应该都记得，...,不担忧


In [58]:
# y_train = labels
y_train = df_use["是否担忧（1=担忧，-1=完全不担忧，0=中性，有些担忧但不用买保险,2=疑似抑郁症）"]
y_train = y_train.apply(lambda v: -1 if v == "不担忧" else 1)
y_train[:5]

420   -1
448   -1
588   -1
617   -1
183    1
Name: 是否担忧（1=担忧，-1=完全不担忧，0=中性，有些担忧但不用买保险,2=疑似抑郁症）, dtype: int64

In [68]:
y_train = np.array(y_train)
y_train = y_train.reshape(len(y_train),1)
print(x_train.shape)
print(y_train.shape)

(654, 50)
(654, 1)


In [69]:
y_train

array([[-1],
       [-1],
       [-1],
       [-1],
       [ 1],
       [ 1],
       [-1],
       [-1],
       [ 1],
       [-1],
       [-1],
       [ 1],
       [-1],
       [ 1],
       [-1],
       [-1],
       [-1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],
       [-1],
       [ 1],
       [ 1],
       [-1],
       [ 1],
       [ 1],
       [ 1],
       [-1],
       [ 1],
       [ 1],
       [ 1],
       [-1],
       [ 1],
       [-1],
       [-1],
       [ 1],
       [-1],
       [-1],
       [-1],
       [ 1],
       [ 1],
       [-1],
       [-1],
       [-1],
       [ 1],
       [-1],
       [ 1],
       [-1],
       [ 1],
       [ 1],
       [-1],
       [-1],
       [-1],
       [ 1],
       [ 1],
       [-1],
       [-1],
       [-1],
       [-1],
       [-1],
       [-1],
       [-1],
       [ 1],
       [-1],
       [ 1],
       [ 1],
       [-1],
       [ 1],
       [ 1],
       [ 1],
       [-1],
       [-1],
       [ 1],
       [ 1],
       [ 1],
       [ 1],

In [72]:
df_use.head(6)

Unnamed: 0,博文,"是否担忧（1=担忧，-1=完全不担忧，0=中性，有些担忧但不用买保险,2=疑似抑郁症）"
420,\n 微博最近总推送一些讲婆媳关系的，看了之后发现婆媳...,不担忧
448,\n #天气与心情# 2月14曰，这天大家应该都记得，...,不担忧
588,\n 555可以换新置顶惹十月底是114斤 但是我没有...,不担忧
617,\n 父母这样的爱情，我该怎么办？ ​​​ ...,不担忧
183,\n 每天心塞，会不会得病 2长春 ​ ...,担忧
269,刚刚收到公司行政部的短信调查，问我感冒咳嗽好了没。啊啊啊，好害怕回去上班！路上好危险，要搭地...,担忧


In [76]:
from keras.models import Sequential
from keras.layers import Flatten,Dense,Embedding,LSTM,Bidirectional,Dropout

max_features = 20000
max_len = 50

model = Sequential()
model.add(Embedding(max_features,300,input_length = max_len,mask_zero = True)) # 遇到0，就不会反向传播更新权重
model.add(Dropout(0.5))
model.add(Bidirectional(LSTM(20),merge_mode = 'concat'))
model.add(Dense(1,activation = 'sigmoid'))
# model.compile(optimizer = 'rmsprop',loss = 'categorical_crossentropy',metrics = ['accuracy'])
model.compile(optimizer = 'rmsprop',loss = 'binary_crossentropy',metrics = ['accuracy'])
model.summary()

Model: "sequential_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_8 (Embedding)      (None, 50, 300)           6000000   
_________________________________________________________________
dropout_8 (Dropout)          (None, 50, 300)           0         
_________________________________________________________________
bidirectional_8 (Bidirection (None, 40)                51360     
_________________________________________________________________
dense_7 (Dense)              (None, 1)                 41        
Total params: 6,051,401
Trainable params: 6,051,401
Non-trainable params: 0
_________________________________________________________________


In [77]:
model.layers[0].set_weights([embedding_matrix])
model.layers[0].trainable = False

In [78]:
x_train

array([[ 4593, 12729,   198, ...,  1574,  7798,  1947],
       [    0,     0,     0, ...,  1052,  7798,  1947],
       [ 2678,   118,  2340, ...,    15,  7798,  1947],
       ...,
       [    0,     0,     0, ...,   875,    54,  4502],
       [    0,     0,     0, ...,  3202,  7798,  1947],
       [    0,     0,     0, ..., 17377,   947, 17377]])

In [79]:
history = model.fit(x_train,
                    y_train,
                    epochs = 10,
                    batch_size = 128, # batch_size越大越好，但是太大会影响计算效率
                    validation_split= 0.3)

  'Discrepancy between trainable weights and collected trainable'
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 457 samples, validate on 197 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [53]:
mlb.classes_

array(['不担忧', '担忧'], dtype=object)

In [50]:
df_use.head(5)

Unnamed: 0,博文,"是否担忧（1=担忧，-1=完全不担忧，0=中性，有些担忧但不用买保险,2=疑似抑郁症）"
420,\n 微博最近总推送一些讲婆媳关系的，看了之后发现婆媳...,不担忧
448,\n #天气与心情# 2月14曰，这天大家应该都记得，...,不担忧
588,\n 555可以换新置顶惹十月底是114斤 但是我没有...,不担忧
617,\n 父母这样的爱情，我该怎么办？ ​​​ ...,不担忧
183,\n 每天心塞，会不会得病 2长春 ​ ...,担忧


In [47]:
model.predict(x_train)

array([[0.51409125, 0.48590872],
       [0.40319288, 0.59680706],
       [0.40424305, 0.595757  ],
       ...,
       [0.37255877, 0.6274412 ],
       [0.50576293, 0.49423707],
       [0.480347  , 0.519653  ]], dtype=float32)

In [40]:
# model.save("Bi-LSTM(加入预训练的词向量库,多标签问题).h5")

In [41]:
new_data = pd.read_csv("未标记的担忧咋办标签.csv")
new_data_preserved = new_data
new_data.head(2)

Unnamed: 0,用户名,博文,预测是否担忧（1=担忧，0=不担忧）,转发数,评论数,点赞数,发文时间,来自,页面网址,博文链接
0,玥玥的碎花小裙儿,今晚就要跨年了咋办呢？先迈左脚还是右脚啊？跨不过去会不会摔一跤？腿太长会不会跨到2021年啊...,0,,,,\n 2019年12月31日 23:52\n ...,realme Q 四摄迅猛龙,https://s.weibo.com/weibo/%25E6%258B%2585%25E5...,https://weibo.com/6275719793/Innh3oI0n?refer_f...
1,海里星星16687,今晚就要跨年了咋办呢？先迈左脚还是右脚啊？跨不过去会不会摔一跤？腿太长会不会跨到2021啊好...,0,,,,\n 2019年12月31日 23:35\n ...,OPPO超视野全面屏R15,https://s.weibo.com/weibo/%25E6%258B%2585%25E5...,https://weibo.com/6607786597/InnadlLNd?refer_f...


In [42]:
new_data = new_data[["博文"]]
new_data["博文"] = process.word_cut(new_data["博文"])
new_data.head(2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,博文
0,"[今晚, 跨年, 咋办, 先迈, 左脚, 右脚, 跨, 摔, 一跤, 腿, 太长会, 跨到,..."
1,"[今晚, 跨年, 咋办, 先迈, 左脚, 右脚, 跨, 摔, 一跤, 腿, 太长会, 跨到,..."


In [43]:
new_data["博文"] = process.frequency(new_data["博文"],5)
new_data.head(2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,博文
0,"[今晚, 跨年, 咋办, 先迈, 左脚, 右脚, 跨, 摔, 一跤, 腿, 太长会, 跨到,..."
1,"[今晚, 跨年, 咋办, 先迈, 左脚, 右脚, 跨, 摔, 一跤, 腿, 太长会, 跨到,..."


In [44]:
new_data["博文"] = process.recoding(new_data["博文"],process.word_index)
new_data.head(2)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,博文
0,"[-1, -1, -1, -1, -1, -1, 4344, -1, -1, 3076, -..."
1,"[-1, -1, -1, -1, -1, -1, 4344, -1, -1, 3076, -..."


In [45]:
new_data["博文"] = process.delete(new_data["博文"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [46]:
x_test = preprocessing.sequence.pad_sequences(new_data["博文"],maxlen = max_len)

In [47]:
x_test

array([[   0,    0,    0, ..., 3076,   15, 3624],
       [   0,    0,    0, ..., 4344, 3076, 3624],
       [   0,    0,    0, ..., 3076,   15, 3624],
       ...,
       [   0,    0,    0, ...,  647, 7798, 1947],
       [   0,    0,    0, ..., 3624,  365,  293],
       [   0,    0,    0, ..., 3676, 2966,  871]])

In [48]:
prediction = model.predict(x_test)
prediction.shape

(2014, 24)

In [49]:
prediction.reshape(prediction.shape[0],prediction.shape[1])

array([[0.45977056, 0.28872412, 0.02617878, ..., 0.030577  , 0.06931883,
        0.03858814],
       [0.4635804 , 0.28818887, 0.02732673, ..., 0.0326975 , 0.07673627,
        0.04089916],
       [0.45977056, 0.28872412, 0.02617878, ..., 0.030577  , 0.06931883,
        0.03858814],
       ...,
       [0.46569923, 0.4161151 , 0.0130426 , ..., 0.02122244, 0.09131664,
        0.03654969],
       [0.48502225, 0.3567925 , 0.0218946 , ..., 0.03336206, 0.05748242,
        0.04534882],
       [0.513146  , 0.3083522 , 0.01959834, ..., 0.03350639, 0.06523269,
        0.04500294]], dtype=float32)

In [50]:
list(mlb.classes_)

['不担忧',
 '中性',
 '亲人',
 '作息不规律',
 '健康',
 '健康保险',
 '孩子',
 '宠物',
 '宠物保险',
 '少儿保险',
 '工作问题',
 '情绪问题',
 '意外',
 '意外保险',
 '担忧',
 '无',
 '日常习惯问题',
 '死亡保险',
 '父母',
 '经济',
 '自己',
 '营业中断',
 '营业中断险',
 '身体问题']

In [53]:
Result = pd.DataFrame(list(prediction),new_data_preserved["博文"])
Result.columns = list(mlb.classes_)
Result = Result.reset_index() 
Result.to_excel("Bi-LSTM预测结果（多标签问题）.xlsx",header = True)