In [1]:
import os
import pandas as pd
import numpy as np

class market():
    def word_cut(self,documents):
        stopwords = self.stopwords
        import jieba
        texts = []
        for line in documents:
            words = ' '.join(jieba.cut(line)).split(' ') # 用空格去连接，连接后马上又拆分
            text = []
            for word in words:
                if (word not in stopwords) & (word != '')& (word != '\u3000')& (word != '\n')&(word != '\u200b'):
                    text.append(word)
            texts.append(text)
        self.docLength = len(documents)
        return(texts)
    def get_docLength(self):
        return(self.docLength)
    def frequency(self,texts,freq):
        from collections import defaultdict
        frequency = defaultdict(int) # value为int
        for text in texts:
            for word in text:
                frequency[word] += 1
        texts = [[word for word in text if frequency[word] > freq] for text in texts]
        return(texts)
    def regroup(self,texts):
        new_texts = []
        for i,sentence in enumerate(texts):
            new_texts.append(" ".join(sentence))
        return(new_texts)
    def add_stopwords(self,path):
        stopwords = set()
        with open(path,'r',encoding = 'cp936') as file:
            for line in file:
                stopwords.add(line.strip())
        self.stopwords = stopwords
        print("Load %s stopwords" %len(stopwords))
    def dictionary(self,docs):
        token_index ={}
        for sample in docs:
            for word in sample:
                if word not in token_index:
                    token_index[word] = len(token_index) + 1
        return(token_index)
    def count(self,docs):
        token_length ={}
        for sample in docs:
            for word in sample:
                if word not in token_length:
                    token_length[word] = 1
                else:
                    token_length[word] += 1
        return(token_length)
    def recoding(self,docs,token_index):
        for i,sample in enumerate(docs):
            for j,word in enumerate(sample):
                if word not in token_index:
                    sample[j] = -1
                else:
                    sample[j] = token_index[word]
            docs[i] = sample
        return(docs)
    def delete(self,docs):
        for index in range(len(docs)):
            for i in range(len(docs[index])-1,-1,-1):
                if docs[index][i] == -1:
                    docs[index].pop(i)
        return docs
    def random_pick(self,df,n):
        import random
        import numpy as np
        rand = np.arange(0,(len(df)-1),1)
        random.shuffle(rand)
        rand = list(rand[:n])
        df = df.loc[rand,]
        return(df)
    def read_vectors(self,path, topn):  # read top n word vectors, i.e. top is 10000
        lines_num, dim = 0, 0
        vectors = {}
        iw = []
        wi = {}
        with open(path, encoding='utf-8', errors='ignore') as f:
            first_line = True
            for line in f:
                if first_line:
                    first_line = False
                    dim = int(line.rstrip().split()[1]) # 删除向量末尾的空格，然后以空格拆分获得向量
                    continue
                lines_num += 1
                tokens = line.rstrip().split(' ')
                vectors[tokens[0]] = np.asarray([float(x) for x in tokens[1:]])# 当数据源是ndarray时，asarray不会占用新的内存；当数据源不是ndarray,asarray与array一样
                iw.append(tokens[0]) # iw储存了所有的tokons[0]，意思是index_word
                if topn != 0 and lines_num >= topn:
                    break
        for i, w in enumerate(iw):
            wi[w] = i # wi是iw的反转，意思是word_index,用w来储存字符，用一个integer去给字符编码
        self.dim = dim
        self.max_words = topn
        self.word_index = wi
        self.index_word = iw
        self.vectors = vectors
        print("Load %s word vectors." % len(vectors))
    def embedding_matrix(self):
        embedding_matrix = np.zeros((self.max_words,self.dim))
        for word,i in self.word_index.items():
            if i < self.max_words:
                embedding_vector = self.vectors.get(word)
                if embedding_vector is not None:
                    embedding_matrix[i] = embedding_vector
        return embedding_matrix
    def navie_knn(self,dataSet, query, k):  
        # 计算出某一样本与所有样本的距离，选择最大(应该修改为最小？)的k个样本作为用于knn
        numSamples = dataSet.shape[0] # return row(sample) number of dataset

        ## step 1: calculate Euclidean distance  
        diff = np.tile(query, (numSamples, 1)) - dataSet #tile: 把query这个向量纵向复制，使得结果与dataset具有同样的行数
        squaredDiff = diff ** 2  
        squaredDist = np.sum(squaredDiff, axis = 1) # sum is performed by row  

        ## step 2: sort the distance  
        sortedDistIndices = np.argsort(squaredDist)   # numpy.argsort 返回的是数组值从小到大的索引值（注意是索引值，不是绝对值）
        if k > len(sortedDistIndices):  
            k = len(sortedDistIndices)  

        return sortedDistIndices[0:k]
    # build a big graph (normalized weight matrix)  
    def buildGraph(self,MatX, kernel_type, rbf_sigma = None, knn_num_neighbors = None):  
        num_samples = MatX.shape[0]  # return row(sample) number of MatX
        affinity_matrix = np.zeros((num_samples, num_samples), np.float32)  
        if kernel_type == 'rbf':  
            if rbf_sigma == None:  
                raise ValueError('You should input a sigma of rbf kernel!')  
            for i in range(num_samples):  
                row_sum = 0.0  
                for j in range(num_samples):  
                    diff = MatX[i, :] - MatX[j, :]  
                    affinity_matrix[i][j] = np.exp(sum(diff**2) / (-2.0 * rbf_sigma**2))  
                    row_sum += affinity_matrix[i][j]  
                affinity_matrix[i][:] /= row_sum  
        elif kernel_type == 'knn':  
            if knn_num_neighbors == None:  
                raise ValueError('You should input a k of knn kernel!')  
            for i in range(num_samples):  
                k_neighbors = self.navie_knn(MatX, MatX[i, :], knn_num_neighbors)  
                affinity_matrix[i][k_neighbors] = 1.0 / knn_num_neighbors  # 将节点i与附近的k个节点连接起来，每个边的权重是1/knn_num_neighbors
        else:  
            raise NameError('Not support kernel type! You can use knn or rbf!')  

        return affinity_matrix  
    # label propagation  
    def labelPropagation(self,Mat_Label, Mat_Unlabel, labels, kernel_type = 'rbf', rbf_sigma = 0.20, \
                        knn_num_neighbors = 10, max_iter = 500, tol = 1e-3):  
        # initialize  
        num_label_samples = Mat_Label.shape[0]  #已经标记的sample number
        num_unlabel_samples = Mat_Unlabel.shape[0]  #未标记的sample number
        num_samples = num_label_samples + num_unlabel_samples
        labels_list = np.unique(labels)  #有哪些label
        num_classes = len(labels_list)  #label的种类数

        MatX = np.vstack((Mat_Label, Mat_Unlabel))
        clamp_data_label = np.zeros((num_label_samples, num_classes), np.float32)  
        for i in range(num_label_samples):  
            clamp_data_label[i][labels[i]] = 1.0   #标记出每一个labelled sample的具体label是什么

        label_function = np.zeros((num_samples, num_classes), np.float32)  
        label_function[0 : num_label_samples] = clamp_data_label  
        label_function[num_label_samples : num_samples] = -1  

        # graph construction  
        affinity_matrix = self.buildGraph(MatX, kernel_type, rbf_sigma, knn_num_neighbors)  

        # start to propagation  
        iter = 0; pre_label_function = np.zeros((num_samples, num_classes), np.float32)  
        changed = np.abs(pre_label_function - label_function).sum()  
        while iter < max_iter and changed > tol:  
            if iter % 1 == 0:  
                print ("---> Iteration %d/%d, changed: %f" % (iter, max_iter, changed))
            pre_label_function = label_function  
            iter += 1  

            # propagation  
            label_function = np.dot(affinity_matrix, label_function)  

            # clamp  
            label_function[0 : num_label_samples] = clamp_data_label  

            # check converge  
            changed = np.abs(pre_label_function - label_function).sum()  

        # get terminate label of unlabeled data  
        unlabel_data_labels = np.zeros(num_unlabel_samples)  
        for i in range(num_unlabel_samples):  
            unlabel_data_labels[i] = np.argmax(label_function[i+num_label_samples]) #取出参数中元素最大值所对应的索引 

        return unlabel_data_labels 

In [2]:
process = market()
process.add_stopwords("D:/Users/PYTHON/Precision-Marketing/stopwords.txt")
process.read_vectors("D:/NLP/sgns.target.word-word.dynwin5.thr10.neg5.dim300.txt",10000)

Load 2316 stopwords
Load 10000 word vectors.


In [3]:
embedding_matrix = process.embedding_matrix()
embedding_matrix.shape

(10000, 300)

In [21]:
os.chdir("D:/Users/PYTHON/Precision-Marketing")

WORRY = pd.read_csv("已标记的担忧咋办标签.csv")
WORRY = WORRY[["博文","是否担忧（1=担忧，0=不担忧）"]] 
PATIENT = pd.read_csv("已标记的会不会得病标签.csv")
PATIENT = PATIENT[["博文","是否担忧（1=担忧，0=不担忧）"]]
# PATIENT = PATIENT.fillna("无") 可以尝试将是否担忧作为一个feature！
print(WORRY.head(5))
print(PATIENT.head(5))

                                                  博文  是否担忧（1=担忧，0=不担忧）
0  今晚就要跨年了咋办呢？先迈左脚还是右脚啊？跨不过去会不会摔一跤？腿太长会不会跨到2021年啊...                 0
1  今晚就要跨年了咋办呢？先迈左脚还是右脚啊？跨不过去会不会摔一跤？腿太长会不会跨到2021啊好...                 0
2  今晚就要跨年了咋办呢？先迈左脚还是右脚啊？跨不过去会不会摔一跤？腿太长会不会跨到2021年啊...                 0
3  问：如何让猫黏上你？答：穿一条毛茸茸的裤子。————但是我现在有点担心你现在睡这么多晚上我睡...                 0
4  今晚就要跨年了咋办呢？先迈左脚还是右脚啊？跨不过去会不会摔一跤？腿太长会不会跨到2021年啊...                 0
                                                  博文  是否担忧（1=担忧，0=不担忧）
0  给好多人说了新年快乐说了新年祝福还没有给自己说过。新年快乐希望在2020年你可以学有所成重要...               0.0
1  2019开始倒数了哦原来小时候觉得很遥远的2020年就是明天了要说2019年我收获了什么呢？...               1.0
2  微信，QQ都是认识的人，没有办法发泄内心的不满，渐渐养成了不发朋友圈的习惯，但是自己变得越来...               1.0
3  现在已经对酒上瘾很严重每天这个点必须喝半杯而且一天喝水喝特别少大概就一瓶不到..我会不会得病...               1.0
4  这段时间每天睡得晚，起的晚，可恨没有好好吃早餐。会不会得病啊中午上一节男老师的尊巴舞，如果体...               1.0


In [5]:
df = pd.concat([WORRY,PATIENT]) # concat之后row index会重复
# df = pd.concat([PATIENT])
df.reset_index(drop = True,inplace = True)
print(len(df))
df = df.dropna()
print(len(df))
df.head(5)

1049497
1636


Unnamed: 0,博文,是否担忧（1=担忧，0=不担忧）
0,今晚就要跨年了咋办呢？先迈左脚还是右脚啊？跨不过去会不会摔一跤？腿太长会不会跨到2021年啊...,0.0
1,今晚就要跨年了咋办呢？先迈左脚还是右脚啊？跨不过去会不会摔一跤？腿太长会不会跨到2021啊好...,0.0
2,今晚就要跨年了咋办呢？先迈左脚还是右脚啊？跨不过去会不会摔一跤？腿太长会不会跨到2021年啊...,0.0
3,问：如何让猫黏上你？答：穿一条毛茸茸的裤子。————但是我现在有点担心你现在睡这么多晚上我睡...,0.0
4,今晚就要跨年了咋办呢？先迈左脚还是右脚啊？跨不过去会不会摔一跤？腿太长会不会跨到2021年啊...,0.0


In [6]:
df["是否担忧（1=担忧，0=不担忧）"].value_counts()

0.0    1370
1.0     266
Name: 是否担忧（1=担忧，0=不担忧）, dtype: int64

In [7]:
df_worry = df[df["是否担忧（1=担忧，0=不担忧）"] == 1]
df_worry.reset_index(drop = True,inplace = True)
df_non_worry = df[df["是否担忧（1=担忧，0=不担忧）"] == 0]
df_non_worry.reset_index(drop = True,inplace = True)
df_non_worry = process.random_pick(df_non_worry,min(len(df_worry),len(df_non_worry)))
print(len(df_worry))
print(len(df_non_worry))

266
266


In [8]:
df_worry

Unnamed: 0,博文,是否担忧（1=担忧，0=不担忧）
0,焦虑症我咋办啊!!我又开始怀疑自己心脏病了…每时每刻都在担心自己是不是会突然倒下去这几天都...,1.0
1,累…要做好工作还要当好妈妈，有多久没有凌晨两点前睡过觉了，刚收拾完宝贝明天去海洋馆要拿的东西...,1.0
2,我是真的很担心我的胳膊不好使了咋办2武汉·亿童大厦​,1.0
3,臭宝前天晚上去医院看了，大夫说是支原体感染，在网上查说支原体感染很麻烦，很容易复发，我又担心...,1.0
4,36个年头，第一次这么怕死，虽然问过医生，还是有点担心，怕真有什么，俩宝咋办，四老咋办，唉，...,1.0
...,...,...
261,刚刚收到公司行政部的短信调查，问我感冒咳嗽好了没。啊啊啊，好害怕回去上班！路上好危险，要搭地...,1.0
262,你们在家里呆有没有出现很闷 就是心慌然后呼吸急促 怎么办我好害怕😨 ​,1.0
263,妈妈不在家的时候我总是很焦虑，我害怕她出事，虽然她身体健康，我想和她每天每个时间都在一起，妈...,1.0
264,望着躺在重症监护室里的父亲，心里五味杂陈，我笑着告诉他一切都好，却害怕他看见我眼角的泪有些话...,1.0


In [9]:
df_worry

Unnamed: 0,博文,是否担忧（1=担忧，0=不担忧）
0,焦虑症我咋办啊!!我又开始怀疑自己心脏病了…每时每刻都在担心自己是不是会突然倒下去这几天都...,1.0
1,累…要做好工作还要当好妈妈，有多久没有凌晨两点前睡过觉了，刚收拾完宝贝明天去海洋馆要拿的东西...,1.0
2,我是真的很担心我的胳膊不好使了咋办2武汉·亿童大厦​,1.0
3,臭宝前天晚上去医院看了，大夫说是支原体感染，在网上查说支原体感染很麻烦，很容易复发，我又担心...,1.0
4,36个年头，第一次这么怕死，虽然问过医生，还是有点担心，怕真有什么，俩宝咋办，四老咋办，唉，...,1.0
...,...,...
261,刚刚收到公司行政部的短信调查，问我感冒咳嗽好了没。啊啊啊，好害怕回去上班！路上好危险，要搭地...,1.0
262,你们在家里呆有没有出现很闷 就是心慌然后呼吸急促 怎么办我好害怕😨 ​,1.0
263,妈妈不在家的时候我总是很焦虑，我害怕她出事，虽然她身体健康，我想和她每天每个时间都在一起，妈...,1.0
264,望着躺在重症监护室里的父亲，心里五味杂陈，我笑着告诉他一切都好，却害怕他看见我眼角的泪有些话...,1.0


In [10]:
df_non_worry

Unnamed: 0,博文,是否担忧（1=担忧，0=不担忧）
701,Lgl官司输了这可咋办我有点担心小朋友了​,0.0
73,🐯:有一位同学考上了北大，说大学比高中还辛苦。我们有的同学就开始担心了，我考上北大咋办家？你...,0.0
781,人的一生真的会有很多第一次哎～比如：今天下午第一次上空中瑜伽课这一小时我是在担心和兴奋中度过...,0.0
1025,#答辩##毕业论文#本科，去年十二月自己选题，被导师毙了几次，那几天做梦梦到自己差点没毕业。...,0.0
922,【恭喜#张怡宁二胎生子#，又诞生一个可以随便玩金牌的小魔王！】据报道，乒乓张怡宁10月30日...,0.0
...,...,...
173,那我以后呢如果和他结婚的话生了孩子谁来照顾呢我看到我妈去照顾她孙子那么高兴以后我的孩子呢也会...,0.0
696,陈正catgod咋办，我好难受，担心败者组​,0.0
966,今晚就要跨年了咋办呢？先迈左脚还是右脚啊？跨不过去会不会摔一跤？腿太长会不会跨到2021年啊...,0.0
904,#给肖战讲笑话#一家三口在大草原旅游。女儿见草原上人烟稀少，担心地问：“妈妈，我们要是遇到狼...,0.0


In [11]:
df_worry = df_worry.dropna()
df_non_worry = df_non_worry.dropna()
print(len(df_worry))
print(len(df_non_worry))

df_use = pd.concat([df_worry,df_non_worry])
df_use.reset_index(drop = True,inplace = True)
df_use = df_use.reindex(np.random.permutation(df_use.index))
df_use.head()


266
266


Unnamed: 0,博文,是否担忧（1=担忧，0=不担忧）
369,我和他的故事真的不是什么好故事，不明白我的闺蜜们为什么非要我讲出来。和他认识是因为参加了同一...,0.0
8,今天被声讨了，心里总是憋着点歉意，到现在还没睡，睡不着，倒也饿起来了，越饿越睡不着，咋办，会...,1.0
489,安检员开始担心摸了会不会得病哈哈哈2333333,0.0
58,我会不会得病啊2邢台​,1.0
21,补作业补到这个点，还欠了一大堆，同时还有一堆工作白天必须交稿，马上面试了还没把该背的背完，每...,1.0


In [12]:
x_train = process.word_cut(df_use["博文"])
x_train = process.frequency(x_train,5)

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\lenovo\AppData\Local\Temp\jieba.cache
Loading model cost 0.648 seconds.
Prefix dict has been built successfully.


In [13]:
token_index = process.dictionary(x_train)

In [14]:
token_length = process.count(x_train)
token_length = {key:token_length[key] for key in sorted(token_length,key = lambda x: token_length[x],reverse = True)[:round((2/5)*len(token_index))]}

In [15]:
# 给字符编码。如果x_train中的词语不在传入的实参token_index中，那么就编码为-1
x_train = process.recoding(x_train,process.word_index)
# x_train[:5]

In [16]:
x_train = process.delete(x_train)
# x_train[:2]

In [17]:
import keras
import tensorflow
from keras import preprocessing

max_len = 50
x_train = preprocessing.sequence.pad_sequences(x_train,maxlen = max_len)

x_train.shape

Using TensorFlow backend.


(532, 50)

In [18]:
df_use.head()

Unnamed: 0,博文,是否担忧（1=担忧，0=不担忧）
369,我和他的故事真的不是什么好故事，不明白我的闺蜜们为什么非要我讲出来。和他认识是因为参加了同一...,0.0
8,今天被声讨了，心里总是憋着点歉意，到现在还没睡，睡不着，倒也饿起来了，越饿越睡不着，咋办，会...,1.0
489,安检员开始担心摸了会不会得病哈哈哈2333333,0.0
58,我会不会得病啊2邢台​,1.0
21,补作业补到这个点，还欠了一大堆，同时还有一堆工作白天必须交稿，马上面试了还没把该背的背完，每...,1.0


In [73]:
y_train = df_use[["是否担忧（1=担忧，0=不担忧）"]]
# y_train["是否担忧（1=担忧，0=不担忧）"] = y_train["是否担忧（1=担忧，0=不担忧）"].astype("category")
y_train.head(5)

Unnamed: 0,是否担忧（1=担忧，0=不担忧）
348,0.0
124,1.0
237,1.0
469,0.0
416,0.0


In [74]:
y_in = len(y_train)
y_train = np.array(y_train)
y_train = y_train.reshape(y_in)
y_train[:5]

array([0., 1., 1., 0., 0.])

In [76]:
print(x_train.shape)
print(y_train.shape)

(532, 50)
(532,)


In [77]:
def get_values(token_index):
    values = []
    for key in token_index:
        values.append(token_index[key])
    return(values)
values = get_values(token_index)
values[:5]
print("最大的序号是：",max(values))

最大的序号是： 663


In [78]:
from keras.models import Sequential
from keras.layers import Flatten,Dense,Embedding,LSTM,Bidirectional,Dropout

max_features = 10000
max_len = 50

model = Sequential()
model.add(Embedding(max_features,300,input_length = max_len,mask_zero = True)) # 遇到0，就不会反向传播更新权重
model.add(Dropout(0.5))
model.add(Bidirectional(LSTM(20),merge_mode = 'concat'))
model.add(Dense(1,activation = 'sigmoid'))
model.compile(optimizer = 'rmsprop',loss = 'binary_crossentropy',metrics = ['accuracy'])
model.summary()

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 50, 300)           3000000   
_________________________________________________________________
dropout_5 (Dropout)          (None, 50, 300)           0         
_________________________________________________________________
bidirectional_5 (Bidirection (None, 40)                51360     
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 41        
Total params: 3,051,401
Trainable params: 3,051,401
Non-trainable params: 0
_________________________________________________________________


In [79]:
model.layers[0].set_weights([embedding_matrix])
model.layers[0].trainable = False

In [80]:
history = model.fit(x_train,
                    y_train,
                    epochs = 10,
                    batch_size = 128, # batch_size越大越好，但是太大会影响计算效率
                    validation_split= 0)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [103]:
# model.save("Bi-LSTM(加入预训练的词向量库).h5")

In [94]:
new_data = pd.read_csv("未标记的担忧咋办标签.csv")
new_data_preserved = new_data
new_data.head(2)

Unnamed: 0,用户名,博文,预测是否担忧（1=担忧，0=不担忧）,转发数,评论数,点赞数,发文时间,来自,页面网址,博文链接
0,玥玥的碎花小裙儿,今晚就要跨年了咋办呢？先迈左脚还是右脚啊？跨不过去会不会摔一跤？腿太长会不会跨到2021年啊...,0,,,,\n 2019年12月31日 23:52\n ...,realme Q 四摄迅猛龙,https://s.weibo.com/weibo/%25E6%258B%2585%25E5...,https://weibo.com/6275719793/Innh3oI0n?refer_f...
1,海里星星16687,今晚就要跨年了咋办呢？先迈左脚还是右脚啊？跨不过去会不会摔一跤？腿太长会不会跨到2021啊好...,0,,,,\n 2019年12月31日 23:35\n ...,OPPO超视野全面屏R15,https://s.weibo.com/weibo/%25E6%258B%2585%25E5...,https://weibo.com/6607786597/InnadlLNd?refer_f...


In [95]:
new_data = new_data[["博文"]]
new_data["博文"] = process.word_cut(new_data["博文"])
new_data.head(2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,博文
0,"[今晚, 跨年, 咋办, 先迈, 左脚, 右脚, 跨, 摔, 一跤, 腿, 太长会, 跨到,..."
1,"[今晚, 跨年, 咋办, 先迈, 左脚, 右脚, 跨, 摔, 一跤, 腿, 太长会, 跨到,..."


In [96]:
new_data["博文"] = process.frequency(new_data["博文"],5)
new_data.head(2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,博文
0,"[今晚, 跨年, 咋办, 先迈, 左脚, 右脚, 跨, 摔, 一跤, 腿, 太长会, 跨到,..."
1,"[今晚, 跨年, 咋办, 先迈, 左脚, 右脚, 跨, 摔, 一跤, 腿, 太长会, 跨到,..."


In [97]:
new_data["博文"] = process.recoding(new_data["博文"],process.word_index)
new_data.head(2)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,博文
0,"[-1, -1, -1, -1, -1, -1, 4344, -1, -1, 3076, -..."
1,"[-1, -1, -1, -1, -1, -1, 4344, -1, -1, 3076, -..."


In [98]:
new_data["博文"] = process.delete(new_data["博文"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [99]:
x_test = preprocessing.sequence.pad_sequences(new_data["博文"],maxlen = max_len)

In [100]:
prediction = model.predict(x_test)
prediction = prediction.reshape(len(prediction))
prediction

array([0.03170225, 0.04756665, 0.03170225, ..., 0.02784744, 0.23346764,
       0.03607011], dtype=float32)

In [102]:
Result = pd.DataFrame({"预测结果":list(prediction),"博文":new_data_preserved["博文"]})
# Result.to_excel("Bi-LSTM预测结果（(加入预训练的词向量库)）.xlsx",header = True)