In [1]:
import os
import pandas as pd
import numpy as np

class market():
    def word_cut(self,documents):
        stopwords = self.stopwords
        import jieba
        texts = []
        for line in documents:
            words = ' '.join(jieba.cut(line)).split(' ') # 用空格去连接，连接后马上又拆分
            text = []
            for word in words:
                if (word not in stopwords) & (word != '')& (word != '\u3000')& (word != '\n')&(word != '\u200b'):
                    text.append(word)
            texts.append(text)
        self.docLength = len(documents)
        return(texts)
    def get_docLength(self):
        return(self.docLength)
    def frequency(self,texts,freq):
        from collections import defaultdict
        frequency = defaultdict(int) # value为int
        for text in texts:
            for word in text:
                frequency[word] += 1
        texts = [[word for word in text if frequency[word] > freq] for text in texts]
        return(texts)
    def regroup(self,texts):
        new_texts = []
        for i,sentence in enumerate(texts):
            new_texts.append(" ".join(sentence))
        return(new_texts)
    def add_stopwords(self,path):
        stopwords = set()
        with open(path,'r',encoding = 'cp936') as file:
            for line in file:
                stopwords.add(line.strip())
        self.stopwords = stopwords
        print("Load %s stopwords" %len(stopwords))
    def dictionary(self,docs):
        token_index ={}
        for sample in docs:
            for word in sample:
                if word not in token_index:
                    token_index[word] = len(token_index) + 1
        return(token_index)
    def count(self,docs):
        token_length ={}
        for sample in docs:
            for word in sample:
                if word not in token_length:
                    token_length[word] = 1
                else:
                    token_length[word] += 1
        return(token_length)
    def recoding(self,docs,token_index):
        for i,sample in enumerate(docs):
            for j,word in enumerate(sample):
                if word not in token_index:
                    sample[j] = -1
                else:
                    sample[j] = token_index[word]
            docs[i] = sample
        return(docs)
    def delete(self,docs):
        for index in range(len(docs)):
            for i in range(len(docs[index])-1,-1,-1):
                if docs[index][i] == -1:
                    docs[index].pop(i)
        return docs
    def random_pick(self,df,n):
        import random
        import numpy as np
        rand = np.arange(0,(len(df)-1),1)
        random.shuffle(rand)
        rand = list(rand[:n])
        df = df.loc[rand,]
        return(df)
    def read_vectors(self,path, topn):  # read top n word vectors, i.e. top is 10000
        lines_num, dim = 0, 0
        vectors = {}
        iw = []
        wi = {}
        with open(path, encoding='utf-8', errors='ignore') as f:
            first_line = True
            for line in f:
                if first_line:
                    first_line = False
                    dim = int(line.rstrip().split()[1]) # 删除向量末尾的空格，然后以空格拆分获得向量
                    continue
                lines_num += 1
                tokens = line.rstrip().split(' ')
                vectors[tokens[0]] = np.asarray([float(x) for x in tokens[1:]])# 当数据源是ndarray时，asarray不会占用新的内存；当数据源不是ndarray,asarray与array一样
                iw.append(tokens[0]) # iw储存了所有的tokons[0]，意思是index_word
                if topn != 0 and lines_num >= topn:
                    break
        for i, w in enumerate(iw):
            wi[w] = i # wi是iw的反转，意思是word_index,用w来储存字符，用一个integer去给字符编码
        self.dim = dim
        self.max_words = topn
        self.word_index = wi
        self.index_word = iw
        self.vectors = vectors
        print("Load %s word vectors." % len(vectors))
    def embedding_matrix(self):
        embedding_matrix = np.zeros((self.max_words,self.dim))
        for word,i in self.word_index.items():
            if i < self.max_words:
                embedding_vector = self.vectors.get(word)
                if embedding_vector is not None:
                    embedding_matrix[i] = embedding_vector
        return embedding_matrix
    def navie_knn(self,dataSet, query, k):  
        # 计算出某一样本与所有样本的距离，选择最大(应该修改为最小？)的k个样本作为用于knn
        numSamples = dataSet.shape[0] # return row(sample) number of dataset

        ## step 1: calculate Euclidean distance  
        diff = np.tile(query, (numSamples, 1)) - dataSet #tile: 把query这个向量纵向复制，使得结果与dataset具有同样的行数
        squaredDiff = diff ** 2  
        squaredDist = np.sum(squaredDiff, axis = 1) # sum is performed by row  

        ## step 2: sort the distance  
        sortedDistIndices = np.argsort(squaredDist)   # numpy.argsort 返回的是数组值从小到大的索引值（注意是索引值，不是绝对值）
        if k > len(sortedDistIndices):  
            k = len(sortedDistIndices)  

        return sortedDistIndices[0:k]
    # build a big graph (normalized weight matrix)  
    def buildGraph(self,MatX, kernel_type, rbf_sigma = None, knn_num_neighbors = None):  
        num_samples = MatX.shape[0]  # return row(sample) number of MatX
        affinity_matrix = np.zeros((num_samples, num_samples), np.float32)  
        if kernel_type == 'rbf':  
            if rbf_sigma == None:  
                raise ValueError('You should input a sigma of rbf kernel!')  
            for i in range(num_samples):  
                row_sum = 0.0  
                for j in range(num_samples):  
                    diff = MatX[i, :] - MatX[j, :]  
                    affinity_matrix[i][j] = np.exp(sum(diff**2) / (-2.0 * rbf_sigma**2))  
                    row_sum += affinity_matrix[i][j]  
                affinity_matrix[i][:] /= row_sum  
        elif kernel_type == 'knn':  
            if knn_num_neighbors == None:  
                raise ValueError('You should input a k of knn kernel!')  
            for i in range(num_samples):  
                k_neighbors = self.navie_knn(MatX, MatX[i, :], knn_num_neighbors)  
                affinity_matrix[i][k_neighbors] = 1.0 / knn_num_neighbors  # 将节点i与附近的k个节点连接起来，每个边的权重是1/knn_num_neighbors
        else:  
            raise NameError('Not support kernel type! You can use knn or rbf!')  

        return affinity_matrix  
    # label propagation  
    def labelPropagation(self,Mat_Label, Mat_Unlabel, labels, kernel_type = 'rbf', rbf_sigma = 0.20, \
                        knn_num_neighbors = 10, max_iter = 500, tol = 1e-3):  
        # initialize  
        num_label_samples = Mat_Label.shape[0]  #已经标记的sample number
        num_unlabel_samples = Mat_Unlabel.shape[0]  #未标记的sample number
        num_samples = num_label_samples + num_unlabel_samples
        labels_list = np.unique(labels)  #有哪些label
        num_classes = len(labels_list)  #label的种类数

        MatX = np.vstack((Mat_Label, Mat_Unlabel))
        clamp_data_label = np.zeros((num_label_samples, num_classes), np.float32)  
        for i in range(num_label_samples):  
            clamp_data_label[i][labels[i]] = 1.0   #标记出每一个labelled sample的具体label是什么

        label_function = np.zeros((num_samples, num_classes), np.float32)  
        label_function[0 : num_label_samples] = clamp_data_label  
        label_function[num_label_samples : num_samples] = -1  

        # graph construction  
        affinity_matrix = self.buildGraph(MatX, kernel_type, rbf_sigma, knn_num_neighbors)  

        # start to propagation  
        iter = 0; pre_label_function = np.zeros((num_samples, num_classes), np.float32)  
        changed = np.abs(pre_label_function - label_function).sum()  
        while iter < max_iter and changed > tol:  
            if iter % 1 == 0:  
                print ("---> Iteration %d/%d, changed: %f" % (iter, max_iter, changed))
            pre_label_function = label_function  
            iter += 1  

            # propagation  
            label_function = np.dot(affinity_matrix, label_function)  

            # clamp  
            label_function[0 : num_label_samples] = clamp_data_label  

            # check converge  
            changed = np.abs(pre_label_function - label_function).sum()  

        # get terminate label of unlabeled data  
        unlabel_data_labels = np.zeros(num_unlabel_samples)  
        for i in range(num_unlabel_samples):  
            unlabel_data_labels[i] = np.argmax(label_function[i+num_label_samples]) #取出参数中元素最大值所对应的索引 

        return unlabel_data_labels 

In [2]:
process = market()
process.add_stopwords("D:/Users/PYTHON/Precision-Marketing/stopwords.txt")
process.read_vectors("D:/NLP/sgns.target.word-word.dynwin5.thr10.neg5.dim300.txt",10000)

Load 2316 stopwords
Load 10000 word vectors.


In [3]:
embedding_matrix = process.embedding_matrix()
embedding_matrix.shape

(10000, 300)

In [4]:
os.chdir("D:/Users/PYTHON/Precision-Marketing")
df = pd.DataFrame()
num = 0
for i in range(10):
    df_temp = pd.read_excel("关键词标签.xlsx",sheet_name = i)
    df = df.append(df_temp)
    num += 1
print("一共读取了{}个sheet".format(num))
df[:2]

一共读取了10个sheet


Unnamed: 0,用户名,博文,关键词,"是否担忧（1=担忧，-1=完全不担忧，0=中性，有些担忧但不用买保险,2=疑似抑郁症）",标签1（担忧对象）,标签2（担忧什么）,标签3（什么保险）,标签4（症状）,转发数,评论数,点赞数,发文时间,来自,页面网址,博文链接
0,1杯冰牛奶,\n 一直在各个公开场合严肃说了要理智告诉粉丝应该怎样...,父母怎么办,-1,,,,,,,,\n 今天06:16\n ...,iPhone 11,https://s.weibo.com/weibo/%25E7%2588%25B6%25E6...,https://weibo.com/6606022745/IwQhN1EAC?refer_f...
1,干杯老铁,\n 网友求助: “因为男友我感染了hpv，尿道炎，宫...,父母怎么办,0,,,,,,,,\n 今天06:02\n ...,即刻笔记,https://s.weibo.com/weibo/%25E7%2588%25B6%25E6...,https://weibo.com/3394404550/IwQbUFIBq?refer_f...


In [5]:
# df = pd.concat([WORRY,PATIENT]) # concat之后row index会重复
# df = pd.concat([PATIENT])
# df.reset_index(drop = True,inplace = True)
print(len(df))
df = df.loc[pd.notna(df["是否担忧（1=担忧，-1=完全不担忧，0=中性，有些担忧但不用买保险,2=疑似抑郁症）"]),]
print(len(df))
df.head(2)

1576
1576


Unnamed: 0,用户名,博文,关键词,"是否担忧（1=担忧，-1=完全不担忧，0=中性，有些担忧但不用买保险,2=疑似抑郁症）",标签1（担忧对象）,标签2（担忧什么）,标签3（什么保险）,标签4（症状）,转发数,评论数,点赞数,发文时间,来自,页面网址,博文链接
0,1杯冰牛奶,\n 一直在各个公开场合严肃说了要理智告诉粉丝应该怎样...,父母怎么办,-1,,,,,,,,\n 今天06:16\n ...,iPhone 11,https://s.weibo.com/weibo/%25E7%2588%25B6%25E6...,https://weibo.com/6606022745/IwQhN1EAC?refer_f...
1,干杯老铁,\n 网友求助: “因为男友我感染了hpv，尿道炎，宫...,父母怎么办,0,,,,,,,,\n 今天06:02\n ...,即刻笔记,https://s.weibo.com/weibo/%25E7%2588%25B6%25E6...,https://weibo.com/3394404550/IwQbUFIBq?refer_f...


In [6]:
df["是否担忧（1=担忧，-1=完全不担忧，0=中性，有些担忧但不用买保险,2=疑似抑郁症）"].value_counts()

-1    942
 1    329
 0    296
 2      9
Name: 是否担忧（1=担忧，-1=完全不担忧，0=中性，有些担忧但不用买保险,2=疑似抑郁症）, dtype: int64

In [7]:
df_worry = df[df["是否担忧（1=担忧，-1=完全不担忧，0=中性，有些担忧但不用买保险,2=疑似抑郁症）"] == 1]
df_worry.reset_index(drop = True,inplace = True)
df_non_worry = df[df["是否担忧（1=担忧，-1=完全不担忧，0=中性，有些担忧但不用买保险,2=疑似抑郁症）"] == -1]
df_non_worry.reset_index(drop = True,inplace = True)
df_non_worry = process.random_pick(df_non_worry,min(len(df_worry),len(df_non_worry)))
print(len(df_worry))
print(len(df_non_worry))

329
329


In [8]:
df_worry = df_worry[["博文","是否担忧（1=担忧，-1=完全不担忧，0=中性，有些担忧但不用买保险,2=疑似抑郁症）"]]
df_non_worry = df_non_worry[["博文","是否担忧（1=担忧，-1=完全不担忧，0=中性，有些担忧但不用买保险,2=疑似抑郁症）"]]

In [9]:
df_worry

Unnamed: 0,博文,"是否担忧（1=担忧，-1=完全不担忧，0=中性，有些担忧但不用买保险,2=疑似抑郁症）"
0,\n 抑郁症我有慢性疾病，怎么治都治不好，可是我才1...,1
1,\n 最近不好的事情真的太多了 想逃避所有情绪不稳定的...,1
2,\n 突然好怕自己有一天撑不住怎么办，父母怎么办，还有...,1
3,\n 疫情发生之后，觉悟到有个好身体的重要性，养成良好...,1
4,\n 皮肤病狗狗。喂了2天，看到我就屁颠屁颠跑过来…实...,1
...,...,...
324,\n 生产后一切安好，没掉发没腰痛没怕冷…直到最近腰不...,1
325,\n 坐着不知道该做什么，连续两天毫无食欲。节目完了之...,1
326,\n 当一天的24小时里，你只有4、5个小时的片段式睡...,1
327,\n #小九与小胡同学的日常# 76day小胡同学...,1


In [10]:
df_non_worry

Unnamed: 0,博文,"是否担忧（1=担忧，-1=完全不担忧，0=中性，有些担忧但不用买保险,2=疑似抑郁症）"
256,\n 比莫干吃了一惊：“淳国知道大合萨的行程？” “他...,-1
161,\n 麻药过后是漫长的疼痛╯﹏╰冷漠的护士让我不是很开...,-1
561,#英语，学起来！##2月21日可以查四六级成绩# 【TED演讲：走出“舒适区”】“2015年...,-1
491,\n 梦见坐在教室里上课，实木的桌子好像有点豪华。梦见...,-1
365,\n 越大越觉得做一个普通人，有人疼有人爱，生病能正常...,-1
...,...,...
183,\n 一粒药12块钱 比我在学校吃一顿饭还贵(天坛新院...,-1
85,\n 【家庭婚姻情感专题】 问答(2020.03.01...,-1
869,\n 今天探索了维他的港式奶茶 还行8我真的每天吃完晚...,-1
840,\n 1、早孕期（12周之前）子宫还没有出盆腔，就是说...,-1


In [11]:
print(len(df_worry))
print(len(df_non_worry))
df_worry = df_worry.dropna()
df_non_worry = df_non_worry.dropna()
print(len(df_worry))
print(len(df_non_worry))

329
329
329
329


In [34]:
df_use = pd.concat([df_worry,df_non_worry])
df_use.reset_index(drop = True,inplace = True)
df_use = df_use.reindex(np.random.permutation(df_use.index))
df_use.head()

Unnamed: 0,博文,"是否担忧（1=担忧，-1=完全不担忧，0=中性，有些担忧但不用买保险,2=疑似抑郁症）"
336,\n #孝感地震#08年汶川8级地震那次，正坐在上铺床...,-1
210,\n 最近又在关注暗网的信息，觉得好可怕。想想在国外独...,1
467,\n 没有太多感情基础的婚姻最终都是以吵架度日吗？当父...,-1
537,\n 我能理解孩子病了家长不愿意带孩子往医院跑的心情，...,-1
401,\n Q最近因为出不了门，大人小孩都很烦躁，我也经常失...,-1


In [35]:
x_train = process.word_cut(df_use["博文"])
x_train = process.frequency(x_train,5)

In [36]:
token_index = process.dictionary(x_train)

In [37]:
token_length = process.count(x_train)
token_length = {key:token_length[key] for key in sorted(token_length,key = lambda x: token_length[x],reverse = True)[:round((2/5)*len(token_index))]}

In [38]:
# 给字符编码。如果x_train中的词语不在传入的实参token_index中，那么就编码为-1
x_train = process.recoding(x_train,process.word_index)
# x_train[:5]

In [39]:
x_train = process.delete(x_train)
# x_train[:2]

In [40]:
import keras
import tensorflow
from keras import preprocessing

max_len = 50
x_train = preprocessing.sequence.pad_sequences(x_train,maxlen = max_len)

x_train.shape

(658, 50)

In [41]:
y_train = df_use[["是否担忧（1=担忧，-1=完全不担忧，0=中性，有些担忧但不用买保险,2=疑似抑郁症）"]]
y_train["是否担忧（1=担忧，-1=完全不担忧，0=中性，有些担忧但不用买保险,2=疑似抑郁症）"] = y_train["是否担忧（1=担忧，-1=完全不担忧，0=中性，有些担忧但不用买保险,2=疑似抑郁症）"].apply(lambda v: 0 if v == -1 else 1)
y_train.head(5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,"是否担忧（1=担忧，-1=完全不担忧，0=中性，有些担忧但不用买保险,2=疑似抑郁症）"
336,0
210,1
467,0
537,0
401,0


In [42]:
y_in = len(y_train)
y_train = np.array(y_train)
y_train = y_train.reshape(y_in)
y_train[:5]

array([0, 1, 0, 0, 0], dtype=int64)

In [43]:
print(x_train.shape)
print(y_train.shape)

(658, 50)
(658,)


In [44]:
def get_values(token_index):
    values = []
    for key in token_index:
        values.append(token_index[key])
    return(values)
values = get_values(token_index)
values[:5]
print("最大的序号是：",max(values))

最大的序号是： 828


In [45]:
from keras.models import Sequential
from keras.layers import Flatten,Dense,Embedding,LSTM,Bidirectional,Dropout,Conv1D,MaxPooling1D

max_features = 10000
max_len = 50

model = Sequential()
model.add(Embedding(max_features,300,input_length = max_len,mask_zero = True)) # 遇到0，就不会反向传播更新权重
# model.add(Embedding(max_features,300,input_length = max_len))
# model.add(Conv1D(32, 5, activation='relu'))
# model.add(MaxPooling1D(5))
# model.add(Conv1D(64, 5, activation='relu'))
model.add(Dropout(0.5))
model.add(Bidirectional(LSTM(20),merge_mode = 'concat'))
model.add(Dense(1,activation = 'sigmoid')) # softmax其实本质上扩展后的sigmoid
model.compile(optimizer = 'rmsprop',loss = 'binary_crossentropy',metrics = ['accuracy'])
model.summary()

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 50, 300)           3000000   
_________________________________________________________________
dropout_5 (Dropout)          (None, 50, 300)           0         
_________________________________________________________________
bidirectional_5 (Bidirection (None, 40)                51360     
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 41        
Total params: 3,051,401
Trainable params: 3,051,401
Non-trainable params: 0
_________________________________________________________________


In [46]:
model.layers[0].set_weights([embedding_matrix])
model.layers[0].trainable = False

In [47]:
history = model.fit(x_train,
                    y_train,
                    epochs = 10,
                    batch_size = 128, # batch_size越大越好，但是太大会影响计算效率
                    validation_split= 0.3)

  'Discrepancy between trainable weights and collected trainable'
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 460 samples, validate on 198 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [103]:
# model.save("Bi-LSTM(加入预训练的词向量库).h5")

In [94]:
new_data = pd.read_csv("未标记的担忧咋办标签.csv")
new_data_preserved = new_data
new_data.head(2)

Unnamed: 0,用户名,博文,预测是否担忧（1=担忧，0=不担忧）,转发数,评论数,点赞数,发文时间,来自,页面网址,博文链接
0,玥玥的碎花小裙儿,今晚就要跨年了咋办呢？先迈左脚还是右脚啊？跨不过去会不会摔一跤？腿太长会不会跨到2021年啊...,0,,,,\n 2019年12月31日 23:52\n ...,realme Q 四摄迅猛龙,https://s.weibo.com/weibo/%25E6%258B%2585%25E5...,https://weibo.com/6275719793/Innh3oI0n?refer_f...
1,海里星星16687,今晚就要跨年了咋办呢？先迈左脚还是右脚啊？跨不过去会不会摔一跤？腿太长会不会跨到2021啊好...,0,,,,\n 2019年12月31日 23:35\n ...,OPPO超视野全面屏R15,https://s.weibo.com/weibo/%25E6%258B%2585%25E5...,https://weibo.com/6607786597/InnadlLNd?refer_f...


In [95]:
new_data = new_data[["博文"]]
new_data["博文"] = process.word_cut(new_data["博文"])
new_data.head(2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,博文
0,"[今晚, 跨年, 咋办, 先迈, 左脚, 右脚, 跨, 摔, 一跤, 腿, 太长会, 跨到,..."
1,"[今晚, 跨年, 咋办, 先迈, 左脚, 右脚, 跨, 摔, 一跤, 腿, 太长会, 跨到,..."


In [96]:
new_data["博文"] = process.frequency(new_data["博文"],5)
new_data.head(2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,博文
0,"[今晚, 跨年, 咋办, 先迈, 左脚, 右脚, 跨, 摔, 一跤, 腿, 太长会, 跨到,..."
1,"[今晚, 跨年, 咋办, 先迈, 左脚, 右脚, 跨, 摔, 一跤, 腿, 太长会, 跨到,..."


In [97]:
new_data["博文"] = process.recoding(new_data["博文"],process.word_index)
new_data.head(2)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,博文
0,"[-1, -1, -1, -1, -1, -1, 4344, -1, -1, 3076, -..."
1,"[-1, -1, -1, -1, -1, -1, 4344, -1, -1, 3076, -..."


In [98]:
new_data["博文"] = process.delete(new_data["博文"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [99]:
x_test = preprocessing.sequence.pad_sequences(new_data["博文"],maxlen = max_len)

In [100]:
prediction = model.predict(x_test)
prediction = prediction.reshape(len(prediction))
prediction

array([0.03170225, 0.04756665, 0.03170225, ..., 0.02784744, 0.23346764,
       0.03607011], dtype=float32)

In [102]:
Result = pd.DataFrame({"预测结果":list(prediction),"博文":new_data_preserved["博文"]})
# Result.to_excel("Bi-LSTM预测结果（(加入预训练的词向量库)）.xlsx",header = True)