In [None]:
import os
import pandas as pd
import numpy as np

class market():
    def word_cut(self,documents):
        stopwords = self.stopwords
        import jieba
        texts = []
        for line in documents:
            words = ' '.join(jieba.cut(line)).split(' ') # 用空格去连接，连接后马上又拆分
            text = []
            for word in words:
                if (word not in stopwords) & (word != '')& (word != '\u3000')& (word != '\n')&(word != '\u200b'):
                    text.append(word)
            texts.append(text)
        self.docLength = len(documents)
        return(texts)
    def get_docLength(self):
        return(self.docLength)
    def frequency(self,texts,freq):
        from collections import defaultdict
        frequency = defaultdict(int) # value为int
        for text in texts:
            for word in text:
                frequency[word] += 1
        texts = [[word for word in text if frequency[word] > freq] for text in texts]
        return(texts)
    def regroup(self,texts):
        new_texts = []
        for i,sentence in enumerate(texts):
            new_texts.append(" ".join(sentence))
        return(new_texts)
    def add_stopwords(self,path):
        stopwords = set()
        with open(path,'r',encoding = 'cp936') as file:
            for line in file:
                stopwords.add(line.strip())
        self.stopwords = stopwords
        print("Load %s stopwords" %len(stopwords))
    def dictionary(self,docs):
        token_index ={}
        for sample in docs:
            for word in sample:
                if word not in token_index:
                    token_index[word] = len(token_index) + 1
        return(token_index)
    def count(self,docs):
        token_length ={}
        for sample in docs:
            for word in sample:
                if word not in token_length:
                    token_length[word] = 1
                else:
                    token_length[word] += 1
        return(token_length)
    def recoding(self,docs,token_index):
        for i,sample in enumerate(docs):
            for j,word in enumerate(sample):
                if word not in token_index:
                    sample[j] = -1
                else:
                    sample[j] = token_index[word]
            docs[i] = sample
        return(docs)
    def delete(self,docs):
        for index in range(len(docs)):
            for i in range(len(docs[index])-1,-1,-1):
                if docs[index][i] == -1:
                    docs[index].pop(i)
        return docs
    def random_pick(self,df,n):
        import random
        import numpy as np
        rand = np.arange(0,(len(df)-1),1)
        random.shuffle(rand)
        rand = list(rand[:n])
        df = df.loc[rand,]
        return(df)
    def read_vectors(self,path, topn):  # read top n word vectors, i.e. top is 10000
        lines_num, dim = 0, 0
        vectors = {}
        iw = []
        wi = {}
        with open(path, encoding='utf-8', errors='ignore') as f:
            first_line = True
            for line in f:
                if first_line:
                    first_line = False
                    dim = int(line.rstrip().split()[1]) # 删除向量末尾的空格，然后以空格拆分获得向量
                    continue
                lines_num += 1
                tokens = line.rstrip().split(' ')
                vectors[tokens[0]] = np.asarray([float(x) for x in tokens[1:]])# 当数据源是ndarray时，asarray不会占用新的内存；当数据源不是ndarray,asarray与array一样
                iw.append(tokens[0]) # iw储存了所有的tokons[0]，意思是index_word
                if topn != 0 and lines_num >= topn:
                    break
        for i, w in enumerate(iw):
            wi[w] = i # wi是iw的反转，意思是word_index,用w来储存字符，用一个integer去给字符编码
        self.dim = dim
        self.max_words = topn
        self.word_index = wi
        self.index_word = iw
        self.vectors = vectors
        print("Load %s word vectors." % len(vectors))
    def embedding_matrix(self):
        embedding_matrix = np.zeros((self.max_words,self.dim))
        for word,i in self.word_index.items():
            if i < self.max_words:
                embedding_vector = self.vectors.get(word)
                if embedding_vector is not None:
                    embedding_matrix[i] = embedding_vector
        return embedding_matrix
    def navie_knn(self,dataSet, query, k):  
        # 计算出某一样本与所有样本的距离，选择最大(应该修改为最小？)的k个样本作为用于knn
        numSamples = dataSet.shape[0] # return row(sample) number of dataset

        ## step 1: calculate Euclidean distance  
        diff = np.tile(query, (numSamples, 1)) - dataSet #tile: 把query这个向量纵向复制，使得结果与dataset具有同样的行数
        squaredDiff = diff ** 2  
        squaredDist = np.sum(squaredDiff, axis = 1) # sum is performed by row  

        ## step 2: sort the distance  
        sortedDistIndices = np.argsort(squaredDist)   # numpy.argsort 返回的是数组值从小到大的索引值（注意是索引值，不是绝对值）
        if k > len(sortedDistIndices):  
            k = len(sortedDistIndices)  

        return sortedDistIndices[0:k]
    # build a big graph (normalized weight matrix)  
    def buildGraph(self,MatX, kernel_type, rbf_sigma = None, knn_num_neighbors = None):  
        num_samples = MatX.shape[0]  # return row(sample) number of MatX
        affinity_matrix = np.zeros((num_samples, num_samples), np.float32)  
        if kernel_type == 'rbf':  
            if rbf_sigma == None:  
                raise ValueError('You should input a sigma of rbf kernel!')  
            for i in range(num_samples):  
                row_sum = 0.0  
                for j in range(num_samples):  
                    diff = MatX[i, :] - MatX[j, :]  
                    affinity_matrix[i][j] = np.exp(sum(diff**2) / (-2.0 * rbf_sigma**2))  
                    row_sum += affinity_matrix[i][j]  
                affinity_matrix[i][:] /= row_sum  
        elif kernel_type == 'knn':  
            if knn_num_neighbors == None:  
                raise ValueError('You should input a k of knn kernel!')  
            for i in range(num_samples):  
                k_neighbors = self.navie_knn(MatX, MatX[i, :], knn_num_neighbors)  
                affinity_matrix[i][k_neighbors] = 1.0 / knn_num_neighbors  # 将节点i与附近的k个节点连接起来，每个边的权重是1/knn_num_neighbors
        else:  
            raise NameError('Not support kernel type! You can use knn or rbf!')  

        return affinity_matrix  
    # label propagation  
    def labelPropagation(self,Mat_Label, Mat_Unlabel, labels, kernel_type = 'rbf', rbf_sigma = 0.20, \
                        knn_num_neighbors = 10, max_iter = 500, tol = 1e-3):  
        # initialize  
        num_label_samples = Mat_Label.shape[0]  #已经标记的sample number
        num_unlabel_samples = Mat_Unlabel.shape[0]  #未标记的sample number
        num_samples = num_label_samples + num_unlabel_samples
        labels_list = np.unique(labels)  #有哪些label
        num_classes = len(labels_list)  #label的种类数

        MatX = np.vstack((Mat_Label, Mat_Unlabel))
        clamp_data_label = np.zeros((num_label_samples, num_classes), np.float32)  
        for i in range(num_label_samples):  
            clamp_data_label[i][labels[i]] = 1.0   #标记出每一个labelled sample的具体label是什么

        label_function = np.zeros((num_samples, num_classes), np.float32)  
        label_function[0 : num_label_samples] = clamp_data_label  
        label_function[num_label_samples : num_samples] = -1  

        # graph construction  
        affinity_matrix = self.buildGraph(MatX, kernel_type, rbf_sigma, knn_num_neighbors)  

        # start to propagation  
        iter = 0; pre_label_function = np.zeros((num_samples, num_classes), np.float32)  
        changed = np.abs(pre_label_function - label_function).sum()  
        while iter < max_iter and changed > tol:  
            if iter % 1 == 0:  
                print ("---> Iteration %d/%d, changed: %f" % (iter, max_iter, changed))
            pre_label_function = label_function  
            iter += 1  

            # propagation  
            label_function = np.dot(affinity_matrix, label_function)  

            # clamp  
            label_function[0 : num_label_samples] = clamp_data_label  

            # check converge  
            changed = np.abs(pre_label_function - label_function).sum()  

        # get terminate label of unlabeled data  
        unlabel_data_labels = np.zeros(num_unlabel_samples)  
        for i in range(num_unlabel_samples):  
            unlabel_data_labels[i] = np.argmax(label_function[i+num_label_samples]) #取出参数中元素最大值所对应的索引 

        return unlabel_data_labels 

In [None]:
new_data = pd.read_csv("未标记的担忧咋办标签.csv")
new_data_preserved = new_data
new_data = new_data[["博文"]]
new_data["博文"] = process.word_cut(new_data["博文"])
new_data["博文"] = process.frequency(new_data["博文"],5)
new_data["博文"] = process.recoding(new_data["博文"],process.word_index)
new_data["博文"] = process.delete(new_data["博文"])
x_test = preprocessing.sequence.pad_sequences(new_data["博文"],maxlen = max_len)
prediction = model.predict(x_test)
prediction = prediction.reshape(len(prediction))
prediction

In [None]:
Result = pd.DataFrame({"预测结果":list(prediction),"博文":new_data_preserved["博文"]})
Result
# Result.to_excel("Bi-LSTM预测结果（(加入预训练的词向量库)）.xlsx",header = True)