In [1]:
# -*- coding:utf-8 -*-
import pandas as pd
import jieba
import random
import re
from zhon.hanzi import punctuation
from gensim.models import word2vec
from gensim.models import doc2vec, ldamodel
from gensim import corpora
import numpy as np
from sklearn.decomposition import PCA
from numpy import *

In [79]:
# 数据预读取与预处理 _用于LDA模型
def pre_process(file):
    source = file
    df = pd.read_excel(source,encoding ="utf-8")
    df=df.dropna()                                                 #去掉空行
    scope=df['business_scope'].values.tolist()
    industry=df['industry'].values.tolist()
    
    #分句
    sentences = []
    labels = []
    for i,j in zip(scope,industry):
        #去除所有特殊符号
        sentence = []
        string = re.sub('\W+', '', i).replace("_", '')
        string = re.sub(r"[0-9]", "", string)
        string = re.sub(r"[a-zA-Z]", "", string)
        segs=jieba.lcut(str(string))                               #jieba分词
        for seg in segs:
            if len(seg) == 1:
                segs.remove(seg)
            else:
                sentence.append(seg)
        sentences.append(sentence)
        
        #industry集合
        labels.append(j)
        
    return sentences, labels

In [80]:
#载入训练数据集，进行数据预处理
sentences,labels = pre_process('./沪交所.xlsx')

# LDA部分

In [9]:
dictionary = corpora.Dictionary(sentences)  #词空间的生成，也就是将所有文章中取出来去重，剩下的词组成的列表。并进行编号 
corpus = [dictionary.doc2bow(sentence) for sentence in sentences]     #针对每个文本，将词汇转为id
#print(corpus[0])  # [(ID, frequence), (505, 1)...]

In [10]:
#训练模型
lda = ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=100)
#print(lda.print_topics(num_topics=100, num_words=5))

# 保存模型
lda.save('lda.model') 

In [11]:
#加载模型
lda = ldamodel.LdaModel.load('lda.model')

#测试数据
texts = pre_process('./上交所.xlsx')
for text in texts:
    bow = dictionary.doc2bow(text)
    #print(lda.get_document_topics(bow))

# Gensim部分

In [4]:
# 数据预处理，用pandas读取csv文件中的特定数据 _用于gensim模型
def pre_process2(file):
    source = file
    df = pd.read_excel(source,encoding ="utf-8")
    df=df.dropna()                                                 #去掉空行
    scope=df['business_scope'].values.tolist()
    industry=df['industry'].values.tolist()
    
    #分句
    sentences=[]
    for i,j in zip(scope,industry):
        #去除所有特殊符号
        string = re.sub('\W+', '', i).replace("_", '')
        segs=jieba.lcut(str(string))                               #jieba分词
        for seg in segs:
            if len(seg) < 2:
                segs.remove(seg)
        sentences.append(" ".join(segs))
    
    #写入为txt
    out=open('./train_data2.txt','w',encoding='utf-8')           
    for sentence in sentences:
        out.write(sentence+"\n")
    
    return sentences    
    
    
#载入训练数据集，进行数据预处理
sentences2 = pre_process2('./沪交所.xlsx')

In [5]:
#训练模型
def train(model_name, file_name):
    sent=word2vec.Text8Corpus(file_name)
    model=word2vec.Word2Vec(sent, size=50)
    model.save(model_name)

    #将字典中的词语添加到数组word_in_voc中
    word_in_voc = []
    for i, word in enumerate(model.wv.vocab):
        word_in_voc.append(word)
    #print(word_in_voc)
    
    return word_in_voc, model

In [6]:
word_in_voc, model = train('word2vec_model', 'train_data2.txt')

In [7]:
#筛选字典中的词
def combination(sentences):
    combine = []   #交集
    for sentence in sentences:
        c = list(set(sentence).intersection(set(word_in_voc)))      #sentence[[a,b,c],[a,s,d]]与字典中词选交集
        #print(c)
        combine.append(c)

    return combine
    
sentences = combination(sentences)

In [8]:
#形成词向量分段50维数组
def wv50(sentences):
    wv_arr = []
    for sentence in sentences:
        tmp = []
        for i in range(len(sentence)):                            #[ [[wv],[wv]...] , [[wv],[wv]...] ]
            tmp.append(model.wv[sentence[i]])                     #[[0,1,2,3],[2,3,4,5]..],[[0,1,2,3],[2,3,4,5]..]
        wv_arr.append(tmp)
    print(len(wv_arr))
    return wv_arr

wv_arr = wv50(sentences)

2125


In [9]:
# 基于2d PCA拟合数据
def minish_dimension(wv_arr):
    result = []
    for wv in wv_arr:
        X = np.array(wv)                               #序列词向量
        pca = PCA(n_components=2)             #使用pca将词向量降维到二维
        r = pca.fit_transform(X)                      #将序列词向量统一降为二维
        result.append(r)
    
    print(len(result))       #result :array( [ [wv1d],[wv1d]... ] , [[wv1d],[wv1d]...] , ...)     
    return result

result = minish_dimension(wv_arr)

2125


In [18]:
#段落平均词向量，存入wvList，完成KNN材料准备
def avg_wv(result):
    wvList = []
    for r in result:                       #[ [wv1d],[wv1d]... ]
        sum0 = 0
        sum1 = 0
        for sub in r:                      # [wv1d]
            sum0 += sub[0]
            sum1 += sub[1]
        wvList.append([ float(sum0/len(r)), float(sum1/len(r))])
    print(len(wvList))
    
    return wvList

wvList = avg_wv(result)

2125


# KNN部分

In [85]:
#两个准备数组 ： wvList[] = [ [wv1d],[wv1d]... ] ， labels[] = [asd,sf,fg,hj,...]

def kNNClassify(testData, wvList, labels, k):
    #求每点之间的最短距离
    #print(testData)
    #print(len(wvList))
    #print(len(labels))
    #print(k)
    diff = tile(testData, (len(wvList), 1)) - wvList       #测试点到每个数据集的点中[x,y]差值
    distance = sum(diff ** 2, axis = 1) ** 0.5             #distance = (diffx^2 + diffy^2)^0.5
    
    #求最短距离的数据点index排序集合（小-大）
    arg = list(argsort(distance))                       #[ 337 1470 1421 ...   96  452  735] 
    freq_dict = {}                                          #频率词典
    for i in range(k):
        label = labels [ arg[0] ]                    #最邻近k节点的分类
        freq_dict[label] = freq_dict.get(label, 0) + 1             #统计每个词汇出现次数
    #print(freq_dict)
    #求出现频率最高的label
    predict_label = max(freq_dict,key=freq_dict.get)
    #print(predict_label)                                           #频率最高的key🌟
    max_freq = freq_dict [max(freq_dict,key=freq_dict.get)]
    #print(max_freq)                                                #频率最高的value
    
    return predict_label


In [82]:
#测试数据
kNNClassify([1,2], wvList, labels, 30)

2125
2125


'全国地产'

# 测试

In [12]:
# 新数据
def pre_process3(file):
    source = file
    df = pd.read_excel(source,encoding ="utf-8")
    df=df.dropna()                                                 #去掉空行
    scope=df['business_scope'].values.tolist()
    industry=df['industry'].values.tolist()
    
    #分句
    sentences=[]
    for i,j in zip(scope,industry):
        #去除所有特殊符号
        string = re.sub('\W+', '', i).replace("_", '')
        segs=jieba.lcut(str(string))                               #jieba分词
        for seg in segs:
            if len(seg) < 2:
                segs.remove(seg)
        sentences.append(" ".join(segs))
    
    #写入为txt
    out=open('./train_data3.txt','w',encoding='utf-8')  
    for sentence in sentences:
        out.write(sentence+"\n")
    
    return sentences    
    
#载入训练数据集，进行数据预处理
sentences3 = pre_process3('./上交所.xlsx')                       #sentences3 : ['as as df','as r rg', ...]

In [21]:
#将测试集中的business scope （1）数据清洗 （2）分别转为二维向量 （3）验证其k取值准确率统计，选出正确的k
#（1）数据清洗
NEW_sentences,labels = pre_process('./上交所.xlsx')                 #NEW_sentences : [[as,as,as],[a,sg,fh],...]
#（2）新模型+二维向量
NEW_word_in_voc, NEW_model = train('NEW_word2vec_model', 'train_data3.txt')
NEW_sentences = combination(NEW_sentences)
NEW_wv_arr = wv50(NEW_sentences)

NEW_result = []
L = []
for wv in NEW_wv_arr:
    for i in wv:
        L.append(i.tolist())
    X = np.array(L)                               #序列词向量
    pca = PCA(n_components=2)             #使用pca将词向量降维到二维
    r = pca.fit_transform(X)                      #将序列词向量统一降为二维
    NEW_result.append(r)

NEW_wvList = avg_wv(NEW_result)

1484
1484


In [99]:
df = pd.read_excel('./上交所.xlsx',encoding ="utf-8")
df=df.dropna()                                                 #去掉空行
industry=df['industry'].values.tolist()
    
NEW_labels = []
for j in industry:
    NEW_labels.append(j)

In [107]:
#测试数据集中的数据
i = 0
accuracy = {}
for vector in NEW_wvList:            #1484
    for k in range(1,200):
        predict_label = kNNClassify(vector, wvList, labels, k)            #放回训练集的list中映射
        if predict_label == NEW_labels[i]:
            accuracy[k] = accuracy.get(k, 0) + 1
    i += 1                                                     #正确分类    
#求出现频率最高的label
Best_k = max(accuracy,key=accuracy.get)
print('Best k value:', Best_k, ', with accuracy of:', accuracy.get(k,0)/len(NEW_wvList)*100, '%')

Best k value: 1 , with accuracy of: 2.7628032345013476 %


In [None]:
kNNClassify(vector, wvList, labels, 1)

In [105]:
#测试数据集中的数据
i = 0
accuracy = {}
for vector in NEW_wvList:
    predict_label = kNNClassify(vector, wvList, labels, 1)            #放回训练集的list中映射
    if predict_label == labels[i]:
        accuracy[k] = accuracy.get(k, 0) + 1
            
i += 1                                                     #正确分类

print(accuracy)

{}
