In [None]:
import io
import pandas as pd
import numpy as np
import jieba
import gensim
from pyhanlp import *
from tqdm import tqdm


In [None]:
#导入数据
df = pd.read_csv('./all_bk.csv')
print('行，列:', df.shape)
print('字段:', df.columns)


In [None]:
#停用词表
with open('./stopwords.txt') as f:
    read = f.read()
    stop_words = read.splitlines()
stop_word = [' ','',r'&#',r'x0D']#补充停用词，不断调整
stop_words.append(stop_word)
#print(stop_word)


In [None]:
#利用hanlp基于信息熵原理提取短语作为新词 保存为newword.txt
#提取短语 作用于长文本才比较有效
#从实验结果来看，此方法比hanlp两种切词方法效果好
txt_str = ''
for i in range (0,df.shape[0]):
    title = df.loc[i][3] #标题文本
    txt_str = txt_str + title
#print(txt_str)
for new_word in HanLP.extractPhrase(txt_str, 100):
    print(new_word)
#new_word = HanLP.extractPhrase(txt_str, 100)


In [None]:
#文档-分词 注意分词质量很关键
#需要不断查看分词结果，调整停用词表、调整新词词典
docs_words = []
for i in range (0,df.shape[0]):
    doc_words = []
    title = df.loc[i][3] #修改导入的字段
    jieba.add_word('反恐法')
    jieba.add_word('三股势力')#加入新词 ，不断调整 
    jieba.add_word('斯里兰卡')
    jieba.add_word('去极端化')
    jieba.add_word('伊斯兰国')
    jieba.load_userdict('./newwords.txt')#导入新词词典，可加入特定领域的词典
    title = str(title)
    a_seg_list = jieba.lcut(title,cut_all=False)#精确模式
    for a_word in a_seg_list:
        a_word = str(a_word)
        a_word = a_word.strip()
        if a_word not in stop_words:#过滤停用词
            if a_word >= u'\u4e00' and a_word <= u'\u9fa5’:#只保留文字，根据情况调整
                if len(a_word) >= 2:#去除单字，根据情况调整
                    doc_words.append(a_word)
    docs_words.append(doc_words)
#print(docs_words)  


In [None]:
#发现二元词、三元词，从实验结果看这个方法对于提升分词质量具有显著的效果
# Build the bigram and trigram models
bigram = gensim.models.Phrases(docs_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[docs_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)


In [None]:
new_docs_words = [] #最终分词的结果
for doc_words in docs_words:
    new_docs_words.append(trigram_mod[bigram_mod[doc_words]])


In [None]:
from gensim.corpora import Dictionary
from gensim.models.ldamodel import LdaModel
from gensim.models import CoherenceModel, LdaModel
from gensim import models
%matplotlib inline


In [None]:
dictionary = Dictionary(new_docs_words)
corpus = [dictionary.doc2bow(text) for text in docs_words]
#print(corpus) #对词进行唯一id编码，并统计在对应文档出现的次数 形成文档-词的词频矩阵，即词袋 word-bag


In [None]:
#tf-idf 对高频但不重要的词进行频数的调整
tfidf_model = models.TfidfModel(corpus)
corpus_tfidf = tfidf_model[corpus]
print(tfidf_model)
print(corpus_tfidf)


In [None]:
from gensim.models import Word2Vec

#构建300维的词向量，循环次数10次，忽略出现次数低于10的词
w2v_model = Word2Vec(new_docs_words, size=300, iter=10, min_count=10)

#保存模型
w2v_model.wv.save_word2vec_format('./wz_w2v_model2', binary=False)


In [None]:
#方案一：利用tensor board进行全局词向量的降维、可视化、效果较好
import sys
import os
import pathlib
import numpy as np
from gensim.models.keyedvectors import KeyedVectors
import tensorflow as tf
from tensorflow.contrib.tensorboard.plugins import projector

def visualize(model, output_path):
    meta_file = "w2x_metadata.tsv"
    placeholder = np.zeros((len(model.wv.index2word), model.vector_size))

    with open(os.path.join(output_path, meta_file), 'wb') as file_metadata:
        for i, word in enumerate(model.wv.index2word):
            placeholder[i] = model[word]
            # temporary solution for https://github.com/tensorflow/tensorflow/issues/9094
            if word == '':
                print("Emply Line, should replecaed by any thing else, or will cause a bug of tensorboard")
                file_metadata.write("{0}".format('<Empty Line>').encode('utf-8') + b'\n')
            else:
                file_metadata.write("{0}".format(word).encode('utf-8') + b'\n')

    # define the model without training
    sess = tf.InteractiveSession()

    embedding = tf.Variable(placeholder, trainable=False, name='w2x_metadata')
    tf.global_variables_initializer().run()

    saver = tf.train.Saver()
    writer = tf.summary.FileWriter(output_path, sess.graph)

    # adding into projector
    config = projector.ProjectorConfig()
    embed = config.embeddings.add()
    embed.tensor_name = 'w2x_metadata'
    embed.metadata_path = meta_file

    # Specify the width and height of a single thumbnail.
    projector.visualize_embeddings(writer, config)
    saver.save(sess, os.path.join(output_path, 'w2x_metadata.ckpt'))
    print('Run `tensorboard --logdir={0}` to run visualize result on tensorboard'.format(output_path))

model = KeyedVectors.load_word2vec_format(r'./wz_w2v_model’)#导入上面保存的模型


In [None]:
#方案二：提取实体、属性，进一步做针对性的社会网络分析sna；使用nodexl中的newman-claut-moorse方法效果佳
#提取实体
from jieba import posseg
from collections import Counter
import gensim
## 词性标注
new_posseg =[]
for i in tqdm(range(0,df.shape[0])):
    contents = str(df['标题'][i]) + str(df['内容'][i])
    a_new_posseg = jieba.posseg.lcut(contents)#精确模式
    new_posseg.append(a_new_posseg)


In [None]:
out = []
for j in range(0,len(new_posseg)):
    for a,b in new_posseg[j]:
        a_out = []
        if b in ['ns','nr','nrf','nsf','nt','nz']:
            a_out.append(str(a))
            a_out.append(str(b))
            out.append(a_out)
            
cixing = pd.DataFrame(out, columns=['word', 'atr']) 
cixing = cixing.groupby(['word','atr']).size().sort_values(ascending = False)
cixing.to_csv('./wz_实体_词性_词频.csv') 


In [None]:
cixing = pd.read_csv('./wz_实体_词性_词频.csv',header = None)
n_com = []
for i in range(0,200):
    a_com = []
    n = cixing.iloc[i][0]#词
    com = cixing.iloc[i][2]#词频
    atr = cixing.iloc[i][1]#词性
    a_com.append(n)
    a_com.append(com)
    a_com.append(atr)
    n_com.append(a_com)
print(n_com)#查看词频最高的前200个实体，再进行筛选


In [None]:
#将筛选好的实体放在字典中，数据结构：{ "name": "傅恒", "id": "0", "size": 12.77 },
node_res = []
for i in range(len(n_com)):
    node = {}
    a = n_com[i]
    node["name"] = a[0] 
    node["id"] = str(i)
    node["size"] = a[1]
    node_res.append(node)
print(node_res) #可用作为利用E chart做图的输入数据


In [None]:
Node = pd.DataFrame(out, columns=['word','frequency', 'atr']) 
Node.to_csv(‘./node1.csv’) 
#一级实体提取完毕


In [None]:
#提取一级实体的关系，数据结构： { "id": "1", "source": "0", "target": "7” }
out = []
count = 0
for i in range(0,len(node_res)):
    for j in range(i+1,len(node_res)):
        a_out = []
        name1 = node_res[i]['name']
        name2 = node_res[j]['name']
        sim = w2v_model.wv.similarity(name1,name2) #计算余弦相似度
        print(name1,name2,sim)
        if sim < 1 and sim > 0.3  : #调节阈值
            a_out.append(node_res[i]['id'])
            a_out.append(node_res[j]['id'])
            a_out.append(sim)
            out.append(a_out)
sim_matrix = pd.DataFrame(out, columns=['source', 'target', 'sim'])   
new_sim_matrix = sim_matrix.sort_values('sim',ascending = False).reset_index()
new_sim_matrix.to_csv('./link1.csv’)
#一级实体关系提取完毕
#字典格式
links = []
for i in range (0,new_sim_matrix.shape[0]):
    link = {}
    link['id'] = str(i)
    link['source'] = new_sim_matrix['source'][i]
    link['target'] = new_sim_matrix['target'][i]
    links.append(link)  
print(links) #可用作为利用E chart做图的输入数据


In [None]:
#二级关系 描述一级主体的属性
out = []
atr_id = len(node_res)
for node in node_res:
    atr = w2v_model.wv.most_similar(node['name'])
    for j in range(len(atr)):
        a_out = []
        a_out.append(node['name'])
        a_out.append(node['id'])
        a_out.append(atr[j][0])
        a_out.append(str(atr_id))
        a_out.append(atr[j][1])
        out.append(a_out)
        atr_id = atr_id + 1
        
atribution = pd.DataFrame(out,columns = ['source','source_id’,'target','target_id','sim'])
atribution.to_csv('./link2.csv')  
#二级关系提取完毕  


In [None]:
#提取二级关系的属性
data_flat = [w for w_list in new_docs_words for w in w_list]
counter = Counter(data_flat)
node2 = atribution
out = []
for i in range(0,node2.shape[0]):
    a_out = []
    a_out.append(node2['target_id’][i])
    a_out.append(node2['target’][i])
    a_out.append(counter[node2['target'][i]])
    out.append(a_out)
Node2 = pd.DataFrame(out,columns = ['target_id','target','frequency']) 
Node2.to_csv(‘./node2.csv’)
#二级属性提取完毕     
#下一步，用nodexl计算sna指标、聚类、画图
