In [1]:
import numpy as np
import gensim
import jieba
import codecs
from jieba import analyse
from _utils import u_constant

root_path = u_constant.PATH_ROOT + "for learn/Python/NLP_in_Action/chapter-7/"
WV_MODEL_PATH = root_path + "word2vec/zhwiki.word2vec"
DV_MODEL_PATH = root_path + "doc2vec/zhwiki.doc2vec"
P1_PATH = root_path + "data/p1.txt"
P2_PATH = root_path + "data/p2.txt"



In [2]:
def cal_sim(u, v):
    u = np.asarray(u)
    v = np.asarray(v)
    norm_u = np.linalg.norm(u, 2)
    norm_v = np.linalg.norm(v, 2)
    if norm_u == 0 or norm_v == 0:
        return 0.0
    else:
        return u.dot(v) / (norm_u * norm_v)

## 利用Word2Vec计算网页相似度

In [5]:
class doc2vec_by_wv:
    def __init__(self, model_path):
        self.model = gensim.models.Word2Vec.load(model_path)
        self.vector_size = self.model.vector_size
        
    def extract_keywords(self, data_path):
        """
        解析目标文件，针对每行提取关键词
        :param data_path: 目标文件路径
        :return: 文件的关键词generator
        """
        with codecs.open(data_path, "r", encoding="utf-8") as in_f:
            for data in in_f:
                data = data.strip()
                keywords = analyse.extract_tags(data)
                for keyword in keywords:
                    yield keyword
    
    def transform(self, data_path):
        """
        获取目标语料的词袋向量
        """
        keywords = self.extract_keywords(data_path)
        result = np.zeros(self.vector_size)
        
        for word in keywords:
            try:
                vec = self.model[word]
                result += vec
            except KeyError:
                continue
        return result
        

In [6]:
dv_by_wv = doc2vec_by_wv(WV_MODEL_PATH)
p1vec = dv_by_wv.transform(P1_PATH)
p2vec = dv_by_wv.transform(P2_PATH)
print(cal_sim(p1vec, p2vec))

0.9473998005928956




## 利用Doc2Vec计算网页相似度

In [20]:
class doc2vec_by_dv:
    def __init__(self, model_path, start_alpha, infer_epoch):
        self.model = gensim.models.Doc2Vec.load(model_path)
        self.alpha = start_alpha
        self.steps = infer_epoch
        
    def get_doc(self, data_path):
        """
        读取目标文件的每个词
        """
        with codecs.open(data_path, "r", encoding="utf-8") as f:
            for line in f:
                line = line.strip()
                for word in jieba.cut(line):
                    yield word
    
    def transform(self, data_path):
        words = list(self.get_doc(data_path))
        return self.model.infer_vector(words, alpha=self.alpha, steps=self.steps)
        

In [21]:
dv_by_dv = doc2vec_by_dv(DV_MODEL_PATH, 0.01, 1000)
p1vec = dv_by_dv.transform(P1_PATH)
p2vec = dv_by_dv.transform(P2_PATH)
print(cal_sim(p1vec, p2vec))



0.6411267
