# Simhash文档相似度计算（去重）

In [55]:
import jieba
import jieba.analyse
from collections import Counter

1、定义一个分词功能的函数.

In [56]:
def word_segmentation(strings):
    '''
        利用结巴工具对文本进行分词，并返回单词的权重.
    '''
    # 分词，返回一个单词列表
    tokens = jieba.lcut(strings)
    
    # 计算每个单词的权值(使用词频)
    weights_dcit = dict(Counter(tokens))
    
    return tokens, weights_dcit
    

2、定义一个Simhash，提供对文档的数值映射和文档间相似度计算的功能.

In [57]:
class Simhash(object):
    
    # 初始化函数
    def __init__(self, weights_dict, tokens='', hashbits=64):
        self.hashbits = hashbits
        self.hash = self.simhash_function(tokens, weights_dict)
    
    # toString函数
    def __str__(self):
        return str(self.hash)
    
    
    # 给每一个单词生成对应的hash值
    def _string_hash(self, source):
        if source == '':
            return 0
        else:
            x = ord(source[0]) << 7
            m = 1000003
            mask = 2 ** self.hashbits - 1
            for c in source:
                x = ((x * m) ^ ord(c)) & mask
            x ^= len(source)
            if x == -1:
                x = -2
            return x
        
    
    # 生成simhash值
    def simhash_function(self, tokens, weights_dict):
        v = [0] * self.hashbits
        for key, t in {x: self._string_hash(x) for x in tokens}.items():
            for i in range(self.hashbits):
                bitmask = 1 << i
                if t & bitmask:
                    v[i] += weights_dict[key]
                else:
                    v[i] -= weights_dict[key]
        
        fingerprint = 0
        for i in range(self.hashbits):
            if v[i] >= 0:
                fingerprint += 1 << i
        return fingerprint
    
    
    # 求文档间的海明距离
    def hamming_distance(self, other):
        x = (self.hash ^ other.hash) & ((1 << self.hashbits) - 1 )
        tot = 0
        while x :
            tot += 1
            x &= x - 1
        return tot
    
    
    #求相似度
    def similarity(self, other):
        a = float(self.hash)
        b = float(other.hash)
        if a > b:
            return b / a
        else: 
            return a / b
        

    
if __name__ == '__main__':
    
    s1 = '你好我是新来的，请多多关照。'
    tokens1, weights_dcit1 = word_segmentation(s1)
    hash1 = Simhash(weights_dict=weights_dcit1, tokens=tokens1)
    
    s2 = '为什么大家觉得你没有那么大的脾气呢.'
    tokens2, weights_dcit2 = word_segmentation(s2)
    hash2 = Simhash(weights_dict=weights_dcit2, tokens=tokens2)
    
    s3 = '你好我来自北京，请多多关照。'
    tokens3, weights_dcit3 = word_segmentation(s3)
    hash3 = Simhash(weights_dict=weights_dcit3, tokens=tokens3)
    
    print(hash1.hamming_distance(hash2) , "   " , hash1.similarity(hash2))
    print(hash1.hamming_distance(hash3) , "   " , hash1.similarity(hash3))    

24     0.26642211074157
13     2.8808327266833885e-05
