In [4]:
class hanzi_simlar_enhance:
    
    @classmethod
    def initDict(cls, path):
        dict = {}
        with open(path, 'r', encoding='utf-8', errors='ignore') as f:
            for line in f.readlines():
                # 移除换行符，并且根据空格拆分
                splits = line.strip('\n').split(' ')
                key = splits[0]
                value = splits[1]
                dict[key] = value
        return dict
    
    def __init__(self, bihuashu_dict_path='./db/bihuashu_2w.txt', 
                 hanzijiegou_dict_path='./db/hanzijiegou_2w.txt', 
                 pianpangbushou_dict_path='./db/pianpangbushou_2w.txt',         
                 sijiaobianma_dict_path='./db/sijiaobianma_2w.txt',
                 hanzijiegouRate=10,
                 sijiaobianmaRate = 8,
                 pianpangbushouRate = 6,
                 bihuashuRate = 2
                 ) -> None:
        # ----- 基于笔画的数据增强字典 ---------
        self.bihuashuDict = self.initDict(bihuashu_dict_path)
        # ----- 基于汉字结构的数据增强字典 ---------
        self.hanzijiegouDict = self.initDict(hanzijiegou_dict_path)
        # ----- 基于汉字偏旁部首的增强字典 ---------
        self.pianpangbushouDict = self.initDict(pianpangbushou_dict_path)
        # ----- 基于汉字四角编码的增强字典 ---------
        self.sijiaobianmaDict = self.initDict(sijiaobianma_dict_path)
        
        self.hanzijiegouRate = hanzijiegouRate
        self.sijiaobianmaRate = sijiaobianmaRate
        self.pianpangbushouRate = pianpangbushouRate
        self.bihuashuRate = bihuashuRate
        
        
        
        
    
    def replace(sentence1, prob):
        """ 将句子中的汉字， 按照prob的概率替换成相近字, 从形成类似OCR识别错误的数据增强效果， 返回值为数据增后的句子
        """
        pass
    
    # 计算核心方法
    '''
    desc: 笔画数相似度, 字典中的内容为 {'单一汉字': 对应的笔画}
    '''
    def bihuashuSimilar(self, charOne, charTwo): 
        valueOne = self.bihuashuDict[charOne]
        valueTwo = self.bihuashuDict[charTwo]
        
        numOne = int(valueOne)
        numTwo = int(valueTwo)
        
        diffVal = 1 - abs((numOne - numTwo) / max(numOne, numTwo))
        return self.bihuashuRate * diffVal * 1.0


    '''
    desc: 汉字结构数相似度
    '''
    def hanzijiegouSimilar(self, charOne, charTwo): 
        valueOne = self.hanzijiegouDict[charOne]
        valueTwo = self.hanzijiegouDict[charTwo]
        
        if valueOne == valueTwo:
            # 后续可以优化为相近的结构
            return self.hanzijiegouRate * 1;
        return 0;
    
    '''
    desc: 四角编码相似度
    '''
    def sijiaobianmaSimilar(self, charOne, charTwo): 
        valueOne = self.sijiaobianmaDict[charOne];
        valueTwo = self.sijiaobianmaDict[charTwo];
        
        totalScore = 0.0;
        minLen = min(len(valueOne), len(valueTwo));
        
        for i in range(minLen):
            if valueOne[i] == valueTwo[i]:
                totalScore += 1.0;
        
        totalScore = totalScore / minLen * 1.0;
        return totalScore * self.sijiaobianmaRate;

    '''
    desc: 偏旁部首相似度
    '''
    def pianpangbushoutSimilar(self, charOne, charTwo): 
        valueOne = self.pianpangbushouDict[charOne];
        valueTwo = self.pianpangbushouDict[charTwo];
        
        if valueOne == valueTwo:
            # 后续可以优化为字的拆分
            return self.pianpangbushouRate * 1;
        return 0;  
    
    '''
    desc: 计算两个汉字的相似度
    '''
    def similar(self, charOne, charTwo):
        if charOne == charTwo:
            return 1.0;
        
        sijiaoScore = self.sijiaobianmaSimilar(charOne, charTwo) # 8.0   
        jiegouScore = self.hanzijiegouSimilar(charOne, charTwo)  # 10
        bushouScore = self.pianpangbushoutSimilar(charOne, charTwo) # 6
        bihuashuScore = self.bihuashuSimilar(charOne, charTwo);     # 1.4
        
        totalScore = sijiaoScore + jiegouScore + bushouScore + bihuashuScore # 25.428571428571427
        totalRate = self.hanzijiegouRate + self.sijiaobianmaRate + self.pianpangbushouRate + self.bihuashuRate # 26
        
        
        result = totalScore*1.0 / totalRate * 1.0;
        print('总分：' + str(totalScore) + ', 总权重: ' + str(totalRate) +', 结果:' + str(result));
        print('四角编码：' + str(sijiaoScore));
        print('汉字结构：' + str(jiegouScore));
        print('偏旁部首：' + str(bushouScore));
        print('笔画数：' + str(bihuashuScore));
        return result;
  
model = hanzi_simlar_enhance()   
model.similar('末', '来') 

总分：25.428571428571427, 总权重: 26, 结果:0.978021978021978
四角编码：8.0
汉字结构：10
偏旁部首：6
笔画数：1.4285714285714286


0.978021978021978

In [5]:
model.hanzijiegouDict

{'丨': '0',
 '亅': '0',
 '乀': '0',
 '乁': '0',
 '丿': '0',
 '一': '0',
 '乙': '0',
 '乛': '0',
 '乚': '0',
 '丶': '0',
 '丩': '0',
 '乃': '0',
 '乄': '0',
 '九': '0',
 '了': '0',
 '丁': '0',
 '七': '0',
 '乜': '0',
 '丷': '0',
 '八': '0',
 '勹': '0',
 '匕': '0',
 '冫': '0',
 '卜': '0',
 '厂': '0',
 '刀': '0',
 '刁': '0',
 '刂': '0',
 '儿': '0',
 '二': '0',
 '匚': '0',
 '阝': '0',
 '几': '0',
 '卩': '0',
 '冂': '0',
 '力': '0',
 '冖': '0',
 '凵': '0',
 '人': '0',
 '亻': '0',
 '入': '0',
 '十': '0',
 '厶': '0',
 '匸': '0',
 '讠': '0',
 '廴': '0',
 '又': '0',
 '丬': '0',
 '丫': '0',
 '久': '0',
 '么': '0',
 '丸': '0',
 '万': '0',
 '三': '0',
 '上': '0',
 '下': '0',
 '与': '0',
 '丈': '0',
 '乞': '0',
 '习': '0',
 '乡': '0',
 '也': '0',
 '之': '0',
 '义': '0',
 '勺': '0',
 '刃': '0',
 '亏': '0',
 '于': '0',
 '凡': '0',
 '卫': '0',
 '个': '0',
 '亡': '0',
 '叉': '0',
 '及': '0',
 '彳': '0',
 '川': '0',
 '辶': '0',
 '寸': '0',
 '大': '0',
 '飞': '0',
 '干': '0',
 '工': '0',
 '弓': '0',
 '广': '0',
 '己': '0',
 '已': '0',
 '彐': '0',
 '彑': '0',
 '巾': '0',
 '口': '0',
 '马': '0',