# Query 理解（一）

## 预处理

In [6]:
!pip install opencc

Collecting opencc
  Downloading OpenCC-1.1.3-cp36-cp36m-manylinux1_x86_64.whl (766 kB)
     |████████████████████████████████| 766 kB 996 kB/s            
[?25hInstalling collected packages: opencc
Successfully installed opencc-1.1.3


In [7]:
import re

import opencc


class QueryPre:
    def __init__(self):
        pass

    def run(self, ustring):
        ustring = self.filter_emoji(ustring)
        ustring = self.strQ2B(ustring)
        ustring = self.t2s_by_opencc(ustring)
        ustring = self.capital_to_lower(ustring)
        return ustring

    def strQ2B(self, ustring):
        """把字符串全角转半角"""
        rstring = ""
        for uchar in ustring:
            # ord返回对应的ascii数值，
            inside_code = ord(uchar)
            # 全角空格直接转换
            if inside_code == 12288:
                inside_code = 32
            # 全角字符（除空格）根据关系转化
            elif inside_code >= 65281 and inside_code <= 65374:
                inside_code -= 65248
            rstring += chr(inside_code)
        return rstring

    def strB2Q(self, ustring):
        """把字符串半角转全角"""
        rstring = ""
        for uchar in ustring:
            # ord返回对应的ascii数值，
            inside_code = ord(uchar)
            # 半角空格直接转换
            if inside_code == 32:
                inside_code = 12288
            elif inside_code >= 33 and inside_code <= 126:
                inside_code += 65248
                # chr输入一个整数返回其对应的ascii字符
            rstring += chr(inside_code)
        return rstring

    def capital_to_lower(self, ustring):
        """
         大写转小写
        :param ustring: 字符串
        :return: 所有字母都小写后的字符串
        """
        return ustring.lower()

    def t2s_by_opencc(self, ustring):
        """
        繁体转简体
        :param ustring: 繁体string
        :return: 简体string
        """
        return opencc.OpenCC("t2s.json").convert(ustring)

    def filter_emoji(self, desstr, restr=""):
        """
        清除表情
        :param desstr: 需要过滤的字符串
        :param restr: 被替换成什么字符
        :return: 返回处理结果
        """
        # 过滤表情
        try:
            co = re.compile("[\U00010000-\U0010ffff]")
        except re.error:
            co = re.compile("[\uD800-\uDBFF][\uDC00-\uDFFF]")
        return co.sub(restr, desstr)

In [9]:
p = QueryPre()
print(p.run("众所周知，长时间以来，加入欧盟和北约，就是乌克兰政府的心愿。过去这段时间里，为了加入北约和欧盟，乌克兰政府曾多次向其喊话，要求其同意乌克兰的加入。而自俄乌冲突正式开始后，乌克兰总统泽连斯基喊话欧盟与北约的频率更是越发频繁，就在前几日，泽连斯基更是发表了视频讲话，并在视频讲话中呼吁欧盟启动特殊程序，立即同意乌克兰加入欧盟。"))

众所周知,长时间以来,加入欧盟和北约,就是乌克兰政府的心愿。过去这段时间里,为了加入北约和欧盟,乌克兰政府曾多次向其喊话,要求其同意乌克兰的加入。而自俄乌冲突正式开始后,乌克兰总统泽连斯基喊话欧盟与北约的频率更是越发频繁,就在前几日,泽连斯基更是发表了视频讲话,并在视频讲话中呼吁欧盟启动特殊程序,立即同意乌克兰加入欧盟。


## 分词

In [1]:
import hanlp


class Tokenization:
    def __init__(self):
        # 先加载模型
        self.HanLP = hanlp.load(
            hanlp.pretrained.mtl.CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_SMALL_ZH
        )
        
    def stopwords(self, stopwords):
        

    def hanlp_token_ner(self, query):
        """
        调用hanlp分词
        :param query: 一句话
        :return: 分词结果
        """
        # 这里如果是空下面的分词会报错
        if query.strip() == "":
            return [""]
        # 精分&实体识别
        results_document = self.HanLP(query, tasks="ner")
        return results_document["tok/fine"], results_document["ner/msra"]

In [19]:
tokenize = Tokenization()
sents = tokenize.hanlp_token_ner("众所周知，长时间以来，加入欧盟和北约，就是乌克兰政府的心愿。过去这段时间里，为了加入北约和欧盟，乌克兰政府曾多次向其喊话，要求其同意乌克兰的加入。而自俄乌冲突正式开始后，乌克兰总统泽连斯基喊话欧盟与北约的频率更是越发频繁，就在前几日，泽连斯基更是发表了视频讲话，并在视频讲话中呼吁欧盟启动特殊程序，立即同意乌克兰加入欧盟。")
print(sents)

                                             

(['众所周知', '，', '长', '时间', '以来', '，', '加入', '欧盟', '和', '北约', '，', '就', '是', '乌克兰', '政府', '的', '心愿', '。', '过去', '这', '段', '时间', '里', '，', '为了', '加入', '北约', '和', '欧盟', '，', '乌克兰', '政府', '曾', '多次', '向', '其', '喊话', '，', '要求', '其', '同意', '乌克兰', '的', '加入', '。', '而', '自', '俄', '乌', '冲突', '正式', '开始', '后', '，', '乌克兰', '总统', '泽连斯基', '喊话', '欧盟', '与', '北约', '的', '频率', '更是', '越发', '频繁', '，', '就', '在', '前', '几日', '，', '泽连斯基', '更', '是', '发表', '了', '视频', '讲话', '，', '并', '在', '视频', '讲话', '中', '呼吁', '欧盟', '启动', '特殊', '程序', '，', '立即', '同意', '乌克兰', '加入', '欧盟', '。'], [('欧盟', 'ORGANIZATION', 7, 8), ('北约', 'ORGANIZATION', 9, 10), ('乌克兰', 'LOCATION', 13, 14), ('北约', 'ORGANIZATION', 26, 27), ('欧盟', 'ORGANIZATION', 28, 29), ('乌克兰', 'LOCATION', 30, 31), ('多次', 'FREQUENCY', 33, 34), ('乌克兰', 'LOCATION', 41, 42), ('俄', 'LOCATION', 47, 48), ('乌', 'LOCATION', 48, 49), ('乌克兰', 'LOCATION', 54, 55), ('泽连斯基', 'PERSON', 56, 57), ('欧盟', 'ORGANIZATION', 58, 59), ('北约', 'ORGANIZATION', 60, 61), ('几日', 'DATE', 70, 71), ('泽连斯基'

## 拼音转汉字

In [47]:
from pyhanlp import *


def demo_pinyin_to_chinese():
    """ HanLP中的数据结构和接口是灵活的，组合这些接口，可以自己创造新功能
    >>> demo_pinyin_to_chinese()
    [renmenrenweiyalujiangbujian/null, lvse/[滤色, 绿色]]
    """
    AhoCorasickDoubleArrayTrie = JClass(
        "com.hankcs.hanlp.collection.AhoCorasick.AhoCorasickDoubleArrayTrie")
    StringDictionary = JClass(
        "com.hankcs.hanlp.corpus.dictionary.StringDictionary")
    CommonAhoCorasickDoubleArrayTrieSegment = JClass(
        "com.hankcs.hanlp.seg.Other.CommonAhoCorasickDoubleArrayTrieSegment")
    CommonAhoCorasickSegmentUtil = JClass(
        "com.hankcs.hanlp.seg.Other.CommonAhoCorasickSegmentUtil")
    Config = JClass("com.hankcs.hanlp.HanLP$Config")

    TreeMap = JClass("java.util.TreeMap")
    TreeSet = JClass("java.util.TreeSet")

    dictionary = StringDictionary()
    dictionary.load(Config.PinyinDictionaryPath)
    entry = {}
    m_map = TreeMap()
    for entry in dictionary.entrySet():
        pinyins = entry.getValue().replace("[\\d,]", "")
        words = m_map.get(pinyins)
        if words is None:
            words = TreeSet()
            m_map.put(pinyins, words)
        words.add(entry.getKey())
    words = TreeSet()
    words.add("绿色")
    words.add("滤色")
    m_map.put("lvse", words)

    segment = CommonAhoCorasickDoubleArrayTrieSegment(m_map)
    print(segment.segment("ai hao"))


if __name__ == "__main__":
    demo_pinyin_to_chinese()

[ai hao/null]


## 去停用词

In [14]:
!git clone https://github.com/goto456/stopwords

Cloning into 'stopwords'...
remote: Enumerating objects: 22, done.[K
remote: Total 22 (delta 0), reused 0 (delta 0), pack-reused 22[K
Unpacking objects: 100% (22/22), done.


In [27]:
import json

import jieba


# 读取停用词列表
def get_stopword_list(file):
    with open(file, 'r', encoding='utf-8') as f:    # 
        stopword_list = [word.strip('\n') for word in f.readlines()]
    return stopword_list


# 分词 然后清除停用词语
def clean_stopword(str, stopword_list):
    result = ''
    tokenize = Tokenization()
    word_list = tokenize.hanlp_token_ner(str)[0]
    for w in word_list:
        if w not in stopword_list:
            result += w
    return result

In [28]:
stopword_file = 'stopwords/cn_stopwords.txt'
stopword_list = get_stopword_list(stopword_file) 
clean_stopword("就在前几日，泽连斯基更是发表了视频讲话，", stopword_list)

                                             

'前日泽连斯基更发表视频讲话'

In [37]:
!pip install pyhanlp

Collecting pyhanlp
  Downloading pyhanlp-0.1.84.tar.gz (136 kB)
     |████████████████████████████████| 136 kB 942 kB/s            
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting jpype1==0.7.0
  Downloading JPype1-0.7.0-cp36-cp36m-manylinux2010_x86_64.whl (2.7 MB)
     |████████████████████████████████| 2.7 MB 6.2 MB/s            
Building wheels for collected packages: pyhanlp
  Building wheel for pyhanlp (setup.py) ... [?25ldone
[?25h  Created wheel for pyhanlp: filename=pyhanlp-0.1.84-py3-none-any.whl size=29819 sha256=40b65c709827d38f2356b5cbd42b605849c693c05545735a1a8409f8bb99d3b3
  Stored in directory: /root/.cache/pip/wheels/c5/45/6f/8e6f5fffdb1cb0b7c40d4e78fa41469102d7e324bf78fd7dc9
Successfully built pyhanlp
Installing collected packages: jpype1, pyhanlp
Successfully installed jpype1-0.7.0 pyhanlp-0.1.84


In [40]:
import os

from pyhanlp.static import HANLP_JAR_PATH, STATIC_ROOT

java_code_path = os.path.join(STATIC_ROOT, 'MyFilter.java')
with open(java_code_path, 'w') as out:
    java_code = """
import com.hankcs.hanlp.dictionary.stopword.CoreStopWordDictionary;
import com.hankcs.hanlp.dictionary.stopword.Filter;
import com.hankcs.hanlp.seg.common.Term;
public class MyFilter implements Filter
{
    public boolean shouldInclude(Term term)
    {
        if (term.nature.startsWith('m')) return true; // 数词保留
        return !CoreStopWordDictionary.contains(term.word); // 停用词过滤
    }
}
"""
    out.write(java_code)
os.system('javac -cp {} {} -d {}'.format(HANLP_JAR_PATH, java_code_path, STATIC_ROOT))
# 编译结束才可以启动hanlp
from pyhanlp import *


def demo_stopword():
    """
    >>> demo_stopword()
    [小区/n, 反对/v, 喂养/v, 流浪猫/nz, 赞成/v, 喂养/v, 小宝贝/nz]
    [小区/n, 居民/n, 反对/v, 喂养/v, 流浪猫/nz, 居民/n, 赞成/v, 喂养/v, 小宝贝/nz]
    [小区/n, 居民/n, 有/vyou, 的/ude1, 反对/v, 喂养/v, 流浪猫/nz, ，/w, 而/cc, 有的/rz, 居民/n, 却/d, 赞成/v, 喂养/v, 这些/rz, 小宝贝/nz]
    [小区/n, 居民/n, 反对/v, 喂养/v, 流浪猫/nz, 居民/n, 赞成/v, 喂养/v, 小宝贝/nz]
    [数字/n, 123/m, 保留/v]
    """
    CoreStopWordDictionary = JClass("com.hankcs.hanlp.dictionary.stopword.CoreStopWordDictionary")
    Filter = JClass("com.hankcs.hanlp.dictionary.stopword.Filter")
    Term = JClass("com.hankcs.hanlp.seg.common.Term")
    BasicTokenizer = JClass("com.hankcs.hanlp.tokenizer.BasicTokenizer")
    NotionalTokenizer = JClass("com.hankcs.hanlp.tokenizer.NotionalTokenizer")

    text = "小区居民有的反对喂养流浪猫，而有的居民却赞成喂养这些小宝贝"
    # 可以动态修改停用词词典
    CoreStopWordDictionary.add("居民")
    print(NotionalTokenizer.segment(text))
    CoreStopWordDictionary.remove("居民")
    print(NotionalTokenizer.segment(text))

    # 可以对任意分词器的结果执行过滤
    term_list = BasicTokenizer.segment(text)
    print(term_list)
    CoreStopWordDictionary.apply(term_list)
    print(term_list)

    # 还可以自定义过滤逻辑
    MyFilter = JClass('MyFilter')
    CoreStopWordDictionary.FILTER = MyFilter()
    print(NotionalTokenizer.segment("数字123的保留"))  # “的”位于stopwords.txt所以被过滤，数字得到保留


if __name__ == "__main__":
    import doctest

    doctest.testmod(verbose=True)

Trying:
    text = "攻城狮逆袭单身狗，迎娶白富美，走上人生巅峰"  # 怎么可能噗哈哈！
Expecting nothing
ok
Trying:
    demo_custom_dictionary(text)
Expecting:
    [攻城/vi, 狮/ng, 逆袭/nz, 单身/n, 狗/n, ，/w, 迎娶/v, 白富美/nr, ，/w, 走上/v, 人生/n, 巅峰/n]
    [攻城狮/nz, 逆袭/nz, 单身狗/nz, ，/w, 迎娶/v, 白富美/nz, ，/w, 走上/v, 人生/n, 巅峰/n]
**********************************************************************
File "__main__", line 10, in __main__.demo_custom_dictionary
Failed example:
    demo_custom_dictionary(text)
Expected:
    [攻城/vi, 狮/ng, 逆袭/nz, 单身/n, 狗/n, ，/w, 迎娶/v, 白富美/nr, ，/w, 走上/v, 人生/n, 巅峰/n]
    [攻城狮/nz, 逆袭/nz, 单身狗/nz, ，/w, 迎娶/v, 白富美/nz, ，/w, 走上/v, 人生/n, 巅峰/n]
Got:
    [攻城狮/nz, 逆袭/nz, 单身狗/nz, ，/w, 迎娶/v, 白富美/nz, ，/w, 走上/v, 人生/n, 巅峰/n]
    [攻城狮/nz, 逆袭/nz, 单身狗/nz, ，/w, 迎娶/v, 白富美/nz, ，/w, 走上/v, 人生/n, 巅峰/n]
Trying:
    demo_stopword()
Expecting:
    [小区/n, 反对/v, 喂养/v, 流浪猫/nz, 赞成/v, 喂养/v, 小宝贝/nz]
    [小区/n, 居民/n, 反对/v, 喂养/v, 流浪猫/nz, 居民/n, 赞成/v, 喂养/v, 小宝贝/nz]
    [小区/n, 居民/n, 有/vyou, 的/ude1, 反对/v, 喂养/v, 流浪猫/nz, ，/w, 而/cc, 有的/rz, 居民/n, 却/d, 赞成/v,

## 新词发现

In [34]:
!pip install -i https://pypi.tuna.tsinghua.edu.cn/simple macropodus

Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
Collecting macropodus
  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/24/5c/95f458b3db3dee5afb5ef5344d880afa0985e0243f743340774c5844ffa9/Macropodus-0.0.7-py2.py3-none-any.whl (6.2 MB)
     |████████████████████████████████| 6.2 MB 3.8 MB/s            
[?25hCollecting passlib==1.7.1
  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/ee/a7/d6d238d927df355d4e4e000670342ca4705a72f0bf694027cf67d9bcf5af/passlib-1.7.1-py2.py3-none-any.whl (498 kB)
     |████████████████████████████████| 498 kB 3.8 MB/s            
[?25hCollecting networkx==2.4
  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/41/8f/dd6a8e85946def36e4f2c69c84219af0fa5e832b018c970e92f2ad337e45/networkx-2.4-py3-none-any.whl (1.6 MB)
     |████████████████████████████████| 1.6 MB 3.9 MB/s            
[?25hCollecting gensim==3.7.1
  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/d7/b9/6c93685bed0026b6a1cce55ab173f6b617f6db0d1325d25489c2fd

In [35]:
import math
import os
from collections import Counter, OrderedDict
from functools import reduce
from operator import mul

from macropodus.data.words_common.stop_words import stop_words
from macropodus.preprocess.tools_ml import cut_sentence, get_ngrams


class WordDiscovery:
    def __init__(self):
        from macropodus.segment import segs
        self.dict_words_freq = segs.dict_words_freq
        self.algorithm = "new-word-discovery"
        self.stop_words = stop_words
        self.total_words_len = {}
        self.total_words = 0
        self.freq_min = 3
        self.len_max = 7
        self.round = 6
        self.eps = 1e-9
        self.empty_words = [sw for sw in stop_words.values() if len(sw)==1] # 虚词

    def count_word(self, text, use_type="text"):
        """
            词频统计(句子/段落/文章)
        :param text: str, path or doc, like "大漠帝国。" or "/home/data/doc.txt"
        :param use_type: str,  "text" or "file", file of "utf-8" of "txt"
        :return: class<Counter>, word-freq
        """
        import macropodus
        self.words_count = Counter()
        if use_type=="text": # 输入为文本形式
            text = macropodus.han2zh(text)
            texts = cut_sentence(use_type=self.algorithm,
                                 text=text)  # 切句子, 如中英文的逗号/句号/感叹号
            for text in texts:
                n_grams = get_ngrams(use_type=self.algorithm,
                                     len_max=self.len_max,
                                     text=text) # 获取一个句子的所有n-gram
                self.words_count.update(n_grams)
        elif use_type=="file": # 输入为文件形式
            if not os.path.exists(text):
                raise RuntimeError("path of text must exist!")
            fr8 = open(text, "r", encoding="utf-8")
            for text in fr8:
                if text.strip():
                    text = macropodus.han2zh(text)
                    texts = cut_sentence(use_type=self.algorithm,
                                         text=text) # 切句子, 如中英文的逗号/句号/感叹号
                    for text in texts:
                        n_grams = get_ngrams(use_type=self.algorithm,
                                             len_max=self.len_max,
                                             text=text)  # 获取一个句子的所有n-gram
                        self.words_count.update(n_grams)
            fr8.close()
        else:
            raise RuntimeError("use_type must be 'text' or 'file'")
        self.total_words = sum(self.words_count.values())

    def calculate_entropy(self, boundary_type="left"):
        """
            计算左熵和右熵
        :param boundary_type: str, like "left" or "right"
        :return: None
        """
        # 获取成词的最左边和最右边的一个字
        one_collect = {}
        self.total_words_len = {}
        for k, v in self.words_count.items():
            len_k = len(k)
            if len_k >= 2:  # 词长度大于3
                if boundary_type == "right":
                    k_boundary = k[:-1]
                else:
                    k_boundary = k[1:]
                # 左右边, 保存为dict, 左右丰度
                if k_boundary in self.words_count:
                    if k_boundary not in one_collect:
                        one_collect[k_boundary] = [v]
                    else:
                        one_collect[k_boundary] = one_collect[k_boundary] + [v]
            # 计算n-gram的长度
            if len_k not in self.total_words_len:
                self.total_words_len[len_k] = [v]
            else:
                self.total_words_len[len_k] += [v]
        self.total_words_len = dict([(k, sum(v)) for k,v in self.total_words_len.items()])

        # 计算左右熵
        for k, v in self.words_select.items():
            # 从字典获取
            boundary_v = one_collect.get(k, None)
            # 计算候选词的左右凝固度, 取最小的那个
            if boundary_v:
                # 求和
                sum_boundary = sum(boundary_v)
                # 计算信息熵
                entroy_boundary = sum([-(enum_bo / sum_boundary) * math.log(enum_bo / sum_boundary, 2)
                                       for enum_bo in boundary_v])
            else:
                entroy_boundary = 0.0
            # 惩罚虚词开头或者结尾
            if (k[0] in self.empty_words or k[-1] in self.empty_words):
                entroy_boundary = entroy_boundary / len(k)
            if boundary_type == "right":
                self.right_entropy[k] = round(entroy_boundary, self.round)
            else:
                self.left_entropy[k] = round(entroy_boundary, self.round)

    def compute_entropys(self):
        """
            计算左右熵
        :param words_count:dict, like {"我":32, "你们":12} 
        :param len_max: int, like 6
        :param freq_min: int, like 32
        :return: dict
        """
        # 提取大于最大频率的词语, 以及长度在3-len_max的词语
        self.words_select = {word: count for word, count in self.words_count.items()
                             if count >= self.freq_min and " " not in word
                             and 1 < len(word) <= self.len_max
                             }
        # 计算凝固度, 左右两边
        self.right_entropy = {}
        self.left_entropy = {}
        self.calculate_entropy(boundary_type="left")
        self.calculate_entropy(boundary_type="right")

    def compute_aggregation(self):
        """
            计算凝固度
        :return: None
        """
        twl_1 = self.total_words_len[1] # ngram=1 的所有词频
        self.aggregation = {}
        for word, value in self.words_select.items():
            len_word = len(word)
            twl_n = self.total_words_len[len_word] # ngram=n 的所有词频
            words_freq = [self.words_count.get(wd, 1) for wd in word]
            probability_word = value / twl_n
            probability_chars = reduce(mul,([wf for wf in words_freq])) / (twl_1**(len(word)))
            pmi = math.log(probability_word / probability_chars, 2)
            # AMI=PMI/length_word. 惩罚虚词(避免"的", "得", "了"开头结尾的情况)
            word_aggregation = pmi/(len_word**len_word) if (word[0] in self.empty_words or word[-1] in self.empty_words) \
                                                        else pmi/len_word # pmi / len_word / len_word
            self.aggregation[word] = round(word_aggregation, self.round)

    def compute_score(self, word, value, a, r, l, rl, lambda_0, lambda_3):
        """
            计算最终得分
        :param word: str, word with prepare
        :param value: float, word freq
        :param a: float, aggregation of word
        :param r: float, right entropy of word
        :param l: float, left entropy of word
        :param rl: float, right_entropy * left_entropy
        :param lambda_0: lambda 0
        :param lambda_3: lambda 3
        :return: 
        """
        self.new_words[word] = {}
        # math.log10(self.aggregation[word]) - math.log10(self.total_words)
        self.new_words[word]["a"] = a
        self.new_words[word]["r"] = r
        self.new_words[word]["l"] = l
        self.new_words[word]["f"] = value
        # word-liberalization
        m1 = lambda_0(r)
        m2 = lambda_0(l)
        m3 = lambda_0(a)
        score_ns = lambda_0((lambda_3(m1, m2) + lambda_3(m1, m3) + lambda_3(m2, m3)) / 3)
        self.new_words[word]["ns"] = round(score_ns, self.round)
        # 乘以词频word-freq, 连乘是为了防止出现较小项
        score_s = value * a * rl * score_ns
        self.new_words[word]["s"] = round(score_s, self.round)

    def find_word(self, text, use_type="text", freq_min=2, len_max=5, entropy_min=2.0, aggregation_min=3.2,
                        use_output=True, use_avg=False, use_filter=False):
        """
            新词发现与策略
        :param text: str, path or doc, like "大漠帝国。" or "/home/data/doc.txt"
        :param use_type: str,  输入格式, 即文件输入还是文本输入, "text" or "file", file of "utf-8" of "txt"
        :param use_output: bool,  输出模式, 即最后结果是否全部输出
        :param use_filter: bool,  新词过滤, 即是否过滤macropodus词典和停用词
        :param freq_min: int, 最小词频, 大于1
        :param len_max: int, 最大成词长度, 一般为5, 6, 7
        :param entropy_min: int, 左右熵阈值, 低于则过滤
        :param aggregation_min: int, PMI(凝固度)-阈值, 低于则过滤
        :return: 
        """
        self.aggregation_min = aggregation_min
        self.entropy_min = entropy_min
        self.freq_min = freq_min
        self.len_max = len_max
        self.count_word(text=text, use_type=use_type)
        self.compute_entropys()
        self.compute_aggregation()
        self.new_words = {}
        lambda_3 = lambda m1, m2: math.log((m1 * math.e ** m2 + m2 * math.e ** m1 + self.eps) / (abs(m1 - m2) + 1), 10)
        lambda_0 = lambda x: -self.eps * x + self.eps if x <= 0 else x
        # 输出
        for word, value in self.words_select.items():
            # 过滤通用词
            if use_filter and word in self.dict_words_freq:
                continue
            # 过滤停用词
            if word in self.stop_words:
                continue
            # {"aggregation":"a", "right_entropy":"r", "left_entropy":"l", "frequency":"f",
            #  "word-liberalization":"ns", "score":"s"}
            a = self.aggregation[word]
            r = self.right_entropy[word]
            l = self.left_entropy[word]
            rl = (r+l) / 2 if use_avg else r * l
            if use_output or (use_avg and a > self.aggregation_min and rl > self.entropy_min) or \
                             (not use_avg and a > self.aggregation_min and r > self.entropy_min and l > self.entropy_min):
                self.compute_score(word, value, a, r, l, rl, lambda_0, lambda_3)

        # 排序
        self.new_words = sorted(self.new_words.items(), key=lambda x:x[1]["s"], reverse=True)
        self.new_words = OrderedDict(self.new_words)
        return self.new_words


if __name__ == '__main__':

    from macropodus.preprocess.tools_common import (
        load_json,
        save_json,
        txt_read,
        txt_write,
    )

    summary = "四川发文取缔全部不合规p2p。字节跳动与今日头条。成都日报，成都市，李太白与杜甫" \
              "PageRank算法简介。" \
              "是上世纪90年代末提出的一种计算网页权重的算法! " \
              "当时，互联网技术突飞猛进，各种网页网站爆炸式增长。 " \
              "业界急需一种相对比较准确的网页重要性计算方法。 " \
              "是人们能够从海量互联网世界中找出自己需要的信息。 " \
              "百度百科如是介绍他的思想:PageRank通过网络浩瀚的超链接关系来确定一个页面的等级。 " \
              "Google把从A页面到B页面的链接解释为A页面给B页面投票。 " \
              "Google根据投票来源甚至来源的来源，即链接到A页面的页面。 " \
              "和投票目标的等级来决定新的等级。简单的说， " \
              "一个高等级的页面可以使其他低等级页面的等级提升。 " \
              "具体说来就是，PageRank有两个基本思想，也可以说是假设。 " \
              "即数量假设：一个网页被越多的其他页面链接，就越重）。 " \
              "质量假设：一个网页越是被高质量的网页链接，就越重要。 " \
              "总的来说就是一句话，从全局角度考虑，获取重要的信。 "

    # 新词发现-文本
    wd = WordDiscovery()
    res = wd.find_word(text=summary, use_type="text", use_avg=False, use_filter=False, use_output=True,
                       freq_min=2, len_max=5, entropy_min=2.0, aggregation_min=3.2)
    for k, v in res.items():
        print(k, v)
    print("\n#################\n")

    while True:
        print("请输入:")
        ques = input()
        res = wd.find_word(text=ques, use_type="text", use_avg=False, use_filter=False, use_output=True,
                           freq_min=2, len_max=5, entropy_min=2.0, aggregation_min=3.2)
        for k, v in res.items():
            print(k, v)
    # ms = 0

2022-05-06 09:44:19,249 - seg_basic.py[line:19] - INFO: path of dict cache is /usr/local/lib/python3.6/dist-packages/macropodus/data/cache/macropodus.cache!
2022-05-06 09:44:19,669 - textcleaner.py[line:37] - INFO: 'pattern' package not found; tag filters are not available for English
2022-05-06 09:44:19,674 - word2vec.py[line:19] - INFO: path of w2v cache is /usr/local/lib/python3.6/dist-packages/macropodus/data/cache/word2vec_char.cache!


页面 {'a': 2.361718, 'r': 2.281036, 'l': 2.446439, 'f': 10, 'ns': 1.655677, 's': 218.207762}
网页 {'a': 1.993236, 'r': 2.584963, 'l': 1.918296, 'f': 6, 'ns': 1.425885, 's': 84.559919}
链接 {'a': 3.200754, 'r': 1.584963, 'l': 2.321928, 'f': 5, 'ns': 1.398996, 's': 82.396116}
投票 {'a': 3.569237, 'r': 1.0, 'l': 1.584963, 'f': 3, 'ns': 1.086026, 's': 18.431306}
B页面 {'a': 3.378601, 'r': 1.0, 'l': 1.0, 'f': 2, 'ns': 0.949694, 's': 6.417272}
计算 {'a': 3.361718, 'r': 1.0, 'l': 1.0, 'f': 2, 'ns': 0.947079, 's': 6.367628}
质量 {'a': 3.361718, 'r': 1.0, 'l': 1.0, 'f': 2, 'ns': 0.947079, 's': 6.367628}
等级 {'a': 1.534619, 'r': 1.0, 'l': 0.625815, 'f': 6, 'ns': 0.551465, 's': 3.17772}
一个 {'a': 1.398538, 'r': 0.75, 'l': 0.5, 'f': 4, 'ns': 0.394292, 's': 0.827148}
重要 {'a': 1.496618, 'r': 0.5, 'l': 0.792481, 'f': 3, 'ns': 0.415517, 's': 0.739231}
来源 {'a': 1.47902, 'r': 0.5, 'l': 0.792481, 'f': 3, 'ns': 0.414049, 's': 0.727958}
可以 {'a': 1.930859, 'r': 0.5, 'l': 0.5, 'f': 2, 'ns': 0.362936, 's': 0.350389}
其他 {'a':

 一个很好的故事


请输入:


KeyboardInterrupt: 

## 领域词库

In [37]:
!pip install pyhanlp

Collecting pyhanlp
  Downloading pyhanlp-0.1.84.tar.gz (136 kB)
     |████████████████████████████████| 136 kB 942 kB/s            
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting jpype1==0.7.0
  Downloading JPype1-0.7.0-cp36-cp36m-manylinux2010_x86_64.whl (2.7 MB)
     |████████████████████████████████| 2.7 MB 6.2 MB/s            
Building wheels for collected packages: pyhanlp
  Building wheel for pyhanlp (setup.py) ... [?25ldone
[?25h  Created wheel for pyhanlp: filename=pyhanlp-0.1.84-py3-none-any.whl size=29819 sha256=40b65c709827d38f2356b5cbd42b605849c693c05545735a1a8409f8bb99d3b3
  Stored in directory: /root/.cache/pip/wheels/c5/45/6f/8e6f5fffdb1cb0b7c40d4e78fa41469102d7e324bf78fd7dc9
Successfully built pyhanlp
Installing collected packages: jpype1, pyhanlp
Successfully installed jpype1-0.7.0 pyhanlp-0.1.84


In [39]:
from pyhanlp import *


def demo_custom_dictionary(text):
    """ 演示用户词典的动态增删
    TO-DO:
    DoubleArrayTrie分词
    首字哈希之后二分的trie树分词
    >>> text = "攻城狮逆袭单身狗，迎娶白富美，走上人生巅峰"  # 怎么可能噗哈哈！
    >>> demo_custom_dictionary(text)
    [攻城/vi, 狮/ng, 逆袭/nz, 单身/n, 狗/n, ，/w, 迎娶/v, 白富美/nr, ，/w, 走上/v, 人生/n, 巅峰/n]
    [攻城狮/nz, 逆袭/nz, 单身狗/nz, ，/w, 迎娶/v, 白富美/nz, ，/w, 走上/v, 人生/n, 巅峰/n]
    """
    print(HanLP.segment(text))

    CustomDictionary = JClass("com.hankcs.hanlp.dictionary.CustomDictionary")
    CustomDictionary.add("攻城狮")  # 动态增加
    CustomDictionary.insert("白富美", "nz 1024")  # 强行插入
    #CustomDictionary.remove("攻城狮"); # 删除词语（注释掉试试）
    CustomDictionary.add("单身狗", "nz 1024 n 1")
    #print(CustomDictionary.get("单身狗"))

    print(HanLP.segment(text))


if __name__ == "__main__":
    import doctest
    doctest.testmod(verbose=True)

Trying:
    text = "攻城狮逆袭单身狗，迎娶白富美，走上人生巅峰"  # 怎么可能噗哈哈！
Expecting nothing
ok
Trying:
    demo_custom_dictionary(text)
Expecting:
    [攻城/vi, 狮/ng, 逆袭/nz, 单身/n, 狗/n, ，/w, 迎娶/v, 白富美/nr, ，/w, 走上/v, 人生/n, 巅峰/n]
    [攻城狮/nz, 逆袭/nz, 单身狗/nz, ，/w, 迎娶/v, 白富美/nz, ，/w, 走上/v, 人生/n, 巅峰/n]
ok
35 items had no tests:
    __main__
    __main__.QueryPre
    __main__.QueryPre.__init__
    __main__.QueryPre.capital_to_lower
    __main__.QueryPre.filter_emoji
    __main__.QueryPre.run
    __main__.QueryPre.strB2Q
    __main__.QueryPre.strQ2B
    __main__.QueryPre.t2s_by_opencc
    __main__.Tokenization
    __main__.Tokenization.__init__
    __main__.Tokenization.hanlp_token_ner
    __main__.WordDiscovery
    __main__.WordDiscovery.__init__
    __main__.WordDiscovery.calculate_entropy
    __main__.WordDiscovery.compute_aggregation
    __main__.WordDiscovery.compute_entropys
    __main__.WordDiscovery.compute_score
    __main__.WordDiscovery.count_word
    __main__.WordDiscovery.find_word
    __main__._check_