Add missed textmulclassify missed by .gitignore

17zuoye · May 24, 2015 · 1811eb5 · 1811eb5
1 parent d926359
commit 1811eb5
Show file tree

Hide file tree

Showing 6 changed files with 498 additions and 0 deletions.
diff --git a/textmulclassify/lib/__init__.py b/textmulclassify/lib/__init__.py
@@ -0,0 +1,6 @@
+__all__ = ["ReadManualKps", "FeaturesWeight", "Similarity", "Evaluate"]
+
+from .read_manual_kps import ReadManualKps
+from .features_weight import FeaturesWeight
+from .similarity import Similarity
+from .evaluate import Evaluate
diff --git a/textmulclassify/lib/entropy.py b/textmulclassify/lib/entropy.py
@@ -0,0 +1,66 @@
+# -*- coding: utf-8 -*-
+
+"""
+计算 特征值的熵，看看特征值在知识点下的分布是否混乱。
+
+每个特征都只有一个熵值。
+
+           / tag1
+Feature(i) - tag2
+           \ tag3
+           \ ...
+           \ tagj
+
+P(ij) = Feature(ij)/Feature(i)
+H(i)  = - (连加 P(ij) * log( P(ij) ))
+"""
+
+from etl_utils import cpickle_cache, process_notifier  # , calculate_entropy
+from collections import defaultdict
+
+try:
+    from scipy.stats import entropy as scipy_entropy
+except ImportError, e:
+    import math
+
+    def scipy_entropy(values):
+        feature_count_sum = float(sum(values))
+
+        entropy = 0.0
+        for c1 in values:
+            p_ij = c1 / feature_count_sum
+            entropy += p_ij * math.log(p_ij)
+        return - entropy
+
+
+class EntropyFunc(object):
+
+    @classmethod
+    def process(cls, d1, cache_dir):
+        """ d1 is {"feature1":count1, "feature2":count2, ... } """
+
+        def func():
+            # 1. fetch all features
+            uniq_keys = set([])
+            for item_id1, item1 in process_notifier(d1):
+                [uniq_keys.add(k1) for k1 in item1.iterkeys()]
+            uniq_keys = list(uniq_keys)
+
+            # 2. feature1 => {doc1: count1, doc2: count2, ...}
+            value_cache = defaultdict(dict)
+            for item_id1, item1 in process_notifier(d1):
+                for k1, c1 in item1.iteritems():
+                    value_cache[k1][item_id1] = c1
+
+            # 3. calculate each feauture's entropy
+            entropy_cache = dict()
+            total_len = len(d1)
+            for k1 in process_notifier(uniq_keys):
+                exist_values = value_cache[k1].values()
+                total_values = exist_values + [0] * (total_len - len(value_cache))
+
+                entropy_cache[k1] = scipy_entropy(total_values)
+
+            return entropy_cache
+
+        return cpickle_cache(cache_dir + '/entropy.cPickle', func)
diff --git a/textmulclassify/lib/evaluate.py b/textmulclassify/lib/evaluate.py
@@ -0,0 +1,121 @@
+# -*- coding: utf-8 -*-
+
+from ..data_structures.tree import TMCTree
+from etl_utils import uprint
+
+
+class Evaluate(object):
+    """ 计算多级知识点下的 exact|peer|child|parent 召回率+正确率 """
+
+    def __init__(self, tags_tree, items):
+        self.items = items
+        for i1 in self.items:
+            i1['eval_result'] = []
+
+        # 验证数据结构
+        assert isinstance(tags_tree, TMCTree)  # name TODO
+        assert isinstance(self.items, list)
+        assert 'original_tags' in self.items[0]
+        assert 'recommend_tags' in self.items[0]
+
+        self.process(tags_tree, self.items)
+
+    def process(self, tags_tree, items, verbose=False):
+        from bunch import Bunch
+        total_counts     = Bunch({'original': 0, 'recommend': 0})
+        # 对recall来说, recommend_tags对original_tags里的每个tag都可以超过一个匹配，
+        # 但是还是按original_tags里排重过的匹配算。
+        recall_counts    = Bunch({'exact': 0, 'peer': 0, 'child': 0, 'parent': 0, 'unmatch': 0})
+        # 对precision来说，是可以超过多个的，然后 比率 就是直接除以自身全部个数。
+        precision_counts = Bunch({'exact': 0, 'peer': 0, 'child': 0, 'parent': 0, 'unmatch': 0})
+
+        def update(obj, method, num):
+            setattr(obj, method, (getattr(obj, method) + num))
+
+        for idx1, item1 in enumerate(items):
+            if verbose:
+                print "\n", "#" * 50, "[process] #", idx1 + 1
+            if item1['original_tags']:
+                assert isinstance(list(item1['original_tags'])[0],  unicode)
+            if item1['recommend_tags']:
+                assert isinstance(list(item1['recommend_tags'])[0], unicode)
+
+            original_tags  = set(filter(lambda i1: i1 in tags_tree.name_to_nodes,
+                                    item1['original_tags']))  # check valid data
+            recommend_tags = set(item1['recommend_tags'])
+
+            total_counts.original  += len(original_tags)
+            total_counts.recommend += len(recommend_tags)
+
+            # processed_* 只是为了处理 epcp 依赖的顺序，即前面处理了，后面就没机会了
+            processed_original_tags   = set([])
+            processed_recommend_tags  = set([])
+
+            def func(counts, is_precision=False):
+                if not is_precision:
+                    for1, for2 = original_tags, recommend_tags
+                else:
+                    for2, for1 = original_tags, recommend_tags
+
+                processed = set([])
+
+                for method in ["exact", "peer", "child", "parent"]:
+                    match_count = 0
+
+                    for t1 in (for1 - processed):  # 其实核心就是对这一层进行遍历
+                        matched_t1 = None
+                        for t2 in for2:  # 不需要相减，因为其他recommend_tags还要判定关系
+                            n_t1, n_t2 = t2, t1
+                            if verbose:
+                                print method, "[n_t1]", n_t1, "[n_t2]", n_t2
+                            if getattr(tags_tree, "is_" + method)(n_t1, n_t2):
+                                if verbose:
+                                    print "√"
+                                matched_t1 = n_t2
+                                break
+                        if matched_t1:
+                            # 这样在这个循环外部的for循环rt1就没有机会重复计算了
+                            processed.add(n_t2)
+                            match_count += 1
+                    update(counts, method, match_count)
+                if verbose:
+                    uprint("[processed]", processed)
+                counts.unmatch += len(for1 - processed)
+
+                # 计算是否完全没有 召回｜正确
+                if len(processed) == 0:
+                    text = "no_precision" if is_precision else "no_recall"
+                    item1['eval_result'].append(text)
+
+            if verbose:
+                print " " * 10, "[recall]    ..."
+            func(recall_counts)
+
+            if verbose:
+                print " " * 10, "[precision] ..."
+            func(precision_counts, is_precision=True)
+
+        print "#" * 100
+        print "#" * 100
+        print "#" * 100
+
+        def calculate_detail_rates(denominator, molecules):
+            def calculate_percent(molecule, denominator):
+                return round(((molecule / float(denominator))) * 100, 2)
+            rates = [calculate_percent(m1, denominator) for m1 in molecules]
+            rates.append(sum(rates[0:-1]))
+            return rates
+
+        print "total_counts", repr(total_counts)
+        print "recall_counts", repr(recall_counts)
+        print "precision_counts", repr(precision_counts)
+
+        self.recall_rates = calculate_detail_rates(total_counts.original,
+                [recall_counts.exact, recall_counts.child, recall_counts.parent, recall_counts.peer, recall_counts.unmatch])
+
+        self.precision_rates = calculate_detail_rates(total_counts.recommend,
+                [precision_counts.exact, precision_counts.child, precision_counts.parent, precision_counts.peer, precision_counts.unmatch])
+
+        print "exact | child | parent | peer | unmatch | [total]"
+        print "召回率", self.recall_rates
+        print "正确率", self.precision_rates
diff --git a/textmulclassify/lib/features_weight.py b/textmulclassify/lib/features_weight.py
@@ -0,0 +1,155 @@
+# -*- coding: utf-8 -*-
+
+# original author is @junchen , @liheng
+# refactored by @mvj3
+
+import os
+import sys
+import json
+from etl_utils import process_notifier, UnicodeUtils, uprint, cached_property, singleton
+from collections import defaultdict, Counter
+import jieba.posseg as posseg
+
+
+@singleton()
+class JIEBA_CLASS(object):
+
+    def __init__(self):
+        self.jieba = posseg.jieba
+
+    def load_dictionaries(self, userdicts=[]):
+        for file1 in userdicts:
+            self.jieba.load_userdict(file1)
+
+    def normal_cut(self, unicode1):
+        return self.jieba.cut(unicode1)
+
+    def posseg_cut(self, unicode1):
+        return posseg.cut(unicode1)
+
+
+class FeaturesWeight(object):
+
+    def __init__(self, classify):
+        self.classify = classify
+
+        self.JIEBA = JIEBA_CLASS()
+        self.JIEBA.load_dictionaries(classify.jieba_userdict_files)
+
+        self.key_tag = set(['mk', 'mw'])
+        self.obsolete_tag = set(['x', 'm', 'eng'])
+
+        self.short_seg_threshold = 1
+        self.long_key_seg_threshold = 2
+        self.key_word_threshold = 0.005
+
+    def load_data_from_input(self, input1):
+        """ return data is a dict. """
+        def wrap(data):
+            avg = sum(data.values()) / float(len(data))
+            return defaultdict(lambda: avg, data)
+
+        if isinstance(input1, dict):
+            return wrap(input1)
+
+        if not os.path.exists(input1):
+            return defaultdict(float)
+
+        content = UnicodeUtils.read(input1).strip()
+        try:
+            data = json.loads(content)
+        except:
+            data = dict()
+            for line in content.split("\n"):
+                result = line.split(',')
+                data[result[0]] = float(result[1].strip())
+
+        return wrap(data)
+
+    @cached_property
+    def H_i_dict(self):
+        d1 = self.load_data_from_input(self.classify.entropy_file)
+        avg = sum(d1.values()) / float(len(d1))
+        for k2 in d1.keys():
+            if d1[k2] == 0.0:
+                d1[k2] = avg  # tmp fix
+        return d1
+
+    @cached_property
+    def idf_dict(self):
+        return self.load_data_from_input(self.classify.idf_file)
+
+    @cached_property
+    def stop_words_set(self):
+        return set([w1.strip() for file1 in self.classify.stop_words_files
+                    for w1 in UnicodeUtils.read(file1).split("\n")])
+
+    #@profile
+    def extract_feature_words(self, in_text):
+        """ 专业词汇抽取 + 对长词(3)再做分词 """
+        assert isinstance(in_text, unicode), in_text
+
+        seg_list = list(self.JIEBA.posseg_cut(in_text))  # NOTE 此处最慢
+        lv_1_list = []
+        lv_2_list = []
+
+        def _continue(seg):
+            if len(seg.word) == self.short_seg_threshold and (seg.flag not in self.key_tag):
+                # 非关键词短串扔掉
+                return True
+            if seg.word in self.stop_words_set:
+                # 扔掉次要词和标点及其他无关紧要的词
+                return True
+            if seg.flag in self.obsolete_tag:
+                # 确定遗弃的词性
+                return True
+            return False
+
+        def cut2(seg, lv_2_list):
+            # 如果是关键词，字串长超过3个字
+            if seg.flag in self.key_tag and len(seg.word) > self.long_key_seg_threshold:
+                # 进一步切割以增加召回
+                second_cut = list(self.JIEBA.normal_cut(seg.word))
+                for sec_word in second_cut:
+                    if (second_cut != seg.word) and (sec_word not in self.stop_words_set):
+                        # 成功切割，增加入词库
+                        lv_2_list.append(sec_word)
+                # 第一个切词即和原词相同，未切割
+
+        def process(seg, lv_1_list, lv_2_list):
+            if _continue(seg):
+                return False
+            # 剩下的都是需要解析的词
+            # 加入词列
+            lv_1_list.append(seg)
+
+            cut2(seg, lv_2_list)
+
+        map(lambda seg: process(seg, lv_1_list, lv_2_list), seg_list)  # 此处运行时间仅为 1.6%
+
+        # list 1 contains both flag and word
+        # list 2 contains only word
+        # directly return the list of words
+        all_list = [item.word for item in lv_1_list]
+        all_list.extend(lv_2_list)
+
+        return all_list
+
+    def calculate_features_weight(self, words_counter):
+        """ 用占用个数 + idf + H 计算weight """
+        words_len  = float(sum(words_counter.values()))
+        assert words_len != 0
+
+        weight_dict = dict()
+        for word1, count1 in words_counter.iteritems():
+            weight_dict[word1] = (count1 / words_len) * self.idf_dict[word1] / self.H_i_dict[word1]
+
+        for word1 in weight_dict.keys():
+            if weight_dict[word1] <= self.key_word_threshold:
+                del weight_dict[word1]
+        return weight_dict
+
+    def compute_words_and_weights(self, text1):
+        words = self.extract_feature_words(text1)
+        keywords_weight = self.calculate_features_weight(Counter(words))
+        return words, keywords_weight