-
Notifications
You must be signed in to change notification settings - Fork 6
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add missed textmulclassify missed by .gitignore
- Loading branch information
Showing
6 changed files
with
498 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
__all__ = ["ReadManualKps", "FeaturesWeight", "Similarity", "Evaluate"] | ||
|
||
from .read_manual_kps import ReadManualKps | ||
from .features_weight import FeaturesWeight | ||
from .similarity import Similarity | ||
from .evaluate import Evaluate |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,66 @@ | ||
# -*- coding: utf-8 -*- | ||
|
||
""" | ||
计算 特征值的熵,看看特征值在知识点下的分布是否混乱。 | ||
每个特征都只有一个熵值。 | ||
/ tag1 | ||
Feature(i) - tag2 | ||
\ tag3 | ||
\ ... | ||
\ tagj | ||
P(ij) = Feature(ij)/Feature(i) | ||
H(i) = - (连加 P(ij) * log( P(ij) )) | ||
""" | ||
|
||
from etl_utils import cpickle_cache, process_notifier # , calculate_entropy | ||
from collections import defaultdict | ||
|
||
try: | ||
from scipy.stats import entropy as scipy_entropy | ||
except ImportError, e: | ||
import math | ||
|
||
def scipy_entropy(values): | ||
feature_count_sum = float(sum(values)) | ||
|
||
entropy = 0.0 | ||
for c1 in values: | ||
p_ij = c1 / feature_count_sum | ||
entropy += p_ij * math.log(p_ij) | ||
return - entropy | ||
|
||
|
||
class EntropyFunc(object): | ||
|
||
@classmethod | ||
def process(cls, d1, cache_dir): | ||
""" d1 is {"feature1":count1, "feature2":count2, ... } """ | ||
|
||
def func(): | ||
# 1. fetch all features | ||
uniq_keys = set([]) | ||
for item_id1, item1 in process_notifier(d1): | ||
[uniq_keys.add(k1) for k1 in item1.iterkeys()] | ||
uniq_keys = list(uniq_keys) | ||
|
||
# 2. feature1 => {doc1: count1, doc2: count2, ...} | ||
value_cache = defaultdict(dict) | ||
for item_id1, item1 in process_notifier(d1): | ||
for k1, c1 in item1.iteritems(): | ||
value_cache[k1][item_id1] = c1 | ||
|
||
# 3. calculate each feauture's entropy | ||
entropy_cache = dict() | ||
total_len = len(d1) | ||
for k1 in process_notifier(uniq_keys): | ||
exist_values = value_cache[k1].values() | ||
total_values = exist_values + [0] * (total_len - len(value_cache)) | ||
|
||
entropy_cache[k1] = scipy_entropy(total_values) | ||
|
||
return entropy_cache | ||
|
||
return cpickle_cache(cache_dir + '/entropy.cPickle', func) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,121 @@ | ||
# -*- coding: utf-8 -*- | ||
|
||
from ..data_structures.tree import TMCTree | ||
from etl_utils import uprint | ||
|
||
|
||
class Evaluate(object): | ||
""" 计算多级知识点下的 exact|peer|child|parent 召回率+正确率 """ | ||
|
||
def __init__(self, tags_tree, items): | ||
self.items = items | ||
for i1 in self.items: | ||
i1['eval_result'] = [] | ||
|
||
# 验证数据结构 | ||
assert isinstance(tags_tree, TMCTree) # name TODO | ||
assert isinstance(self.items, list) | ||
assert 'original_tags' in self.items[0] | ||
assert 'recommend_tags' in self.items[0] | ||
|
||
self.process(tags_tree, self.items) | ||
|
||
def process(self, tags_tree, items, verbose=False): | ||
from bunch import Bunch | ||
total_counts = Bunch({'original': 0, 'recommend': 0}) | ||
# 对recall来说, recommend_tags对original_tags里的每个tag都可以超过一个匹配, | ||
# 但是还是按original_tags里排重过的匹配算。 | ||
recall_counts = Bunch({'exact': 0, 'peer': 0, 'child': 0, 'parent': 0, 'unmatch': 0}) | ||
# 对precision来说,是可以超过多个的,然后 比率 就是直接除以自身全部个数。 | ||
precision_counts = Bunch({'exact': 0, 'peer': 0, 'child': 0, 'parent': 0, 'unmatch': 0}) | ||
|
||
def update(obj, method, num): | ||
setattr(obj, method, (getattr(obj, method) + num)) | ||
|
||
for idx1, item1 in enumerate(items): | ||
if verbose: | ||
print "\n", "#" * 50, "[process] #", idx1 + 1 | ||
if item1['original_tags']: | ||
assert isinstance(list(item1['original_tags'])[0], unicode) | ||
if item1['recommend_tags']: | ||
assert isinstance(list(item1['recommend_tags'])[0], unicode) | ||
|
||
original_tags = set(filter(lambda i1: i1 in tags_tree.name_to_nodes, | ||
item1['original_tags'])) # check valid data | ||
recommend_tags = set(item1['recommend_tags']) | ||
|
||
total_counts.original += len(original_tags) | ||
total_counts.recommend += len(recommend_tags) | ||
|
||
# processed_* 只是为了处理 epcp 依赖的顺序,即前面处理了,后面就没机会了 | ||
processed_original_tags = set([]) | ||
processed_recommend_tags = set([]) | ||
|
||
def func(counts, is_precision=False): | ||
if not is_precision: | ||
for1, for2 = original_tags, recommend_tags | ||
else: | ||
for2, for1 = original_tags, recommend_tags | ||
|
||
processed = set([]) | ||
|
||
for method in ["exact", "peer", "child", "parent"]: | ||
match_count = 0 | ||
|
||
for t1 in (for1 - processed): # 其实核心就是对这一层进行遍历 | ||
matched_t1 = None | ||
for t2 in for2: # 不需要相减,因为其他recommend_tags还要判定关系 | ||
n_t1, n_t2 = t2, t1 | ||
if verbose: | ||
print method, "[n_t1]", n_t1, "[n_t2]", n_t2 | ||
if getattr(tags_tree, "is_" + method)(n_t1, n_t2): | ||
if verbose: | ||
print "√" | ||
matched_t1 = n_t2 | ||
break | ||
if matched_t1: | ||
# 这样在这个循环外部的for循环rt1就没有机会重复计算了 | ||
processed.add(n_t2) | ||
match_count += 1 | ||
update(counts, method, match_count) | ||
if verbose: | ||
uprint("[processed]", processed) | ||
counts.unmatch += len(for1 - processed) | ||
|
||
# 计算是否完全没有 召回|正确 | ||
if len(processed) == 0: | ||
text = "no_precision" if is_precision else "no_recall" | ||
item1['eval_result'].append(text) | ||
|
||
if verbose: | ||
print " " * 10, "[recall] ..." | ||
func(recall_counts) | ||
|
||
if verbose: | ||
print " " * 10, "[precision] ..." | ||
func(precision_counts, is_precision=True) | ||
|
||
print "#" * 100 | ||
print "#" * 100 | ||
print "#" * 100 | ||
|
||
def calculate_detail_rates(denominator, molecules): | ||
def calculate_percent(molecule, denominator): | ||
return round(((molecule / float(denominator))) * 100, 2) | ||
rates = [calculate_percent(m1, denominator) for m1 in molecules] | ||
rates.append(sum(rates[0:-1])) | ||
return rates | ||
|
||
print "total_counts", repr(total_counts) | ||
print "recall_counts", repr(recall_counts) | ||
print "precision_counts", repr(precision_counts) | ||
|
||
self.recall_rates = calculate_detail_rates(total_counts.original, | ||
[recall_counts.exact, recall_counts.child, recall_counts.parent, recall_counts.peer, recall_counts.unmatch]) | ||
|
||
self.precision_rates = calculate_detail_rates(total_counts.recommend, | ||
[precision_counts.exact, precision_counts.child, precision_counts.parent, precision_counts.peer, precision_counts.unmatch]) | ||
|
||
print "exact | child | parent | peer | unmatch | [total]" | ||
print "召回率", self.recall_rates | ||
print "正确率", self.precision_rates |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,155 @@ | ||
# -*- coding: utf-8 -*- | ||
|
||
# original author is @junchen , @liheng | ||
# refactored by @mvj3 | ||
|
||
import os | ||
import sys | ||
import json | ||
from etl_utils import process_notifier, UnicodeUtils, uprint, cached_property, singleton | ||
from collections import defaultdict, Counter | ||
import jieba.posseg as posseg | ||
|
||
|
||
@singleton() | ||
class JIEBA_CLASS(object): | ||
|
||
def __init__(self): | ||
self.jieba = posseg.jieba | ||
|
||
def load_dictionaries(self, userdicts=[]): | ||
for file1 in userdicts: | ||
self.jieba.load_userdict(file1) | ||
|
||
def normal_cut(self, unicode1): | ||
return self.jieba.cut(unicode1) | ||
|
||
def posseg_cut(self, unicode1): | ||
return posseg.cut(unicode1) | ||
|
||
|
||
class FeaturesWeight(object): | ||
|
||
def __init__(self, classify): | ||
self.classify = classify | ||
|
||
self.JIEBA = JIEBA_CLASS() | ||
self.JIEBA.load_dictionaries(classify.jieba_userdict_files) | ||
|
||
self.key_tag = set(['mk', 'mw']) | ||
self.obsolete_tag = set(['x', 'm', 'eng']) | ||
|
||
self.short_seg_threshold = 1 | ||
self.long_key_seg_threshold = 2 | ||
self.key_word_threshold = 0.005 | ||
|
||
def load_data_from_input(self, input1): | ||
""" return data is a dict. """ | ||
def wrap(data): | ||
avg = sum(data.values()) / float(len(data)) | ||
return defaultdict(lambda: avg, data) | ||
|
||
if isinstance(input1, dict): | ||
return wrap(input1) | ||
|
||
if not os.path.exists(input1): | ||
return defaultdict(float) | ||
|
||
content = UnicodeUtils.read(input1).strip() | ||
try: | ||
data = json.loads(content) | ||
except: | ||
data = dict() | ||
for line in content.split("\n"): | ||
result = line.split(',') | ||
data[result[0]] = float(result[1].strip()) | ||
|
||
return wrap(data) | ||
|
||
@cached_property | ||
def H_i_dict(self): | ||
d1 = self.load_data_from_input(self.classify.entropy_file) | ||
avg = sum(d1.values()) / float(len(d1)) | ||
for k2 in d1.keys(): | ||
if d1[k2] == 0.0: | ||
d1[k2] = avg # tmp fix | ||
return d1 | ||
|
||
@cached_property | ||
def idf_dict(self): | ||
return self.load_data_from_input(self.classify.idf_file) | ||
|
||
@cached_property | ||
def stop_words_set(self): | ||
return set([w1.strip() for file1 in self.classify.stop_words_files | ||
for w1 in UnicodeUtils.read(file1).split("\n")]) | ||
|
||
#@profile | ||
def extract_feature_words(self, in_text): | ||
""" 专业词汇抽取 + 对长词(3)再做分词 """ | ||
assert isinstance(in_text, unicode), in_text | ||
|
||
seg_list = list(self.JIEBA.posseg_cut(in_text)) # NOTE 此处最慢 | ||
lv_1_list = [] | ||
lv_2_list = [] | ||
|
||
def _continue(seg): | ||
if len(seg.word) == self.short_seg_threshold and (seg.flag not in self.key_tag): | ||
# 非关键词短串扔掉 | ||
return True | ||
if seg.word in self.stop_words_set: | ||
# 扔掉次要词和标点及其他无关紧要的词 | ||
return True | ||
if seg.flag in self.obsolete_tag: | ||
# 确定遗弃的词性 | ||
return True | ||
return False | ||
|
||
def cut2(seg, lv_2_list): | ||
# 如果是关键词,字串长超过3个字 | ||
if seg.flag in self.key_tag and len(seg.word) > self.long_key_seg_threshold: | ||
# 进一步切割以增加召回 | ||
second_cut = list(self.JIEBA.normal_cut(seg.word)) | ||
for sec_word in second_cut: | ||
if (second_cut != seg.word) and (sec_word not in self.stop_words_set): | ||
# 成功切割,增加入词库 | ||
lv_2_list.append(sec_word) | ||
# 第一个切词即和原词相同,未切割 | ||
|
||
def process(seg, lv_1_list, lv_2_list): | ||
if _continue(seg): | ||
return False | ||
# 剩下的都是需要解析的词 | ||
# 加入词列 | ||
lv_1_list.append(seg) | ||
|
||
cut2(seg, lv_2_list) | ||
|
||
map(lambda seg: process(seg, lv_1_list, lv_2_list), seg_list) # 此处运行时间仅为 1.6% | ||
|
||
# list 1 contains both flag and word | ||
# list 2 contains only word | ||
# directly return the list of words | ||
all_list = [item.word for item in lv_1_list] | ||
all_list.extend(lv_2_list) | ||
|
||
return all_list | ||
|
||
def calculate_features_weight(self, words_counter): | ||
""" 用占用个数 + idf + H 计算weight """ | ||
words_len = float(sum(words_counter.values())) | ||
assert words_len != 0 | ||
|
||
weight_dict = dict() | ||
for word1, count1 in words_counter.iteritems(): | ||
weight_dict[word1] = (count1 / words_len) * self.idf_dict[word1] / self.H_i_dict[word1] | ||
|
||
for word1 in weight_dict.keys(): | ||
if weight_dict[word1] <= self.key_word_threshold: | ||
del weight_dict[word1] | ||
return weight_dict | ||
|
||
def compute_words_and_weights(self, text1): | ||
words = self.extract_feature_words(text1) | ||
keywords_weight = self.calculate_features_weight(Counter(words)) | ||
return words, keywords_weight |
Oops, something went wrong.