Skip to content

Commit

Permalink
Add missed textmulclassify missed by .gitignore
Browse files Browse the repository at this point in the history
  • Loading branch information
dchentech committed May 24, 2015
1 parent d926359 commit 1811eb5
Show file tree
Hide file tree
Showing 6 changed files with 498 additions and 0 deletions.
6 changes: 6 additions & 0 deletions textmulclassify/lib/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
__all__ = ["ReadManualKps", "FeaturesWeight", "Similarity", "Evaluate"]

from .read_manual_kps import ReadManualKps
from .features_weight import FeaturesWeight
from .similarity import Similarity
from .evaluate import Evaluate
66 changes: 66 additions & 0 deletions textmulclassify/lib/entropy.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
# -*- coding: utf-8 -*-

"""
计算 特征值的熵,看看特征值在知识点下的分布是否混乱。
每个特征都只有一个熵值。
/ tag1
Feature(i) - tag2
\ tag3
\ ...
\ tagj
P(ij) = Feature(ij)/Feature(i)
H(i) = - (连加 P(ij) * log( P(ij) ))
"""

from etl_utils import cpickle_cache, process_notifier # , calculate_entropy
from collections import defaultdict

try:
from scipy.stats import entropy as scipy_entropy
except ImportError, e:
import math

def scipy_entropy(values):
feature_count_sum = float(sum(values))

entropy = 0.0
for c1 in values:
p_ij = c1 / feature_count_sum
entropy += p_ij * math.log(p_ij)
return - entropy


class EntropyFunc(object):

@classmethod
def process(cls, d1, cache_dir):
""" d1 is {"feature1":count1, "feature2":count2, ... } """

def func():
# 1. fetch all features
uniq_keys = set([])
for item_id1, item1 in process_notifier(d1):
[uniq_keys.add(k1) for k1 in item1.iterkeys()]
uniq_keys = list(uniq_keys)

# 2. feature1 => {doc1: count1, doc2: count2, ...}
value_cache = defaultdict(dict)
for item_id1, item1 in process_notifier(d1):
for k1, c1 in item1.iteritems():
value_cache[k1][item_id1] = c1

# 3. calculate each feauture's entropy
entropy_cache = dict()
total_len = len(d1)
for k1 in process_notifier(uniq_keys):
exist_values = value_cache[k1].values()
total_values = exist_values + [0] * (total_len - len(value_cache))

entropy_cache[k1] = scipy_entropy(total_values)

return entropy_cache

return cpickle_cache(cache_dir + '/entropy.cPickle', func)
121 changes: 121 additions & 0 deletions textmulclassify/lib/evaluate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
# -*- coding: utf-8 -*-

from ..data_structures.tree import TMCTree
from etl_utils import uprint


class Evaluate(object):
""" 计算多级知识点下的 exact|peer|child|parent 召回率+正确率 """

def __init__(self, tags_tree, items):
self.items = items
for i1 in self.items:
i1['eval_result'] = []

# 验证数据结构
assert isinstance(tags_tree, TMCTree) # name TODO
assert isinstance(self.items, list)
assert 'original_tags' in self.items[0]
assert 'recommend_tags' in self.items[0]

self.process(tags_tree, self.items)

def process(self, tags_tree, items, verbose=False):
from bunch import Bunch
total_counts = Bunch({'original': 0, 'recommend': 0})
# 对recall来说, recommend_tags对original_tags里的每个tag都可以超过一个匹配,
# 但是还是按original_tags里排重过的匹配算。
recall_counts = Bunch({'exact': 0, 'peer': 0, 'child': 0, 'parent': 0, 'unmatch': 0})
# 对precision来说,是可以超过多个的,然后 比率 就是直接除以自身全部个数。
precision_counts = Bunch({'exact': 0, 'peer': 0, 'child': 0, 'parent': 0, 'unmatch': 0})

def update(obj, method, num):
setattr(obj, method, (getattr(obj, method) + num))

for idx1, item1 in enumerate(items):
if verbose:
print "\n", "#" * 50, "[process] #", idx1 + 1
if item1['original_tags']:
assert isinstance(list(item1['original_tags'])[0], unicode)
if item1['recommend_tags']:
assert isinstance(list(item1['recommend_tags'])[0], unicode)

original_tags = set(filter(lambda i1: i1 in tags_tree.name_to_nodes,
item1['original_tags'])) # check valid data
recommend_tags = set(item1['recommend_tags'])

total_counts.original += len(original_tags)
total_counts.recommend += len(recommend_tags)

# processed_* 只是为了处理 epcp 依赖的顺序,即前面处理了,后面就没机会了
processed_original_tags = set([])
processed_recommend_tags = set([])

def func(counts, is_precision=False):
if not is_precision:
for1, for2 = original_tags, recommend_tags
else:
for2, for1 = original_tags, recommend_tags

processed = set([])

for method in ["exact", "peer", "child", "parent"]:
match_count = 0

for t1 in (for1 - processed): # 其实核心就是对这一层进行遍历
matched_t1 = None
for t2 in for2: # 不需要相减,因为其他recommend_tags还要判定关系
n_t1, n_t2 = t2, t1
if verbose:
print method, "[n_t1]", n_t1, "[n_t2]", n_t2
if getattr(tags_tree, "is_" + method)(n_t1, n_t2):
if verbose:
print "√"
matched_t1 = n_t2
break
if matched_t1:
# 这样在这个循环外部的for循环rt1就没有机会重复计算了
processed.add(n_t2)
match_count += 1
update(counts, method, match_count)
if verbose:
uprint("[processed]", processed)
counts.unmatch += len(for1 - processed)

# 计算是否完全没有 召回|正确
if len(processed) == 0:
text = "no_precision" if is_precision else "no_recall"
item1['eval_result'].append(text)

if verbose:
print " " * 10, "[recall] ..."
func(recall_counts)

if verbose:
print " " * 10, "[precision] ..."
func(precision_counts, is_precision=True)

print "#" * 100
print "#" * 100
print "#" * 100

def calculate_detail_rates(denominator, molecules):
def calculate_percent(molecule, denominator):
return round(((molecule / float(denominator))) * 100, 2)
rates = [calculate_percent(m1, denominator) for m1 in molecules]
rates.append(sum(rates[0:-1]))
return rates

print "total_counts", repr(total_counts)
print "recall_counts", repr(recall_counts)
print "precision_counts", repr(precision_counts)

self.recall_rates = calculate_detail_rates(total_counts.original,
[recall_counts.exact, recall_counts.child, recall_counts.parent, recall_counts.peer, recall_counts.unmatch])

self.precision_rates = calculate_detail_rates(total_counts.recommend,
[precision_counts.exact, precision_counts.child, precision_counts.parent, precision_counts.peer, precision_counts.unmatch])

print "exact | child | parent | peer | unmatch | [total]"
print "召回率", self.recall_rates
print "正确率", self.precision_rates
155 changes: 155 additions & 0 deletions textmulclassify/lib/features_weight.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
# -*- coding: utf-8 -*-

# original author is @junchen , @liheng
# refactored by @mvj3

import os
import sys
import json
from etl_utils import process_notifier, UnicodeUtils, uprint, cached_property, singleton
from collections import defaultdict, Counter
import jieba.posseg as posseg


@singleton()
class JIEBA_CLASS(object):

def __init__(self):
self.jieba = posseg.jieba

def load_dictionaries(self, userdicts=[]):
for file1 in userdicts:
self.jieba.load_userdict(file1)

def normal_cut(self, unicode1):
return self.jieba.cut(unicode1)

def posseg_cut(self, unicode1):
return posseg.cut(unicode1)


class FeaturesWeight(object):

def __init__(self, classify):
self.classify = classify

self.JIEBA = JIEBA_CLASS()
self.JIEBA.load_dictionaries(classify.jieba_userdict_files)

self.key_tag = set(['mk', 'mw'])
self.obsolete_tag = set(['x', 'm', 'eng'])

self.short_seg_threshold = 1
self.long_key_seg_threshold = 2
self.key_word_threshold = 0.005

def load_data_from_input(self, input1):
""" return data is a dict. """
def wrap(data):
avg = sum(data.values()) / float(len(data))
return defaultdict(lambda: avg, data)

if isinstance(input1, dict):
return wrap(input1)

if not os.path.exists(input1):
return defaultdict(float)

content = UnicodeUtils.read(input1).strip()
try:
data = json.loads(content)
except:
data = dict()
for line in content.split("\n"):
result = line.split(',')
data[result[0]] = float(result[1].strip())

return wrap(data)

@cached_property
def H_i_dict(self):
d1 = self.load_data_from_input(self.classify.entropy_file)
avg = sum(d1.values()) / float(len(d1))
for k2 in d1.keys():
if d1[k2] == 0.0:
d1[k2] = avg # tmp fix
return d1

@cached_property
def idf_dict(self):
return self.load_data_from_input(self.classify.idf_file)

@cached_property
def stop_words_set(self):
return set([w1.strip() for file1 in self.classify.stop_words_files
for w1 in UnicodeUtils.read(file1).split("\n")])

#@profile
def extract_feature_words(self, in_text):
""" 专业词汇抽取 + 对长词(3)再做分词 """
assert isinstance(in_text, unicode), in_text

seg_list = list(self.JIEBA.posseg_cut(in_text)) # NOTE 此处最慢
lv_1_list = []
lv_2_list = []

def _continue(seg):
if len(seg.word) == self.short_seg_threshold and (seg.flag not in self.key_tag):
# 非关键词短串扔掉
return True
if seg.word in self.stop_words_set:
# 扔掉次要词和标点及其他无关紧要的词
return True
if seg.flag in self.obsolete_tag:
# 确定遗弃的词性
return True
return False

def cut2(seg, lv_2_list):
# 如果是关键词,字串长超过3个字
if seg.flag in self.key_tag and len(seg.word) > self.long_key_seg_threshold:
# 进一步切割以增加召回
second_cut = list(self.JIEBA.normal_cut(seg.word))
for sec_word in second_cut:
if (second_cut != seg.word) and (sec_word not in self.stop_words_set):
# 成功切割,增加入词库
lv_2_list.append(sec_word)
# 第一个切词即和原词相同,未切割

def process(seg, lv_1_list, lv_2_list):
if _continue(seg):
return False
# 剩下的都是需要解析的词
# 加入词列
lv_1_list.append(seg)

cut2(seg, lv_2_list)

map(lambda seg: process(seg, lv_1_list, lv_2_list), seg_list) # 此处运行时间仅为 1.6%

# list 1 contains both flag and word
# list 2 contains only word
# directly return the list of words
all_list = [item.word for item in lv_1_list]
all_list.extend(lv_2_list)

return all_list

def calculate_features_weight(self, words_counter):
""" 用占用个数 + idf + H 计算weight """
words_len = float(sum(words_counter.values()))
assert words_len != 0

weight_dict = dict()
for word1, count1 in words_counter.iteritems():
weight_dict[word1] = (count1 / words_len) * self.idf_dict[word1] / self.H_i_dict[word1]

for word1 in weight_dict.keys():
if weight_dict[word1] <= self.key_word_threshold:
del weight_dict[word1]
return weight_dict

def compute_words_and_weights(self, text1):
words = self.extract_feature_words(text1)
keywords_weight = self.calculate_features_weight(Counter(words))
return words, keywords_weight

0 comments on commit 1811eb5

Please sign in to comment.