Skip to content

Commit

Permalink
clean code with test passed
Browse files Browse the repository at this point in the history
  • Loading branch information
dchentech committed May 24, 2015
1 parent fe7dfd3 commit f85b560
Show file tree
Hide file tree
Showing 6 changed files with 17 additions and 14 deletions.
2 changes: 2 additions & 0 deletions textmulclassify/data_structures/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,4 @@
__all__ = ["Distribution", "TMCModel", "TMCTree"]

from .model import Distribution, TMCModel
from .tree import TMCTree
4 changes: 2 additions & 2 deletions textmulclassify/data_structures/model.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-

from etl_utils import UnicodeUtils, process_notifier, cpickle_cache
from collections import Counter, defaultdict
from collections import Counter
from urwid import is_wide_char
import random

Expand Down Expand Up @@ -51,7 +51,7 @@ def tags_model__append_more_features_when_recommend(cls, item, sorted_features):

@classmethod
def is_valid_tag(cls, tag1):
return self.tags_tree.has_node(tag1)
return cls.tags_tree.has_node(tag1)

def __repr__(self):
print "item_id", self.item_id, "\n"
Expand Down
10 changes: 6 additions & 4 deletions textmulclassify/data_structures/tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,9 @@

from etl_utils import UnicodeUtils, process_notifier, uprint, cached_property, slots_with_pickle
from collections import defaultdict, Counter
from ..lib.read_manual_kps import ReadManualKps


from bunch import Bunch

# 使用 __slots__ 属性 降低内存使用
# 优化例子: 初高中物理数学内存从 5.6G 降低到 5.0G.

Expand Down Expand Up @@ -88,7 +87,7 @@ def add_to_current_tree(current_tree, parent):
add_to_current_tree(self[TMCTree.root_node], TMCTree.root_node)

def import_from_file(file1):
# import_from_file 暂不支持depth
# NOTE import_from_file 暂不支持depth
for line in UnicodeUtils.read(file1).strip().split(line_split):
line = line.strip()
if TMCTree.root_node not in self:
Expand Down Expand Up @@ -257,11 +256,14 @@ def filter_valid_tags(self, tags):

@cached_property
def total_nodes(self):
return set([node1 for f1, nodes in feature_to_nodes.iteritems() for node1 in nodes])
return set([node1 for f1, nodes in self.feature_to_nodes.iteritems() for node1 in nodes])

def rich_train_data_by_editor(self, files=[]):
""" 通过人工编辑规则增强Train Data """
# 20140910_1427 没效果,反而有一两个百分点下降。
import jieba
dict_dir = None

for file1 in files:
parsed = ReadManualKps.process(dict_dir + file1)
for node_name1, node_features in parsed.iteritems():
Expand Down
6 changes: 3 additions & 3 deletions textmulclassify/lib/evaluate.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
# -*- coding: utf-8 -*-

from ..data_structures.tree import TMCTree
from etl_utils import uprint


Expand All @@ -13,6 +12,7 @@ def __init__(self, tags_tree, items):
i1['eval_result'] = []

# 验证数据结构
from ..data_structures.tree import TMCTree # load lazily
assert isinstance(tags_tree, TMCTree) # name TODO
assert isinstance(self.items, list)
assert 'original_tags' in self.items[0]
Expand Down Expand Up @@ -48,8 +48,8 @@ def update(obj, method, num):
total_counts.recommend += len(recommend_tags)

# processed_* 只是为了处理 epcp 依赖的顺序,即前面处理了,后面就没机会了
processed_original_tags = set([])
processed_recommend_tags = set([])
# processed_original_tags = set([])
# processed_recommend_tags = set([])

def func(counts, is_precision=False):
if not is_precision:
Expand Down
5 changes: 2 additions & 3 deletions textmulclassify/lib/features_weight.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,8 @@
# refactored by @mvj3

import os
import sys
import json
from etl_utils import process_notifier, UnicodeUtils, uprint, cached_property, singleton
from etl_utils import UnicodeUtils, cached_property, singleton
from collections import defaultdict, Counter
import jieba.posseg as posseg

Expand Down Expand Up @@ -84,7 +83,7 @@ def stop_words_set(self):
return set([w1.strip() for file1 in self.classify.stop_words_files
for w1 in UnicodeUtils.read(file1).split("\n")])

#@profile
# @profile
def extract_feature_words(self, in_text):
""" 专业词汇抽取 + 对长词(3)再做分词 """
assert isinstance(in_text, unicode), in_text
Expand Down
4 changes: 2 additions & 2 deletions textmulclassify/lib/similarity.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@
# original author is @junchen and @LiHeng
# refactored by @mvj3

#from scipy.spatial.distance import cosine as scipy_cosine
#from collections import OrderedDict
# from scipy.spatial.distance import cosine as scipy_cosine
# from collections import OrderedDict
from math import sqrt


Expand Down

0 comments on commit f85b560

Please sign in to comment.