init

17zuoye · Jul 29, 2014 · 8d26bcc · 8d26bcc
commit 8d26bcc
Show file tree

Hide file tree

Showing 7 changed files with 229 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,49 @@
+static/bower_components
+static/node_modules
+node_modules
+static/compressed
+static/.webassets-cache/
+data
+submodules_git
+
+.vagrant
+
+*.pyc
+*.log
+*.pickle
+build
+*.txt
+*.db
+*.json
+*.yml
+*.iso
+*.box
+*.DS_Store
+*.out
+*.zip
+*.cPickle
+*.db-journal
+*.idnamecache
+*.list
+*.jpg
+*.csv
+
+env
+lib # virtualenv
+bin # virtualenv
+include # virtualenv
+man
+share
+txt
+
+########################
+# misc / editors
+########################
+*~
+.netbeans
+nbproject
+.idea
+*.mchg
+
+dist
+*.egg-info
diff --git a/README.markdown b/README.markdown
@@ -0,0 +1,8 @@
+article_segment
+================================
+Fix the articles that have some words broken by random spaces, based on [wordsegment](https://github.com/mvj3/wordsegment) and [split_block](https://github.com/mvj3/split_block).
+
+
+License
+-------------------------------
+MIT. David Chen at 17zuoye.
diff --git a/__init__.py b/__init__.py
@@ -0,0 +1 @@
+from .article_segment import article_segment
diff --git a/article_segment/__init__.py b/article_segment/__init__.py
@@ -0,0 +1 @@
+from .article_segment import article_segment
diff --git a/article_segment/article_segment.py b/article_segment/article_segment.py
@@ -0,0 +1,113 @@
+# -*- coding: utf-8 -*-
+
+# TODO 切分300个长度以上的字符组。
+# TODO 切分错误就撤销回去
+
+# load unigrams and bigrams texts.
+import re
+import os
+import sys
+
+from etl_utils import cached_property, singleton
+from split_block import SplitBlockGroup, SplitBlock
+
+@singleton()
+class ArticleSegment(object):
+    def load_grams(self):
+        cache_dir = None
+        try:
+            from wordsegment import segment
+        except IOError, e:
+            cache_dir = os.path.dirname(e.filename)
+
+        if cache_dir: # if there's none grams texts
+            grams_urls = ["https://github.com/mvj3/wordsegment/blob/master/unigrams.txt",
+                          "https://github.com/mvj3/wordsegment/blob/master/bigrams.txt",]
+
+            for url1 in grams_urls:
+                grams_path = os.path.join(cache_dir, url1.split("/")[-1])
+                os.system("wget %s %s/" % (url1, cache_dir))
+
+    @cached_property
+    def segment(self):
+        self.load_grams()
+        from wordsegment import segment
+        return segment
+
+    def isupper(self, str1, idx1=0):
+        if len(str1) < (idx1+1): return False
+        return str1[idx1].isupper()
+
+    def fix_blanks(self, split_block_group, item1, idx1, offset):
+        if offset not in [1, -1]: raise NotImplemented
+        if (offset == -1) and (idx1 == 0): return False
+
+        item2 = split_block_group[idx1 + offset]
+        if (isinstance(item2, SplitBlock) and (not item2.is_blank)):
+            item1 = (item1 + ' ') if offset is 1 else (' ' + item1)
+            split_block_group[idx1] = item1
+
+
+    def article_segment(self, sentence, inspect=False):
+        sentence = re.sub("\xc2\xa0", " ", sentence)
+
+        split_block_group = SplitBlockGroup.extract(sentence)
+        index_block__to__fixed_words = dict()
+
+        # Generate fixed words and their indexes.
+        for chapped_group1 in split_block_group.maybe_chapped_groups():
+            chapped_group1 = SplitBlockGroup(chapped_group1)
+
+            # Reject upper words
+            # Iterate to remove continuous upper items
+            rejected_items = set([])
+            letters = chapped_group1.letters()
+            for idx1, letter1 in enumerate(letters):
+                if (idx1 + 1) == len(letters): break
+                if inspect: print letters
+                if self.isupper(letter1.string) and self.isupper(letters[idx1+1].string, 1):
+                    rejected_items.add(letter1)
+                    rejected_items.add(letters[idx1+1])
+            for rejected_item1 in rejected_items:
+                chapped_group1.remove(rejected_item1)
+
+            chapped_strs   = "".join(chapped_group1.concat_items().split(" "))
+            fixed_words    = " ".join(self.segment(chapped_strs))
+            if inspect: print fixed_words
+
+            index_block__to__fixed_words[(chapped_group1[0].pos_begin, chapped_group1[-1].pos_end,)] = fixed_words
+        if inspect:
+            print
+            print "[split_block_group.maybe_chapped_groups()]", split_block_group.maybe_chapped_groups()
+            print "[index_block__to__fixed_words]", index_block__to__fixed_words
+            print "\n"*5
+
+        # Fill fixed words by their indexes.
+        for begin_end_pos in index_block__to__fixed_words:
+            begin_idx1, end_idx1 = None, None
+            for idx2, sb2 in enumerate(split_block_group):
+                if isinstance(sb2, str): continue
+                if begin_end_pos[0] == sb2.pos_begin: begin_idx1 = idx2
+                if begin_end_pos[1] == sb2.pos_end:   end_idx1   = idx2
+            split_block_group[begin_idx1:end_idx1+1] = index_block__to__fixed_words[begin_end_pos]
+            if inspect: print split_block_group; print
+
+        # Fix blanks
+        for idx1, item1 in enumerate(split_block_group[:]):
+            if not isinstance(item1, str): continue
+            if (idx1 + 1) == len(split_block_group) - 1: break
+
+            self.fix_blanks(split_block_group, item1, idx1, 1)
+            self.fix_blanks(split_block_group, item1, idx1, -1)
+
+        if inspect: print split_block_group.concat_items(); print
+        return split_block_group.concat_items()
+
+article_segment = ArticleSegment().article_segment
+
+"""
+(Pdb) segment("sunB")
+['sunB']
+(Pdb) segment("sunb")
+['sun', 'b']
+"""
diff --git a/setup.py b/setup.py
@@ -0,0 +1,26 @@
+from setuptools import setup
+
+setup(
+    name='article_segment',
+    version='0.0.1',
+    url='http://github.com/17zuoye/article_segment/',
+    license='MIT',
+    author='David Chen',
+    author_email=''.join(reversed("moc.liamg@emojvm")),
+    description='article_segment',
+    long_description='article_segment',
+    packages=['article_segment'],
+    include_package_data=True,
+    zip_safe=False,
+    platforms='any',
+    install_requires=[
+        'wordsegment',
+        'split_block',
+    ],
+    classifiers=[
+        'Intended Audience :: Developers',
+        'Operating System :: OS Independent',
+        'Programming Language :: Python',
+        'Topic :: Software Development :: Libraries :: Python Modules'
+    ],
+)
diff --git a/tests/test.py b/tests/test.py
@@ -0,0 +1,31 @@
+# -*- coding: utf-8 -*-
+
+import os, sys
+root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+sys.path.insert(0, root_dir)
+
+import unittest
+from article_segment import article_segment
+
+class TestFillBrokenWords(unittest.TestCase):
+
+    def test_(self):
+        assert article_segment("       u se    c u te  ") == "       use cute"
+        assert article_segment("fl y       k i te  ") == "fly kite"
+        assert article_segment("k i te   l i ve") == "kite live"
+        assert article_segment("ea r   h ea d") == "ear head"
+        assert article_segment(" f ir st   th ir d         ") == " first third"
+        assert article_segment("s o     o n ") == "soon"
+
+        assert article_segment("A. s un  B.no s e C.fa c e  D.ri c e", True) == "A. sun B. nose C. face D. rice"
+        assert article_segment(" A.j u mp       B.st u dy      C.J u ly", True) == " A. jump B. study C. July"
+
+        # copied from http://en.wikipedia.org/wiki/Peter_Norvig
+        long_txt = article_segment("He is a Fellow and Councilor of the Association for the Advancement of Artificial Intelligence and co-author, with Stuart Russell, of Artificial Intelligence: A Modern Approach, now the leading college text in the field[citation needed]. He previously was head of the Computational Sciences Division (now the Intelligent Systems Division) at NASA Ames Research Center, where he oversaw a staff of 200 scientists performing NASA's research and development in autonomy and robotics, automated software engineering and data analysis, neuroengineering, collaborative systems research, and simulation-based decision-making. Before that he was Chief Scientist at Junglee, where he helped develop one of the first Internet comparison shopping services; Chief designer at Harlequin Inc.; and Senior Scientist at Sun Microsystems Laboratories. " + \
+                "A.j u mp       B.st u dy      C.J u ly " + \
+                "Norvig received a Bachelor of Science in Applied Mathematics from Brown University[6] and a Ph.D. in Computer Science from the University of California, Berkeley.",)
+
+        assert " A. jump B. study C. July" in long_txt
+
+
+if __name__ == '__main__': unittest.main()