Skip to content

Commit

Permalink
init
Browse files Browse the repository at this point in the history
  • Loading branch information
dchentech committed Jul 29, 2014
0 parents commit 8d26bcc
Show file tree
Hide file tree
Showing 7 changed files with 229 additions and 0 deletions.
49 changes: 49 additions & 0 deletions .gitignore
@@ -0,0 +1,49 @@
static/bower_components
static/node_modules
node_modules
static/compressed
static/.webassets-cache/
data
submodules_git

.vagrant

*.pyc
*.log
*.pickle
build
*.txt
*.db
*.json
*.yml
*.iso
*.box
*.DS_Store
*.out
*.zip
*.cPickle
*.db-journal
*.idnamecache
*.list
*.jpg
*.csv

env
lib # virtualenv
bin # virtualenv
include # virtualenv
man
share
txt

########################
# misc / editors
########################
*~
.netbeans
nbproject
.idea
*.mchg

dist
*.egg-info
8 changes: 8 additions & 0 deletions README.markdown
@@ -0,0 +1,8 @@
article_segment
================================
Fix the articles that have some words broken by random spaces, based on [wordsegment](https://github.com/mvj3/wordsegment) and [split_block](https://github.com/mvj3/split_block).


License
-------------------------------
MIT. David Chen at 17zuoye.
1 change: 1 addition & 0 deletions __init__.py
@@ -0,0 +1 @@
from .article_segment import article_segment
1 change: 1 addition & 0 deletions article_segment/__init__.py
@@ -0,0 +1 @@
from .article_segment import article_segment
113 changes: 113 additions & 0 deletions article_segment/article_segment.py
@@ -0,0 +1,113 @@
# -*- coding: utf-8 -*-

# TODO 切分300个长度以上的字符组。
# TODO 切分错误就撤销回去

# load unigrams and bigrams texts.
import re
import os
import sys

from etl_utils import cached_property, singleton
from split_block import SplitBlockGroup, SplitBlock

@singleton()
class ArticleSegment(object):
def load_grams(self):
cache_dir = None
try:
from wordsegment import segment
except IOError, e:
cache_dir = os.path.dirname(e.filename)

if cache_dir: # if there's none grams texts
grams_urls = ["https://github.com/mvj3/wordsegment/blob/master/unigrams.txt",
"https://github.com/mvj3/wordsegment/blob/master/bigrams.txt",]

for url1 in grams_urls:
grams_path = os.path.join(cache_dir, url1.split("/")[-1])
os.system("wget %s %s/" % (url1, cache_dir))

@cached_property
def segment(self):
self.load_grams()
from wordsegment import segment
return segment

def isupper(self, str1, idx1=0):
if len(str1) < (idx1+1): return False
return str1[idx1].isupper()

def fix_blanks(self, split_block_group, item1, idx1, offset):
if offset not in [1, -1]: raise NotImplemented
if (offset == -1) and (idx1 == 0): return False

item2 = split_block_group[idx1 + offset]
if (isinstance(item2, SplitBlock) and (not item2.is_blank)):
item1 = (item1 + ' ') if offset is 1 else (' ' + item1)
split_block_group[idx1] = item1


def article_segment(self, sentence, inspect=False):
sentence = re.sub("\xc2\xa0", " ", sentence)

split_block_group = SplitBlockGroup.extract(sentence)
index_block__to__fixed_words = dict()

# Generate fixed words and their indexes.
for chapped_group1 in split_block_group.maybe_chapped_groups():
chapped_group1 = SplitBlockGroup(chapped_group1)

# Reject upper words
# Iterate to remove continuous upper items
rejected_items = set([])
letters = chapped_group1.letters()
for idx1, letter1 in enumerate(letters):
if (idx1 + 1) == len(letters): break
if inspect: print letters
if self.isupper(letter1.string) and self.isupper(letters[idx1+1].string, 1):
rejected_items.add(letter1)
rejected_items.add(letters[idx1+1])
for rejected_item1 in rejected_items:
chapped_group1.remove(rejected_item1)

chapped_strs = "".join(chapped_group1.concat_items().split(" "))
fixed_words = " ".join(self.segment(chapped_strs))
if inspect: print fixed_words

index_block__to__fixed_words[(chapped_group1[0].pos_begin, chapped_group1[-1].pos_end,)] = fixed_words
if inspect:
print
print "[split_block_group.maybe_chapped_groups()]", split_block_group.maybe_chapped_groups()
print "[index_block__to__fixed_words]", index_block__to__fixed_words
print "\n"*5

# Fill fixed words by their indexes.
for begin_end_pos in index_block__to__fixed_words:
begin_idx1, end_idx1 = None, None
for idx2, sb2 in enumerate(split_block_group):
if isinstance(sb2, str): continue
if begin_end_pos[0] == sb2.pos_begin: begin_idx1 = idx2
if begin_end_pos[1] == sb2.pos_end: end_idx1 = idx2
split_block_group[begin_idx1:end_idx1+1] = index_block__to__fixed_words[begin_end_pos]
if inspect: print split_block_group; print

# Fix blanks
for idx1, item1 in enumerate(split_block_group[:]):
if not isinstance(item1, str): continue
if (idx1 + 1) == len(split_block_group) - 1: break

self.fix_blanks(split_block_group, item1, idx1, 1)
self.fix_blanks(split_block_group, item1, idx1, -1)

if inspect: print split_block_group.concat_items(); print
return split_block_group.concat_items()

article_segment = ArticleSegment().article_segment

"""
(Pdb) segment("sunB")
['sunB']
(Pdb) segment("sunb")
['sun', 'b']
"""
26 changes: 26 additions & 0 deletions setup.py
@@ -0,0 +1,26 @@
from setuptools import setup

setup(
name='article_segment',
version='0.0.1',
url='http://github.com/17zuoye/article_segment/',
license='MIT',
author='David Chen',
author_email=''.join(reversed("moc.liamg@emojvm")),
description='article_segment',
long_description='article_segment',
packages=['article_segment'],
include_package_data=True,
zip_safe=False,
platforms='any',
install_requires=[
'wordsegment',
'split_block',
],
classifiers=[
'Intended Audience :: Developers',
'Operating System :: OS Independent',
'Programming Language :: Python',
'Topic :: Software Development :: Libraries :: Python Modules'
],
)
31 changes: 31 additions & 0 deletions tests/test.py
@@ -0,0 +1,31 @@
# -*- coding: utf-8 -*-

import os, sys
root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.insert(0, root_dir)

import unittest
from article_segment import article_segment

class TestFillBrokenWords(unittest.TestCase):

def test_(self):
assert article_segment(" u se c u te ") == " use cute"
assert article_segment("fl y      k i te  ") == "fly kite"
assert article_segment("k i te l i ve") == "kite live"
assert article_segment("ea r h ea d") == "ear head"
assert article_segment(" f ir st   th ir d        ") == " first third"
assert article_segment("s o o n ") == "soon"

assert article_segment("A. s un B.no s e C.fa c e D.ri c e", True) == "A. sun B. nose C. face D. rice"
assert article_segment(" A.j u mp B.st u dy C.J u ly", True) == " A. jump B. study C. July"

# copied from http://en.wikipedia.org/wiki/Peter_Norvig
long_txt = article_segment("He is a Fellow and Councilor of the Association for the Advancement of Artificial Intelligence and co-author, with Stuart Russell, of Artificial Intelligence: A Modern Approach, now the leading college text in the field[citation needed]. He previously was head of the Computational Sciences Division (now the Intelligent Systems Division) at NASA Ames Research Center, where he oversaw a staff of 200 scientists performing NASA's research and development in autonomy and robotics, automated software engineering and data analysis, neuroengineering, collaborative systems research, and simulation-based decision-making. Before that he was Chief Scientist at Junglee, where he helped develop one of the first Internet comparison shopping services; Chief designer at Harlequin Inc.; and Senior Scientist at Sun Microsystems Laboratories. " + \
"A.j u mp B.st u dy C.J u ly " + \
"Norvig received a Bachelor of Science in Applied Mathematics from Brown University[6] and a Ph.D. in Computer Science from the University of California, Berkeley.",)

assert " A. jump B. study C. July" in long_txt


if __name__ == '__main__': unittest.main()

0 comments on commit 8d26bcc

Please sign in to comment.