Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit 8d26bcc
Showing
7 changed files
with
229 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
static/bower_components | ||
static/node_modules | ||
node_modules | ||
static/compressed | ||
static/.webassets-cache/ | ||
data | ||
submodules_git | ||
|
||
.vagrant | ||
|
||
*.pyc | ||
*.log | ||
*.pickle | ||
build | ||
*.txt | ||
*.db | ||
*.json | ||
*.yml | ||
*.iso | ||
*.box | ||
*.DS_Store | ||
*.out | ||
*.zip | ||
*.cPickle | ||
*.db-journal | ||
*.idnamecache | ||
*.list | ||
*.jpg | ||
*.csv | ||
|
||
env | ||
lib # virtualenv | ||
bin # virtualenv | ||
include # virtualenv | ||
man | ||
share | ||
txt | ||
|
||
######################## | ||
# misc / editors | ||
######################## | ||
*~ | ||
.netbeans | ||
nbproject | ||
.idea | ||
*.mchg | ||
|
||
dist | ||
*.egg-info |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
article_segment | ||
================================ | ||
Fix the articles that have some words broken by random spaces, based on [wordsegment](https://github.com/mvj3/wordsegment) and [split_block](https://github.com/mvj3/split_block). | ||
|
||
|
||
License | ||
------------------------------- | ||
MIT. David Chen at 17zuoye. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
from .article_segment import article_segment |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
from .article_segment import article_segment |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,113 @@ | ||
# -*- coding: utf-8 -*- | ||
|
||
# TODO 切分300个长度以上的字符组。 | ||
# TODO 切分错误就撤销回去 | ||
|
||
# load unigrams and bigrams texts. | ||
import re | ||
import os | ||
import sys | ||
|
||
from etl_utils import cached_property, singleton | ||
from split_block import SplitBlockGroup, SplitBlock | ||
|
||
@singleton() | ||
class ArticleSegment(object): | ||
def load_grams(self): | ||
cache_dir = None | ||
try: | ||
from wordsegment import segment | ||
except IOError, e: | ||
cache_dir = os.path.dirname(e.filename) | ||
|
||
if cache_dir: # if there's none grams texts | ||
grams_urls = ["https://github.com/mvj3/wordsegment/blob/master/unigrams.txt", | ||
"https://github.com/mvj3/wordsegment/blob/master/bigrams.txt",] | ||
|
||
for url1 in grams_urls: | ||
grams_path = os.path.join(cache_dir, url1.split("/")[-1]) | ||
os.system("wget %s %s/" % (url1, cache_dir)) | ||
|
||
@cached_property | ||
def segment(self): | ||
self.load_grams() | ||
from wordsegment import segment | ||
return segment | ||
|
||
def isupper(self, str1, idx1=0): | ||
if len(str1) < (idx1+1): return False | ||
return str1[idx1].isupper() | ||
|
||
def fix_blanks(self, split_block_group, item1, idx1, offset): | ||
if offset not in [1, -1]: raise NotImplemented | ||
if (offset == -1) and (idx1 == 0): return False | ||
|
||
item2 = split_block_group[idx1 + offset] | ||
if (isinstance(item2, SplitBlock) and (not item2.is_blank)): | ||
item1 = (item1 + ' ') if offset is 1 else (' ' + item1) | ||
split_block_group[idx1] = item1 | ||
|
||
|
||
def article_segment(self, sentence, inspect=False): | ||
sentence = re.sub("\xc2\xa0", " ", sentence) | ||
|
||
split_block_group = SplitBlockGroup.extract(sentence) | ||
index_block__to__fixed_words = dict() | ||
|
||
# Generate fixed words and their indexes. | ||
for chapped_group1 in split_block_group.maybe_chapped_groups(): | ||
chapped_group1 = SplitBlockGroup(chapped_group1) | ||
|
||
# Reject upper words | ||
# Iterate to remove continuous upper items | ||
rejected_items = set([]) | ||
letters = chapped_group1.letters() | ||
for idx1, letter1 in enumerate(letters): | ||
if (idx1 + 1) == len(letters): break | ||
if inspect: print letters | ||
if self.isupper(letter1.string) and self.isupper(letters[idx1+1].string, 1): | ||
rejected_items.add(letter1) | ||
rejected_items.add(letters[idx1+1]) | ||
for rejected_item1 in rejected_items: | ||
chapped_group1.remove(rejected_item1) | ||
|
||
chapped_strs = "".join(chapped_group1.concat_items().split(" ")) | ||
fixed_words = " ".join(self.segment(chapped_strs)) | ||
if inspect: print fixed_words | ||
|
||
index_block__to__fixed_words[(chapped_group1[0].pos_begin, chapped_group1[-1].pos_end,)] = fixed_words | ||
if inspect: | ||
print "[split_block_group.maybe_chapped_groups()]", split_block_group.maybe_chapped_groups() | ||
print "[index_block__to__fixed_words]", index_block__to__fixed_words | ||
print "\n"*5 | ||
|
||
# Fill fixed words by their indexes. | ||
for begin_end_pos in index_block__to__fixed_words: | ||
begin_idx1, end_idx1 = None, None | ||
for idx2, sb2 in enumerate(split_block_group): | ||
if isinstance(sb2, str): continue | ||
if begin_end_pos[0] == sb2.pos_begin: begin_idx1 = idx2 | ||
if begin_end_pos[1] == sb2.pos_end: end_idx1 = idx2 | ||
split_block_group[begin_idx1:end_idx1+1] = index_block__to__fixed_words[begin_end_pos] | ||
if inspect: print split_block_group; print | ||
|
||
# Fix blanks | ||
for idx1, item1 in enumerate(split_block_group[:]): | ||
if not isinstance(item1, str): continue | ||
if (idx1 + 1) == len(split_block_group) - 1: break | ||
|
||
self.fix_blanks(split_block_group, item1, idx1, 1) | ||
self.fix_blanks(split_block_group, item1, idx1, -1) | ||
|
||
if inspect: print split_block_group.concat_items(); print | ||
return split_block_group.concat_items() | ||
|
||
article_segment = ArticleSegment().article_segment | ||
|
||
""" | ||
(Pdb) segment("sunB") | ||
['sunB'] | ||
(Pdb) segment("sunb") | ||
['sun', 'b'] | ||
""" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
from setuptools import setup | ||
|
||
setup( | ||
name='article_segment', | ||
version='0.0.1', | ||
url='http://github.com/17zuoye/article_segment/', | ||
license='MIT', | ||
author='David Chen', | ||
author_email=''.join(reversed("moc.liamg@emojvm")), | ||
description='article_segment', | ||
long_description='article_segment', | ||
packages=['article_segment'], | ||
include_package_data=True, | ||
zip_safe=False, | ||
platforms='any', | ||
install_requires=[ | ||
'wordsegment', | ||
'split_block', | ||
], | ||
classifiers=[ | ||
'Intended Audience :: Developers', | ||
'Operating System :: OS Independent', | ||
'Programming Language :: Python', | ||
'Topic :: Software Development :: Libraries :: Python Modules' | ||
], | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
# -*- coding: utf-8 -*- | ||
|
||
import os, sys | ||
root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) | ||
sys.path.insert(0, root_dir) | ||
|
||
import unittest | ||
from article_segment import article_segment | ||
|
||
class TestFillBrokenWords(unittest.TestCase): | ||
|
||
def test_(self): | ||
assert article_segment(" u se c u te ") == " use cute" | ||
assert article_segment("fl y k i te ") == "fly kite" | ||
assert article_segment("k i te l i ve") == "kite live" | ||
assert article_segment("ea r h ea d") == "ear head" | ||
assert article_segment(" f ir st th ir d ") == " first third" | ||
assert article_segment("s o o n ") == "soon" | ||
|
||
assert article_segment("A. s un B.no s e C.fa c e D.ri c e", True) == "A. sun B. nose C. face D. rice" | ||
assert article_segment(" A.j u mp B.st u dy C.J u ly", True) == " A. jump B. study C. July" | ||
|
||
# copied from http://en.wikipedia.org/wiki/Peter_Norvig | ||
long_txt = article_segment("He is a Fellow and Councilor of the Association for the Advancement of Artificial Intelligence and co-author, with Stuart Russell, of Artificial Intelligence: A Modern Approach, now the leading college text in the field[citation needed]. He previously was head of the Computational Sciences Division (now the Intelligent Systems Division) at NASA Ames Research Center, where he oversaw a staff of 200 scientists performing NASA's research and development in autonomy and robotics, automated software engineering and data analysis, neuroengineering, collaborative systems research, and simulation-based decision-making. Before that he was Chief Scientist at Junglee, where he helped develop one of the first Internet comparison shopping services; Chief designer at Harlequin Inc.; and Senior Scientist at Sun Microsystems Laboratories. " + \ | ||
"A.j u mp B.st u dy C.J u ly " + \ | ||
"Norvig received a Bachelor of Science in Applied Mathematics from Brown University[6] and a Ph.D. in Computer Science from the University of California, Berkeley.",) | ||
|
||
assert " A. jump B. study C. July" in long_txt | ||
|
||
|
||
if __name__ == '__main__': unittest.main() |