Skip to content

Commit

Permalink
Review rebase function
Browse files Browse the repository at this point in the history
  • Loading branch information
Fantomas42 committed Feb 3, 2015
1 parent bd586dc commit e4fa3c6
Show file tree
Hide file tree
Showing 2 changed files with 57 additions and 34 deletions.
21 changes: 11 additions & 10 deletions mots_vides/stop_words.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,14 +58,15 @@ def __iter__(self):
"""
return self.collection.__iter__()

def _compile_regex(self, word):
return re.compile(
r'((^| ){0})|({0} )|{0}'.format('((?<!\w)'+word+'(?!\w))'),
flags=re.IGNORECASE
)
def rebase(self, text, char='X'):
"""
Rebases text with stop words removed.
"""
regexp = re.compile(r'\b(%s)\b' % '|'.join(self.collection),
re.IGNORECASE | re.UNICODE)

def replace(m):
word = m.group(1)
return char * len(word)

def rebase(self, text):
for word in self.collection:
current_regex = self._compile_regex(word)
text = current_regex.sub('', text).strip()
return text
return regexp.sub(replace, text)
70 changes: 46 additions & 24 deletions mots_vides/tests/stop_words.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,63 +91,85 @@ def test_sub(self):

class StopWordRebaseTestCase(TestCase):

def check_stop_word_rebase(self, inpout, outpout, sept):
def check_stop_word_rebase(self, inpout, outpout, sept, char=None):
sw = StopWord('test', sept)
self.assertEqual(sw.rebase(inpout), outpout)
if char is None:
self.assertEqual(sw.rebase(inpout), outpout)
else:
self.assertEqual(sw.rebase(inpout, char), outpout)

def test_stopword_rebase_first(self):
def test_stopword_rebase(self):
"""
Test with first word in text
Basic rebasing
"""
self.check_stop_word_rebase(
'Comme je viens de te le dire',
'je viens de te le dire',
'XXXXX je viens de te le dire',
['comme'])

def test_stopword_rebase_middle(self):
"""
Test with word in middle of text
"""
self.check_stop_word_rebase(
'Comme je viens de te le dire',
'Comme je de te le dire',
'Comme je XXXXX de te le dire',
['viens'])
self.check_stop_word_rebase(
'Comme je viens de te le dire',
'Comme je viens de te le XXXX',
['dire'])
self.check_stop_word_rebase(
'Comme je viens de te le dire',
'Comme je viens de te le @@@@',
['dire'], '@')

def test_stopword_rebase_newline(self):
"""
Test with newline between two words
"""
self.check_stop_word_rebase(
'Comme je\nviens de te le dire',
'Comme\nde te le dire',
'Comme XX\nXXXXX de te le dire',
['viens', 'je'])
self.check_stop_word_rebase(
'Comme je\nviens de te le dire',
'Comme\nviens de te le dire',
'Comme XX\nviens de te le dire',
['je'])
self.check_stop_word_rebase(
'Comme je\nviens de te le dire',
'Comme je\nde te le dire',
'Comme je\nXXXXX de te le dire',
['viens'])

def test_stopword_rebase_two_escape_code(self):
"""
Test with newline and tab before word
"""
self.check_stop_word_rebase(
'Comme je\n\tviens de te le dire',
'Comme je\n\tde te le dire',
['viens'])
self.check_stop_word_rebase(
'Comme je viens\n\tde te le dire',
'Comme je\n\tde te le dire',
'Comme je\n\tXXXXX de te le dire',
['viens'])

def test_stopword_dont_rebase(self):
"""
Test with newline before word
"""
self.check_stop_word_rebase(
'Comme je viensbhgfds de te le dire',
'Comme je viensbhgfds de te le dire',
['viens'])
self.check_stop_word_rebase(
'Comme je gfgviens de te le dire',
'Comme je gfgviens de te le dire',
['viens'])
self.check_stop_word_rebase(
'Comme je gfgviensbhgfds de te le dire',
'Comme je gfgviensbhgfds de te le dire',
['viens'])

def test_stopword_empty(self):
"""
Test with empty charactere to rebase
"""
self.check_stop_word_rebase(
'Comme je viens de te le dire',
' je viens de te le dire',
['comme'], '')
self.check_stop_word_rebase(
'Comme je viens de te le dire',
'Comme je de te le dire',
['viens'], '')
self.check_stop_word_rebase(
'Comme je\n\tviens de te le dire',
'Comme je\n\t de te le dire',
['viens'], '')

0 comments on commit e4fa3c6

Please sign in to comment.