Skip to content

Commit

Permalink
Merge branch 'feature/lazy-punctuation' into develop
Browse files Browse the repository at this point in the history
  • Loading branch information
Fantomas42 committed Mar 9, 2016
2 parents ccefc32 + d008181 commit e55a8a9
Show file tree
Hide file tree
Showing 6 changed files with 21 additions and 8 deletions.
1 change: 1 addition & 0 deletions buildout.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ zinnia = beautifulsoup4
django-xmlrpc
mots-vides
pyparsing
regex
markups = docutils
markdown
textile
Expand Down
2 changes: 2 additions & 0 deletions docs/getting-started/install.rst
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ Make sure to install these packages prior to installation :
* `django-tagging`_ >= 0.3.6
* `beautifulsoup4`_ >= 4.1.3
* `mots-vides`_ >= 2015.2.6
* `regex`_ >= 2016.3.2
* `django-contrib-comments`_ >= 1.6

The packages below are optionnal but needed for run the full test suite or
Expand Down Expand Up @@ -193,6 +194,7 @@ project directory to sync the models with the database. ::
.. _`django-tagging`: https://code.google.com/p/django-tagging/
.. _`django-contrib-comments`: https://github.com/django/django-contrib-comments
.. _`mots-vides`: https://github.com/Fantomas42/mots-vides
.. _`regex`: https://pypi.python.org/pypi/regex
.. _`beautifulsoup4`: http://www.crummy.com/software/BeautifulSoup/
.. _`pytz`: http://pytz.sourceforge.net/
.. _`pyparsing`: http://pyparsing.wikispaces.com/
Expand Down
1 change: 1 addition & 0 deletions docs/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,4 @@ django-xmlrpc==0.1.5
mots-vides==2015.5.11
pillow==2.9.0
pyparsing==2.0.3
regex==2016.3.2
3 changes: 2 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,5 +40,6 @@
'mots-vides>=2015.5.11',
'pillow>=2.0.0',
'pyparsing>=2.0.3',
'pytz>=2014.10']
'pytz>=2014.10',
'regex>=2016.3.2']
)
11 changes: 4 additions & 7 deletions zinnia/comparison.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
"""Comparison tools for Zinnia"""
import sys
import unicodedata
from math import sqrt

from django.utils import six
Expand All @@ -10,15 +8,14 @@
from django.utils.functional import cached_property
from django.core.cache import InvalidCacheBackendError

import regex as re

from zinnia.models.entry import Entry
from zinnia.settings import STOP_WORDS
from zinnia.settings import COMPARISON_FIELDS


PUNCTUATION = dict.fromkeys(
i for i in range(sys.maxunicode)
if unicodedata.category(six.unichr(i)).startswith('P')
)
PUNCTUATION = re.compile(r'\p{P}+')


def pearson_score(list1, list2):
Expand Down Expand Up @@ -108,7 +105,7 @@ def raw_clean(self, datas):
"""
datas = strip_tags(datas) # Remove HTML
datas = STOP_WORDS.rebase(datas, '') # Remove STOP WORDS
datas = datas.translate(PUNCTUATION) # Remove punctuation
datas = PUNCTUATION.sub('', datas) # Remove punctuation
datas = datas.lower()
return [d for d in datas.split() if len(d) > 1]

Expand Down
11 changes: 11 additions & 0 deletions zinnia/tests/test_comparison.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,3 +177,14 @@ def test_cached_vector_builder(self):
self.assertEquals(len(v.columns), 3)
with self.assertNumQueries(0):
self.assertEquals(len(v.get_related(e1, 5)), 2)

def test_raw_clean(self):
v = ModelVectorBuilder(queryset=Entry.objects.none(), fields=['title'])
self.assertEquals(v.raw_clean('<p>HTML Content</p>'),
['html', 'content'])
self.assertEquals(v.raw_clean('<p>An HTML Content</p>'),
['html', 'content'])
self.assertEquals(v.raw_clean('<p>An HTML Content 2</p>'),
['html', 'content'])
self.assertEquals(v.raw_clean('<p>!HTML Content ?</p>'),
['html', 'content'])

0 comments on commit e55a8a9

Please sign in to comment.