Skip to content

Commit

Permalink
Remove punctuation when building the datas
Browse files Browse the repository at this point in the history
  • Loading branch information
Fantomas42 committed May 4, 2015
1 parent 47e1048 commit 19f3bb8
Show file tree
Hide file tree
Showing 2 changed files with 26 additions and 8 deletions.
24 changes: 21 additions & 3 deletions zinnia/comparison.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,22 @@
"""Comparison tools for Zinnia"""
import sys
import unicodedata
from math import sqrt

from django.utils import six
from django.core.cache import caches
from django.utils.html import strip_tags
from django.core.cache import InvalidCacheBackendError

from math import sqrt

from zinnia.search import STOP_WORDS


PUNCTUATION = dict.fromkeys(
i for i in xrange(sys.maxunicode)
if unicodedata.category(unichr(i)).startswith('P')
)


def pearson_score(list1, list2):
"""
Compute the Pearson' score between 2 lists of vectors.
Expand Down Expand Up @@ -48,9 +56,19 @@ def dataset(self):
item = list(item)
item_pk = item.pop(0)
datas = ' '.join(map(six.text_type, item))
dataset[item_pk] = STOP_WORDS.rebase(strip_tags(datas).lower(), '')
dataset[item_pk] = self.clean(datas)
return dataset

def clean(self, datas):
"""
Apply a cleaning on the datas.
"""
datas = strip_tags(datas) # Remove HTML
datas = datas.translate(PUNCTUATION) # Remove punctuation
datas = STOP_WORDS.rebase(datas, '') # Remove STOP WORDS
datas = datas.lower()
return datas


class VectorBuilder(object):
"""
Expand Down
10 changes: 5 additions & 5 deletions zinnia/tests/test_comparison.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,20 +29,20 @@ def test_pearson_score(self):
-1)

def test_clustered_model(self):
params = {'title': 'My entry 1', 'content': 'My content 1',
params = {'title': 'My entry 1', 'content': 'My content 1.',
'tags': 'zinnia, test', 'slug': 'my-entry-1'}
entry_1 = Entry.objects.create(**params)
params = {'title': 'My entry 2', 'content': 'My content 2',
params = {'title': 'My entry 2', 'content': 'My content 2.',
'tags': 'zinnia, test', 'slug': 'my-entry-2'}
entry_2 = Entry.objects.create(**params)
cm = ClusteredModel(Entry.objects.all(), ['id'])
self.assertEqual(sorted(cm.dataset().values()),
sorted([str(entry_1.pk), str(entry_2.pk)]))
cm = ClusteredModel(Entry.objects.all(),
['title', 'excerpt', 'content'])
['title', 'content', 'tags'])
self.assertEqual(sorted(cm.dataset().values()),
sorted([' entry 1 content 1',
' entry 2 content 2']))
sorted([' entry 1 content 1 zinnia test',
' entry 2 content 2 zinnia test']))

def test_vector_builder(self):
vectors = VectorBuilder(Entry.objects.all(),
Expand Down

0 comments on commit 19f3bb8

Please sign in to comment.