Remove punctuation when building the datas

Fantomas42 · May 4, 2015 · 19f3bb8 · 19f3bb8
1 parent 47e1048
commit 19f3bb8
Show file tree

Hide file tree

Showing 2 changed files with 26 additions and 8 deletions.
diff --git a/zinnia/comparison.py b/zinnia/comparison.py
@@ -1,14 +1,22 @@
 """Comparison tools for Zinnia"""
+import sys
+import unicodedata
+from math import sqrt
+
 from django.utils import six
 from django.core.cache import caches
 from django.utils.html import strip_tags
 from django.core.cache import InvalidCacheBackendError
 
-from math import sqrt
-
 from zinnia.search import STOP_WORDS
 
 
+PUNCTUATION = dict.fromkeys(
+    i for i in xrange(sys.maxunicode)
+    if unicodedata.category(unichr(i)).startswith('P')
+)
+
+
 def pearson_score(list1, list2):
     """
     Compute the Pearson' score between 2 lists of vectors.
@@ -48,9 +56,19 @@ def dataset(self):
             item = list(item)
             item_pk = item.pop(0)
             datas = ' '.join(map(six.text_type, item))
-            dataset[item_pk] = STOP_WORDS.rebase(strip_tags(datas).lower(), '')
+            dataset[item_pk] = self.clean(datas)
         return dataset
 
+    def clean(self, datas):
+        """
+        Apply a cleaning on the datas.
+        """
+        datas = strip_tags(datas)             # Remove HTML
+        datas = datas.translate(PUNCTUATION)  # Remove punctuation
+        datas = STOP_WORDS.rebase(datas, '')  # Remove STOP WORDS
+        datas = datas.lower()
+        return datas
+
 
 class VectorBuilder(object):
     """

diff --git a/zinnia/tests/test_comparison.py b/zinnia/tests/test_comparison.py
@@ -29,20 +29,20 @@ def test_pearson_score(self):
                          -1)
 
     def test_clustered_model(self):
-        params = {'title': 'My entry 1', 'content': 'My content 1',
+        params = {'title': 'My entry 1', 'content': 'My content 1.',
                   'tags': 'zinnia, test', 'slug': 'my-entry-1'}
         entry_1 = Entry.objects.create(**params)
-        params = {'title': 'My entry 2', 'content': 'My content 2',
+        params = {'title': 'My entry 2', 'content': 'My content 2.',
                   'tags': 'zinnia, test', 'slug': 'my-entry-2'}
         entry_2 = Entry.objects.create(**params)
         cm = ClusteredModel(Entry.objects.all(), ['id'])
         self.assertEqual(sorted(cm.dataset().values()),
                          sorted([str(entry_1.pk), str(entry_2.pk)]))
         cm = ClusteredModel(Entry.objects.all(),
-                            ['title', 'excerpt', 'content'])
+                            ['title', 'content', 'tags'])
         self.assertEqual(sorted(cm.dataset().values()),
-                         sorted([' entry 1   content 1',
-                                 ' entry 2   content 2']))
+                         sorted([' entry 1  content 1 zinnia test',
+                                 ' entry 2  content 2 zinnia test']))
 
     def test_vector_builder(self):
         vectors = VectorBuilder(Entry.objects.all(),