From fbbcab48fe9ea71d2945f3ec71b3987f79a2ab3b Mon Sep 17 00:00:00 2001 From: Paul Craciunoiu Date: Fri, 26 Feb 2010 18:00:19 -0800 Subject: [PATCH 1/6] Adding a few regexes for cleanup --- apps/search/clients.py | 61 ++++++++++++++++++++++++++++++++++- apps/search/views.py | 7 +++- apps/sumo/models.py | 4 --- templates/search/results.html | 6 ++-- 4 files changed, 69 insertions(+), 9 deletions(-) diff --git a/apps/search/clients.py b/apps/search/clients.py index 0820fff22e6..651960aad2a 100644 --- a/apps/search/clients.py +++ b/apps/search/clients.py @@ -2,6 +2,7 @@ from .sphinxapi import SphinxClient +import re class SearchClient(object): """ @@ -15,6 +16,18 @@ def __init__(self): def query(self, query, filters): abstract + def excerpt(self, results, query): + """ + Returns a list of Sphinx excerpts for the passed-in list of results. + + Takes in a list of strings + """ + documents = [] + for result in results: + documents.append(results.data) + + return self.sphinx.BuildExcerpts(documents, self.index, query) + class ForumClient(SearchClient): """ @@ -55,6 +68,31 @@ class WikiClient(SearchClient): """ Search the knowledge base """ + index = 'wiki_pages' + patterns = ( + (r'^!+',), + (r'^;:',), + (r'\n|\r',), + (r'\{maketoc\}',), + ('#\{ANAME.*?ANAME\}#',), +""" + ('/__/'), + ('/\^(.*?)\^/',), + (r'{[a-zA-Z]+.*?}',), + ('#~/?np~#',), + ('/~(h|t)c~.*\~\/\1c~/U',), + ('/\(\((.*)(?:\|(.*))?\)\)/Ue',), + ('#\[.+\|(.+)\]#U','$1',), + ('#\'\'#',), + (r'%{2,}',), +""" + ) + compiled_patterns = [] + + def __init__(self): + SearchClient.__init__(self) + for pattern in self.patterns: + self.compiled_patterns.append(re.compile(pattern[0], re.MULTILINE)) def query(self, query, filters=None): """ @@ -77,9 +115,30 @@ def query(self, query, filters=None): sc.SetFilter(f['filter'], f['value'], f.get('exclude', False)) - result = sc.Query(query, 'wiki_pages') + result = sc.Query(query, self.index) if result: return result['matches'] else: return [] + + def excerpt(self, results, query): + """ + Returns a list of wiki page excerpts for the passed-in list of results. + + Takes in a list of strings + """ + documents = [] + for result in results: + documents.append(result.data) + + raw_excerpts = self.sphinx.BuildExcerpts(documents, self.index, query) + excerpts = [] + for raw_excerpt in raw_excerpts: + excerpt = raw_excerpt + for p in self.compiled_patterns: + excerpt = p.sub(' ', excerpt) + + excerpts.append(excerpt) + + return excerpts diff --git a/apps/search/views.py b/apps/search/views.py index 7b110158ff5..0521b950225 100644 --- a/apps/search/views.py +++ b/apps/search/views.py @@ -91,7 +91,12 @@ def search(request): for i in range(offset, offset + settings.SEARCH_RESULTS_PER_PAGE): try: if documents[i]['attrs'].get('category', False): - results.append(WikiPage.objects.get(pk=documents[i]['id'])) + wiki_document = WikiPage.objects.get(pk=documents[i]['id']) + wd = {'search_summary': wc.excerpt((wiki_document,), q)[0], + 'url': wiki_document.get_url(), + 'title': wiki_document.name, + } + results.append(wd) else: results.append(ForumThread.objects.get(pk=documents[i]['id'])) except IndexError: diff --git a/apps/sumo/models.py b/apps/sumo/models.py index 9ee48cd1ad6..d22190d366f 100644 --- a/apps/sumo/models.py +++ b/apps/sumo/models.py @@ -101,10 +101,6 @@ def __unicode__(self): def name(self): return self.pageName - @property - def search_summary(self): - return self.description - def get_url(self): """ TODO: Once we can use reverse(), use reverse() diff --git a/templates/search/results.html b/templates/search/results.html index 9fc63355cb8..e84ac4a7676 100644 --- a/templates/search/results.html +++ b/templates/search/results.html @@ -37,10 +37,10 @@ {% for doc in results %}
- {{ doc.name }} + {{ doc.title }}

- - {{ doc.search_summary }} + + {{ doc.search_summary|safe }} ...

From efca425c65f1f2685c2c8f88b08d348c5e1f3bd4 Mon Sep 17 00:00:00 2001 From: Paul Craciunoiu Date: Mon, 1 Mar 2010 11:45:01 -0800 Subject: [PATCH 2/6] search summary length and a few more regexes --- apps/search/clients.py | 27 ++++++++++++++++----------- settings.py | 1 + templates/search/results.html | 1 - 3 files changed, 17 insertions(+), 12 deletions(-) diff --git a/apps/search/clients.py b/apps/search/clients.py index 651960aad2a..7da25d65851 100644 --- a/apps/search/clients.py +++ b/apps/search/clients.py @@ -72,19 +72,24 @@ class WikiClient(SearchClient): patterns = ( (r'^!+',), (r'^;:',), + (r'^#',), (r'\n|\r',), (r'\{maketoc\}',), - ('#\{ANAME.*?ANAME\}#',), + (r'\{ANAME.*?ANAME\}',), + (r'\{[a-zA-Z]+.*?\}',), + (r'\{.*?$',), + (r'__',), + (r'\'\'',), + (r'%{2,}',), + (r'\*|\^|; \}|/\}',), + (r'~/?np~',), + (r'~/?(h|t)c~',), + (r'\(spans.*?\)',), + (r'\}',), """ - ('/__/'), - ('/\^(.*?)\^/',), - (r'{[a-zA-Z]+.*?}',), - ('#~/?np~#',), - ('/~(h|t)c~.*\~\/\1c~/U',), - ('/\(\((.*)(?:\|(.*))?\)\)/Ue',), - ('#\[.+\|(.+)\]#U','$1',), + $text = preg_replace('/\(\((.*)(?:\|(.*))?\)\)/Ue','("$2")?"$2":"$1"', $text); + $text = preg_replace('#\[.+\|(.+)\]#U','$1',$text); ('#\'\'#',), - (r'%{2,}',), """ ) compiled_patterns = [] @@ -131,8 +136,8 @@ def excerpt(self, results, query): documents = [] for result in results: documents.append(result.data) - - raw_excerpts = self.sphinx.BuildExcerpts(documents, self.index, query) + raw_excerpts = self.sphinx.BuildExcerpts(documents, self.index, query, + {'limit': settings.SEARCH_SUMMARY_LENGTH}) excerpts = [] for raw_excerpt in raw_excerpts: excerpt = raw_excerpt diff --git a/settings.py b/settings.py index 5c9be9bed56..c5b40a7ea14 100644 --- a/settings.py +++ b/settings.py @@ -125,3 +125,4 @@ # Search default settings SEARCH_DEFAULT_CATEGORIES = '1,17,18' # comma-separated string of category IDs SEARCH_DEFAULT_FORUM = '1' # default forum ID (eg: 1 on sumo, 5 on mosumo) +SEARCH_SUMMARY_LENGTH = 350 \ No newline at end of file diff --git a/templates/search/results.html b/templates/search/results.html index e84ac4a7676..bd8d420c4c7 100644 --- a/templates/search/results.html +++ b/templates/search/results.html @@ -41,7 +41,6 @@

{{ doc.search_summary|safe }} - ...

From 08982c64f230dff9c623f6dd87bf8e62d2e21855 Mon Sep 17 00:00:00 2001 From: Paul Craciunoiu Date: Mon, 1 Mar 2010 14:40:50 -0800 Subject: [PATCH 3/6] Done regexes for wiki summary --- apps/search/clients.py | 24 ++++++++++++++++-------- apps/search/templates/results.html | 9 ++++----- media/css/search.css | 12 ++++++++++++ 3 files changed, 32 insertions(+), 13 deletions(-) diff --git a/apps/search/clients.py b/apps/search/clients.py index 7da25d65851..620c31d7923 100644 --- a/apps/search/clients.py +++ b/apps/search/clients.py @@ -81,23 +81,31 @@ class WikiClient(SearchClient): (r'__',), (r'\'\'',), (r'%{2,}',), - (r'\*|\^|; \}|/\}',), + (r'\*|\^|;|/\}',), (r'~/?np~',), (r'~/?(h|t)c~',), (r'\(spans.*?\)',), (r'\}',), -""" - $text = preg_replace('/\(\((.*)(?:\|(.*))?\)\)/Ue','("$2")?"$2":"$1"', $text); - $text = preg_replace('#\[.+\|(.+)\]#U','$1',$text); - ('#\'\'#',), -""" + (r'\(\(.*?\|(?P.*?)\)\)', '\g'), + (r'\(\((?P.*?)\)\)', '\g'), + (r'\(\(',), + (r'\)\)',), + (r'\[.+?\|(?P.+?)\]', '\g'), + (r'/wiki_up.*? ',), + (r'"',), ) compiled_patterns = [] def __init__(self): SearchClient.__init__(self) for pattern in self.patterns: - self.compiled_patterns.append(re.compile(pattern[0], re.MULTILINE)) + p = [re.compile(pattern[0], re.MULTILINE)] + if len(pattern) > 1: + p.append(pattern[1]) + else: + p.append(' ') + + self.compiled_patterns.append(p) def query(self, query, filters=None): """ @@ -142,7 +150,7 @@ def excerpt(self, results, query): for raw_excerpt in raw_excerpts: excerpt = raw_excerpt for p in self.compiled_patterns: - excerpt = p.sub(' ', excerpt) + excerpt = p[0].sub(p[1], excerpt) excerpts.append(excerpt) diff --git a/apps/search/templates/results.html b/apps/search/templates/results.html index bd0c7315ad4..661288bb6de 100644 --- a/apps/search/templates/results.html +++ b/apps/search/templates/results.html @@ -22,15 +22,14 @@ {% for doc in results %} - -

Can't find what you're looking for? Ask a support question instead!

+

Can't find what you're looking for? Ask a support question instead!

{% endblock %} diff --git a/media/css/search.css b/media/css/search.css index baf6a54afe8..0e34b849ea2 100644 --- a/media/css/search.css +++ b/media/css/search.css @@ -2,6 +2,18 @@ font-size: 110%; } +.search-results .title { + font-size: 130%; +} + +.search-results p a { + color: #000; +} + +.search-results .result { + margin-bottom: 15px; +} + .search-query { border: 1px solid #E16601; width: 270px; From 2e0512e83cb3c32252f8f18d491fdecab99551f7 Mon Sep 17 00:00:00 2001 From: Paul Craciunoiu Date: Mon, 1 Mar 2010 15:55:48 -0800 Subject: [PATCH 4/6] Forum thread summaries, common excerpt function, and styling --- apps/search/clients.py | 86 ++++++++++++++++++++++++++++-------------- apps/search/views.py | 17 ++++++--- media/css/search.css | 1 + settings.py | 2 +- 4 files changed, 71 insertions(+), 35 deletions(-) diff --git a/apps/search/clients.py b/apps/search/clients.py index 620c31d7923..47ba38cd75c 100644 --- a/apps/search/clients.py +++ b/apps/search/clients.py @@ -16,23 +16,71 @@ def __init__(self): def query(self, query, filters): abstract - def excerpt(self, results, query): + def excerpt(self, result, query): """ - Returns a list of Sphinx excerpts for the passed-in list of results. + Returns an excerpt for the passed-in string - Takes in a list of strings + Takes in a string """ - documents = [] - for result in results: - documents.append(results.data) + documents = [result] - return self.sphinx.BuildExcerpts(documents, self.index, query) + # build excerpts that are 1.3 times as long and truncate + raw_excerpt = self.sphinx.BuildExcerpts(documents, self.index, query, + {'limit': settings.SEARCH_SUMMARY_LENGTH * 1.3})[0] + + excerpt = raw_excerpt + for p in self.compiled_patterns: + excerpt = p[0].sub(p[1], excerpt) + + # truncate long excerpts + if len(excerpt) > settings.SEARCH_SUMMARY_LENGTH: + excerpt = excerpt[:settings.SEARCH_SUMMARY_LENGTH] \ + + self.truncate_pattern.sub('', excerpt[settings.SEARCH_SUMMARY_LENGTH:]) + if excerpt[len(excerpt)-1] != '.': + excerpt += '...' + + return excerpt class ForumClient(SearchClient): """ Search the forum """ + index = 'forum_threads' + patterns = ( + (r'^!+',), + (r'^;:',), + (r'^#',), + (r'\n|\r',), + (r'__',), + (r'\'\'',), + (r'%{2,}',), + (r'\*|\^|;|/\}',), + (r'\}',), + (r'\(\(.*?\|(?P.*?)\)\)', '\g'), + (r'\(\((?P.*?)\)\)', '\g'), + (r'\(\(',), + (r'\)\)',), + (r'\[.+?\|(?P.+?)\]', '\g'), + (r'\[(?P.+?)\]', '\g'), + (r'"',), + (r'\*+',), + (r'^!! Issue.+!! Description',), + (r'\s+',), + ) + compiled_patterns = [] + truncate_pattern = re.compile(r'\s.*', re.MULTILINE) + + def __init__(self): + SearchClient.__init__(self) + for pattern in self.patterns: + p = [re.compile(pattern[0], re.MULTILINE)] + if len(pattern) > 1: + p.append(pattern[1]) + else: + p.append(' ') + + self.compiled_patterns.append(p) def query(self, query, filters=None): """ @@ -91,10 +139,13 @@ class WikiClient(SearchClient): (r'\(\(',), (r'\)\)',), (r'\[.+?\|(?P.+?)\]', '\g'), + (r'\[(?P.+?)\]', '\g'), (r'/wiki_up.*? ',), (r'"',), + (r'\s+',), ) compiled_patterns = [] + truncate_pattern = re.compile(r'\s.*', re.MULTILINE) def __init__(self): SearchClient.__init__(self) @@ -134,24 +185,3 @@ def query(self, query, filters=None): return result['matches'] else: return [] - - def excerpt(self, results, query): - """ - Returns a list of wiki page excerpts for the passed-in list of results. - - Takes in a list of strings - """ - documents = [] - for result in results: - documents.append(result.data) - raw_excerpts = self.sphinx.BuildExcerpts(documents, self.index, query, - {'limit': settings.SEARCH_SUMMARY_LENGTH}) - excerpts = [] - for raw_excerpt in raw_excerpts: - excerpt = raw_excerpt - for p in self.compiled_patterns: - excerpt = p[0].sub(p[1], excerpt) - - excerpts.append(excerpt) - - return excerpts diff --git a/apps/search/views.py b/apps/search/views.py index 957f4b03b6f..5d2ea88ec96 100644 --- a/apps/search/views.py +++ b/apps/search/views.py @@ -96,14 +96,19 @@ def search(request): for i in range(offset, offset + settings.SEARCH_RESULTS_PER_PAGE): try: if documents[i]['attrs'].get('category', False): - wiki_document = WikiPage.objects.get(pk=documents[i]['id']) - wd = {'search_summary': wc.excerpt((wiki_document,), q)[0], - 'url': wiki_document.get_url(), - 'title': wiki_document.name, + wiki_page = WikiPage.objects.get(pk=documents[i]['id']) + result = {'search_summary': wc.excerpt(wiki_page.data, q), + 'url': wiki_page.get_url(), + 'title': wiki_page.name, } - results.append(wd) + results.append(result) else: - results.append(ForumThread.objects.get(pk=documents[i]['id'])) + forum_thread = ForumThread.objects.get(pk=documents[i]['id']) + result = {'search_summary': fc.excerpt(forum_thread.data, q), + 'url': forum_thread.get_url(), + 'title': forum_thread.name, + } + results.append(result) except IndexError: break except (WikiPage.DoesNotExist, ForumThread.DoesNotExist): diff --git a/media/css/search.css b/media/css/search.css index 0e34b849ea2..9eaf4a5d965 100644 --- a/media/css/search.css +++ b/media/css/search.css @@ -8,6 +8,7 @@ .search-results p a { color: #000; + line-height: 140%; } .search-results .result { diff --git a/settings.py b/settings.py index c5b40a7ea14..6d305b69b11 100644 --- a/settings.py +++ b/settings.py @@ -125,4 +125,4 @@ # Search default settings SEARCH_DEFAULT_CATEGORIES = '1,17,18' # comma-separated string of category IDs SEARCH_DEFAULT_FORUM = '1' # default forum ID (eg: 1 on sumo, 5 on mosumo) -SEARCH_SUMMARY_LENGTH = 350 \ No newline at end of file +SEARCH_SUMMARY_LENGTH = 275 \ No newline at end of file From 33fbc03e961868683f85e0b9e85b74259a80ad02 Mon Sep 17 00:00:00 2001 From: Paul Craciunoiu Date: Mon, 1 Mar 2010 16:56:30 -0800 Subject: [PATCH 5/6] move __init__ to common SearchClient and define constant for multiplier --- apps/search/clients.py | 124 ++++++++++++++++------------------------- settings.py | 5 +- 2 files changed, 52 insertions(+), 77 deletions(-) diff --git a/apps/search/clients.py b/apps/search/clients.py index 47ba38cd75c..5b243edc46e 100644 --- a/apps/search/clients.py +++ b/apps/search/clients.py @@ -4,6 +4,36 @@ import re +MARKUP_PATTERNS = ( + (r'^!+',), + (r'^;:',), + (r'^#',), + (r'\n|\r',), + (r'\{maketoc\}',), + (r'\{ANAME.*?ANAME\}',), + (r'\{[a-zA-Z]+.*?\}',), + (r'\{.*?$',), + (r'__',), + (r'\'\'',), + (r'%{2,}',), + (r'\*|\^|;|/\}',), + (r'~/?np~',), + (r'~/?(h|t)c~',), + (r'\(spans.*?\)',), + (r'\}',), + (r'\(\(.*?\|(?P.*?)\)\)', '\g'), + (r'\(\((?P.*?)\)\)', '\g'), + (r'\(\(',), + (r'\)\)',), + (r'\[.+?\|(?P.+?)\]', '\g'), + (r'\[(?P.+?)\]', '\g'), + (r'/wiki_up.*? ',), + (r'"',), + (r'^!! Issue.+!! Description',), + (r'\s+',), +) + + class SearchClient(object): """ Base-class for search clients @@ -14,6 +44,20 @@ def __init__(self): self.sphinx.SetServer(settings.SPHINX_HOST, settings.SPHINX_PORT) self.sphinx.SetLimits(0, settings.SEARCH_MAX_RESULTS) + # initialize regexes for markup cleaning + self.truncate_pattern = re.compile(r'\s.*', re.MULTILINE) + self.compiled_patterns = [] + + if MARKUP_PATTERNS: + for pattern in MARKUP_PATTERNS: + p = [re.compile(pattern[0], re.MULTILINE)] + if len(pattern) > 1: + p.append(pattern[1]) + else: + p.append(' ') + + self.compiled_patterns.append(p) + def query(self, query, filters): abstract def excerpt(self, result, query): @@ -24,9 +68,11 @@ def excerpt(self, result, query): """ documents = [result] - # build excerpts that are 1.3 times as long and truncate + # build excerpts that are longer and truncate + # see multiplier constant definition for details raw_excerpt = self.sphinx.BuildExcerpts(documents, self.index, query, - {'limit': settings.SEARCH_SUMMARY_LENGTH * 1.3})[0] + {'limit': settings.SEARCH_SUMMARY_LENGTH + * settings.SEARCH_SUMMARY_LENGTH_MULTIPLIER})[0] excerpt = raw_excerpt for p in self.compiled_patterns: @@ -47,40 +93,6 @@ class ForumClient(SearchClient): Search the forum """ index = 'forum_threads' - patterns = ( - (r'^!+',), - (r'^;:',), - (r'^#',), - (r'\n|\r',), - (r'__',), - (r'\'\'',), - (r'%{2,}',), - (r'\*|\^|;|/\}',), - (r'\}',), - (r'\(\(.*?\|(?P.*?)\)\)', '\g'), - (r'\(\((?P.*?)\)\)', '\g'), - (r'\(\(',), - (r'\)\)',), - (r'\[.+?\|(?P.+?)\]', '\g'), - (r'\[(?P.+?)\]', '\g'), - (r'"',), - (r'\*+',), - (r'^!! Issue.+!! Description',), - (r'\s+',), - ) - compiled_patterns = [] - truncate_pattern = re.compile(r'\s.*', re.MULTILINE) - - def __init__(self): - SearchClient.__init__(self) - for pattern in self.patterns: - p = [re.compile(pattern[0], re.MULTILINE)] - if len(pattern) > 1: - p.append(pattern[1]) - else: - p.append(' ') - - self.compiled_patterns.append(p) def query(self, query, filters=None): """ @@ -117,46 +129,6 @@ class WikiClient(SearchClient): Search the knowledge base """ index = 'wiki_pages' - patterns = ( - (r'^!+',), - (r'^;:',), - (r'^#',), - (r'\n|\r',), - (r'\{maketoc\}',), - (r'\{ANAME.*?ANAME\}',), - (r'\{[a-zA-Z]+.*?\}',), - (r'\{.*?$',), - (r'__',), - (r'\'\'',), - (r'%{2,}',), - (r'\*|\^|;|/\}',), - (r'~/?np~',), - (r'~/?(h|t)c~',), - (r'\(spans.*?\)',), - (r'\}',), - (r'\(\(.*?\|(?P.*?)\)\)', '\g'), - (r'\(\((?P.*?)\)\)', '\g'), - (r'\(\(',), - (r'\)\)',), - (r'\[.+?\|(?P.+?)\]', '\g'), - (r'\[(?P.+?)\]', '\g'), - (r'/wiki_up.*? ',), - (r'"',), - (r'\s+',), - ) - compiled_patterns = [] - truncate_pattern = re.compile(r'\s.*', re.MULTILINE) - - def __init__(self): - SearchClient.__init__(self) - for pattern in self.patterns: - p = [re.compile(pattern[0], re.MULTILINE)] - if len(pattern) > 1: - p.append(pattern[1]) - else: - p.append(' ') - - self.compiled_patterns.append(p) def query(self, query, filters=None): """ diff --git a/settings.py b/settings.py index 6d305b69b11..fdb55d3bdbd 100644 --- a/settings.py +++ b/settings.py @@ -125,4 +125,7 @@ # Search default settings SEARCH_DEFAULT_CATEGORIES = '1,17,18' # comma-separated string of category IDs SEARCH_DEFAULT_FORUM = '1' # default forum ID (eg: 1 on sumo, 5 on mosumo) -SEARCH_SUMMARY_LENGTH = 275 \ No newline at end of file +SEARCH_SUMMARY_LENGTH = 275 +# because of markup cleanup, search summaries lengths vary quite a bit +# so we extract longer excerpts and perform truncation to the length above +SEARCH_SUMMARY_LENGTH_MULTIPLIER = 1.3 From 8fe12c6f2e42fd62b0c17b1f1caf712145e80695 Mon Sep 17 00:00:00 2001 From: Paul Craciunoiu Date: Mon, 1 Mar 2010 17:04:40 -0800 Subject: [PATCH 6/6] Accessing last character of string doesn't need to call len() --- apps/search/clients.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/search/clients.py b/apps/search/clients.py index 5b243edc46e..255d1626ab5 100644 --- a/apps/search/clients.py +++ b/apps/search/clients.py @@ -82,7 +82,7 @@ def excerpt(self, result, query): if len(excerpt) > settings.SEARCH_SUMMARY_LENGTH: excerpt = excerpt[:settings.SEARCH_SUMMARY_LENGTH] \ + self.truncate_pattern.sub('', excerpt[settings.SEARCH_SUMMARY_LENGTH:]) - if excerpt[len(excerpt)-1] != '.': + if excerpt[-1] != '.': excerpt += '...' return excerpt