Permalink
Switch branches/tags
Find file
Fetching contributors…
Cannot retrieve contributors at this time
547 lines (449 sloc) 15.3 KB
/*
* Copyright (C) 2006, Jamie McCracken <jamiemcc@gnome.org>
* Copyright (C) 2008,2009,2010 Nokia <ivan.frade@nokia.com>
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
* 02110-1301 USA
*/
#include "config.h"
#include <stdio.h>
#include <string.h>
/* libunistring versions prior to 9.1.2 need this hack */
#define _UNUSED_PARAMETER_
#include <unistr.h>
#include <uniwbrk.h>
#include <unictype.h>
#include <unicase.h>
#include "tracker-parser.h"
#include "tracker-parser-utils.h"
/* Type of words detected */
typedef enum {
TRACKER_PARSER_WORD_TYPE_ASCII,
TRACKER_PARSER_WORD_TYPE_OTHER_UNAC,
TRACKER_PARSER_WORD_TYPE_OTHER_NO_UNAC,
} TrackerParserWordType;
/* Max possible length of a UTF-8 encoded string (just a safety limit) */
#define WORD_BUFFER_LENGTH 512
struct TrackerParser {
const gchar *txt;
gint txt_size;
TrackerLanguage *language;
guint max_word_length;
gboolean enable_stemmer;
gboolean enable_unaccent;
gboolean ignore_stop_words;
gboolean ignore_reserved_words;
gboolean ignore_numbers;
gboolean enable_forced_wordbreaks;
/* Private members */
gchar *word;
gint word_length;
guint word_position;
/* Cursor, as index of the input array of bytes */
gsize cursor;
/* libunistring flags array */
gchar *word_break_flags;
/* general category of the start character in words */
uc_general_category_t allowed_start;
};
static gboolean
get_word_info (TrackerParser *parser,
gsize *p_word_length,
gboolean *p_is_allowed_word_start,
TrackerParserWordType *p_word_type)
{
ucs4_t first_unichar;
gint first_unichar_len;
gboolean ascii_only;
/* Defaults */
*p_is_allowed_word_start = TRUE;
/* Get first character of the word as UCS4 */
first_unichar_len = u8_strmbtouc (&first_unichar,
&(parser->txt[parser->cursor]));
if (first_unichar_len <= 0) {
/* This should only happen if NIL was passed to u8_strmbtouc,
* so better just force stop here */
return FALSE;
} else {
/* If first character has length 1, it's ASCII-7 */
ascii_only = first_unichar_len == 1 ? TRUE : FALSE;
}
/* Consider word starts with a forced wordbreak */
if (parser->enable_forced_wordbreaks &&
IS_FORCED_WORDBREAK_UCS4 ((guint32)first_unichar)) {
*p_word_length = first_unichar_len;
} else {
gsize i;
/* Find next word break, and in the same loop checking if only ASCII
* characters */
i = parser->cursor + first_unichar_len;
while (1) {
/* Text bounds reached? */
if (i >= parser->txt_size)
break;
/* Proper unicode word break detected? */
if (parser->word_break_flags[i])
break;
/* Forced word break detected? */
if (parser->enable_forced_wordbreaks &&
IS_FORCED_WORDBREAK_UCS4 ((guint32)parser->txt[i]))
break;
if (ascii_only &&
!IS_ASCII_UCS4 ((guint32)parser->txt[i])) {
ascii_only = FALSE;
}
i++;
}
/* Word end is the first byte after the word, which is either the
* start of next word or the end of the string */
*p_word_length = i - parser->cursor;
}
/* We only want the words where the first character
* in the word is either a letter, a number or a symbol.
* This is needed because the word break algorithm also
* considers word breaks after for example commas or other
* punctuation marks.
* Note that looking at the first character in the string
* should be compatible with all Unicode normalization
* methods.
*/
if (!IS_UNDERSCORE_UCS4 ((guint32)first_unichar) &&
!uc_is_general_category (first_unichar,
parser->allowed_start)) {
*p_is_allowed_word_start = FALSE;
return TRUE;
}
/* Decide word type */
if (ascii_only) {
*p_word_type = TRACKER_PARSER_WORD_TYPE_ASCII;
} else if (IS_CJK_UCS4 (first_unichar)) {
*p_word_type = TRACKER_PARSER_WORD_TYPE_OTHER_NO_UNAC;
} else {
*p_word_type = TRACKER_PARSER_WORD_TYPE_OTHER_UNAC;
}
return TRUE;
}
/* The input word in this method MUST be normalized in NFKD form,
* and given in UTF-8, where str_length is the byte-length */
gboolean
tracker_parser_unaccent_nfkd_string (gpointer str,
gsize *str_length)
{
gchar *word;
gsize word_length;
gsize i;
gsize j;
g_return_val_if_fail (str != NULL, FALSE);
g_return_val_if_fail (str_length != NULL, FALSE);
g_return_val_if_fail (*str_length > 0, FALSE);
word = (gchar *)str;
word_length = *str_length;
i = 0;
j = 0;
while (i < word_length) {
ucs4_t unichar;
gint utf8_len;
/* Get next character of the word as UCS4 */
utf8_len = u8_strmbtouc (&unichar, &word[i]);
/* Invalid UTF-8 character or end of original string. */
if (utf8_len <= 0) {
break;
}
/* If the given unichar is a combining diacritical mark,
* just update the original index, not the output one */
if (IS_CDM_UCS4 ((guint32) unichar)) {
i += utf8_len;
continue;
}
/* If already found a previous combining
* diacritical mark, indexes are different so
* need to copy characters. As output and input
* buffers may overlap, need to use memmove
* instead of memcpy */
if (i != j) {
memmove (&word[j], &word[i], utf8_len);
}
/* Update both indexes */
i += utf8_len;
j += utf8_len;
}
/* Force proper string end */
word[j] = '\0';
/* Set new output length */
*str_length = j;
return TRUE;
}
static gchar *
process_word_utf8 (TrackerParser *parser,
const gchar *word,
gint length,
TrackerParserWordType type,
gboolean *stop_word)
{
gchar word_buffer [WORD_BUFFER_LENGTH];
gchar *normalized = NULL;
gchar *stemmed = NULL;
size_t new_word_length;
g_return_val_if_fail (parser != NULL, NULL);
g_return_val_if_fail (word != NULL, NULL);
/* If length is set as -1, the input word MUST be NIL-terminated.
* Otherwise, this restriction is not needed as the length to process
* is given as input argument */
if (length < 0) {
length = strlen (word);
}
/* Log original word */
tracker_parser_message_hex ("ORIGINAL word",
word, length);
/* Normalization and case-folding ONLY for non-ASCII */
if (type != TRACKER_PARSER_WORD_TYPE_ASCII) {
/* Leave space for last NIL */
new_word_length = WORD_BUFFER_LENGTH - 1;
/* Casefold and NFKD normalization in output.
* NOTE: if the output buffer is not big enough, u8_casefold will
* return a newly-allocated buffer. */
normalized = u8_casefold ((const uint8_t *)word,
length,
uc_locale_language (),
UNINORM_NFKD,
word_buffer,
&new_word_length);
/* Case folding + Normalization failed, ignore this word */
g_return_val_if_fail (normalized != NULL, NULL);
/* If output buffer is not the same as the one passed to
* u8_casefold, we know it was newly-allocated, so need
* to resize it in 1 byte to add last NIL */
if (normalized != word_buffer) {
normalized = g_realloc (normalized, new_word_length + 1);
}
/* Log after Normalization */
tracker_parser_message_hex (" After Casefolding and NFKD normalization",
normalized, new_word_length);
} else {
/* For ASCII-only, just tolower() each character */
gsize i;
normalized = length > WORD_BUFFER_LENGTH ? g_malloc (length + 1) : word_buffer;
for (i = 0; i < length; i++) {
normalized[i] = g_ascii_tolower (word[i]);
}
new_word_length = length;
/* Log after tolower */
tracker_parser_message_hex (" After Lowercasing",
normalized, new_word_length);
}
/* Set output NIL */
normalized[new_word_length] = '\0';
/* UNAC stripping needed? (for non-CJK and non-ASCII) */
if (parser->enable_unaccent &&
type == TRACKER_PARSER_WORD_TYPE_OTHER_UNAC &&
tracker_parser_unaccent_nfkd_string (normalized, &new_word_length)) {
/* Log after UNAC stripping */
tracker_parser_message_hex (" After UNAC stripping",
normalized, new_word_length);
}
/* Check if stop word */
if (parser->ignore_stop_words) {
*stop_word = tracker_language_is_stop_word (parser->language,
normalized);
}
/* Stemming needed? */
if (parser->enable_stemmer) {
stemmed = tracker_language_stem_word (parser->language,
normalized,
new_word_length);
/* Log after stemming */
tracker_parser_message_hex (" After stemming",
stemmed, strlen (stemmed));
}
/* If stemmed wanted and succeeded, free previous and return it */
if (stemmed) {
if (normalized != word_buffer) {
g_free (normalized);
}
return stemmed;
}
/* It may be the case that no stripping and no stemming was needed, and
* that the output buffer in stack was enough for case-folding and
* normalization. In this case, need to strdup() the string to return it */
return normalized == word_buffer ? g_strdup (word_buffer) : normalized;
}
static gboolean
parser_next (TrackerParser *parser,
gint *byte_offset_start,
gint *byte_offset_end,
gboolean *stop_word)
{
gsize word_length = 0;
gchar *processed_word = NULL;
*byte_offset_start = 0;
*byte_offset_end = 0;
g_return_val_if_fail (parser, FALSE);
/* Loop to look for next valid word */
while (!processed_word &&
parser->cursor < parser->txt_size) {
TrackerParserWordType type;
gsize truncated_length;
gboolean is_allowed;
/* Get word info */
if (!get_word_info (parser,
&word_length,
&is_allowed,
&type)) {
/* Quit loop just in case */
parser->cursor = parser->txt_size;
break;
}
/* Ignore the word if not an allowed word start */
if (!is_allowed) {
/* Ignore this word and keep on looping */
parser->cursor += word_length;
continue;
}
/* Ignore the word if longer than the maximum allowed */
if (word_length >= parser->max_word_length) {
/* Ignore this word and keep on looping */
parser->cursor += word_length;
continue;
}
/* check if word is reserved and ignore it if so */
if (parser->ignore_reserved_words &&
tracker_parser_is_reserved_word_utf8 (&parser->txt[parser->cursor],
word_length)) {
/* Ignore this word and keep on looping */
parser->cursor += word_length;
continue;
}
/* compute truncated word length if needed (to avoid extremely
* long words)*/
truncated_length = (word_length < WORD_BUFFER_LENGTH ?
word_length :
WORD_BUFFER_LENGTH - 1);
/* Process the word here. If it fails, we can still go
* to the next one. Returns newly allocated string
* always */
processed_word = process_word_utf8 (parser,
&(parser->txt[parser->cursor]),
truncated_length,
type,
stop_word);
if (!processed_word) {
/* Ignore this word and keep on looping */
parser->cursor += word_length;
continue;
}
}
/* If we got a word here, set output */
if (processed_word) {
/* Set outputs */
*byte_offset_start = parser->cursor;
*byte_offset_end = parser->cursor + word_length;
/* Update cursor */
parser->cursor += word_length;
parser->word_length = strlen (processed_word);
parser->word = processed_word;
return TRUE;
}
/* No more words... */
return FALSE;
}
TrackerParser *
tracker_parser_new (TrackerLanguage *language)
{
TrackerParser *parser;
g_return_val_if_fail (TRACKER_IS_LANGUAGE (language), NULL);
parser = g_new0 (TrackerParser, 1);
parser->language = g_object_ref (language);
return parser;
}
void
tracker_parser_free (TrackerParser *parser)
{
g_return_if_fail (parser != NULL);
if (parser->language) {
g_object_unref (parser->language);
}
g_free (parser->word_break_flags);
g_free (parser->word);
g_free (parser);
}
void
tracker_parser_reset (TrackerParser *parser,
const gchar *txt,
gint txt_size,
guint max_word_length,
gboolean enable_stemmer,
gboolean enable_unaccent,
gboolean ignore_stop_words,
gboolean ignore_reserved_words,
gboolean ignore_numbers)
{
g_return_if_fail (parser != NULL);
g_return_if_fail (txt != NULL);
parser->max_word_length = max_word_length;
parser->enable_stemmer = enable_stemmer;
parser->enable_unaccent = enable_unaccent;
parser->ignore_stop_words = ignore_stop_words;
parser->ignore_reserved_words = ignore_reserved_words;
parser->ignore_numbers = ignore_numbers;
/* Note: We're forcing some unicode characters to behave
* as wordbreakers: e.g, the '.' The main reason for this
* is to enable FTS searches matching file extension. */
parser->enable_forced_wordbreaks = TRUE;
parser->txt_size = txt_size;
parser->txt = txt;
g_free (parser->word);
parser->word = NULL;
parser->word_position = 0;
parser->cursor = 0;
g_free (parser->word_break_flags);
/* Create array of flags, same size as original text. */
parser->word_break_flags = g_malloc (txt_size);
/* Get wordbreak flags in the whole string */
u8_wordbreaks ((const uint8_t *)txt,
(size_t) txt_size,
(char *)parser->word_break_flags);
/* Prepare a custom category which is a combination of the
* desired ones */
parser->allowed_start = UC_LETTER;
if (!parser->ignore_numbers) {
parser->allowed_start = uc_general_category_or (parser->allowed_start, UC_NUMBER);
}
}
const gchar *
tracker_parser_next (TrackerParser *parser,
gint *position,
gint *byte_offset_start,
gint *byte_offset_end,
gboolean *stop_word,
gint *word_length)
{
const gchar *str;
gint byte_start = 0, byte_end = 0;
str = NULL;
g_free (parser->word);
parser->word = NULL;
*stop_word = FALSE;
if (parser_next (parser, &byte_start, &byte_end, stop_word)) {
str = parser->word;
}
if (!*stop_word) {
parser->word_position++;
}
*word_length = parser->word_length;
*position = parser->word_position;
*byte_offset_start = byte_start;
*byte_offset_end = byte_end;
return str;
}