# sphinx config
source pages
{
type = mysql
# whether to strip HTML
# values can be 0 (don't strip) or 1 (do strip)
# WARNING, only works with mysql source for now
# WARNING, should work ok for PERFECTLY formed XHTML for now
# WARNING, POSSIBLE TO BUG on malformed everday HTML
# optional, default is 0
strip_html = 1
# what HTML attributes to index if stripping HTML
# format is as follows:
#
index_html_attrs = img=alt,title; a=title;
sql_host = <%= sphinx_db_host %>
sql_user = <%= sphinx_db_user %>
sql_pass = <%= sphinx_db_pass %>
sql_db = <%= sphinx_db_name %>
sql_port = <%= sphinx_db_port %> # optional, default is 3306
sql_query_pre = SET NAMES UTF8
sql_query_pre = SET SESSION query_cache_type=OFF
sql_query_pre = INSERT INTO indexer_status (started_at, status, index_name, hostname) VALUES (NOW(), 'indexing', 'pages', USER()) \
ON DUPLICATE KEY UPDATE started_at = NOW(), status = 'indexing'
sql_query = SELECT id, user_id, language, UNIX_TIMESTAMP(created_at) AS created_at, UNIX_TIMESTAMP(updated_at) AS updated_at, body, title FROM pages WHERE id>=$start AND id<=$end
sql_query_range = SELECT MIN(id),MAX(id) FROM pages where type='Article'
sql_range_step = 1000
sql_query_post = UPDATE indexer_status SET updated_at = NOW(), status = 'updated' WHERE index_name = 'pages' and hostname = USER()
sql_attr_uint = user_id
sql_attr_timestamp = created_at
sql_attr_timestamp = updated_at
}
source pages_delta : pages
{
# Clear and reset sql_query_pre
sql_query_pre =
sql_query_pre = SET NAMES UTF8
sql_query_pre = SET SESSION query_cache_type=OFF
sql_query_pre = INSERT INTO indexer_status (id, started_at, status, index_name, hostname) VALUES (NOW(), 'indexing', 'pages_delta', USER()) \
ON DUPLICATE KEY UPDATE started_at = NOW(), status = 'indexing'
sql_query = SELECT id, user_id, language, UNIX_TIMESTAMP(created_at) AS created_at, UNIX_TIMESTAMP(updated_at) AS updated_at, body, title \
FROM pages \
WHERE updated_at >= (SELECT updated_at FROM indexer_status WHERE index_name = 'pages_delta' and hostname = USER())
sql_query_post =
sql_query_post = UPDATE indexer_status SET updated_at = NOW(), status = 'updated' WHERE index_name = 'pages_delta' and hostname = USER()
sql_query_range =
sql_range_step =
}
index pages
{
source = pages
path = <%= sphinx_index_root %>/pages
docinfo = extern
morphology = stem_en
stopwords = <%= sphinx_conf_path %>/stopwords.txt
min_word_len = 1
charset_type = utf-8
min_prefix_len = 0
min_infix_len = 0
}
index pages_delta : pages
{
source = pages_delta
path = <%= sphinx_index_root %>/pages_delta
}
#############################################################################
## indexer settings
#############################################################################
indexer
{
# memory limit
#
# may be specified in bytes (no postfix), kilobytes (mem_limit=1000K)
# or megabytes (mem_limit=10M)
#
# will grow if set unacceptably low
# will warn if set too low and potentially hurting the performance
#
# optional, default is 32M
mem_limit = 64M
}
#############################################################################
## searchd settings
#############################################################################
searchd
{
# IP address on which search daemon will bind and accept
# incoming network requests
#
# optional, default is to listen on all addresses,
# ie. address = 0.0.0.0
#
address = <%= sphinx_host %>
# address = 192.168.0.1
# port on which search daemon will listen
port = <%= sphinx_port %>
# log file
# searchd run info is logged here
log = <%= sphinx_log_root %>/searchd.log
# query log file
# all the search queries are logged here
query_log = <%= sphinx_log_root %>/query.log
# client read timeout, seconds
read_timeout = 5
# maximum amount of children to fork
# useful to control server load
max_children = 30
# a file which will contain searchd process ID
# used for different external automation scripts
# MUST be present
pid_file = <%= sphinx_pid_path %>
# maximum amount of matches this daemon would ever retrieve
# from each index and serve to client
#
# this parameter affects per-client memory and CPU usage
# (16+ bytes per match) in match sorting phase; so blindly raising
# it to 1 million is definitely NOT recommended
#
# starting from 0.9.7, it can be decreased on the fly through
# the corresponding API call; increasing is prohibited to protect
# against malicious and/or malformed requests
#
# default is 1000 (just like with Google)
max_matches = 1000
}
# --eof--