In [8]:
import os
from whoosh.index import create_in
from whoosh.fields import *
from whoosh.qparser import QueryParser
from whoosh.query import *

# 定义索引schema,确定索引字段

In [3]:
schema = Schema(title=TEXT(stored=True), path=ID(stored=True), content=TEXT)

# 创建索引对象

In [4]:
if not os.path.exists("index"):
    os.mkdir("index")
ix = create_in("index", schema)
# from whoosh.index import open_dir
# ix = open_dir("index")

# 添加文档到索引中

In [5]:
writer = ix.writer()  # returns an IndexWriter object
# 传递Unicode值
writer.add_document(title=u"First document",  # 用于索引
                    _stored_title=u"Stored title",  # 用于store
                    path=u"/a",
                    content=u"This is the first document we've added!")
writer.add_document(title=u"Second document", path=u"/b", content=u"The second one is even more interesting!")
writer.commit()  # saves the added documents to the index

# 通过关键词搜索

In [14]:
with ix.searcher() as searcher:
    query = QueryParser("content", ix.schema).parse("one")
    results = searcher.search(query)
    print(results[0])

<Hit {'path': '/b', 'title': 'Second document'}>


# 同时匹配多个关键词

默认的 QueryParser 允许使用查询语句 AND 和 OR 和 NOT

In [18]:
with ix.searcher() as searcher:
    myquery = And([Term("content", u"is"), Term("content", u"more")]) 
    results = searcher.search(query)
    print(results[0])

<Hit {'path': '/b', 'title': 'Second document'}>


# 检索中文

## 基于正则表达式进行分词

In [20]:
from jieba.analyse import ChineseAnalyzer

analyzer = ChineseAnalyzer()
schema = Schema(title=TEXT(stored=True), content=TEXT(stored=True, analyzer=analyzer))
idx = create_in("index", schema)

writer = idx.writer()
writer.add_document(title=u"第一篇文档", content=u"这是第一篇文档，成功添加")
writer.add_document(title=u"第二篇文档", content=u"第二篇文档更有趣")
writer.commit()  # saves the added documents to the index

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\wcy70\AppData\Local\Temp\jieba.cache
Loading model cost 0.860 seconds.
Prefix dict has been built succesfully.


In [29]:
with idx.searcher() as searcher:
    parser = QueryParser("content", schema=idx.schema)
    for keyword in ("文档","成功","有趣"):
        print("result of ",keyword)
        q = parser.parse(keyword)
        results = searcher.search(q)
        for hit in results:
            print(dict(hit))
#             print(hit.highlights("content"))
        print("-"*30)

result of  文档
{'content': '第二篇文档更有趣', 'title': '第二篇文档'}
{'content': '这是第一篇文档，成功添加', 'title': '第一篇文档'}
------------------------------
result of  成功
{'content': '这是第一篇文档，成功添加', 'title': '第一篇文档'}
------------------------------
result of  有趣
{'content': '第二篇文档更有趣', 'title': '第二篇文档'}
------------------------------
