In [2]:
from typing import Dict, List, Sequence

from whoosh.index import create_in
from whoosh.fields import *
from whoosh.qparser import MultifieldParser
from whoosh.filedb.filestore import RamStorage
from whoosh.analysis import StemmingAnalyzer

import json


In [3]:

class SearchEngine:
    def __init__(self, schema):
        self.schema = schema
        schema.add('raw', TEXT(stored=True))
        self.ix = RamStorage().create_index(self.schema)
        '''try:
            storage = copy_to_ram(FileStorage('data/index'))
            self.ix = storage.open_index()

        except EmptyIndexError:
            docs = pd.DataFrame()
            with sqlite3.connect(data_path) as conn:
                # Use sql query to load docs into pandas dataframe.
                docs = pd.read_sql(load_classes_query, conn).dropna().to_dict(orient='records')
                self.index_documents(docs)'''

    def index_documents(self, docs: Sequence):
        writer = self.ix.writer()
        for doc in docs:
            d = {k: v for k,v in doc.items() if k in self.schema.stored_names()}
            d['raw'] = json.dumps(doc) # raw version of all of doc
            writer.add_document(**d)
        writer.commit(optimize=True)

    def get_index_size(self) -> int:
        return self.ix.doc_count_all()

    def query(self, q: str, fields: Sequence, highlight: bool=True) -> List[Dict]:
        search_results = []
        with self.ix.searcher() as searcher:
            results = searcher.search(MultifieldParser(fields, schema=self.schema).parse(q))
            for r in results:
                d = json.loads(r['raw'])
                if highlight:
                    for f in fields:
                        if r[f] and isinstance(r[f], str):
                            d[f] = r.highlights(f) or r[f]

                search_results.append(d)

        return search_results

In [4]:
schema = Schema(
    id=ID(stored=True),
    sentence=TEXT(stored=True, analyzer=StemmingAnalyzer()),
    start=NUMERIC(stored=True),
    end=NUMERIC(stored=True)
)

In [5]:
file = open("225.json")
docs = json.loads(file.read())
file.close()
docs

[{'sentence': 'Yeah.', 'start': 3, 'end': 3},
 {'sentence': 'Okay.', 'start': 3, 'end': 25},
 {'sentence': 'Welcome back to CS 225 and perhaps that overly Randy intake on forest development.',
  'start': 25,
  'end': 33},
 {'sentence': "With that, we're going to move into trees today.",
  'start': 32,
  'end': 36},
 {'sentence': "So we assume that you have a background we're sort of reviewing, but in part I want to call out some of the differences in how in this class we're going to talk about tree is theoretically from what you've seen before.",
  'start': 35,
  'end': 52},
 {'sentence': "And it's because we care about moving from just a theoretical description of a tree to something we're actually going to implement.",
  'start': 52,
  'end': 62},
 {'sentence': 'It changes a couple of things.', 'start': 61, 'end': 65},
 {'sentence': 'So first of all, how do we define a binary tree?',
  'start': 64,
  'end': 70},
 {'sentence': 'Sort of what is our formal definition?',
  'start': 70,
 

In [6]:
engine = SearchEngine(schema)
engine.index_documents(docs)


In [7]:
fields_to_search = ["sentence"]



In [8]:
for q in ["left child", "node"]:
    print(f"Query:: {q}")
    print("\t", engine.query(q, fields_to_search, highlight=True))
    print("-"*70)

Query:: left child
	 [{'sentence': 'So there\'s a <b class="match term0">left</b> <b class="match term1">child</b> of the root and a <b class="match term0">left</b> <b class="match term1">child</b> of the <b class="match term0">left</b> <b class="match term1">child</b>', 'start': 2444, 'end': 2456}, {'sentence': 'A <b class="match term0">left</b> <b class="match term1">child</b> by a right <b class="match term1">child</b>', 'start': 2482, 'end': 2484}, {'sentence': 'so, I just have one <b class="match term0">left</b>-<b class="match term1">child</b> of the second node...the <b class="match term0">left</b> subtree', 'start': 1293, 'end': 1300}, {'sentence': 'well, too here has a <b class="match term0">left</b> <b class="match term1">child</b>', 'start': 1187, 'end': 1197}]
----------------------------------------------------------------------
Query:: node
	 [{'sentence': 'A single <b class="match term0">node</b> is a tree of height 0...contains one <b class="match term0">node</b>', 'sta

In [9]:
for idx, item in enumerate(docs):
    item['id'] = str(idx)

In [10]:
docs

[{'sentence': 'Yeah.', 'start': 3, 'end': 3, 'id': '0'},
 {'sentence': 'Okay.', 'start': 3, 'end': 25, 'id': '1'},
 {'sentence': 'Welcome back to CS 225 and perhaps that overly Randy intake on forest development.',
  'start': 25,
  'end': 33,
  'id': '2'},
 {'sentence': "With that, we're going to move into trees today.",
  'start': 32,
  'end': 36,
  'id': '3'},
 {'sentence': "So we assume that you have a background we're sort of reviewing, but in part I want to call out some of the differences in how in this class we're going to talk about tree is theoretically from what you've seen before.",
  'start': 35,
  'end': 52,
  'id': '4'},
 {'sentence': "And it's because we care about moving from just a theoretical description of a tree to something we're actually going to implement.",
  'start': 52,
  'end': 62,
  'id': '5'},
 {'sentence': 'It changes a couple of things.',
  'start': 61,
  'end': 65,
  'id': '6'},
 {'sentence': 'So first of all, how do we define a binary tree?',
  'start':