# Search Lib

# Why this library?

Because we need to understand and see the components of our pipeline more granularly, faster and share those results.

## Downloading Anking Deck

In [1]:
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
from search_lib.load_data import download_sqlite_file, get_notes
import pickle

In [3]:
db_path = download_sqlite_file()
db_path

Downloading SQLite file for deck: AnKing Step Deck


Path('AnKing Step Deck.sqlite')

In [4]:
notes = get_notes(db_path)
notes[0]

Extracted 28664 notes with content


{'id': '10d7cd51-565b-44d6-9858-30cb6ebf51c4',
 'content': 'For instructions on <b>using this deck</b>&nbsp;and&nbsp;<b>using/customizing the note type</b>&nbsp;go to <a href="https://community.ankihub.net/t/wiki-anking-overhaul-for-step-1-2-by-ankingmed/114092?_gl=1*15fqpnu*_ga*MTA1OTEyNjc5Ni4xNjkxODA1MTI5*_ga_T2ZF93TKF6*MTcwNzQxMzQwOS45OTcuMS4xNzA3NDE5NTEwLjAuMC4w">the wiki guide</a><br><br>{{c1::Please note that all organization for this deck is in the <b>tags</b>! There are <u><a href="https://community.ankihub.net/t/why-are-there-no-subdecks-in-the-anking-v12-deck/129271">no subdecks</a></u>}}<br><br>If you are wondering why there are <u>0 cards showing up</u>, see this <a href="https://community.ankihub.net/t/why-does-my-anking-step-deck-have-0-cards/131517">post for the answer</a> Huge thank you to all the&nbsp;<a href="https://community.ankihub.net/docs?topic=134040">contributors &amp; maintainers</a>!&nbsp;<br>To see what is being updated, check the&nbsp;<a href="https://commu

## Pre Search

This module holds anything that happens prior to the search across notes, such as query pre-processing, tag enrichment, etc.

You can use a search function from the search module

In [5]:
from search_lib.pre_search import decompose_query

In [6]:
qry = '''Acute myocardial infarction (AMI) is characterized by sudden chest pain, often described as pressure or tightness, that may radiate to the left arm or jaw. The condition occurs when blood flow to a part of the heart is blocked, typically due to a ruptured atherosclerotic plaque, leading to myocardial cell death. Diagnosis is confirmed through ECG changes, elevated cardiac biomarkers like troponin, and clinical presentation.'''
decompose_query(qry, verbose=False).queries

['What are the symptoms of acute myocardial infarction (AMI)?',
 'What causes acute myocardial infarction (AMI)?',
 'How is the diagnosis of acute myocardial infarction (AMI) confirmed?']

# Search

This holds anything that does the main search across all notes

### Dense

In [7]:
from search_lib.search import dense_search, embed_cohere
import os

In [8]:
notes[0]

{'id': '10d7cd51-565b-44d6-9858-30cb6ebf51c4',
 'content': 'For instructions on <b>using this deck</b>&nbsp;and&nbsp;<b>using/customizing the note type</b>&nbsp;go to <a href="https://community.ankihub.net/t/wiki-anking-overhaul-for-step-1-2-by-ankingmed/114092?_gl=1*15fqpnu*_ga*MTA1OTEyNjc5Ni4xNjkxODA1MTI5*_ga_T2ZF93TKF6*MTcwNzQxMzQwOS45OTcuMS4xNzA3NDE5NTEwLjAuMC4w">the wiki guide</a><br><br>{{c1::Please note that all organization for this deck is in the <b>tags</b>! There are <u><a href="https://community.ankihub.net/t/why-are-there-no-subdecks-in-the-anking-v12-deck/129271">no subdecks</a></u>}}<br><br>If you are wondering why there are <u>0 cards showing up</u>, see this <a href="https://community.ankihub.net/t/why-does-my-anking-step-deck-have-0-cards/131517">post for the answer</a> Huge thank you to all the&nbsp;<a href="https://community.ankihub.net/docs?topic=134040">contributors &amp; maintainers</a>!&nbsp;<br>To see what is being updated, check the&nbsp;<a href="https://commu

In [9]:
notes_content = [n['content'] for n in notes]

In [10]:
embeddings_fpath = 'cohere_embeddings.pkl'
if os.path.exists(embeddings_fpath):
    with open(embeddings_fpath, 'rb') as f:
        embeddings = pickle.load(f)
else: 
    embeddings = embed_cohere(notes_content, verbose=True, input_type='search_document')
    with open(embeddings_fpath, 'wb') as f:
        pickle.dump(embeddings, f)

In [11]:
embeddings.shape

(28664, 1024)

In [12]:
res = dense_search(qry, notes, embeddings=embeddings, verbose=True)

2025-05-08 14:50:36.094046 : Embedding 1 documents with embed-english-v3.0...


100%|███████████████████████████████████████████████████████| 1/1 [00:00<00:00,  6.13it/s]

2025-05-08 14:50:36.258734 : Searching...





In [13]:
sorted_results = sorted(res[0], key=lambda x: x["similarity"], reverse=True)

for r in sorted_results[:5]:
    print(r['content'])
    print('-'*5)

<div><b>Myocardial infarction</b> often presents with severe <i>crushing </i>chest pain <u>lasting {{c1::&gt; 20}} minutes</u> that radiates to the {{c2::<b>left arm</b>}} or {{c2::<b>jaw</b>}} </div> Other symptoms include <b>diaphoresis</b>, <b>nausea</b>, vomiting, severe retrosternal pain, <b>shortness of breath</b>, and fatigue     <div><i><img src="paste-296039210812552.jpg"></i></div><br> <img src="bde56c5d21fe142edba03359678155de.webp"><img src="1342fb2a84d994e8e5ae58fee322ea02.webp"><img src="1bc6e745a3cbf3b2a79dd3ba4d184c41.webp"><br><a href="https://dashboard.sketchy.com/study/medical/courses/medical-pathophysiology/units/medical-pathophysiology-cardiac/videos/medical-pathophysiology-cardiac-ischemic-heart-disease-acute-myocardial-infarction-and-post-mi-timeline?utm_source=anki&amp;utm_medium=partnership&amp;utm_campaign=february_update&amp;utm_content=medical">Watch Acute Myocardial Infarction &amp; Post MI Timeline</a> <img src="65402f0d7d58bdf6c07550e6fc9a4101.webp"><br><

### Sparse

In [14]:
from search_lib.search import sparse_search, tokenize_notes

In [15]:
tokens = tokenize_notes(notes, verbose=True)

2025-05-08 14:50:37.636270 : Tokenizing 28664 notes...


100%|████████████████████████████████████████████| 28664/28664 [00:01<00:00, 16505.89it/s]


In [27]:
tokens[0]

Tokenized(ids=[[0, 1, 2, 3, 3, 1, 4, 5, 6, 3, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 13, 25, 26, 26, 27, 28, 5, 29, 30, 2, 31, 8, 9, 10, 11, 12, 32, 33, 14, 34, 2, 35, 33, 26, 26, 36, 37, 32, 38, 39, 40, 41, 8, 9, 10, 11, 12, 32, 42, 43, 14, 16, 2, 44, 38, 45, 46, 47, 48, 49, 36, 29, 3, 8, 9, 10, 11, 12, 50, 51, 52, 53, 54, 55, 3, 26, 41, 56, 57, 58, 59, 3, 8, 9, 10, 11, 12, 14, 16, 2, 60, 61, 62, 60, 61, 26, 63, 64, 65, 8, 9, 10, 11, 12, 66, 64, 67, 10, 3, 26, 26, 36, 68, 69, 70, 28, 59, 3, 8, 9, 10, 11, 12, 14, 16, 2, 71, 72, 73, 72, 3, 74, 29, 75, 76, 77, 72, 78, 79, 26, 26, 80, 36, 81, 82, 83, 14, 5, 84, 2, 85, 86, 87, 88, 89, 3, 90, 91, 92, 93, 94, 95, 90, 3, 3, 90, 91, 92, 93, 94, 95, 90, 3, 29, 96, 97, 85, 98, 98, 99, 36, 88, 100, 3, 90, 91, 92, 93, 94, 95, 90, 3, 101, 102, 103, 3, 90, 91, 92, 93, 94, 95, 90, 3, 101, 29, 104, 96, 36, 88, 97, 100, 3, 8, 9, 105, 12, 106, 107, 108, 14, 5, 84, 109, 3, 110, 111, 112, 113, 87, 114, 115, 87, 116]], vocab={

In [19]:
res = sparse_search(qry, notes)

BM25S Create Vocab:   0%|          | 0/28664 [00:00<?, ?it/s]

TypeError: unhashable type: 'list'

You can use the components to create you own

# Post Search

This holds anything that happens after the primary search, such as reranking.

In [None]:
from search_lib.post_search import cohere_reranker, llm_reranker

ModuleNotFoundError: No module named 'search_lib.query_processor'