In [1]:
import os
import sys

sys.path.append(os.path.abspath(os.path.join(os.curdir, "..")))

See [`elastic/vectorizer.py`](../elastic/vectorizer.py) for the implementation of the `TFIDFVectorizer` class.

In [2]:
from elastic.vectorizer import TFIDFVectorizer
from elasticsearch import Elasticsearch

# A small test
client = Elasticsearch("http://localhost:9200", request_timeout=1000)
vectorizer = TFIDFVectorizer(client, index_name="arxiv", analyzer_name="custom")
vectorizer.tf_idf_text("machine learning")

{'learning': 3.1174318645927173, 'machine': 4.130636908016368}

In [3]:
query_string = "machine learning"
index_name = "arxiv"
analyzer_name = "custom"
r = 10  # Number of results to return

## Naive implementation

In [4]:
from elasticsearch.helpers import scan
from pprint import pprint
from tqdm import tqdm
import math

from elasticsearch import Elasticsearch

client = Elasticsearch("http://localhost:9200", request_timeout=1000)
print(f"Executing search of query string '{query_string}' with over documents in index '{index_name}'")
vectorizer = TFIDFVectorizer(client, index_name=index_name, analyzer_name=analyzer_name)

# Compute the query vector. No need to normalize it, as we only care about relative similarities.
query_vector = vectorizer.tf_idf_text(query_string)
query_norm = math.sqrt(sum(weight**2 for weight in query_vector.values()))
query_vector_normalized = {token: weight / query_norm for token, weight in query_vector.items()}

# Naive implementation:
# Scan through docs, compute cosine sim between query and each doc
similarities = {}
for doc in tqdm(scan(client, index=index_name, query={"_source": False}), total=vectorizer.doc_count()):
    docid = doc["_id"]
    doc_tf_idf = vectorizer.tf_idf_document(docid)
    doc_tf_idf_norm = math.sqrt(sum(weight**2 for weight in doc_tf_idf.values()))
    similarities[docid] = 0.0
    for token, weight in query_vector_normalized.items():
        similarities[docid] += weight * doc_tf_idf.get(token, 0)
    similarities[docid] /= doc_tf_idf_norm

# now sort by cosine similarity
sorted_answer = sorted(similarities.items(), key=lambda kv: kv[1], reverse=True)
pprint(sorted_answer[:r])

Executing search of query string 'machine learning' with over documents in index 'arxiv'


100%|██████████| 58102/58102 [42:42<00:00, 22.67it/s]

[('30749', 0.499339916162954),
 ('29106', 0.4550250906242585),
 ('25230', 0.45238527762709785),
 ('30639', 0.4255856152643423),
 ('18989', 0.3944888658727178),
 ('53877', 0.3944888658727178),
 ('37272', 0.3767314656563681),
 ('55842', 0.3767314656563681),
 ('26955', 0.35511371168748196),
 ('35642', 0.3518888496131795)]





## Query Term-First Implementation

We have to first create the inverted index for the documents. We'll store, for each term, a list of documents that contain
that term, along with the TF-IDF weight of that term in the document.

We'll ignore any compression optimizations for simplicity. This won't cause any issues, as it will still fit in memory.

Also, one of the interesting things of the inverted-index approach is that we only need to compute it once, and then we can store it on disk
for future queries.

To ease the computation, we'll use async calls. See [`elastic/vectorizer.py`](../elastic/vectorizer.py) for the implementation of the `AsyncTFIDFVectorizer` class.

#### Without caching of TF-IDF norms

This would be the typical implementation. However, because we need the norms of the document vectors, and to get them, we need to query elasticsearch, this becomes very inefficient.

In [5]:
from typing import Tuple
from collections import defaultdict
from elasticsearch.helpers import scan
import pickle
import os
import asyncio
from typing import List, Dict
from elasticsearch import AsyncElasticsearch
from elastic.vectorizer import AsyncTFIDFVectorizer
from tqdm import tqdm


async_client = AsyncElasticsearch("http://localhost:9200", request_timeout=1000)
vectorizer = AsyncTFIDFVectorizer(async_client, index_name=index_name, analyzer_name=analyzer_name)
inverted_index: Dict[str, List[Tuple[int, float]]] = defaultdict(list)  # Lists are amortized O(1) for appends
doc_ids: List[str]  # Storing strings in the inverted index would be too memory intensive


client = Elasticsearch("http://localhost:9200", request_timeout=1000)  # We need a sync client for the scan helper
doc_ids = [doc["_id"] for doc in scan(client, index=index_name, query={"_source": False})]


async def process_document(docid: str, doc_index: int):
    doc_tf_idf = await vectorizer.tf_idf_document(docid)
    for term, weight in doc_tf_idf.items():
        inverted_index[term].append((doc_index, weight))
    bar.update(1)


# We don't care about the order of the docIDs (in boolean models we would care, to implement fast merging algorithms)
# so we can just process them in parallel, and append them as they complete.
bar = tqdm(total=len(doc_ids))
tasks = [process_document(docid, doc_index) for doc_index, docid in enumerate(doc_ids)]
await asyncio.gather(*tasks)
bar.close()

inverted_index = dict(inverted_index)  # Convert back to normal dict for pickling

os.makedirs("../data/cache/", exist_ok=True)

# Save the inverted index to disk
with open("../data/cache/inverted_index.pkl", "wb") as f:
    pickle.dump(inverted_index, f)

# Save the document IDs to disk
with open("../data/cache/doc_ids.pkl", "wb") as f:
    pickle.dump(doc_ids, f)

100%|██████████| 58102/58102 [00:29<00:00, 1964.64it/s]


A small pat on my own back: This process would've taken 45 minutes if it weren't for the async calls. With them, it took 26 seconds.

In [6]:
from pprint import pprint
from collections import defaultdict
import time
import math
from elastic.vectorizer import TFIDFVectorizer


print(f"Executing quick search of query string '{query_string}' with over documents in index '{index_name}'")
start = time.time()
vectorizer = TFIDFVectorizer(client, index_name=index_name, analyzer_name=analyzer_name)
vectorizer_loaded_time = time.time()
print(f"Vectorizer loaded in {vectorizer_loaded_time-start:.2f} seconds")

# Compute the query vector. No need to normalize it, as we only care about relative similarities.
query_vector = vectorizer.tf_idf_text(query_string)
query_norm = math.sqrt(sum(weight**2 for weight in query_vector.values()))
query_vector_normalized = {token: weight / query_norm for token, weight in query_vector.items()}


query_vector_computed_time = time.time()
print(f"Query vector computed in {query_vector_computed_time-vectorizer_loaded_time:.2f} seconds")

# Fast implementation:
# Scan through the query terms, look them up in the inverted index, and accumulate partial similarities

# Start by loading the inverted index and document IDs from disk
with open("../data/cache/inverted_index.pkl", "rb") as f:
    inverted_index = pickle.load(f)
with open("../data/cache/doc_ids.pkl", "rb") as f:
    doc_ids = pickle.load(f)

inverted_index_loaded_time = time.time()
print(f"Inverted index loaded in {inverted_index_loaded_time-query_vector_computed_time:.2f} seconds")

similarities = defaultdict(float)
for term, weight in query_vector_normalized.items():
    if term in inverted_index:
        for doc_index, doc_weight in inverted_index[term]:
            docid = doc_ids[doc_index]
            similarities[docid] += weight * doc_weight

similarities_aggregated_time = time.time()
print(
    f"Accumulated similarities for {len(similarities)} documents in {similarities_aggregated_time-inverted_index_loaded_time:.2f} seconds"
)

for docid in similarities:
    doc_tf_idf = vectorizer.tf_idf_document(docid)
    doc_tf_idf_norm = math.sqrt(sum(weight**2 for weight in doc_tf_idf.values()))
    similarities[docid] /= doc_tf_idf_norm

similarities_normalized_time = time.time()
print(f"Normalized similarities in {similarities_normalized_time-similarities_aggregated_time:.2f} seconds")

# now sort by cosine similarity
sorted_answer = sorted(similarities.items(), key=lambda kv: kv[1], reverse=True)
response_time = time.time()
print(f"Sorted top {r} results in {response_time-similarities_normalized_time:.2f} seconds")
print(f"Total query time: {response_time-start:.2f} seconds")
pprint(sorted_answer[:r])

Executing quick search of query string 'machine learning' with over documents in index 'arxiv'
Vectorizer loaded in 0.00 seconds
Query vector computed in 0.04 seconds
Inverted index loaded in 2.07 seconds
Accumulated similarities for 7392 documents in 0.00 seconds
Normalized similarities in 325.33 seconds
Sorted top 10 results in 0.01 seconds
Total query time: 327.44 seconds
[('30749', 0.499339916162954),
 ('29106', 0.4550250906242585),
 ('25230', 0.45238527762709785),
 ('30639', 0.4255856152643423),
 ('18989', 0.3944888658727178),
 ('53877', 0.3944888658727178),
 ('37272', 0.3767314656563681),
 ('55842', 0.3767314656563681),
 ('26955', 0.35511371168748196),
 ('35642', 0.3518888496131795)]


Clearly, querying elasticsearch each time for the norm of the tf-idf vectors of the documents is very inefficient, so we'll try to also cache this data.

#### With caching of TF-IDF norms

In [7]:
import asyncio
import pickle
from tqdm import tqdm
from typing import List
from elastic.vectorizer import AsyncTFIDFVectorizer
from elasticsearch import AsyncElasticsearch

async_client = AsyncElasticsearch("http://localhost:9200", request_timeout=1000)
vectorizer = AsyncTFIDFVectorizer(async_client, index_name=index_name, analyzer_name=analyzer_name)
vector_norms: List[float] = []  # Again, we'll store floats in a list for memory and access efficiency


with open("../data/cache/doc_ids.pkl", "rb") as f:
    doc_ids: List[str] = pickle.load(f)

bar = tqdm(total=len(doc_ids))


async def compute_norm(docid: str) -> float:
    doc_tf_idf = await vectorizer.tf_idf_document(docid)
    doc_tf_idf_norm = math.sqrt(sum(weight**2 for weight in doc_tf_idf.values()))
    bar.update(1)
    return doc_tf_idf_norm


async def main():
    tasks = [compute_norm(docid) for docid in doc_ids]
    return await asyncio.gather(*tasks)


vector_norms = await main()
bar.close()

with open("../data/cache/vector_norms.pkl", "wb") as f:
    pickle.dump(vector_norms, f)

  0%|          | 0/58102 [00:00<?, ?it/s]Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x00000106BBB1A900>
100%|██████████| 58102/58102 [00:28<00:00, 2023.41it/s]


In [8]:
from collections import defaultdict
import time
import pickle

print(f"Executing quick search of query string '{query_string}' with over documents in index '{index_name}'")
start = time.time()
vectorizer = TFIDFVectorizer(client, index_name=index_name, analyzer_name=analyzer_name)
vectorizer_loaded_time = time.time()
print(f"Vectorizer loaded in {vectorizer_loaded_time-start:.2f} seconds")

# Compute the query vector. No need to normalize it, as we only care about relative similarities.
query_vector = vectorizer.tf_idf_text(query_string)
query_norm = math.sqrt(sum(weight**2 for weight in query_vector.values()))
query_vector_normalized = {token: weight / query_norm for token, weight in query_vector.items()}


query_vector_computed_time = time.time()
print(f"Query vector computed in {query_vector_computed_time-vectorizer_loaded_time:.2f} seconds")

# Fast implementation:
# Scan through the query terms, look them up in the inverted index, and accumulate partial similarities

# Start by loading the inverted index and document IDs from disk
with open("../data/cache/inverted_index.pkl", "rb") as f:
    inverted_index = pickle.load(f)
with open("../data/cache/doc_ids.pkl", "rb") as f:
    doc_ids = pickle.load(f)
with open("../data/cache/vector_norms.pkl", "rb") as f:
    vector_norms = pickle.load(f)

inverted_index_loaded_time = time.time()
print(f"Inverted index loaded in {inverted_index_loaded_time-query_vector_computed_time:.2f} seconds")

similarities = defaultdict(float)
for term, weight in query_vector_normalized.items():
    if term in inverted_index:
        for doc_index, doc_weight in inverted_index[term]:
            similarities[doc_index] += weight * doc_weight

similarities_aggregated_time = time.time()
print(
    f"Accumulated similarities for {len(similarities)} documents in {similarities_aggregated_time-inverted_index_loaded_time:.2f} seconds"
)

for doc_index in similarities:
    similarities[doc_index] /= vector_norms[doc_index]

similarities_normalized_time = time.time()
print(f"Normalized similarities in {similarities_normalized_time-similarities_aggregated_time:.2f} seconds")

# now sort by cosine similarity
sorted_answer = sorted(similarities.items(), key=lambda kv: kv[1], reverse=True)
response_time = time.time()
print(f"Sorted top {r} results in {response_time-similarities_normalized_time:.2f} seconds")
print(f"Total query time: {response_time-start:.2f} seconds")
for doc_index, score in sorted_answer[:r]:
    docid = doc_ids[doc_index]
    print(f"({docid}, {score})")

Executing quick search of query string 'machine learning' with over documents in index 'arxiv'
Vectorizer loaded in 0.00 seconds
Query vector computed in 0.05 seconds
Inverted index loaded in 1.27 seconds
Accumulated similarities for 7392 documents in 0.00 seconds
Normalized similarities in 0.00 seconds
Sorted top 10 results in 0.00 seconds
Total query time: 1.32 seconds
(30749, 0.499339916162954)
(29106, 0.4550250906242585)
(25230, 0.45238527762709785)
(30639, 0.4255856152643423)
(18989, 0.3944888658727178)
(53877, 0.3944888658727178)
(37272, 0.3767314656563681)
(55842, 0.3767314656563681)
(26955, 0.35511371168748196)
(35642, 0.3518888496131795)


And voila, what took 45 minutes the first time, was reduced to 1.02 seconds!

## Directly using Elasticsearch's built-in search

In [9]:
import time

client = Elasticsearch("http://localhost:9200", request_timeout=1000)

search_start = time.time()
results = client.search(index=index_name, body={"query": {"match": {"text": query_string}}, "size": r})
search_end = time.time()
print(f"Search finished in {search_end-search_start:.4f} seconds")
print("Results:")
for hit in results["hits"]["hits"]:
    print(f"({hit['_id']}, {hit['_score']})")

Search finished in 0.0216 seconds
Results:
(30749, 9.954108)
(25230, 9.918955)
(29106, 9.918955)
(18989, 9.913799)
(53877, 9.913799)
(32144, 9.854071)
(37272, 9.836382)
(55842, 9.836382)
(30639, 9.817394)
(35642, 9.597559)
