# Reverse search for embeddings

Prerequisites:
- Base filtered dataset is generated (./create-filtered-dataset-1m.ipynb)
- Weaviate and embedding servers are up (./infra/docker-compose.yml)
- .env file is created with Weaviate credentials

## Embed the queries

In [2]:
%pip install datasets weaviate-client python-dotenv

Collecting python-dotenv
  Downloading python_dotenv-1.0.0-py3-none-any.whl (19 kB)
Installing collected packages: python-dotenv
Successfully installed python-dotenv-1.0.0
Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd

df = pd.read_parquet('queries_80_20.parquet/')

In [3]:
import requests
import numpy as np

EMB_URL = "http://localhost:8080/openai"

def create_embedding(url, input_text):
    response = requests.post(url, json={'input': input_text})
    list_embedding = response.json()["data"][0]["embedding"]
    # Convert to numpy array
    return(np.array(list_embedding))

def create_batch_embedding(url, input_texts):
    response = requests.post(url, json={'input': input_texts})
    list_embedding = list(map(lambda x: x["embedding"], response.json()["data"]))
    # Convert to numpy array
    return(np.array(list_embedding))

In [4]:
from tqdm import tqdm

def batch_indices(iterable, batch_size):
    l = len(iterable)
    for ndx in range(0, l, batch_size):
        yield iterable[ndx:min(ndx + batch_size, l)]

def batch_apply(df_column, batch_size, func, url):
    results = []
    total_batches = len(df_column) // batch_size 
    for batch in tqdm(batch_indices(df_column, batch_size), total=total_batches):
        batch_result = func(url, batch.tolist())
        results.extend(batch_result)
    return results

## Reverse documents

Let's test this by embedding [Project Gutenberg's 1000 most popular works](https://www.gutenberg.org/browse/scores/top):

In [5]:
from datasets import load_dataset

dataset = load_dataset("jkeisling/project-gutenberg-top-books-oct-2023", data_files="project-gutenberg-top-1k-fixed-cleaned.csv")
books_df = dataset['train'].to_pandas()

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
# Reconstruct "title by author" format
books_df['document'] = books_df['Title'] + ' by ' + books_df['Author']

In [6]:
# Embed all documents using batch apply. Yes, I know, this is contrived since the dataset is already embedded upstream, but this is the source.
batch_size = 64
books_df['embedding'] = batch_apply(books_df['document'], batch_size, create_batch_embedding, EMB_URL)

16it [00:00, 46.66it/s]                        


Let's persist the embeddings. _Fiat lux!_

In [7]:
%load_ext dotenv
%dotenv
import weaviate
import os

# Create client
client = weaviate.Client(
    url = os.environ.get('WEAVIATE_ENDPOINT'),
    timeout_config=(2, 15)
)

In [8]:
if not client.schema.exists("Document"):
    # Define schema for reverse query documents
    class_obj = {
        "class": "Document",
        "vectorizer": "none",
    }
    client.schema.create_class(class_obj)

In [9]:
from tqdm import tqdm

client.batch.configure(batch_size=1000)
with client.batch as batch:
    for i, row in tqdm(books_df.iterrows(), total=len(books_df)):
        batch.add_data_object(
            data_object={
                "title": row['Title'],
                "author": row['Author'],
                "document": row['document'],
            },
            class_name="Document",
            vector=row['embedding'].tolist()
        )

  0%|          | 0/1000 [00:00<?, ?it/s]


KeyError: 'embedding'

In [10]:
def search_documents(client, query, top_k=10, schema="Document"):
    embedding = create_embedding(EMB_URL, f"Represent this sentence for searching relevant passages: ${query}")
    response = (
        client.query
        .get(schema, ["title", "author"])
        .with_near_vector({
            "vector": embedding.tolist(),
        })
        .with_limit(top_k)
        .do()
    )
    print(response)

def search_keyphrases(client, query, top_k=10, is_query=True):
    prefix = "Represent this sentence for searching relevant passages: " if is_query else ""
    embedding = create_embedding(EMB_URL, prefix + query)
    response = (
        client.query
        .get("Keyphrase", ["keyphrase", "avg_score"])
        .with_near_vector({
            "vector": embedding.tolist(),
        })
        .with_limit(top_k)
        .do()
    )
    return response

In [11]:
search_documents(client, "religion")

{'data': {'Get': {'Document': [{'author': 'William James', 'title': 'The Varieties of Religious Experience: A Study in Human Nature'}, {'author': 'Nietzsche', 'title': 'The Twilight of the Idols; or, How to Philosophize with the Hammer. The Antichrist'}, {'author': 'Émile Durkheim', 'title': 'Les formes élémentaires de la vie religieuse. English'}, {'author': 'Anonymous', 'title': 'Doctrina Christiana'}, {'author': 'G. K. Chesterton', 'title': 'Orthodoxy'}, {'author': 'Anonymous', 'title': 'The King James Version of the Bible'}, {'author': 'Albert Gallatin Mackey', 'title': 'The Symbolism of Freemasonry'}, {'author': 'David Hume', 'title': 'Dialogues Concerning Natural Religion'}, {'author': 'Thomas Inman and M.R.C.S.E. John Newton', 'title': 'Ancient Pagan and Modern Christian Symbolism'}, {'author': 'M. E. Billings', 'title': 'Crimes of Preachers in the United States and Canada'}]}}}


In [11]:
search_documents(client, "Evil")

{'data': {'Get': {'Document': [{'author': 'Friedrich Wilhelm Nietzsche', 'title': 'The Antichrist'}, {'author': 'Ambrose Bierce', 'title': "The Devil's Dictionary"}, {'author': 'Nietzsche', 'title': 'The Twilight of the Idols; or, How to Philosophize with the Hammer. The Antichrist'}, {'author': 'Friedrich Wilhelm Nietzsche', 'title': 'Beyond Good and Evil'}, {'author': 'Chester S. Geier', 'title': 'The Venus Evil'}, {'author': 'Moncure Daniel Conway', 'title': 'Demonology and Devil-lore'}, {'author': 'Kurt Vonnegut', 'title': '2 B R 0 2 B'}, {'author': 'Herman Melville', 'title': 'Bartleby, the Scrivener: A Story of Wall-Street'}, {'author': 'Washington Irving', 'title': 'Rip Van Winkle'}, {'author': 'Anonymous', 'title': 'The King James Version of the Bible'}]}}}


In [23]:
 # persist to parquet
books_df.to_parquet('books.parquet')
books_df.head(10)

Unnamed: 0,Title,Author,Downloads,document,embedding
0,A Christmas Carol in Prose; Being a Ghost Stor...,Charles Dickens,70650,A Christmas Carol in Prose; Being a Ghost Stor...,"[0.0016174316, -0.0793457, 0.036315918, -0.024..."
1,Pride and Prejudice,Jane Austen,59636,Pride and Prejudice by Jane Austen,"[-0.025238037, -0.020614624, 0.0023059845, -0...."
2,"Frankenstein; Or, The Modern Prometheus",Mary Wollstonecraft Shelley,56171,"Frankenstein; Or, The Modern Prometheus by Mar...","[-0.01550293, -0.038726807, 0.018920898, -0.02..."
3,Alice's Adventures in Wonderland,Lewis Carroll,28040,Alice's Adventures in Wonderland by Lewis Carroll,"[-0.017807007, -0.033721924, 0.03161621, -0.04..."
4,The Adventures of Sherlock Holmes,Arthur Conan Doyle,23345,The Adventures of Sherlock Holmes by Arthur Co...,"[0.004436493, -0.021759033, 0.014953613, -0.05..."
5,A Tale of Two Cities,Charles Dickens,23253,A Tale of Two Cities by Charles Dickens,"[-0.03189087, -0.058746338, 0.017105103, -0.01..."
6,The Scarlet Letter,Nathaniel Hawthorne,22293,The Scarlet Letter by Nathaniel Hawthorne,"[0.01763916, -0.018432617, 0.03918457, 0.00870..."
7,A Modest Proposal,Jonathan Swift,22171,A Modest Proposal by Jonathan Swift,"[0.012031555, -0.029891968, 0.041748047, -0.01..."
8,"Moby Dick; Or, The Whale",Herman Melville,22024,"Moby Dick; Or, The Whale by Herman Melville","[0.015167236, -0.04067993, 0.022979736, -0.018..."
9,St. Benedict’s Rule for Monasteries,Anonymous,21228,St. Benedict’s Rule for Monasteries by Anonymous,"[-0.048553467, 0.0017318726, 0.035614014, -0.0..."


## Full embed upload

In [16]:
# Iterate over all queries and create embeddings, persist to weaviate
from tqdm import tqdm

client.batch.configure(batch_size=128)

<weaviate.batch.crud_batch.Batch at 0x7fb8dfd8b7d0>

Now let's try reverse search.

In [20]:
if client.schema.exists("Keyphrase"):
    client.schema.delete_class("Keyphrase")

# Define schema for reverse query documents
class_obj = {
        "class": "Keyphrase",
        "vectorizer": "none",
    }
client.schema.create_class(class_obj)


In [21]:
import math

# Iterate over subset as test
with client.batch as batch:
    n_chunks = math.ceil(len(df) / 64)
    chunks = np.array_split(df, n_chunks)

    for chunk in tqdm(chunks, total=n_chunks):
        # Embed keyphrase
        embeddings = create_batch_embedding(EMB_URL, ("Represent this sentence for searching relevant passages: " + chunk['clean_keyphrase']).tolist())
        # Add embedding to chunk
        chunk['embedding'] = embeddings.tolist()

        for i, row in chunk.iterrows():
            batch.add_data_object(
                data_object={
                    "keyphrase": row['clean_keyphrase'],
                    "avg_score": row['avg_score'],
                },
                class_name="Keyphrase",
                vector=row['embedding']
            )
        
        # Must do this explicitly to avoid memory leak and OOM
        del chunk['embedding']

  return bound(*args, **kwds)
100%|██████████| 15629/15629 [29:59<00:00,  8.69it/s]


In [28]:
search_keyphrases(client, 'For over 25 years, Intesys has helped medium and large businesses digitally transform their processes through design, development, and implementation with open architecture enterprise applications. Intesys, a Digital Transformation Partner, based in Italy, offers specialized solutions for their unique clients in all different industries.', 50, is_query=False)

{'data': {'Get': {'Keyphrase': [{'avg_score': 470,
     'keyphrase': 'digital transformation solutions'},
    {'avg_score': 2479.1666666666665, 'keyphrase': 'digital transformation'},
    {'avg_score': 481.6666666666667, 'keyphrase': 'business transformation'},
    {'avg_score': 1.25, 'keyphrase': 'esys company'},
    {'avg_score': 622.5, 'keyphrase': 'cloud data integration solutions'},
    {'avg_score': 867.5, 'keyphrase': 'small business software systems'},
    {'avg_score': 36.666666666666664, 'keyphrase': 'digital office systems'},
    {'avg_score': 689.5833333333334, 'keyphrase': 'enterprise software'},
    {'avg_score': 0.4166666666666667,
     'keyphrase': 'systems development company'},
    {'avg_score': 1776.6666666666667, 'keyphrase': 'esolutions'},
    {'avg_score': 32147.083333333332,
     'keyphrase': 'business cloud integration service'},
    {'avg_score': 1264.5833333333333,
     'keyphrase': 'enterprise data integration'},
    {'avg_score': 0.4166666666666667, 'keyphra

## Addendum: Book clustering

Let's cluster them all, for funsies.

In [42]:
%pip install umap-learn

Collecting umap-learn
  Downloading umap-learn-0.5.4.tar.gz (90 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m90.8/90.8 kB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Collecting numba>=0.51.2 (from umap-learn)
  Downloading numba-0.58.1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (2.7 kB)
Collecting pynndescent>=0.5 (from umap-learn)
  Downloading pynndescent-0.5.10.tar.gz (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m32.2 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Collecting tbb>=2019.0 (from umap-learn)
  Downloading tbb-2021.10.0-py2.py3-none-manylinux1_x86_64

In [9]:
import sklearn

# Let's cluster the books
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# Create a dataframe with just the embeddings
X = pd.DataFrame(books_df['embedding'].tolist())

# Reduce dimensionality with UMAP
import umap
reducer = umap.UMAP(n_components=32)
X = reducer.fit_transform(X)

NameError: name 'books_df' is not defined

In [50]:
# Create a range of cluster sizes to try
cluster_range = [2**x for x in range(1, 6)]

# For each cluster size, fit a KMeans model and print the silhouette score
for n_clusters in cluster_range:
    clusterer = KMeans(n_clusters=n_clusters, random_state=10)
    cluster_labels = clusterer.fit_predict(X)
    silhouette_avg = silhouette_score(X, cluster_labels)
    print("For n_clusters =", n_clusters,
          "The average silhouette_score is :", silhouette_avg)

  super()._check_params_vs_input(X, default_n_init=10)


For n_clusters = 2 The average silhouette_score is : 0.37293


  super()._check_params_vs_input(X, default_n_init=10)


For n_clusters = 4 The average silhouette_score is : 0.32302183


  super()._check_params_vs_input(X, default_n_init=10)


For n_clusters = 8 The average silhouette_score is : 0.32309246


  super()._check_params_vs_input(X, default_n_init=10)


For n_clusters = 16 The average silhouette_score is : 0.35084388


  super()._check_params_vs_input(X, default_n_init=10)


For n_clusters = 32 The average silhouette_score is : 0.40294448


In [55]:
# apply kmeans with 32 clusters
clusterer = KMeans(n_clusters=16, random_state=10)
cluster_labels = clusterer.fit_predict(X)

# Add cluster labels to dataframe
books_df['cluster'] = cluster_labels

  super()._check_params_vs_input(X, default_n_init=10)


In [56]:
books_df["cluster"].value_counts()

cluster
4     108
12    107
7      82
2      81
13     76
0      70
1      62
14     62
6      62
10     54
3      51
9      51
5      47
8      38
11     37
15     12
Name: count, dtype: int64

In [1]:
books_df[books_df["cluster"] == 3]["document"].head(10)

NameError: name 'books_df' is not defined

In [60]:
# Drop embeddings and save to csv
books_df.drop(columns=['embedding']).to_csv('books_with_clusters.csv', index=False)