# Reverse search for embeddings
## Embed the queries

In [1]:
%pip install datasets

Note: you may need to restart the kernel to use updated packages.


In [1]:
import pandas as pd

df = pd.read_parquet('queries_80_20.parquet/')

In [2]:
# ping server to make sure it's up
import requests
import numpy as np

EMB_URL = "http://localhost:8080/openai"

def create_embedding(url, input_text):
    response = requests.post(url, json={'input': input_text})
    list_embedding = response.json()["data"][0]["embedding"]
    # Convert to numpy array
    return(np.array(list_embedding))

def create_batch_embedding(url, input_texts):
    response = requests.post(url, json={'input': input_texts})
    list_embedding = list(map(lambda x: x["embedding"], response.json()["data"]))
    # Convert to numpy array
    return(np.array(list_embedding))

In [3]:
from tqdm import tqdm

def batch_indices(iterable, batch_size):
    l = len(iterable)
    for ndx in range(0, l, batch_size):
        yield iterable[ndx:min(ndx + batch_size, l)]

def batch_apply(df_column, batch_size, func, url):
    results = []
    total_batches = len(df_column) // batch_size 
    for batch in tqdm(batch_indices(df_column, batch_size), total=total_batches):
        batch_result = func(url, batch.tolist())
        results.extend(batch_result)
    return results

In [4]:
df.head()

Unnamed: 0,clean_keyphrase,avg_score
0,100 free death record search,0.416667
1,1610a,2.083333
2,18 cargo trailer,0.833333
3,1973 jeep cj5 specifications,0.416667
4,1a sehen de,0.416667


Let's experiment on a subset of the queries. This takes a jiffy:

In [30]:
# Time for 100k subset to sanity check
df_subset = df.sample(int(1e5), random_state=42)
# Batch apply 32 at a time
batch_size = 64
df_subset['embedding'] = batch_apply(df_subset['clean_keyphrase'], batch_size, create_batch_embedding, EMB_URL)

  0%|          | 0/1562 [00:00<?, ?it/s]

1563it [00:25, 61.97it/s]                          


## Reverse documents

Let's test this by embedding [Project Gutenberg's 1000 most popular works](https://www.gutenberg.org/browse/scores/top):

In [5]:
from datasets import load_dataset

dataset = load_dataset("jkeisling/project-gutenberg-top-books-oct-2023", data_files="project-gutenberg-top-1k-fixed-cleaned.csv")
books_df = dataset['train'].to_pandas()

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
# Reconstruct "title by author" format
books_df['document'] = books_df['Title'] + ' by ' + books_df['Author']

In [31]:
# Embed all documents using batch apply. Yes, I know, this is contrived since the dataset is already embedded upstream, but this is the source.
batch_size = 64
books_df['embedding'] = batch_apply(books_df['document'], batch_size, create_batch_embedding, EMB_URL)

16it [00:00, 43.02it/s]                        


In [23]:
 # persist to parquet
books_df.to_parquet('books.parquet')
books_df.head(10)

Unnamed: 0,Title,Author,Downloads,document,embedding
0,A Christmas Carol in Prose; Being a Ghost Stor...,Charles Dickens,70650,A Christmas Carol in Prose; Being a Ghost Stor...,"[0.0016174316, -0.0793457, 0.036315918, -0.024..."
1,Pride and Prejudice,Jane Austen,59636,Pride and Prejudice by Jane Austen,"[-0.025238037, -0.020614624, 0.0023059845, -0...."
2,"Frankenstein; Or, The Modern Prometheus",Mary Wollstonecraft Shelley,56171,"Frankenstein; Or, The Modern Prometheus by Mar...","[-0.01550293, -0.038726807, 0.018920898, -0.02..."
3,Alice's Adventures in Wonderland,Lewis Carroll,28040,Alice's Adventures in Wonderland by Lewis Carroll,"[-0.017807007, -0.033721924, 0.03161621, -0.04..."
4,The Adventures of Sherlock Holmes,Arthur Conan Doyle,23345,The Adventures of Sherlock Holmes by Arthur Co...,"[0.004436493, -0.021759033, 0.014953613, -0.05..."
5,A Tale of Two Cities,Charles Dickens,23253,A Tale of Two Cities by Charles Dickens,"[-0.03189087, -0.058746338, 0.017105103, -0.01..."
6,The Scarlet Letter,Nathaniel Hawthorne,22293,The Scarlet Letter by Nathaniel Hawthorne,"[0.01763916, -0.018432617, 0.03918457, 0.00870..."
7,A Modest Proposal,Jonathan Swift,22171,A Modest Proposal by Jonathan Swift,"[0.012031555, -0.029891968, 0.041748047, -0.01..."
8,"Moby Dick; Or, The Whale",Herman Melville,22024,"Moby Dick; Or, The Whale by Herman Melville","[0.015167236, -0.04067993, 0.022979736, -0.018..."
9,St. Benedict’s Rule for Monasteries,Anonymous,21228,St. Benedict’s Rule for Monasteries by Anonymous,"[-0.048553467, 0.0017318726, 0.035614014, -0.0..."


Now let's try reverse search.

In [13]:
from scipy.spatial.distance import cdist

def search(query_embedding, all_embeddings, top_k=16):
    distances = cdist([query_embedding], all_embeddings, metric='cosine')
    indices = np.argsort(distances)[0][:top_k]
    return indices

def retrieve_documents(indices, df):
    # Return all matching documents
    return df.iloc[indices]['clean_keyphrase'].tolist()

In [32]:
all_query_embeddings = np.vstack(df_subset['embedding'].values)

In [33]:
retrieve_documents(search(books_df.iloc[0]["embedding"], all_query_embeddings, top_k=32), df_subset)

['christmas carol book',
 'christmas poem',
 'christmas carol george c scott',
 'charles dickens biography',
 'politics prose',
 'scrooge',
 'caroling',
 'great expectations',
 'moral story',
 'lyrics christmas songs',
 'theme moral story',
 'poem written iambic pentameter',
 'wordsworth',
 'stephen king',
 'last christmas',
 'tom sawyer book',
 'creative writing',
 'tone literature',
 'nat king cole christmas song',
 'christmas chronicles',
 'james joyce',
 'free literature book',
 'christmas puns',
 'tall tale',
 'short narrative',
 'f scott fitzgerald',
 'finding father christmas',
 'yuletide',
 'nathaniel hawthorne',
 'william blake',
 'mark twain',
 'lament son book']

## Addendum: Book clustering

Let's cluster them all, for funsies.

In [42]:
%pip install umap-learn

Collecting umap-learn
  Downloading umap-learn-0.5.4.tar.gz (90 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m90.8/90.8 kB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Collecting numba>=0.51.2 (from umap-learn)
  Downloading numba-0.58.1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (2.7 kB)
Collecting pynndescent>=0.5 (from umap-learn)
  Downloading pynndescent-0.5.10.tar.gz (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m32.2 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Collecting tbb>=2019.0 (from umap-learn)
  Downloading tbb-2021.10.0-py2.py3-none-manylinux1_x86_64

In [9]:
import sklearn

# Let's cluster the books
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# Create a dataframe with just the embeddings
X = pd.DataFrame(books_df['embedding'].tolist())

# Reduce dimensionality with UMAP
import umap
reducer = umap.UMAP(n_components=32)
X = reducer.fit_transform(X)

NameError: name 'books_df' is not defined

In [50]:
# Create a range of cluster sizes to try
cluster_range = [2**x for x in range(1, 6)]

# For each cluster size, fit a KMeans model and print the silhouette score
for n_clusters in cluster_range:
    clusterer = KMeans(n_clusters=n_clusters, random_state=10)
    cluster_labels = clusterer.fit_predict(X)
    silhouette_avg = silhouette_score(X, cluster_labels)
    print("For n_clusters =", n_clusters,
          "The average silhouette_score is :", silhouette_avg)

  super()._check_params_vs_input(X, default_n_init=10)


For n_clusters = 2 The average silhouette_score is : 0.37293


  super()._check_params_vs_input(X, default_n_init=10)


For n_clusters = 4 The average silhouette_score is : 0.32302183


  super()._check_params_vs_input(X, default_n_init=10)


For n_clusters = 8 The average silhouette_score is : 0.32309246


  super()._check_params_vs_input(X, default_n_init=10)


For n_clusters = 16 The average silhouette_score is : 0.35084388


  super()._check_params_vs_input(X, default_n_init=10)


For n_clusters = 32 The average silhouette_score is : 0.40294448


In [55]:
# apply kmeans with 32 clusters
clusterer = KMeans(n_clusters=16, random_state=10)
cluster_labels = clusterer.fit_predict(X)

# Add cluster labels to dataframe
books_df['cluster'] = cluster_labels

  super()._check_params_vs_input(X, default_n_init=10)


In [56]:
books_df["cluster"].value_counts()

cluster
4     108
12    107
7      82
2      81
13     76
0      70
1      62
14     62
6      62
10     54
3      51
9      51
5      47
8      38
11     37
15     12
Name: count, dtype: int64

In [57]:
books_df[books_df["cluster"] == 3]["document"].head(10)

37           The Wonderful Wizard of Oz by L. Frank Baum
40                          The Prophet by Kahlil Gibran
61     Baron Trump's Marvellous Underground Journey b...
79                  The Turn of the Screw by Henry James
81                    The Jungle Book by Rudyard Kipling
86                          The Jungle by Upton Sinclair
173             The King in Yellow by Robert W. Chambers
177           The Wind in the Willows by Kenneth Grahame
197                   Just So Stories by Rudyard Kipling
221    The Pilgrim's Progress from this world to that...
Name: document, dtype: object

In [60]:
# Drop embeddings and save to csv
books_df.drop(columns=['embedding']).to_csv('books_with_clusters.csv', index=False)