In [1]:
import os
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

In [2]:
# embeddings
from sentence_transformers import SentenceTransformer
# chroma
import chromadb
from chromadb.config import Settings
# Notebook-friendly display
from IPython.display import display, HTML

In [3]:
# Configuration
DATA_CSV = "books_cleaned.csv"
EMBED_CACHE = "embeddings.npy"
ID_CACHE =  "ids.npy"
PERSIST_DIR = r"D:\book-recommender\chroma_db"
COLLECTION_NAME = "books_collection"
BATCH_SIZE = 32
MODEL_NAME = "sentence_transformers/all-MiniLM-L6-v2"

In [4]:
# loading cleaned books.csv
df = pd.read_csv("books_cleaned.csv")
print("Number of books:", len(df))
df.head()

Number of books: 5197


Unnamed: 0,isbn13,isbn10,title,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count,title_and_subtitle,tagged_description
0,9780002005883,2005883,Gilead,Marilynne Robinson,Fiction,http://books.google.com/books/content?id=KQZCP...,A NOVEL THAT READERS and critics have been eag...,2004.0,3.85,247.0,361.0,Gilead,A NOVEL THAT READERS and critics have been eag...
1,9780002261982,2261987,Spider's Web,Charles Osborne;Agatha Christie,Detective and mystery stories,http://books.google.com/books/content?id=gA5GP...,A new 'Christie for Christmas' -- a full-lengt...,2000.0,3.83,241.0,5164.0,Spider's Web:A Novel,A new 'Christie for Christmas' -- a full-lengt...
2,9780006178736,6178731,Rage of angels,Sidney Sheldon,Fiction,http://books.google.com/books/content?id=FKo2T...,"A memorable, mesmerizing heroine Jennifer -- b...",1993.0,3.93,512.0,29532.0,Rage of angels,"A memorable, mesmerizing heroine Jennifer -- b..."
3,9780006280897,6280897,The Four Loves,Clive Staples Lewis,Christian life,http://books.google.com/books/content?id=XhQ5X...,Lewis' work on the nature of love divides love...,2002.0,4.15,170.0,33684.0,The Four Loves,Lewis' work on the nature of love divides love...
4,9780006280934,6280935,The Problem of Pain,Clive Staples Lewis,Christian life,http://books.google.com/books/content?id=Kk-uV...,"""In The Problem of Pain, C.S. Lewis, one of th...",2002.0,4.09,176.0,37569.0,The Problem of Pain,"""In The Problem of Pain, C.S. Lewis, one of th..."


In [5]:
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
import warnings
warnings.filterwarnings("ignore")

In [6]:
from chromadb import PersistentClient
# Creating persistent client
client = PersistentClient(path=PERSIST_DIR)

# Creating/ Loading collection
collection = client.get_or_create_collection(
    name="books_collection",
    metadata={"hnsw:space" : "cosine"}
    # cosine similarity measures how similar two vectors are.
    # hnsw stands for Hierarchical Navigable Small World graph.It is an algorithm for fastest neighbour search
)

In [7]:
# Convert pandas columns into python lists for chromadb
documents = df["tagged_description"].tolist()
ids = df["isbn13"].astype(str).tolist()
titles = df["title_and_subtitle"].tolist()

In [8]:
# Create Embeddings and insert them into Chroma
batch_size = BATCH_SIZE
# batch processing to avoid memory issues
for i in tqdm(range(0, len(documents), batch_size)):
    batch_docs = documents[i:i + batch_size]
    batch_ids = ids[i:i + batch_size]
    batch_titles = titles[i:i + batch_size]
# Encoding text into embeddings
    embeddings = model.encode(batch_docs).tolist()
# Adding batch to chromadb
    collection.add(
        documents=batch_docs,
        embeddings=embeddings,
        ids=batch_ids,
        metadatas=[{"title": t} for t in batch_titles],
    )

  0%|          | 0/163 [00:00<?, ?it/s]

In [9]:
# Creating function to search books
def recommend_books(query, n=5):
    """
    Search for top-n books similar to the query.

    Parameters:
    query (str): Your search query
    n (int): Number of top results to return
    """
    results = collection.query(query_texts=[query], n_results=n)

    recommended = []
    for doc, meta in zip(results['documents'][0], results['metadatas'][0]):
        recommended.append({'title': meta['title'],
                            'description': doc[:200]
                            })  # print title (meta['title'])
    
    return recommended

In [10]:
books = recommend_books("Love and Hate", n=3)
for b in books:
    print("Title:", b['title'])
    print("Description:", b['description'], "...")
    print()

Title: The Four Loves
Description: Lewis' work on the nature of love divides love into four categories; Affection, Friendship, Eros and Charity. The first three come naturally to humanity. Charity, however, the Gift-love of God, is div ...

Title: The Art of Loving
Description: The fiftieth Anniversary Edition of the groundbreaking international bestseller that has shown millions of readers how to achieve rich, productive lives by developing their hidden capacities for love  ...

Title: Sputnik Sweetheart
Description: Twenty two year old, Sumire is in love for the first time with a woman seventeen years her senior, Miu. Surprised that she might, after all, be a lesbian, Sumire spends hours on the phone talking to h ...



In [11]:
import os

print(os.listdir(PERSIST_DIR))


['0b20c569-6376-4a21-851c-64873e740fad', 'chroma.sqlite3']
