In [3]:
# Install requirements if needed (uncomment below in Colab/Jupyter)
# !pip install langchain langchain-community langchain-chroma pandas requests python-dotenv

from dotenv import load_dotenv
import os
import pandas as pd
import requests

from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import CharacterTextSplitter
from langchain_chroma import Chroma
from langchain.embeddings.base import Embeddings
from langchain_core.documents import Document



In [4]:
class GeminiEmbeddings(Embeddings):
    def __init__(self, api_key=None):
        self.api_key = api_key or os.getenv("GEMINI_API_KEY")
        self.url = "https://generativelanguage.googleapis.com/v1beta/models/embedding-001:embedContent"

    def embed_documents(self, texts):
        embeddings = []
        for text in texts:
            resp = requests.post(
                self.url,
                headers={
                    "x-goog-api-key": self.api_key,
                    "Content-Type": "application/json"
                },
                json={
                    "content": {
                        "parts": [{"text": text}]
                    }
                }
            )
            resp.raise_for_status()
            data = resp.json()
            # Gemini returns a list of embeddings under 'embedding'
            embedding = data["embedding"]["values"]
            embeddings.append(embedding)
        return embeddings

    def embed_query(self, text):
        return self.embed_documents([text])[0]


In [5]:
# -- Load environment variables --
load_dotenv()
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
assert GEMINI_API_KEY, "Please set GEMINI_API_KEY in your .env file"


In [6]:
# -- Data Load --
books = pd.read_csv('books_cleaned.csv')


In [7]:
# -- Load documents and split --
documents = []
with open('tagged_description.txt', encoding='utf-8') as f:
    for line in f:
        line = line.strip()
        if not line: continue
        first_space = line.find(' ')
        if first_space == -1: continue
        isbn = line[:first_space].strip('"')
        description = line[first_space+1:].strip()
        documents.append(Document(page_content=description, metadata={"isbn13": isbn}))


In [18]:
# Define the directory where the vector store will be saved
persist_directory = "./chroma_db"

# Check if the database already exists
gemini_embeddings = GeminiEmbeddings(api_key=GEMINI_API_KEY)
if os.path.exists(persist_directory):
    # Load the existing database
    print("Loading existing Chroma vector store...")
    db_books = Chroma(
        persist_directory=persist_directory,
        embedding_function=gemini_embeddings  # Pass the embedding function again
    )
    print("Chroma vector store loaded.")
else:
    # Build the database from scratch and save it
    print("Building and saving new Chroma vector store...")
    db_books = Chroma.from_documents(
        documents,
        embedding=gemini_embeddings,
        persist_directory=persist_directory
    )
    print("Vector store built and saved.")

# -- Semantic Recommendation Function --
# ... rest of your code

Loading existing Chroma vector store...
Chroma vector store loaded.


In [27]:
from langchain_community.embeddings import HuggingFaceEmbeddings

huggingface_embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

db_books = Chroma.from_documents(
    documents,
    embedding=huggingface_embeddings
)

  huggingface_embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


In [28]:
query = "a book to teach children about nature"

docs = db_books.similarity_search(query, k=10)

docs

[Document(id='c3e1847f-60a8-43a8-9396-c93a85f2e1c7', metadata={'isbn13': '9780786808069'}, page_content='Children will discover the exciting world of their own backyard in this introduction to familiar animals from cats and dogs to bugs and frogs. The combination of photographs, illustrations, and fun facts make this an accessible and delightful learning experience.'),
 Document(id='8b718361-0a29-4cd0-ba99-cefe1879af75', metadata={'isbn13': '9780786808380'}, page_content="Introduce your babies to birds, cats, dogs, and babies through fine art, illustration, and photographs. These books are a rare opportunity to expose little ones to a range of images on a single subject, from simple child's drawings and abstract art to playful photos. A brief text accompanies each image, introducing the baby to some basic -- and sometimes playful -- information about the subjects."),
 Document(id='8ba991d2-de45-4c75-aab4-4d3ce702691a', metadata={'isbn13': '9780786808397'}, page_content="Introduce your 

In [30]:
# Correctly access the isbn13 from the metadata
isbn_to_find = int(docs[0].metadata['isbn13'])

# Filter the DataFrame using the correct ISBN from metadata
books[books['isbn13'] == isbn_to_find]

Unnamed: 0,isbn13,isbn10,title,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count,title_and_subtitle,tagged_description
3747,9780786808069,786808063,Baby Einstein: Neighborhood Animals,Marilyn Singer;Julie Aigner-Clark,Juvenile Fiction,http://books.google.com/books/content?id=X9a4P...,Children will discover the exciting world of t...,2001.0,3.89,16.0,180.0,Baby Einstein: Neighborhood Animals,9780786808069 Children will discover the excit...


In [33]:
def retrieve_semantic_recommendation(
        query: str,
        top_k: int = 10,
) -> pd.DataFrame:
    recs = db_books.similarity_search(query, k=50)
    books_list = []
    
    # Iterate through the recommended documents
    for rec in recs:
        # Correctly get the ISBN from the document's metadata
        isbn = rec.metadata.get("isbn13")
        if isbn:
            books_list.append(int(isbn))

    # Convert the isbn13 column in your DataFrame to numeric
    books["isbn13"] = pd.to_numeric(books["isbn13"], errors='coerce')
    
    # Filter the DataFrame based on the list of ISBNs
    recommended_books_df = books[books["isbn13"].isin(books_list)]
    
    return recommended_books_df.head(top_k)

In [34]:
retrieve_semantic_recommendation("A book with cats")

Unnamed: 0,isbn13,isbn10,title,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count,title_and_subtitle,tagged_description
161,9780060578121,0060578122,Cat Breaking Free,Shirley Rousseau Murphy,Fiction,http://books.google.com/books/content?id=-yFbm...,The fur starts flying when a gang from L.A. co...,2006.0,4.2,375.0,204.0,Cat Breaking Free : A Joe Grey Mystery,9780060578121 The fur starts flying when a gan...
203,9780060736255,0060736259,Weetzie Bat,Francesca Lia Block,Juvenile Fiction,http://books.google.com/books/content?id=vxCXx...,Fifteen years ago Francesca Lia Block made a d...,2004.0,3.74,128.0,12771.0,Weetzie Bat,9780060736255 Fifteen years ago Francesca Lia ...
223,9780060775858,0060775858,Goodnight Moon 60th Anniversary Edition,Margaret Wise Brown,Juvenile Fiction,http://books.google.com/books/content?id=lLYOr...,"In a great green room, tucked away in bed, is ...",2005.0,4.27,32.0,264013.0,Goodnight Moon 60th Anniversary Edition,"9780060775858 In a great green room, tucked aw..."
226,9780060777333,0060777338,The Cat's Pajamas,Ray Bradbury,Fiction,http://books.google.com/books/content?id=Xb6MU...,From the winner of the National Book Foundatio...,2005.0,3.73,234.0,1316.0,The Cat's Pajamas : Stories,9780060777333 From the winner of the National ...
343,9780061015625,0061015628,Cat Laughing Last,Shirley Rousseau Murphy,Fiction,http://books.google.com/books/content?id=KwDSl...,Fans of Lillian Jackson Braun and Rita Mae Bro...,2002.0,4.21,368.0,638.0,Cat Laughing Last : A Joe Grey Mystery,9780061015625 Fans of Lillian Jackson Braun an...
354,9780061056000,0061056006,Cat on the Edge,Shirley Rousseau Murphy,Fiction,http://books.google.com/books/content?id=G0yvy...,It's been quite a week for Joe Grey. First the...,1996.0,3.9,274.0,1855.0,Cat on the Edge : A Joe Grey Mystery,9780061056000 It's been quite a week for Joe G...
355,9780061059476,0061059471,Cat in the Dark,Shirley Rousseau Murphy,Fiction,http://books.google.com/books/content?id=wz1vl...,"""Of course I worry. What if the cops witness a...",1999.0,4.25,320.0,696.0,Cat in the Dark : A Joe Grey Mystery,"9780061059476 ""Of course I worry. What if the ..."
368,9780061127762,0061127760,Charlotte's Web Signature Edition,E. B. White,Juvenile Fiction,http://books.google.com/books/content?id=oi9BP...,This is the story of a little girl named Fern ...,2006.0,4.16,224.0,226.0,Charlotte's Web Signature Edition,9780061127762 This is the story of a little gi...
415,9780064406307,006440630X,The Midwife's Apprentice (rpkg),Karen Cushman,Juvenile Fiction,http://books.google.com/books/content?id=Bhm76...,"'Like Cushman's 1995 Newbery Honor Book, Cathe...",1996.0,3.72,128.0,35319.0,The Midwife's Apprentice (rpkg),9780064406307 'Like Cushman's 1995 Newbery Hon...
433,9780064441766,0064441768,Captain Cat,Syd Hoff,Juvenile Fiction,http://books.google.com/books/content?id=sHz4s...,"A patriotic feline, Captain Cat springs out of...",1994.0,3.66,48.0,107.0,Captain Cat,"9780064441766 A patriotic feline, Captain Cat ..."
