In [1]:
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import CharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_chroma import Chroma
from langchain.docstore.document import Document

In [2]:
from dotenv import load_dotenv
load_dotenv()

True

In [None]:
import pandas as pd
books=pd.read_csv("../data/books_cleaned.csv")

In [4]:
books["tagged_description"]

0       9780002005883 A NOVEL THAT READERS and critics...
1       9780002261982 A new 'Christie for Christmas' -...
2       9780006178736 A memorable, mesmerizing heroine...
3       9780006280897 Lewis' work on the nature of lov...
4       9780006280934 "In The Problem of Pain, C.S. Le...
                              ...                        
5192    9788172235222 On A Train Journey Home To North...
5193    9788173031014 This book tells the tale of a ma...
5194    9788179921623 Wisdom to Create a Life of Passi...
5195    9788185300535 This collection of the timeless ...
5196    9789027712059 Since the three volume edition o...
Name: tagged_description, Length: 5197, dtype: object

In [5]:
books["tagged_description"].to_csv("tagged_description.txt",
                                   sep="\n",
                                   header=False,
                                   index=False)

In [6]:
raw_documents=TextLoader("tagged_description.txt",encoding="utf-8").load()
text_splitter=CharacterTextSplitter(chunk_size=0,chunk_overlap=0,separator="\n")
#documents=text_splitter.split_documents(raw_documents)
from langchain.docstore.document import Document

# Build documents directly from your books DataFrame
documents = [
    Document(
        page_content=str(row["tagged_description"]),
        metadata={"isbn13": row["isbn13"], "title": row["title"]}
    )
    for _, row in books.iterrows()
]


In [7]:
documents[0]

Document(metadata={'isbn13': 9780002005883, 'title': 'Gilead'}, page_content='9780002005883 A NOVEL THAT READERS and critics have been eagerly anticipating for over a decade, Gilead is an astonishingly imagined story of remarkable lives. John Ames is a preacher, the son of a preacher and the grandson (both maternal and paternal) of preachers. It’s 1956 in Gilead, Iowa, towards the end of the Reverend Ames’s life, and he is absorbed in recording his family’s story, a legacy for the young son he will never see grow up. Haunted by his grandfather’s presence, John tells of the rift between his grandfather and his father: the elder, an angry visionary who fought for the abolitionist cause, and his son, an ardent pacifist. He is troubled, too, by his prodigal namesake, Jack (John Ames) Boughton, his best friend’s lost son who returns to Gilead searching for forgiveness and redemption. Told in John Ames’s joyous, rambling voice that finds beauty, humour and truth in the smallest of life’s det

In [8]:
import os
import sentence_transformers
import tf_keras as keras

# Loading env variables
model_name = os.getenv("EMBEDDING_MODEL")
chroma_dir = os.getenv("CHROMA_DB_DIR")

# Load the Hugging Face embedding model
embedding_model = HuggingFaceEmbeddings(model_name=model_name)

# Create Chroma vector DB
db_books = Chroma.from_documents(
    documents=documents,
    embedding=embedding_model,
    persist_directory=chroma_dir
)




  embedding_model = HuggingFaceEmbeddings(model_name=model_name)


In [9]:
print("Model name:", os.getenv("EMBEDDING_MODEL"))
print("Chroma dir:", os.getenv("CHROMA_DB_DIR"))
print(type(db_books))

Model name: sentence-transformers/all-MiniLM-L6-v2
Chroma dir: ./chroma_books
<class 'langchain_chroma.vectorstores.Chroma'>


In [10]:
def retrieve_semantic_recomendations(
        query:str,
        top_k: int=10,
)-> pd.DataFrame:
    recs=db_books.similarity_search(query,k=50)
    books_list=[]

    for i in range (0,len(recs)):
        books_list+=[int(recs[i].page_content.strip('"').split()[0])]
    
    return books[books["isbn13"].isin(books_list)].head(top_k)

In [14]:
def retrieve_semantic_recommendations(query: str, top_k: int = 20) -> pd.DataFrame:
    # Increase k to get more candidates for filtering
    recs = db_books.similarity_search(query, k=top_k * 20)
    
    # Collect ISBNs and titles for better matching
    isbn_list = []
    for rec in recs:
        if "isbn13" in rec.metadata:
            isbn_list.append(int(rec.metadata["isbn13"]))
    
    filtered_books = books[books["isbn13"].isin(isbn_list)]
    
    # Try to boost results that match the query in title or description
    query_lower = query.lower()
    def score_row(row):
        score = 0
        if query_lower in str(row["title"]).lower():
            score += 2
        if query_lower in str(row["description"]).lower():
            score += 1
        return score

    filtered_books = filtered_books.copy()
    filtered_books["score"] = filtered_books.apply(score_row, axis=1)
    filtered_books = filtered_books.sort_values(by=["score", "average_rating", "ratings_count"], ascending=[False, False, False])
    
    return filtered_books.drop(columns=["score"]).head(top_k)

In [18]:
def retrieve_semantic_recommendations(query: str, top_k: int = 20) -> pd.DataFrame:
    # Increase k to get more candidates for filtering
    recs = db_books.similarity_search(query, k=top_k * 30)
    
    # Collect ISBNs and titles for better matching
    isbn_list = []
    for rec in recs:
        if "isbn13" in rec.metadata:
            isbn_list.append(int(rec.metadata["isbn13"]))
    
    filtered_books = books[books["isbn13"].isin(isbn_list)]
    
    # Try to boost results that match the query in title or description
    query_lower = query.lower()
    def score_row(row):
        score = 0
        if query_lower in str(row["title"]).lower():
            score += 2
        if query_lower in str(row["description"]).lower():
            score += 1
        return score

    filtered_books = filtered_books.copy()
    filtered_books["score"] = filtered_books.apply(score_row, axis=1)
    filtered_books = filtered_books.sort_values(by=["score", "average_rating", "ratings_count"], ascending=[False, False, False])
    
    return filtered_books.drop(columns=["score"]).head(top_k)

In [19]:
retrieve_semantic_recommendations("a heartwarming journey of love and friendship")

Unnamed: 0,isbn13,isbn10,title,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count,title_and_subtitle,tagged_description
4619,9781563893339,1563893339,Death,Neil Gaiman;Chris Bachalo;Mark Buckingham;Mike...,Comics & Graphic Novels,http://books.google.com/books/content?id=VIdbP...,"A tale of music, mortality, friendship and dea...",1997.0,4.22,95.0,12206.0,The Time of Your Life: Death,"9781563893339 A tale of music, mortality, frie..."
5023,9781852864989,1852864982,Death,Neil Gaiman,Death (Fictitious character : Gaiman),http://books.google.com/books/content?id=5XnXO...,"A tale of music, mortality, friendship and dea...",1994.0,4.18,104.0,37678.0,The High Cost of Living: Death,"9781852864989 A tale of music, mortality, frie..."
3,9780006280897,6280897,The Four Loves,Clive Staples Lewis,Christian life,http://books.google.com/books/content?id=XhQ5X...,Lewis' work on the nature of love divides love...,2002.0,4.15,170.0,33684.0,The Four Loves,9780006280897 Lewis' work on the nature of lov...
2186,9780440235163,440235162,The Smoke Jumper,Nicholas Evans,Fiction,http://books.google.com/books/content?id=Ta--z...,"A woman must choose between two men, best frie...",2002.0,4.06,576.0,28164.0,The Smoke Jumper,9780440235163 A woman must choose between two ...
370,9780061129735,61129739,The Art of Loving,Erich Fromm,Self-Help,http://books.google.com/books/content?id=TRMED...,The fiftieth Anniversary Edition of the ground...,2006.0,4.03,192.0,35605.0,The Art of Loving,9780061129735 The fiftieth Anniversary Edition...
71,9780007189953,7189958,Where Rainbows End,Cecelia Ahern,Friendship,http://books.google.com/books/content?id=PA7t6...,The new warm and absorbing story from the auth...,2004.0,3.94,454.0,642.0,Where Rainbows End,9780007189953 The new warm and absorbing story...
3811,9780802130365,802130364,The Malady of Death,Marguerite Duras,Fiction,http://books.google.com/books/content?id=ZDHyw...,A man hires a woman to spend several weeks wit...,1988.0,3.86,60.0,1280.0,The Malady of Death,9780802130365 A man hires a woman to spend sev...
1870,9780385721875,385721870,The Age of Grief,Jane Smiley,Fiction,http://books.google.com/books/content?id=uJNpA...,The luminous novella and stories in The Age of...,2002.0,3.82,224.0,1341.0,The Age of Grief,9780385721875 The luminous novella and stories...
301,9780060930318,60930314,Identity,Milan Kundera,Fiction,http://books.google.com/books/content?id=mXPU2...,There are situations in which we fail for a mo...,1999.0,3.68,168.0,13065.0,A Novel: Identity,9780060930318 There are situations in which we...
101,9780060175641,60175648,Identity,Milan Kundera,Fiction,http://books.google.com/books/content?id=D30Ex...,Milan Kundera's Identity translated from the F...,1998.0,3.68,176.0,260.0,A Novel: Identity,9780060175641 Milan Kundera's Identity transla...
